9#include <unordered_map>
174 struct Options *options =
nullptr) {
185 [[nodiscard]] std::string
convert();
225 [[nodiscard]]
bool ok()
const;
240 return html_ == c.html_ && option == c.option;
246 inline explicit operator bool()
const {
return ok(); };
250 static constexpr const char *kAttributeHref =
"href";
251 static constexpr const char *kAttributeAlt =
"alt";
252 static constexpr const char *kAttributeTitle =
"title";
253 static constexpr const char *kAttributeClass =
"class";
254 static constexpr const char *kAttributeSrc =
"src";
255 static constexpr const char *kAttrinuteAlign =
"align";
257 static constexpr const char *kTagAnchor =
"a";
258 static constexpr const char *kTagBreak =
"br";
259 static constexpr const char *kTagCode =
"code";
260 static constexpr const char *kTagDiv =
"div";
261 static constexpr const char *kTagHead =
"head";
262 static constexpr const char *kTagLink =
"link";
263 static constexpr const char *kTagListItem =
"li";
264 static constexpr const char *kTagMeta =
"meta";
265 static constexpr const char *kTagNav =
"nav";
266 static constexpr const char *kTagNoScript =
"noscript";
267 static constexpr const char *kTagOption =
"option";
268 static constexpr const char *kTagOrderedList =
"ol";
269 static constexpr const char *kTagParagraph =
"p";
270 static constexpr const char *kTagPre =
"pre";
271 static constexpr const char *kTagScript =
"script";
272 static constexpr const char *kTagSpan =
"span";
273 static constexpr const char *kTagStyle =
"style";
274 static constexpr const char *kTagTemplate =
"template";
275 static constexpr const char *kTagTitle =
"title";
276 static constexpr const char *kTagUnorderedList =
"ul";
277 static constexpr const char *kTagImg =
"img";
278 static constexpr const char *kTagSeperator =
"hr";
281 static constexpr const char *kTagBold =
"b";
282 static constexpr const char *kTagStrong =
"strong";
283 static constexpr const char *kTagItalic =
"em";
284 static constexpr const char *kTagItalic2 =
"i";
285 static constexpr const char *kTagCitation =
"cite";
286 static constexpr const char *kTagDefinition =
"dfn";
287 static constexpr const char *kTagUnderline =
"u";
288 static constexpr const char *kTagStrighthrought =
"del";
289 static constexpr const char *kTagStrighthrought2 =
"s";
291 static constexpr const char *kTagBlockquote =
"blockquote";
294 static constexpr const char *kTagHeader1 =
"h1";
295 static constexpr const char *kTagHeader2 =
"h2";
296 static constexpr const char *kTagHeader3 =
"h3";
297 static constexpr const char *kTagHeader4 =
"h4";
298 static constexpr const char *kTagHeader5 =
"h5";
299 static constexpr const char *kTagHeader6 =
"h6";
302 static constexpr const char *kTagTable =
"table";
303 static constexpr const char *kTagTableRow =
"tr";
304 static constexpr const char *kTagTableHeader =
"th";
305 static constexpr const char *kTagTableData =
"td";
307 size_t index_ch_in_html_ = 0;
309 bool is_closing_tag_ =
false;
310 bool is_in_attribute_value_ =
false;
311 bool is_in_code_ =
false;
312 bool is_in_list_ =
false;
313 bool is_in_p_ =
false;
314 bool is_in_pre_ =
false;
315 bool is_in_table_ =
false;
316 bool is_in_table_row_ =
false;
317 bool is_in_tag_ =
false;
318 bool is_self_closing_tag_ =
false;
321 bool is_in_ordered_list_ =
false;
322 uint8_t index_ol = 0;
325 size_t table_start = 0;
328 uint8_t index_li = 0;
330 uint8_t index_blockquote = 0;
332 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
333 char prev_ch_in_html_ =
'x';
337 uint16_t offset_lt_ = 0;
338 std::string current_tag_;
339 std::string prev_tag_;
342 std::string tableLine;
344 size_t chars_in_curr_line_ = 0;
352 virtual void OnHasLeftOpeningTag(
Converter *c) = 0;
353 virtual void OnHasLeftClosingTag(
Converter *c) = 0;
359 struct TagIgnored : Tag {
360 void OnHasLeftOpeningTag(
Converter *c)
override{};
361 void OnHasLeftClosingTag(
Converter *c)
override{};
364 struct TagAnchor : Tag {
365 void OnHasLeftOpeningTag(
Converter *c)
override;
366 void OnHasLeftClosingTag(
Converter *c)
override;
368 std::string current_href_;
369 std::string current_title_;
372 struct TagBold : Tag {
373 void OnHasLeftOpeningTag(
Converter *c)
override;
374 void OnHasLeftClosingTag(
Converter *c)
override;
377 struct TagItalic : Tag {
378 void OnHasLeftOpeningTag(
Converter *c)
override;
379 void OnHasLeftClosingTag(
Converter *c)
override;
382 struct TagUnderline : Tag {
383 void OnHasLeftOpeningTag(
Converter *c)
override;
384 void OnHasLeftClosingTag(
Converter *c)
override;
387 struct TagStrikethrought : Tag {
388 void OnHasLeftOpeningTag(
Converter *c)
override;
389 void OnHasLeftClosingTag(
Converter *c)
override;
392 struct TagBreak : Tag {
393 void OnHasLeftOpeningTag(
Converter *c)
override;
394 void OnHasLeftClosingTag(
Converter *c)
override;
397 struct TagDiv : Tag {
398 void OnHasLeftOpeningTag(
Converter *c)
override;
399 void OnHasLeftClosingTag(
Converter *c)
override;
402 struct TagHeader1 : Tag {
403 void OnHasLeftOpeningTag(
Converter *c)
override;
404 void OnHasLeftClosingTag(
Converter *c)
override;
407 struct TagHeader2 : Tag {
408 void OnHasLeftOpeningTag(
Converter *c)
override;
409 void OnHasLeftClosingTag(
Converter *c)
override;
412 struct TagHeader3 : Tag {
413 void OnHasLeftOpeningTag(
Converter *c)
override;
414 void OnHasLeftClosingTag(
Converter *c)
override;
417 struct TagHeader4 : Tag {
418 void OnHasLeftOpeningTag(
Converter *c)
override;
419 void OnHasLeftClosingTag(
Converter *c)
override;
422 struct TagHeader5 : Tag {
423 void OnHasLeftOpeningTag(
Converter *c)
override;
424 void OnHasLeftClosingTag(
Converter *c)
override;
427 struct TagHeader6 : Tag {
428 void OnHasLeftOpeningTag(
Converter *c)
override;
429 void OnHasLeftClosingTag(
Converter *c)
override;
432 struct TagListItem : Tag {
433 void OnHasLeftOpeningTag(
Converter *c)
override;
434 void OnHasLeftClosingTag(
Converter *c)
override;
437 struct TagOption : Tag {
438 void OnHasLeftOpeningTag(
Converter *c)
override;
439 void OnHasLeftClosingTag(
Converter *c)
override;
442 struct TagOrderedList : Tag {
443 void OnHasLeftOpeningTag(
Converter *c)
override;
444 void OnHasLeftClosingTag(
Converter *c)
override;
447 struct TagParagraph : Tag {
448 void OnHasLeftOpeningTag(
Converter *c)
override;
449 void OnHasLeftClosingTag(
Converter *c)
override;
452 struct TagPre : Tag {
453 void OnHasLeftOpeningTag(
Converter *c)
override;
454 void OnHasLeftClosingTag(
Converter *c)
override;
457 struct TagCode : Tag {
458 void OnHasLeftOpeningTag(
Converter *c)
override;
459 void OnHasLeftClosingTag(
Converter *c)
override;
462 struct TagSpan : Tag {
463 void OnHasLeftOpeningTag(
Converter *c)
override;
464 void OnHasLeftClosingTag(
Converter *c)
override;
467 struct TagTitle : Tag {
468 void OnHasLeftOpeningTag(
Converter *c)
override;
469 void OnHasLeftClosingTag(
Converter *c)
override;
472 struct TagUnorderedList : Tag {
473 void OnHasLeftOpeningTag(
Converter *c)
override;
474 void OnHasLeftClosingTag(
Converter *c)
override;
477 struct TagImage : Tag {
478 void OnHasLeftOpeningTag(
Converter *c)
override;
479 void OnHasLeftClosingTag(
Converter *c)
override;
482 struct TagSeperator : Tag {
483 void OnHasLeftOpeningTag(
Converter *c)
override;
484 void OnHasLeftClosingTag(
Converter *c)
override;
487 struct TagTable : Tag {
488 void OnHasLeftOpeningTag(
Converter *c)
override;
489 void OnHasLeftClosingTag(
Converter *c)
override;
492 struct TagTableRow : Tag {
493 void OnHasLeftOpeningTag(
Converter *c)
override;
494 void OnHasLeftClosingTag(
Converter *c)
override;
497 struct TagTableHeader : Tag {
498 void OnHasLeftOpeningTag(
Converter *c)
override;
499 void OnHasLeftClosingTag(
Converter *c)
override;
502 struct TagTableData : Tag {
503 void OnHasLeftOpeningTag(
Converter *c)
override;
504 void OnHasLeftClosingTag(
Converter *c)
override;
507 struct TagBlockquote : Tag {
508 void OnHasLeftOpeningTag(
Converter *c)
override;
509 void OnHasLeftClosingTag(
Converter *c)
override;
512 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
514 explicit Converter(std::string *html,
struct Options *options);
516 void CleanUpMarkdown();
519 static void LTrim(std::string *s);
522 Converter *RTrim(std::string *s,
bool trim_only_blank =
false);
529 void TidyAllLines(std::string *str);
531 std::string ExtractAttributeFromTagLeftOf(
const std::string &attr);
533 void TurnLineIntoHeader1();
535 void TurnLineIntoHeader2();
538 void OnHasEnteredTag();
548 bool ParseCharInTag(
char ch);
553 inline static bool TagContainsAttributesToHide(std::string *tag) {
556 return (*tag).find(
" aria=\"hidden\"") != string::npos ||
557 (*tag).find(
"display:none") != string::npos ||
558 (*tag).find(
"visibility:hidden") != string::npos ||
559 (*tag).find(
"opacity:0") != string::npos ||
560 (*tag).find(
"Details-content--hidden-not-important") != string::npos;
563 Converter *ShortenMarkdown(
size_t chars = 1);
564 inline bool shortIfPrevCh(
char prev) {
565 if (prev_ch_in_md_ == prev) {
576 bool ParseCharInTagContent(
char ch);
579 bool ReplacePreviousSpaceInLineByNewline();
581 static inline bool IsIgnoredTag(
const std::string &tag) {
582 return (tag[0] ==
'-' || kTagTemplate == tag || kTagStyle == tag ||
583 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
588 [[nodiscard]]
bool IsInIgnoredTag()
const;
597inline std::string
Convert(std::string &html,
bool *ok =
nullptr) {
605#ifndef PYTHON_BINDINGS
606inline std::string
Convert(std::string &&html,
bool *ok =
nullptr) {
Class for converting HTML to Markdown.
std::string convert()
Convert HTML into Markdown.
Converter * appendToMd(char ch)
Append a char to the Markdown.
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
bool operator==(const Converter &c) const
void reset()
Reset the generated Markdown.
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
std::string Convert(std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Options for the conversion from HTML to Markdown.
bool operator==(html2md::Options o) const
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
int hardBreak
hardBreak Force a break after ... characters in a line
char orderedList
The char used after the number of the item.
bool formatTable
Whetever to format Markdown Tables.
bool splitLines
Add new line when a certain number of characters is reached.
char unorderedList
The char used for unordered lists.
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.