9#include <unordered_map>
194 struct Options *options =
nullptr) {
205 [[nodiscard]] std::string
convert();
253 const std::string &replacement) {
254 htmlSymbolConversions_[htmlSymbol] = replacement;
264 htmlSymbolConversions_.erase(htmlSymbol);
278 [[nodiscard]]
bool ok()
const;
293 return html_ == c.html_ && option == c.option;
299 inline explicit operator bool()
const {
return ok(); };
303 static constexpr const char *kAttributeHref =
"href";
304 static constexpr const char *kAttributeAlt =
"alt";
305 static constexpr const char *kAttributeTitle =
"title";
306 static constexpr const char *kAttributeClass =
"class";
307 static constexpr const char *kAttributeSrc =
"src";
308 static constexpr const char *kAttrinuteAlign =
"align";
310 static constexpr const char *kTagAnchor =
"a";
311 static constexpr const char *kTagBreak =
"br";
312 static constexpr const char *kTagCode =
"code";
313 static constexpr const char *kTagDiv =
"div";
314 static constexpr const char *kTagHead =
"head";
315 static constexpr const char *kTagLink =
"link";
316 static constexpr const char *kTagListItem =
"li";
317 static constexpr const char *kTagMeta =
"meta";
318 static constexpr const char *kTagNav =
"nav";
319 static constexpr const char *kTagNoScript =
"noscript";
320 static constexpr const char *kTagOption =
"option";
321 static constexpr const char *kTagOrderedList =
"ol";
322 static constexpr const char *kTagParagraph =
"p";
323 static constexpr const char *kTagPre =
"pre";
324 static constexpr const char *kTagScript =
"script";
325 static constexpr const char *kTagSpan =
"span";
326 static constexpr const char *kTagStyle =
"style";
327 static constexpr const char *kTagTemplate =
"template";
328 static constexpr const char *kTagTitle =
"title";
329 static constexpr const char *kTagUnorderedList =
"ul";
330 static constexpr const char *kTagImg =
"img";
331 static constexpr const char *kTagSeperator =
"hr";
334 static constexpr const char *kTagBold =
"b";
335 static constexpr const char *kTagStrong =
"strong";
336 static constexpr const char *kTagItalic =
"em";
337 static constexpr const char *kTagItalic2 =
"i";
338 static constexpr const char *kTagCitation =
"cite";
339 static constexpr const char *kTagDefinition =
"dfn";
340 static constexpr const char *kTagUnderline =
"u";
341 static constexpr const char *kTagStrighthrought =
"del";
342 static constexpr const char *kTagStrighthrought2 =
"s";
344 static constexpr const char *kTagBlockquote =
"blockquote";
347 static constexpr const char *kTagHeader1 =
"h1";
348 static constexpr const char *kTagHeader2 =
"h2";
349 static constexpr const char *kTagHeader3 =
"h3";
350 static constexpr const char *kTagHeader4 =
"h4";
351 static constexpr const char *kTagHeader5 =
"h5";
352 static constexpr const char *kTagHeader6 =
"h6";
355 static constexpr const char *kTagTable =
"table";
356 static constexpr const char *kTagTableRow =
"tr";
357 static constexpr const char *kTagTableHeader =
"th";
358 static constexpr const char *kTagTableData =
"td";
360 size_t index_ch_in_html_ = 0;
362 bool is_closing_tag_ =
false;
363 bool is_in_attribute_value_ =
false;
364 bool is_in_code_ =
false;
365 bool is_in_list_ =
false;
366 bool is_in_p_ =
false;
367 bool is_in_pre_ =
false;
368 bool is_in_table_ =
false;
369 bool is_in_table_row_ =
false;
370 bool is_in_tag_ =
false;
371 bool is_self_closing_tag_ =
false;
374 bool is_in_ordered_list_ =
false;
375 uint8_t index_ol = 0;
378 size_t table_start = 0;
381 uint8_t index_li = 0;
383 uint8_t index_blockquote = 0;
385 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
386 char prev_ch_in_html_ =
'x';
390 uint16_t offset_lt_ = 0;
391 std::string current_tag_;
392 std::string prev_tag_;
395 std::string tableLine;
397 size_t chars_in_curr_line_ = 0;
403 std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
404 {
""",
"\""}, {
"<",
"<"}, {
">",
">"},
405 {
"&",
"&"}, {
" ",
" "}, {
"→",
"→"}};
409 virtual void OnHasLeftOpeningTag(
Converter *c) = 0;
410 virtual void OnHasLeftClosingTag(
Converter *c) = 0;
416 struct TagIgnored : Tag {
417 void OnHasLeftOpeningTag(
Converter *c)
override {};
418 void OnHasLeftClosingTag(
Converter *c)
override {};
421 struct TagAnchor : Tag {
422 void OnHasLeftOpeningTag(
Converter *c)
override;
423 void OnHasLeftClosingTag(
Converter *c)
override;
425 std::string current_href_;
426 std::string current_title_;
429 struct TagBold : Tag {
430 void OnHasLeftOpeningTag(
Converter *c)
override;
431 void OnHasLeftClosingTag(
Converter *c)
override;
434 struct TagItalic : Tag {
435 void OnHasLeftOpeningTag(
Converter *c)
override;
436 void OnHasLeftClosingTag(
Converter *c)
override;
439 struct TagUnderline : Tag {
440 void OnHasLeftOpeningTag(
Converter *c)
override;
441 void OnHasLeftClosingTag(
Converter *c)
override;
444 struct TagStrikethrought : Tag {
445 void OnHasLeftOpeningTag(
Converter *c)
override;
446 void OnHasLeftClosingTag(
Converter *c)
override;
449 struct TagBreak : Tag {
450 void OnHasLeftOpeningTag(
Converter *c)
override;
451 void OnHasLeftClosingTag(
Converter *c)
override;
454 struct TagDiv : Tag {
455 void OnHasLeftOpeningTag(
Converter *c)
override;
456 void OnHasLeftClosingTag(
Converter *c)
override;
459 struct TagHeader1 : Tag {
460 void OnHasLeftOpeningTag(
Converter *c)
override;
461 void OnHasLeftClosingTag(
Converter *c)
override;
464 struct TagHeader2 : Tag {
465 void OnHasLeftOpeningTag(
Converter *c)
override;
466 void OnHasLeftClosingTag(
Converter *c)
override;
469 struct TagHeader3 : Tag {
470 void OnHasLeftOpeningTag(
Converter *c)
override;
471 void OnHasLeftClosingTag(
Converter *c)
override;
474 struct TagHeader4 : Tag {
475 void OnHasLeftOpeningTag(
Converter *c)
override;
476 void OnHasLeftClosingTag(
Converter *c)
override;
479 struct TagHeader5 : Tag {
480 void OnHasLeftOpeningTag(
Converter *c)
override;
481 void OnHasLeftClosingTag(
Converter *c)
override;
484 struct TagHeader6 : Tag {
485 void OnHasLeftOpeningTag(
Converter *c)
override;
486 void OnHasLeftClosingTag(
Converter *c)
override;
489 struct TagListItem : Tag {
490 void OnHasLeftOpeningTag(
Converter *c)
override;
491 void OnHasLeftClosingTag(
Converter *c)
override;
494 struct TagOption : Tag {
495 void OnHasLeftOpeningTag(
Converter *c)
override;
496 void OnHasLeftClosingTag(
Converter *c)
override;
499 struct TagOrderedList : Tag {
500 void OnHasLeftOpeningTag(
Converter *c)
override;
501 void OnHasLeftClosingTag(
Converter *c)
override;
504 struct TagParagraph : Tag {
505 void OnHasLeftOpeningTag(
Converter *c)
override;
506 void OnHasLeftClosingTag(
Converter *c)
override;
509 struct TagPre : Tag {
510 void OnHasLeftOpeningTag(
Converter *c)
override;
511 void OnHasLeftClosingTag(
Converter *c)
override;
514 struct TagCode : Tag {
515 void OnHasLeftOpeningTag(
Converter *c)
override;
516 void OnHasLeftClosingTag(
Converter *c)
override;
519 struct TagSpan : Tag {
520 void OnHasLeftOpeningTag(
Converter *c)
override;
521 void OnHasLeftClosingTag(
Converter *c)
override;
524 struct TagTitle : Tag {
525 void OnHasLeftOpeningTag(
Converter *c)
override;
526 void OnHasLeftClosingTag(
Converter *c)
override;
529 struct TagUnorderedList : Tag {
530 void OnHasLeftOpeningTag(
Converter *c)
override;
531 void OnHasLeftClosingTag(
Converter *c)
override;
534 struct TagImage : Tag {
535 void OnHasLeftOpeningTag(
Converter *c)
override;
536 void OnHasLeftClosingTag(
Converter *c)
override;
539 struct TagSeperator : Tag {
540 void OnHasLeftOpeningTag(
Converter *c)
override;
541 void OnHasLeftClosingTag(
Converter *c)
override;
544 struct TagTable : Tag {
545 void OnHasLeftOpeningTag(
Converter *c)
override;
546 void OnHasLeftClosingTag(
Converter *c)
override;
549 struct TagTableRow : Tag {
550 void OnHasLeftOpeningTag(
Converter *c)
override;
551 void OnHasLeftClosingTag(
Converter *c)
override;
554 struct TagTableHeader : Tag {
555 void OnHasLeftOpeningTag(
Converter *c)
override;
556 void OnHasLeftClosingTag(
Converter *c)
override;
559 struct TagTableData : Tag {
560 void OnHasLeftOpeningTag(
Converter *c)
override;
561 void OnHasLeftClosingTag(
Converter *c)
override;
564 struct TagBlockquote : Tag {
565 void OnHasLeftOpeningTag(
Converter *c)
override;
566 void OnHasLeftClosingTag(
Converter *c)
override;
569 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
571 explicit Converter(
const std::string *html,
struct Options *options);
573 void CleanUpMarkdown();
576 static void LTrim(std::string *s);
579 Converter *RTrim(std::string *s,
bool trim_only_blank =
false);
586 void TidyAllLines(std::string *str);
588 std::string ExtractAttributeFromTagLeftOf(
const std::string &attr);
590 void TurnLineIntoHeader1();
592 void TurnLineIntoHeader2();
595 void OnHasEnteredTag();
605 bool ParseCharInTag(
char ch);
610 inline static bool TagContainsAttributesToHide(std::string *tag) {
613 return (*tag).find(
" aria=\"hidden\"") != string::npos ||
614 (*tag).find(
"display:none") != string::npos ||
615 (*tag).find(
"visibility:hidden") != string::npos ||
616 (*tag).find(
"opacity:0") != string::npos ||
617 (*tag).find(
"Details-content--hidden-not-important") != string::npos;
620 Converter *ShortenMarkdown(
size_t chars = 1);
621 inline bool shortIfPrevCh(
char prev) {
622 if (prev_ch_in_md_ == prev) {
633 bool ParseCharInTagContent(
char ch);
636 bool ReplacePreviousSpaceInLineByNewline();
638 static inline bool IsIgnoredTag(
const std::string &tag) {
639 return (tag[0] ==
'-' || kTagTemplate == tag || kTagStyle == tag ||
640 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
645 [[nodiscard]]
bool IsInIgnoredTag()
const;
654inline std::string
Convert(
const std::string &html,
bool *ok =
nullptr) {
662#ifndef PYTHON_BINDINGS
663inline std::string
Convert(
const std::string &&html,
bool *ok =
nullptr) {
Class for converting HTML to Markdown.
std::string convert()
Convert HTML into Markdown.
void addHtmlSymbolConversion(const std::string &htmlSymbol, const std::string &replacement)
Add an HTML symbol conversion.
Converter * appendToMd(char ch)
Append a char to the Markdown.
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
void removeHtmlSymbolConversion(const std::string &htmlSymbol)
Remove an HTML symbol conversion.
bool operator==(const Converter &c) const
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
void reset()
Reset the generated Markdown.
void clearHtmlSymbolConversions()
Clear all HTML symbol conversions.
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
std::string Convert(const std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Options for the conversion from HTML to Markdown.
bool operator==(html2md::Options o) const
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
bool compressWhitespace
Whether to compress whitespace (tabs, multiple spaces) into a single space.
int hardBreak
hardBreak Force a break after ... characters in a line
char orderedList
The char used after the number of the item.
bool formatTable
Whetever to format Markdown Tables.
bool splitLines
Add new line when a certain number of characters is reached.
char unorderedList
The char used for unordered lists.
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.
bool forceLeftTrim
Whether to force left trim of lines in the final Markdown output.