13using std::make_shared;
18bool startsWith(
const string &str,
const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
23bool endsWith(
const string &str,
const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
28size_t ReplaceAll(
string *haystack,
const string &needle,
29 const string &replacement) {
31 size_t pos = (*haystack).find(needle);
33 size_t amount_replaced = 0;
36 while (pos != string::npos) {
38 (*haystack).replace(pos, needle.size(), replacement);
41 pos = (*haystack).find(needle, pos + replacement.size());
46 return amount_replaced;
49size_t ReplaceAll(
string *haystack,
const string &needle,
const char c) {
50 return ReplaceAll(haystack, needle,
string({c}));
54vector<string> Split(
string const &str,
char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
58 for (
string token; getline(iss, token, delimiter);)
59 result.push_back(token);
64string Repeat(
const string &str,
size_t amount) {
72 for (
size_t i = 0; i < amount; ++i)
88 auto tagIgnored = make_shared<Converter::TagIgnored>();
89 tags_[kTagHead] = tagIgnored;
90 tags_[kTagMeta] = tagIgnored;
91 tags_[kTagNav] = tagIgnored;
92 tags_[kTagNoScript] = tagIgnored;
93 tags_[kTagScript] = tagIgnored;
94 tags_[kTagStyle] = tagIgnored;
95 tags_[kTagTemplate] = tagIgnored;
98 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
99 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
100 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
101 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
102 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
103 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
104 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
105 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
106 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
107 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
108 tags_[kTagOption] = make_shared<Converter::TagOption>();
109 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
110 tags_[kTagPre] = make_shared<Converter::TagPre>();
111 tags_[kTagCode] = make_shared<Converter::TagCode>();
112 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
113 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
114 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
115 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
116 tags_[kTagImg] = make_shared<Converter::TagImage>();
117 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
120 auto tagBold = make_shared<Converter::TagBold>();
121 tags_[kTagBold] = tagBold;
122 tags_[kTagStrong] = tagBold;
124 auto tagItalic = make_shared<Converter::TagItalic>();
125 tags_[kTagItalic] = tagItalic;
126 tags_[kTagItalic2] = tagItalic;
127 tags_[kTagDefinition] = tagItalic;
128 tags_[kTagCitation] = tagItalic;
130 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
132 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
133 tags_[kTagStrighthrought] = tagStrighthrought;
134 tags_[kTagStrighthrought2] = tagStrighthrought;
136 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
139 tags_[kTagTable] = make_shared<Converter::TagTable>();
140 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
141 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
142 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
145void Converter::CleanUpMarkdown() {
148 ReplaceAll(&md_,
" , ",
", ");
150 ReplaceAll(&md_,
"\n.\n",
".\n");
151 ReplaceAll(&md_,
"\n↵\n",
" ↵\n");
152 ReplaceAll(&md_,
"\n*\n",
"\n");
153 ReplaceAll(&md_,
"\n. ",
".\n");
155 ReplaceAll(&md_,
""",
'"');
156 ReplaceAll(&md_,
"<",
"<");
157 ReplaceAll(&md_,
">",
">");
158 ReplaceAll(&html_,
"&",
'&');
159 ReplaceAll(&html_,
" ",
' ');
160 ReplaceAll(&html_,
"→",
"→");
162 ReplaceAll(&md_,
"\t\t ",
"\t\t");
166 if (IsInIgnoredTag())
169 if (index_blockquote != 0 && ch ==
'\n') {
172 chars_in_curr_line_ = 0;
173 appendToMd(Repeat(
"> ", index_blockquote));
182 chars_in_curr_line_ = 0;
184 ++chars_in_curr_line_;
191 if (IsInIgnoredTag())
196 auto str_len = strlen(str);
198 for (
auto i = 0; i < str_len; ++i) {
200 chars_in_curr_line_ = 0;
202 ++chars_in_curr_line_;
209 UpdatePrevChFromMd();
211 if (prev_ch_in_md_ ==
'\n' ||
212 (prev_ch_in_md_ ==
'*' && prev_prev_ch_in_md_ ==
'*'))
215 return appendToMd(
' ');
218bool Converter::ok()
const {
219 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
220 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
223void Converter::LTrim(
string *s) {
224 (*s).erase((*s).begin(),
225 find_if((*s).begin(), (*s).end(),
226 [](
unsigned char ch) { return !std::isspace(ch); }));
229Converter *Converter::RTrim(
string *s,
bool trim_only_blank) {
230 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
231 [trim_only_blank](
unsigned char ch) {
245Converter *Converter::Trim(
string *s) {
246 if (!startsWith(*s,
"\t"))
249 if (!(startsWith(*s,
" "), endsWith(*s,
" ")))
255void Converter::TidyAllLines(
string *str) {
256 auto lines = Split(*str,
'\n');
259 uint8_t amount_newlines = 0;
260 bool in_code_block =
false;
262 for (
auto line : lines) {
263 if (startsWith(line,
"```") || startsWith(line,
"~~~"))
264 in_code_block = !in_code_block;
273 if (amount_newlines < 2 && !res.empty()) {
287string Converter::ExtractAttributeFromTagLeftOf(
const string &attr) {
289 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
292 auto offset_attr = tag.find(attr);
294 if (offset_attr == string::npos)
298 auto offset_equals = tag.find(
'=', offset_attr);
300 if (offset_equals == string::npos)
304 auto offset_double_quote = tag.find(
'"', offset_equals);
305 auto offset_single_quote = tag.find(
'\'', offset_equals);
307 bool has_double_quote = offset_double_quote != string::npos;
308 bool has_single_quote = offset_single_quote != string::npos;
310 if (!has_double_quote && !has_single_quote)
313 char wrapping_quote = 0;
315 size_t offset_opening_quote = 0;
316 size_t offset_closing_quote = 0;
318 if (has_double_quote) {
319 if (!has_single_quote) {
320 wrapping_quote =
'"';
321 offset_opening_quote = offset_double_quote;
323 if (offset_double_quote < offset_single_quote) {
324 wrapping_quote =
'"';
325 offset_opening_quote = offset_double_quote;
327 wrapping_quote =
'\'';
328 offset_opening_quote = offset_single_quote;
333 wrapping_quote =
'\'';
334 offset_opening_quote = offset_single_quote;
337 if (offset_opening_quote == string::npos)
340 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
342 if (offset_closing_quote == string::npos)
345 return tag.substr(offset_opening_quote + 1,
346 offset_closing_quote - 1 - offset_opening_quote);
349void Converter::TurnLineIntoHeader1() {
350 appendToMd(
'\n' + Repeat(
"=", chars_in_curr_line_) +
"\n\n");
352 chars_in_curr_line_ = 0;
355void Converter::TurnLineIntoHeader2() {
356 appendToMd(
'\n' + Repeat(
"-", chars_in_curr_line_) +
"\n\n");
358 chars_in_curr_line_ = 0;
361string Converter::convert() {
363 if (index_ch_in_html_ == html_.size())
368 for (
char ch : html_) {
371 if (!is_in_tag_ && ch ==
'<') {
380 ParseCharInTagContent(ch);
388void Converter::OnHasEnteredTag() {
389 offset_lt_ = index_ch_in_html_;
391 prev_tag_ = current_tag_;
395 UpdatePrevChFromMd();
399Converter *Converter::UpdatePrevChFromMd() {
401 prev_ch_in_md_ = md_[md_.length() - 1];
403 if (md_.length() > 1)
404 prev_prev_ch_in_md_ = md_[md_.length() - 2];
410bool Converter::ParseCharInTag(
char ch) {
411 if (ch ==
'/' && !is_in_attribute_value_) {
412 is_closing_tag_ = current_tag_.empty();
413 is_self_closing_tag_ = !is_closing_tag_;
419 return OnHasLeftTag();
422 if (is_in_attribute_value_) {
423 is_in_attribute_value_ =
false;
424 }
else if (current_tag_[current_tag_.length() - 1] ==
'=') {
425 is_in_attribute_value_ =
true;
436bool Converter::OnHasLeftTag() {
439 UpdatePrevChFromMd();
441 if (!is_closing_tag_)
442 if (TagContainsAttributesToHide(¤t_tag_))
445 current_tag_ = Split(current_tag_,
' ')[0];
447 auto tag = tags_[current_tag_];
452 if (!is_closing_tag_) {
453 tag->OnHasLeftOpeningTag(
this);
455 if (is_closing_tag_ || is_self_closing_tag_) {
456 is_closing_tag_ =
false;
458 tag->OnHasLeftClosingTag(
this);
464Converter *Converter::ShortenMarkdown(
size_t chars) {
465 md_ = md_.substr(0, md_.length() - chars);
467 if (chars > chars_in_curr_line_)
468 chars_in_curr_line_ = 0;
470 chars_in_curr_line_ = chars_in_curr_line_ - chars;
472 return this->UpdatePrevChFromMd();
475bool Converter::ParseCharInTagContent(
char ch) {
479 if (index_blockquote != 0 && ch ==
'\n')
480 appendToMd(Repeat(
"> ", index_blockquote));
485 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
486 prev_ch_in_html_ = ch;
492 if (index_blockquote != 0) {
494 chars_in_curr_line_ = 0;
495 appendToMd(Repeat(
"> ", index_blockquote));
513 ++chars_in_curr_line_;
517 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
518 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
522 chars_in_curr_line_ = 0;
523 }
else if (chars_in_curr_line_ > option.hardBreak) {
524 ReplacePreviousSpaceInLineByNewline();
531bool Converter::ReplacePreviousSpaceInLineByNewline() {
532 if (current_tag_ == kTagParagraph ||
533 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
536 auto offset = md_.length() - 1;
538 if (md_.length() == 0)
542 if (md_[offset] ==
'\n')
545 if (md_[offset] ==
' ') {
547 chars_in_curr_line_ = md_.length() - offset;
553 }
while (offset > 0);
558void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
559 if (c->prev_tag_ == kTagImg)
562 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
565 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
568void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
569 if (!c->shortIfPrevCh(
'[')) {
570 c->appendToMd(
"](")->appendToMd(current_href_);
573 if (!current_title_.empty()) {
574 c->appendToMd(
" \"")->appendToMd(current_title_)->appendToMd(
'"');
575 current_title_.clear();
580 if (c->prev_tag_ == kTagImg)
585void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
589void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
593void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
597void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
601void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
602 c->appendToMd(
"<u>");
605void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
606 c->appendToMd(
"</u>");
609void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
613void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
617void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
618 if (c->is_in_list_) {
619 c->appendToMd(
" \n");
620 c->appendToMd(Repeat(
" ", c->index_li));
621 }
else if (c->is_in_table_) {
622 c->appendToMd(
"<br>");
623 }
else if (!c->is_in_p_) {
624 c->appendToMd(
"\n<br>\n\n");
625 }
else if (c->md_.length() > 0)
626 c->appendToMd(
" \n");
629void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
631void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
632 if (c->prev_ch_in_md_ !=
'\n')
635 if (c->prev_prev_ch_in_md_ !=
'\n')
639void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
641void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
642 c->appendToMd(
"\n# ");
645void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
646 if (c->prev_prev_ch_in_md_ !=
' ')
650void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
651 c->appendToMd(
"\n## ");
654void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
655 if (c->prev_prev_ch_in_md_ !=
' ')
659void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
660 c->appendToMd(
"\n### ");
663void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
664 if (c->prev_prev_ch_in_md_ !=
' ')
668void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
669 c->appendToMd(
"\n#### ");
672void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
673 if (c->prev_prev_ch_in_md_ !=
' ')
677void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
678 c->appendToMd(
"\n##### ");
681void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
682 if (c->prev_prev_ch_in_md_ !=
' ')
686void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
687 c->appendToMd(
"\n###### ");
690void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
691 if (c->prev_prev_ch_in_md_ !=
' ')
695void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
699 if (!c->is_in_ordered_list_) {
700 c->appendToMd(
string({c->option.unorderedList,
' '}));
706 string num = std::to_string(c->index_ol);
707 num.append({c->option.orderedList,
' '});
711void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
715 if (c->prev_ch_in_md_ !=
'\n')
719void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
721void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
722 if (c->md_.length() > 0)
723 c->appendToMd(
" \n");
726void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
730 c->is_in_list_ =
true;
731 c->is_in_ordered_list_ =
true;
736 c->ReplacePreviousSpaceInLineByNewline();
741void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
745 c->is_in_ordered_list_ =
false;
747 if (c->index_li != 0)
750 c->is_in_list_ = c->index_li != 0;
755void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
758 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
759 c->appendToMd(
"\n\t");
760 else if (!c->is_in_list_)
764void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
770 if (c->index_blockquote != 0)
771 c->appendToMd(Repeat(
"> ", c->index_blockquote));
774void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
775 c->is_in_pre_ =
true;
777 if (c->prev_ch_in_md_ !=
'\n')
780 if (c->prev_prev_ch_in_md_ !=
'\n')
783 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
784 c->ShortenMarkdown(2);
787 c->appendToMd(
"\t\t");
789 c->appendToMd(
"```");
792void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
793 c->is_in_pre_ =
false;
798 c->appendToMd(
"```");
802void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
803 c->is_in_code_ =
true;
809 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
811 if (startsWith(code,
"language-"))
820void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
821 c->is_in_code_ =
false;
829void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
831void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
833void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
835void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
836 c->TurnLineIntoHeader1();
839void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
840 if (c->is_in_list_ || c->is_in_table_)
843 c->is_in_list_ =
true;
850void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
854 if (c->index_li != 0)
857 c->is_in_list_ = c->index_li != 0;
859 if (c->prev_prev_ch_in_md_ ==
'\n' && c->prev_ch_in_md_ ==
'\n')
860 c->ShortenMarkdown();
861 else if (c->prev_ch_in_md_ !=
'\n')
865void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
866 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ !=
'\n')
870 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
872 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
874 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
875 if (!title.empty()) {
876 c->appendToMd(
" \"")->appendToMd(title)->appendToMd(
'"');
882void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
883 if (c->prev_tag_ == kTagAnchor)
887void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
888 c->appendToMd(
"\n---\n");
891void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
893void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
894 c->is_in_table_ =
true;
896 c->table_start = c->md_.length();
899void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
900 c->is_in_table_ =
false;
903 if (!c->option.formatTable)
906 string table = c->md_.substr(c->table_start);
908 c->ShortenMarkdown(c->md_.size() - c->table_start);
909 c->appendToMd(table);
912void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
916void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
917 c->UpdatePrevChFromMd();
918 if (c->prev_ch_in_md_ ==
'|')
923 if (!c->tableLine.empty()) {
924 if (c->prev_ch_in_md_ !=
'\n')
927 c->tableLine.append(
"|\n");
928 c->appendToMd(c->tableLine);
929 c->tableLine.clear();
933void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
934 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
938 if (align ==
"left" || align ==
"center")
943 if (align ==
"right" || align ==
"center")
948 c->tableLine.append(line);
953void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
955void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
956 if (c->prev_prev_ch_in_md_ !=
'|')
960void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
962void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
963 ++c->index_blockquote;
966void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
967 --c->index_blockquote;
968 c->ShortenMarkdown(2);
971void Converter::reset() {
974 prev_prev_ch_in_md_ = 0;
975 index_ch_in_html_ = 0;
978bool Converter::IsInIgnoredTag()
const {
979 if (current_tag_ == kTagTitle && !option.includeTitle)
982 return IsIgnoredTag(current_tag_);
Class for converting HTML to Markdown.
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
std::string formatMarkdownTable(const std::string &inputTable)