13using std::make_shared;
18bool startsWith(
const string &str,
const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
23bool endsWith(
const string &str,
const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
28size_t ReplaceAll(
string *haystack,
const string &needle,
29 const string &replacement) {
31 size_t pos = (*haystack).find(needle);
33 size_t amount_replaced = 0;
36 while (pos != string::npos) {
38 (*haystack).replace(pos, needle.size(), replacement);
41 pos = (*haystack).find(needle, pos + replacement.size());
46 return amount_replaced;
49size_t ReplaceAll(
string *haystack,
const string &needle,
const char c) {
50 return ReplaceAll(haystack, needle,
string({c}));
54vector<string> Split(
string const &str,
char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
58 for (
string token; getline(iss, token, delimiter);)
59 result.push_back(token);
64string Repeat(
const string &str,
size_t amount) {
72 for (
size_t i = 0; i < amount; ++i)
88 auto tagIgnored = make_shared<Converter::TagIgnored>();
89 tags_[kTagHead] = tagIgnored;
90 tags_[kTagMeta] = tagIgnored;
91 tags_[kTagNav] = tagIgnored;
92 tags_[kTagNoScript] = tagIgnored;
93 tags_[kTagScript] = tagIgnored;
94 tags_[kTagStyle] = tagIgnored;
95 tags_[kTagTemplate] = tagIgnored;
98 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
99 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
100 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
101 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
102 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
103 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
104 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
105 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
106 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
107 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
108 tags_[kTagOption] = make_shared<Converter::TagOption>();
109 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
110 tags_[kTagPre] = make_shared<Converter::TagPre>();
111 tags_[kTagCode] = make_shared<Converter::TagCode>();
112 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
113 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
114 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
115 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
116 tags_[kTagImg] = make_shared<Converter::TagImage>();
117 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
120 auto tagBold = make_shared<Converter::TagBold>();
121 tags_[kTagBold] = tagBold;
122 tags_[kTagStrong] = tagBold;
124 auto tagItalic = make_shared<Converter::TagItalic>();
125 tags_[kTagItalic] = tagItalic;
126 tags_[kTagItalic2] = tagItalic;
127 tags_[kTagDefinition] = tagItalic;
128 tags_[kTagCitation] = tagItalic;
130 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
132 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
133 tags_[kTagStrighthrought] = tagStrighthrought;
134 tags_[kTagStrighthrought2] = tagStrighthrought;
136 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
139 tags_[kTagTable] = make_shared<Converter::TagTable>();
140 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
141 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
142 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
145void Converter::CleanUpMarkdown() {
148 ReplaceAll(&md_,
" , ",
", ");
150 ReplaceAll(&md_,
"\n.\n",
".\n");
151 ReplaceAll(&md_,
"\n↵\n",
" ↵\n");
152 ReplaceAll(&md_,
"\n*\n",
"\n");
153 ReplaceAll(&md_,
"\n. ",
".\n");
155 ReplaceAll(&md_,
""",
'"');
156 ReplaceAll(&md_,
"<",
"<");
157 ReplaceAll(&md_,
">",
">");
158 ReplaceAll(&md_,
"&",
'&');
159 ReplaceAll(&md_,
" ",
' ');
160 ReplaceAll(&md_,
"→",
"→");
162 ReplaceAll(&md_,
"\t\t ",
"\t\t");
166 if (IsInIgnoredTag())
169 if (index_blockquote != 0 && ch ==
'\n') {
172 chars_in_curr_line_ = 0;
182 chars_in_curr_line_ = 0;
184 ++chars_in_curr_line_;
190 if (IsInIgnoredTag())
195 auto str_len = strlen(str);
197 for (
auto i = 0; i < str_len; ++i) {
199 chars_in_curr_line_ = 0;
201 ++chars_in_curr_line_;
208 UpdatePrevChFromMd();
210 if (prev_ch_in_md_ ==
'\n' ||
211 (prev_ch_in_md_ ==
'*' && prev_prev_ch_in_md_ ==
'*'))
218 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
219 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
222void Converter::LTrim(
string *s) {
223 (*s).erase((*s).begin(),
224 find_if((*s).begin(), (*s).end(),
225 [](
unsigned char ch) { return !std::isspace(ch); }));
228Converter *Converter::RTrim(
string *s,
bool trim_only_blank) {
229 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
230 [trim_only_blank](
unsigned char ch) {
244Converter *Converter::Trim(
string *s) {
245 if (!startsWith(*s,
"\t"))
248 if (!(startsWith(*s,
" "), endsWith(*s,
" ")))
254void Converter::TidyAllLines(
string *str) {
255 auto lines = Split(*str,
'\n');
258 uint8_t amount_newlines = 0;
259 bool in_code_block =
false;
261 for (
auto line : lines) {
262 if (startsWith(line,
"```") || startsWith(line,
"~~~"))
263 in_code_block = !in_code_block;
272 if (amount_newlines < 2 && !res.empty()) {
286string Converter::ExtractAttributeFromTagLeftOf(
const string &attr) {
288 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
291 auto offset_attr = tag.find(attr);
293 if (offset_attr == string::npos)
297 auto offset_equals = tag.find(
'=', offset_attr);
299 if (offset_equals == string::npos)
303 auto offset_double_quote = tag.find(
'"', offset_equals);
304 auto offset_single_quote = tag.find(
'\'', offset_equals);
306 bool has_double_quote = offset_double_quote != string::npos;
307 bool has_single_quote = offset_single_quote != string::npos;
309 if (!has_double_quote && !has_single_quote)
312 char wrapping_quote = 0;
314 size_t offset_opening_quote = 0;
315 size_t offset_closing_quote = 0;
317 if (has_double_quote) {
318 if (!has_single_quote) {
319 wrapping_quote =
'"';
320 offset_opening_quote = offset_double_quote;
322 if (offset_double_quote < offset_single_quote) {
323 wrapping_quote =
'"';
324 offset_opening_quote = offset_double_quote;
326 wrapping_quote =
'\'';
327 offset_opening_quote = offset_single_quote;
332 wrapping_quote =
'\'';
333 offset_opening_quote = offset_single_quote;
336 if (offset_opening_quote == string::npos)
339 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
341 if (offset_closing_quote == string::npos)
344 return tag.substr(offset_opening_quote + 1,
345 offset_closing_quote - 1 - offset_opening_quote);
348void Converter::TurnLineIntoHeader1() {
349 appendToMd(
'\n' + Repeat(
"=", chars_in_curr_line_) +
"\n\n");
351 chars_in_curr_line_ = 0;
354void Converter::TurnLineIntoHeader2() {
355 appendToMd(
'\n' + Repeat(
"-", chars_in_curr_line_) +
"\n\n");
357 chars_in_curr_line_ = 0;
362 if (index_ch_in_html_ == html_.size())
367 for (
char ch : html_) {
370 if (!is_in_tag_ && ch ==
'<') {
379 ParseCharInTagContent(ch);
387void Converter::OnHasEnteredTag() {
388 offset_lt_ = index_ch_in_html_;
390 prev_tag_ = current_tag_;
394 UpdatePrevChFromMd();
398Converter *Converter::UpdatePrevChFromMd() {
400 prev_ch_in_md_ = md_[md_.length() - 1];
402 if (md_.length() > 1)
403 prev_prev_ch_in_md_ = md_[md_.length() - 2];
409bool Converter::ParseCharInTag(
char ch) {
410 if (ch ==
'/' && !is_in_attribute_value_) {
411 is_closing_tag_ = current_tag_.empty();
412 is_self_closing_tag_ = !is_closing_tag_;
418 return OnHasLeftTag();
421 if (is_in_attribute_value_) {
422 is_in_attribute_value_ =
false;
423 }
else if (current_tag_[current_tag_.length() - 1] ==
'=') {
424 is_in_attribute_value_ =
true;
435bool Converter::OnHasLeftTag() {
438 UpdatePrevChFromMd();
440 if (!is_closing_tag_)
441 if (TagContainsAttributesToHide(¤t_tag_))
444 auto cut_tags = Split(current_tag_,
' ');
445 if (cut_tags.empty())
448 current_tag_ = cut_tags[0];
450 auto tag = tags_[current_tag_];
455 if (!is_closing_tag_) {
456 tag->OnHasLeftOpeningTag(
this);
458 if (is_closing_tag_ || is_self_closing_tag_) {
459 is_closing_tag_ =
false;
461 tag->OnHasLeftClosingTag(
this);
467Converter *Converter::ShortenMarkdown(
size_t chars) {
468 md_ = md_.substr(0, md_.length() - chars);
470 if (chars > chars_in_curr_line_)
471 chars_in_curr_line_ = 0;
473 chars_in_curr_line_ = chars_in_curr_line_ - chars;
475 return this->UpdatePrevChFromMd();
478bool Converter::ParseCharInTagContent(
char ch) {
482 if (index_blockquote != 0 && ch ==
'\n')
483 appendToMd(Repeat(
"> ", index_blockquote));
488 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
489 prev_ch_in_html_ = ch;
495 if (index_blockquote != 0) {
497 chars_in_curr_line_ = 0;
498 appendToMd(Repeat(
"> ", index_blockquote));
516 ++chars_in_curr_line_;
520 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
521 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
525 chars_in_curr_line_ = 0;
526 }
else if (chars_in_curr_line_ > option.hardBreak) {
527 ReplacePreviousSpaceInLineByNewline();
534bool Converter::ReplacePreviousSpaceInLineByNewline() {
535 if (current_tag_ == kTagParagraph ||
536 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
539 auto offset = md_.length() - 1;
541 if (md_.length() == 0)
545 if (md_[offset] ==
'\n')
548 if (md_[offset] ==
' ') {
550 chars_in_curr_line_ = md_.length() - offset;
556 }
while (offset > 0);
561void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
562 if (c->prev_tag_ == kTagImg)
565 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
568 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
571void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
572 if (!c->shortIfPrevCh(
'[')) {
573 c->appendToMd(
"](")->appendToMd(current_href_);
576 if (!current_title_.empty()) {
577 c->appendToMd(
" \"")->appendToMd(current_title_)->appendToMd(
'"');
578 current_title_.clear();
583 if (c->prev_tag_ == kTagImg)
588void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
592void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
596void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
600void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
604void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
605 c->appendToMd(
"<u>");
608void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
609 c->appendToMd(
"</u>");
612void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
616void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
620void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
621 if (c->is_in_list_) {
622 c->appendToMd(
" \n");
623 c->appendToMd(Repeat(
" ", c->index_li));
624 }
else if (c->is_in_table_) {
625 c->appendToMd(
"<br>");
626 }
else if (!c->md_.empty())
627 c->appendToMd(
" \n");
630void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
632void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
633 if (c->prev_ch_in_md_ !=
'\n')
636 if (c->prev_prev_ch_in_md_ !=
'\n')
640void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
642void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
643 c->appendToMd(
"\n# ");
646void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
647 if (c->prev_prev_ch_in_md_ !=
' ')
651void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
652 c->appendToMd(
"\n## ");
655void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
656 if (c->prev_prev_ch_in_md_ !=
' ')
660void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
661 c->appendToMd(
"\n### ");
664void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
665 if (c->prev_prev_ch_in_md_ !=
' ')
669void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
670 c->appendToMd(
"\n#### ");
673void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
674 if (c->prev_prev_ch_in_md_ !=
' ')
678void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
679 c->appendToMd(
"\n##### ");
682void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
683 if (c->prev_prev_ch_in_md_ !=
' ')
687void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
688 c->appendToMd(
"\n###### ");
691void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
692 if (c->prev_prev_ch_in_md_ !=
' ')
696void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
700 if (!c->is_in_ordered_list_) {
701 c->appendToMd(
string({c->option.unorderedList,
' '}));
707 string num = std::to_string(c->index_ol);
708 num.append({c->option.orderedList,
' '});
712void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
716 if (c->prev_ch_in_md_ !=
'\n')
720void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
722void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
723 if (c->md_.length() > 0)
724 c->appendToMd(
" \n");
727void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
731 c->is_in_list_ =
true;
732 c->is_in_ordered_list_ =
true;
737 c->ReplacePreviousSpaceInLineByNewline();
742void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
746 c->is_in_ordered_list_ =
false;
748 if (c->index_li != 0)
751 c->is_in_list_ = c->index_li != 0;
756void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
759 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
760 c->appendToMd(
"\n\t");
761 else if (!c->is_in_list_)
765void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
771 if (c->index_blockquote != 0)
772 c->appendToMd(Repeat(
"> ", c->index_blockquote));
775void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
776 c->is_in_pre_ =
true;
778 if (c->prev_ch_in_md_ !=
'\n')
781 if (c->prev_prev_ch_in_md_ !=
'\n')
784 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
785 c->ShortenMarkdown(2);
788 c->appendToMd(
"\t\t");
790 c->appendToMd(
"```");
793void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
794 c->is_in_pre_ =
false;
799 c->appendToMd(
"```");
803void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
804 c->is_in_code_ =
true;
810 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
812 if (startsWith(code,
"language-"))
821void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
822 c->is_in_code_ =
false;
830void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
832void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
834void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
836void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
837 c->TurnLineIntoHeader1();
840void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
841 if (c->is_in_list_ || c->is_in_table_)
844 c->is_in_list_ =
true;
851void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
855 if (c->index_li != 0)
858 c->is_in_list_ = c->index_li != 0;
860 if (c->prev_prev_ch_in_md_ ==
'\n' && c->prev_ch_in_md_ ==
'\n')
861 c->ShortenMarkdown();
862 else if (c->prev_ch_in_md_ !=
'\n')
866void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
867 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ !=
'\n')
871 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
873 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
875 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
876 if (!title.empty()) {
877 c->appendToMd(
" \"")->appendToMd(title)->appendToMd(
'"');
883void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
884 if (c->prev_tag_ == kTagAnchor)
888void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
889 c->appendToMd(
"\n---\n");
892void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
894void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
895 c->is_in_table_ =
true;
897 c->table_start = c->md_.length();
900void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
901 c->is_in_table_ =
false;
904 if (!c->option.formatTable)
907 string table = c->md_.substr(c->table_start);
909 c->ShortenMarkdown(c->md_.size() - c->table_start);
910 c->appendToMd(table);
913void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
917void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
918 c->UpdatePrevChFromMd();
919 if (c->prev_ch_in_md_ ==
'|')
924 if (!c->tableLine.empty()) {
925 if (c->prev_ch_in_md_ !=
'\n')
928 c->tableLine.append(
"|\n");
929 c->appendToMd(c->tableLine);
930 c->tableLine.clear();
934void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
935 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
939 if (align ==
"left" || align ==
"center")
944 if (align ==
"right" || align ==
"center")
949 c->tableLine.append(line);
954void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
956void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
957 if (c->prev_prev_ch_in_md_ !=
'|')
961void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
963void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
964 ++c->index_blockquote;
967void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
968 --c->index_blockquote;
969 c->ShortenMarkdown(2);
975 prev_prev_ch_in_md_ = 0;
976 index_ch_in_html_ = 0;
979bool Converter::IsInIgnoredTag()
const {
980 if (current_tag_ == kTagTitle && !option.includeTitle)
983 return IsIgnoredTag(current_tag_);
std::string convert()
Convert HTML into Markdown.
Converter * appendToMd(char ch)
Append a char to the Markdown.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
void reset()
Reset the generated Markdown.
Options for the conversion from HTML to Markdown.
std::string formatMarkdownTable(const std::string &inputTable)