13using std::make_shared;
18bool startsWith(
const string &str,
const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
23bool endsWith(
const string &str,
const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
28size_t ReplaceAll(
string *haystack,
const string &needle,
29 const string &replacement) {
31 size_t pos = (*haystack).find(needle);
33 size_t amount_replaced = 0;
36 while (pos != string::npos) {
38 (*haystack).replace(pos, needle.size(), replacement);
41 pos = (*haystack).find(needle, pos + replacement.size());
46 return amount_replaced;
49size_t ReplaceAll(
string *haystack,
const string &needle,
const char c) {
50 return ReplaceAll(haystack, needle,
string({c}));
54vector<string> Split(
string const &str,
char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
58 for (
string token; getline(iss, token, delimiter);)
59 result.push_back(token);
64string Repeat(
const string &str,
size_t amount) {
72 for (
size_t i = 0; i < amount; ++i)
78string toLower(
const string &str) {
80 lower.reserve(str.size());
95 md_.reserve(html->size() * 0.8);
99 auto tagIgnored = make_shared<Converter::TagIgnored>();
100 tags_[kTagHead] = tagIgnored;
101 tags_[kTagMeta] = tagIgnored;
102 tags_[kTagNav] = tagIgnored;
103 tags_[kTagNoScript] = tagIgnored;
104 tags_[kTagScript] = tagIgnored;
105 tags_[kTagStyle] = tagIgnored;
106 tags_[kTagTemplate] = tagIgnored;
109 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
110 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
111 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
112 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
113 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
114 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
115 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
116 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
117 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
118 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
119 tags_[kTagOption] = make_shared<Converter::TagOption>();
120 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
121 tags_[kTagPre] = make_shared<Converter::TagPre>();
122 tags_[kTagCode] = make_shared<Converter::TagCode>();
123 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
124 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
125 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
126 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
127 tags_[kTagImg] = make_shared<Converter::TagImage>();
128 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
131 auto tagBold = make_shared<Converter::TagBold>();
132 tags_[kTagBold] = tagBold;
133 tags_[kTagStrong] = tagBold;
135 auto tagItalic = make_shared<Converter::TagItalic>();
136 tags_[kTagItalic] = tagItalic;
137 tags_[kTagItalic2] = tagItalic;
138 tags_[kTagDefinition] = tagItalic;
139 tags_[kTagCitation] = tagItalic;
141 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
143 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
144 tags_[kTagStrighthrought] = tagStrighthrought;
145 tags_[kTagStrighthrought2] = tagStrighthrought;
147 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
150 tags_[kTagTable] = make_shared<Converter::TagTable>();
151 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
152 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
153 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
156void Converter::CleanUpMarkdown() {
159 buffer.reserve(md_.size());
162 for (
size_t i = 0; i < md_.size();) {
163 bool replaced =
false;
166 for (
const auto &symbol_replacement : htmlSymbolConversions_) {
167 const std::string &symbol = symbol_replacement.first;
168 const std::string &replacement = symbol_replacement.second;
170 if (md_.compare(i, symbol.size(), symbol) == 0) {
171 buffer.append(replacement);
179 buffer.push_back(md_[i++]);
187 const char *replacements[][2] = {
188 {
" , ",
", "}, {
"\n.\n",
".\n"}, {
"\n↵\n",
" ↵\n"}, {
"\n*\n",
"\n"},
189 {
"\n. ",
".\n"}, {
"\t\t ",
"\t\t"},
192 for (
const auto &replacement : replacements) {
193 ReplaceAll(&md_, replacement[0], replacement[1]);
198 if (IsInIgnoredTag())
201 if (index_blockquote != 0 && ch ==
'\n') {
204 chars_in_curr_line_ = 0;
214 chars_in_curr_line_ = 0;
216 ++chars_in_curr_line_;
222 if (IsInIgnoredTag())
227 auto str_len = strlen(str);
229 for (
auto i = 0; i < str_len; ++i) {
231 chars_in_curr_line_ = 0;
233 ++chars_in_curr_line_;
240 UpdatePrevChFromMd();
242 if (prev_ch_in_md_ ==
'\n' ||
243 (prev_ch_in_md_ ==
'*' && prev_prev_ch_in_md_ ==
'*'))
250 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
251 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
254void Converter::LTrim(
string *s) {
255 (*s).erase((*s).begin(),
256 find_if((*s).begin(), (*s).end(),
257 [](
unsigned char ch) { return !std::isspace(ch); }));
260Converter *Converter::RTrim(
string *s,
bool trim_only_blank) {
261 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
262 [trim_only_blank](
unsigned char ch) {
276Converter *Converter::Trim(
string *s) {
277 if (!startsWith(*s,
"\t"))
280 if (!(startsWith(*s,
" "), endsWith(*s,
" ")))
286void Converter::TidyAllLines(
string *str) {
287 auto lines = Split(*str,
'\n');
290 uint8_t amount_newlines = 0;
291 bool in_code_block =
false;
293 for (
auto line : lines) {
294 if (startsWith(line,
"```") || startsWith(line,
"~~~"))
295 in_code_block = !in_code_block;
304 if (amount_newlines < 2 && !res.empty()) {
318string Converter::ExtractAttributeFromTagLeftOf(
const string &attr) {
320 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
321 string lowerTag = toLower(tag);
324 auto offset_attr = lowerTag.find(attr);
326 if (offset_attr == string::npos)
330 auto offset_equals = tag.find(
'=', offset_attr);
332 if (offset_equals == string::npos)
336 auto offset_double_quote = tag.find(
'"', offset_equals);
337 auto offset_single_quote = tag.find(
'\'', offset_equals);
339 bool has_double_quote = offset_double_quote != string::npos;
340 bool has_single_quote = offset_single_quote != string::npos;
342 if (!has_double_quote && !has_single_quote)
345 char wrapping_quote = 0;
347 size_t offset_opening_quote = 0;
348 size_t offset_closing_quote = 0;
350 if (has_double_quote) {
351 if (!has_single_quote) {
352 wrapping_quote =
'"';
353 offset_opening_quote = offset_double_quote;
355 if (offset_double_quote < offset_single_quote) {
356 wrapping_quote =
'"';
357 offset_opening_quote = offset_double_quote;
359 wrapping_quote =
'\'';
360 offset_opening_quote = offset_single_quote;
365 wrapping_quote =
'\'';
366 offset_opening_quote = offset_single_quote;
369 if (offset_opening_quote == string::npos)
372 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
374 if (offset_closing_quote == string::npos)
377 return tag.substr(offset_opening_quote + 1,
378 offset_closing_quote - 1 - offset_opening_quote);
381void Converter::TurnLineIntoHeader1() {
382 appendToMd(
'\n' + Repeat(
"=", chars_in_curr_line_) +
"\n\n");
384 chars_in_curr_line_ = 0;
387void Converter::TurnLineIntoHeader2() {
388 appendToMd(
'\n' + Repeat(
"-", chars_in_curr_line_) +
"\n\n");
390 chars_in_curr_line_ = 0;
395 if (index_ch_in_html_ == html_.size())
400 for (
char ch : html_) {
403 if (!is_in_tag_ && ch ==
'<') {
412 ParseCharInTagContent(ch);
420void Converter::OnHasEnteredTag() {
421 offset_lt_ = index_ch_in_html_;
423 is_closing_tag_ =
false;
424 prev_tag_ = current_tag_;
428 UpdatePrevChFromMd();
432Converter *Converter::UpdatePrevChFromMd() {
434 prev_ch_in_md_ = md_[md_.length() - 1];
436 if (md_.length() > 1)
437 prev_prev_ch_in_md_ = md_[md_.length() - 2];
443bool Converter::ParseCharInTag(
char ch) {
444 static bool skipping_leading_whitespace =
true;
446 if (ch ==
'/' && !is_in_attribute_value_) {
447 is_closing_tag_ = current_tag_.empty();
448 is_self_closing_tag_ = !is_closing_tag_;
449 skipping_leading_whitespace =
true;
455 while (!current_tag_.empty() && std::isspace(current_tag_.back())) {
456 current_tag_.pop_back();
458 skipping_leading_whitespace =
true;
459 if (!is_self_closing_tag_)
460 return OnHasLeftTag();
463 is_self_closing_tag_ =
false;
464 is_closing_tag_ =
true;
465 return OnHasLeftTag();
470 if (is_in_attribute_value_) {
471 is_in_attribute_value_ =
false;
473 size_t pos = current_tag_.length();
474 while (pos > 0 && isspace(current_tag_[pos - 1])) {
477 if (pos > 0 && current_tag_[pos - 1] ==
'=') {
478 is_in_attribute_value_ =
true;
481 skipping_leading_whitespace =
false;
486 if (isspace(ch) && skipping_leading_whitespace) {
491 skipping_leading_whitespace =
false;
492 current_tag_ += tolower(ch);
496bool Converter::OnHasLeftTag() {
499 UpdatePrevChFromMd();
501 if (!is_closing_tag_)
502 if (TagContainsAttributesToHide(¤t_tag_))
505 auto cut_tags = Split(current_tag_,
' ');
506 if (cut_tags.empty())
509 current_tag_ = cut_tags[0];
511 auto tag = tags_[current_tag_];
516 if (!is_closing_tag_) {
517 tag->OnHasLeftOpeningTag(
this);
520 is_closing_tag_ =
false;
522 tag->OnHasLeftClosingTag(
this);
528Converter *Converter::ShortenMarkdown(
size_t chars) {
529 md_ = md_.substr(0, md_.length() - chars);
531 if (chars > chars_in_curr_line_)
532 chars_in_curr_line_ = 0;
534 chars_in_curr_line_ = chars_in_curr_line_ - chars;
536 return this->UpdatePrevChFromMd();
539bool Converter::ParseCharInTagContent(
char ch) {
543 if (index_blockquote != 0 && ch ==
'\n')
544 appendToMd(Repeat(
"> ", index_blockquote));
549 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
550 prev_ch_in_html_ = ch;
556 if (index_blockquote != 0) {
558 chars_in_curr_line_ = 0;
559 appendToMd(Repeat(
"> ", index_blockquote));
577 ++chars_in_curr_line_;
581 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
582 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
586 chars_in_curr_line_ = 0;
587 }
else if (chars_in_curr_line_ > option.hardBreak) {
588 ReplacePreviousSpaceInLineByNewline();
595bool Converter::ReplacePreviousSpaceInLineByNewline() {
596 if (current_tag_ == kTagParagraph ||
597 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
600 auto offset = md_.length() - 1;
602 if (md_.length() == 0)
606 if (md_[offset] ==
'\n')
609 if (md_[offset] ==
' ') {
611 chars_in_curr_line_ = md_.length() - offset;
617 }
while (offset > 0);
622void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
623 if (c->prev_tag_ == kTagImg)
626 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
629 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
632void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
633 if (!c->shortIfPrevCh(
'[')) {
634 c->appendToMd(
"](")->appendToMd(current_href_);
637 if (!current_title_.empty()) {
638 c->appendToMd(
" \"")->appendToMd(current_title_)->appendToMd(
'"');
639 current_title_.clear();
644 if (c->prev_tag_ == kTagImg)
649void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
653void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
657void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
661void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
665void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
666 c->appendToMd(
"<u>");
669void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
670 c->appendToMd(
"</u>");
673void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
677void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
681void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
682 if (c->is_in_list_) {
683 c->appendToMd(
" \n");
684 c->appendToMd(Repeat(
" ", c->index_li));
685 }
else if (c->is_in_table_) {
686 c->appendToMd(
"<br>");
687 }
else if (!c->md_.empty())
688 c->appendToMd(
" \n");
691void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
693void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
694 if (c->prev_ch_in_md_ !=
'\n')
697 if (c->prev_prev_ch_in_md_ !=
'\n')
701void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
703void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
704 c->appendToMd(
"\n# ");
707void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
708 if (c->prev_prev_ch_in_md_ !=
' ')
712void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
713 c->appendToMd(
"\n## ");
716void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
717 if (c->prev_prev_ch_in_md_ !=
' ')
721void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
722 c->appendToMd(
"\n### ");
725void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
726 if (c->prev_prev_ch_in_md_ !=
' ')
730void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
731 c->appendToMd(
"\n#### ");
734void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
735 if (c->prev_prev_ch_in_md_ !=
' ')
739void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
740 c->appendToMd(
"\n##### ");
743void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
744 if (c->prev_prev_ch_in_md_ !=
' ')
748void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
749 c->appendToMd(
"\n###### ");
752void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
753 if (c->prev_prev_ch_in_md_ !=
' ')
757void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
761 if (!c->is_in_ordered_list_) {
762 c->appendToMd(
string({c->option.unorderedList,
' '}));
768 string num = std::to_string(c->index_ol);
769 num.append({c->option.orderedList,
' '});
773void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
777 if (c->prev_ch_in_md_ !=
'\n')
781void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
783void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
784 if (c->md_.length() > 0)
785 c->appendToMd(
" \n");
788void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
792 c->is_in_list_ =
true;
793 c->is_in_ordered_list_ =
true;
798 c->ReplacePreviousSpaceInLineByNewline();
803void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
807 c->is_in_ordered_list_ =
false;
809 if (c->index_li != 0)
812 c->is_in_list_ = c->index_li != 0;
817void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
820 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
821 c->appendToMd(
"\n\t");
822 else if (!c->is_in_list_)
826void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
832 if (c->index_blockquote != 0)
833 c->appendToMd(Repeat(
"> ", c->index_blockquote));
836void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
837 c->is_in_pre_ =
true;
839 if (c->prev_ch_in_md_ !=
'\n')
842 if (c->prev_prev_ch_in_md_ !=
'\n')
845 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
846 c->ShortenMarkdown(2);
849 c->appendToMd(
"\t\t");
851 c->appendToMd(
"```");
854void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
855 c->is_in_pre_ =
false;
860 c->appendToMd(
"```");
864void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
865 c->is_in_code_ =
true;
871 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
873 if (startsWith(code,
"language-"))
882void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
883 c->is_in_code_ =
false;
891void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
893void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
895void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
897void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
898 c->TurnLineIntoHeader1();
901void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
902 if (c->is_in_list_ || c->is_in_table_)
905 c->is_in_list_ =
true;
912void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
916 if (c->index_li != 0)
919 c->is_in_list_ = c->index_li != 0;
921 if (c->prev_prev_ch_in_md_ ==
'\n' && c->prev_ch_in_md_ ==
'\n')
922 c->ShortenMarkdown();
923 else if (c->prev_ch_in_md_ !=
'\n')
927void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
928 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ !=
'\n')
932 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
934 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
936 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
937 if (!title.empty()) {
938 c->appendToMd(
" \"")->appendToMd(title)->appendToMd(
'"');
944void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
945 if (c->prev_tag_ == kTagAnchor)
949void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
950 c->appendToMd(
"\n---\n");
953void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
955void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
956 c->is_in_table_ =
true;
958 c->table_start = c->md_.length();
961void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
962 c->is_in_table_ =
false;
965 if (!c->option.formatTable)
968 string table = c->md_.substr(c->table_start);
970 c->ShortenMarkdown(c->md_.size() - c->table_start);
971 c->appendToMd(table);
974void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
978void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
979 c->UpdatePrevChFromMd();
980 if (c->prev_ch_in_md_ ==
'|')
985 if (!c->tableLine.empty()) {
986 if (c->prev_ch_in_md_ !=
'\n')
989 c->tableLine.append(
"|\n");
990 c->appendToMd(c->tableLine);
991 c->tableLine.clear();
995void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
996 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
1000 if (align ==
"left" || align ==
"center")
1005 if (align ==
"right" || align ==
"center")
1010 c->tableLine.append(line);
1012 c->appendToMd(
"| ");
1015void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
1017void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
1018 if (c->prev_prev_ch_in_md_ !=
'|')
1019 c->appendToMd(
"| ");
1022void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
1024void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
1025 ++c->index_blockquote;
1026 c->appendToMd(
"\n");
1027 c->appendToMd(Repeat(
"> ", c->index_blockquote));
1030void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
1031 --c->index_blockquote;
1033 if (!c->md_.empty() && c->md_.length() >= 2 &&
1034 c->md_.substr(c->md_.length() - 2) ==
"> ") {
1035 c->ShortenMarkdown(2);
1042 prev_prev_ch_in_md_ = 0;
1043 index_ch_in_html_ = 0;
1046bool Converter::IsInIgnoredTag()
const {
1047 if (current_tag_ == kTagTitle && !option.includeTitle)
1050 return IsIgnoredTag(current_tag_);
std::string convert()
Convert HTML into Markdown.
Converter * appendToMd(char ch)
Append a char to the Markdown.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
void reset()
Reset the generated Markdown.
Options for the conversion from HTML to Markdown.
std::string formatMarkdownTable(const std::string &inputTable)