html2md  v1.7.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.cpp
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#include "html2md.h"
5#include "table.h"
6
7#include <algorithm>
8#include <cstring>
9#include <memory>
10#include <sstream>
11#include <vector>
12
13using std::make_shared;
14using std::string;
15using std::vector;
16
17namespace {
18bool startsWith(const string &str, const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
21}
22
23bool endsWith(const string &str, const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
26}
27
28size_t ReplaceAll(string *haystack, const string &needle,
29 const string &replacement) {
30 // Get first occurrence
31 size_t pos = (*haystack).find(needle);
32
33 size_t amount_replaced = 0;
34
35 // Repeat until end is reached
36 while (pos != string::npos) {
37 // Replace this occurrence of sub string
38 (*haystack).replace(pos, needle.size(), replacement);
39
40 // Get the next occurrence from the current position
41 pos = (*haystack).find(needle, pos + replacement.size());
42
43 ++amount_replaced;
44 }
45
46 return amount_replaced;
47}
48
49size_t ReplaceAll(string *haystack, const string &needle, const char c) {
50 return ReplaceAll(haystack, needle, string({c}));
51}
52
53// Split given string by given character delimiter into vector of strings
54vector<string> Split(string const &str, char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
57
58 for (string token; getline(iss, token, delimiter);)
59 result.push_back(token);
60
61 return result;
62}
63
64string Repeat(const string &str, size_t amount) {
65 if (amount == 0)
66 return "";
67 else if (amount == 1)
68 return str;
69
70 string out;
71
72 for (size_t i = 0; i < amount; ++i)
73 out.append(str);
74
75 return out;
76}
77
78string toLower(const string &str) {
79 string lower;
80 lower.reserve(str.size());
81 for (char ch : str) {
82 lower += tolower(ch);
83 }
84 return lower;
85}
86
87} // namespace
88
89namespace html2md {
90
91Converter::Converter(const string *html, Options *options) : html_(*html) {
92 if (options)
93 option = *options;
94
95 md_.reserve(html->size() * 0.8);
96 tags_.reserve(41);
97
98 // non-printing tags
99 auto tagIgnored = make_shared<Converter::TagIgnored>();
100 tags_[kTagHead] = tagIgnored;
101 tags_[kTagMeta] = tagIgnored;
102 tags_[kTagNav] = tagIgnored;
103 tags_[kTagNoScript] = tagIgnored;
104 tags_[kTagScript] = tagIgnored;
105 tags_[kTagStyle] = tagIgnored;
106 tags_[kTagTemplate] = tagIgnored;
107
108 // printing tags
109 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
110 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
111 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
112 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
113 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
114 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
115 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
116 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
117 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
118 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
119 tags_[kTagOption] = make_shared<Converter::TagOption>();
120 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
121 tags_[kTagPre] = make_shared<Converter::TagPre>();
122 tags_[kTagCode] = make_shared<Converter::TagCode>();
123 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
124 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
125 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
126 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
127 tags_[kTagImg] = make_shared<Converter::TagImage>();
128 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
129
130 // Text formatting
131 auto tagBold = make_shared<Converter::TagBold>();
132 tags_[kTagBold] = tagBold;
133 tags_[kTagStrong] = tagBold;
134
135 auto tagItalic = make_shared<Converter::TagItalic>();
136 tags_[kTagItalic] = tagItalic;
137 tags_[kTagItalic2] = tagItalic;
138 tags_[kTagDefinition] = tagItalic;
139 tags_[kTagCitation] = tagItalic;
140
141 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
142
143 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
144 tags_[kTagStrighthrought] = tagStrighthrought;
145 tags_[kTagStrighthrought2] = tagStrighthrought;
146
147 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
148
149 // Tables
150 tags_[kTagTable] = make_shared<Converter::TagTable>();
151 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
152 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
153 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
154}
155
156void Converter::CleanUpMarkdown() {
157 TidyAllLines(&md_);
158 std::string buffer;
159 buffer.reserve(md_.size());
160
161 // Replace HTML symbols during the initial pass
162 for (size_t i = 0; i < md_.size();) {
163 bool replaced = false;
164
165 // C++11 compatible iteration over htmlSymbolConversions_
166 for (const auto &symbol_replacement : htmlSymbolConversions_) {
167 const std::string &symbol = symbol_replacement.first;
168 const std::string &replacement = symbol_replacement.second;
169
170 if (md_.compare(i, symbol.size(), symbol) == 0) {
171 buffer.append(replacement);
172 i += symbol.size();
173 replaced = true;
174 break;
175 }
176 }
177
178 if (!replaced) {
179 buffer.push_back(md_[i++]);
180 }
181 }
182
183 // Use swap instead of move assignment for better pre-C++11 compatibility
184 md_.swap(buffer);
185
186 // Optimized replacement sequence
187 const char *replacements[][2] = {
188 {" , ", ", "}, {"\n.\n", ".\n"}, {"\n↵\n", " ↵\n"}, {"\n*\n", "\n"},
189 {"\n. ", ".\n"}, {"\t\t ", "\t\t"},
190 };
191
192 for (const auto &replacement : replacements) {
193 ReplaceAll(&md_, replacement[0], replacement[1]);
194 }
195}
196
198 if (IsInIgnoredTag())
199 return this;
200
201 if (index_blockquote != 0 && ch == '\n') {
202 if (is_in_pre_) {
203 md_ += ch;
204 chars_in_curr_line_ = 0;
205 appendToMd(Repeat("> ", index_blockquote));
206 }
207
208 return this;
209 }
210
211 md_ += ch;
212
213 if (ch == '\n')
214 chars_in_curr_line_ = 0;
215 else
216 ++chars_in_curr_line_;
217
218 return this;
219}
220
222 if (IsInIgnoredTag())
223 return this;
224
225 md_ += str;
226
227 auto str_len = strlen(str);
228
229 for (auto i = 0; i < str_len; ++i) {
230 if (str[i] == '\n')
231 chars_in_curr_line_ = 0;
232 else
233 ++chars_in_curr_line_;
234 }
235
236 return this;
237}
238
240 UpdatePrevChFromMd();
241
242 if (prev_ch_in_md_ == '\n' ||
243 (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*'))
244 return this;
245
246 return appendToMd(' ');
247}
248
249bool Converter::ok() const {
250 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
251 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
252}
253
254void Converter::LTrim(string *s) {
255 (*s).erase((*s).begin(),
256 find_if((*s).begin(), (*s).end(),
257 [](unsigned char ch) { return !std::isspace(ch); }));
258}
259
260Converter *Converter::RTrim(string *s, bool trim_only_blank) {
261 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
262 [trim_only_blank](unsigned char ch) {
263 if (trim_only_blank)
264 return !isblank(ch);
265
266 return !isspace(ch);
267 })
268 .base(),
269 (*s).end());
270
271 return this;
272}
273
274// NOTE: Pay attention when changing one of the trim functions. It can break the
275// output!
276Converter *Converter::Trim(string *s) {
277 if (!startsWith(*s, "\t"))
278 LTrim(s);
279
280 if (!(startsWith(*s, " "), endsWith(*s, " ")))
281 RTrim(s);
282
283 return this;
284}
285
286void Converter::TidyAllLines(string *str) {
287 auto lines = Split(*str, '\n');
288 string res;
289
290 uint8_t amount_newlines = 0;
291 bool in_code_block = false;
292
293 for (auto line : lines) {
294 if (startsWith(line, "```") || startsWith(line, "~~~"))
295 in_code_block = !in_code_block;
296 if (in_code_block) {
297 res += line + '\n';
298 continue;
299 }
300
301 Trim(&line);
302
303 if (line.empty()) {
304 if (amount_newlines < 2 && !res.empty()) {
305 res += '\n';
306 amount_newlines++;
307 }
308 } else {
309 amount_newlines = 0;
310
311 res += line + '\n';
312 }
313 }
314
315 *str = res;
316}
317
318string Converter::ExtractAttributeFromTagLeftOf(const string &attr) {
319 // Extract the whole tag from current offset, e.g. from '>', backwards
320 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
321 string lowerTag = toLower(tag); // Convert tag to lowercase for comparison
322
323 // locate given attribute (case-insensitive)
324 auto offset_attr = lowerTag.find(attr);
325
326 if (offset_attr == string::npos)
327 return "";
328
329 // locate attribute-value pair's '='
330 auto offset_equals = tag.find('=', offset_attr);
331
332 if (offset_equals == string::npos)
333 return "";
334
335 // locate value's surrounding quotes
336 auto offset_double_quote = tag.find('"', offset_equals);
337 auto offset_single_quote = tag.find('\'', offset_equals);
338
339 bool has_double_quote = offset_double_quote != string::npos;
340 bool has_single_quote = offset_single_quote != string::npos;
341
342 if (!has_double_quote && !has_single_quote)
343 return "";
344
345 char wrapping_quote = 0;
346
347 size_t offset_opening_quote = 0;
348 size_t offset_closing_quote = 0;
349
350 if (has_double_quote) {
351 if (!has_single_quote) {
352 wrapping_quote = '"';
353 offset_opening_quote = offset_double_quote;
354 } else {
355 if (offset_double_quote < offset_single_quote) {
356 wrapping_quote = '"';
357 offset_opening_quote = offset_double_quote;
358 } else {
359 wrapping_quote = '\'';
360 offset_opening_quote = offset_single_quote;
361 }
362 }
363 } else {
364 // has only single quote
365 wrapping_quote = '\'';
366 offset_opening_quote = offset_single_quote;
367 }
368
369 if (offset_opening_quote == string::npos)
370 return "";
371
372 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
373
374 if (offset_closing_quote == string::npos)
375 return "";
376
377 return tag.substr(offset_opening_quote + 1,
378 offset_closing_quote - 1 - offset_opening_quote);
379}
380
381void Converter::TurnLineIntoHeader1() {
382 appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n");
383
384 chars_in_curr_line_ = 0;
385}
386
387void Converter::TurnLineIntoHeader2() {
388 appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n");
389
390 chars_in_curr_line_ = 0;
391}
392
394 // We already converted
395 if (index_ch_in_html_ == html_.size())
396 return md_;
397
398 reset();
399
400 for (char ch : html_) {
401 ++index_ch_in_html_;
402
403 if (!is_in_tag_ && ch == '<') {
404 OnHasEnteredTag();
405
406 continue;
407 }
408
409 if (is_in_tag_)
410 ParseCharInTag(ch);
411 else
412 ParseCharInTagContent(ch);
413 }
414
415 CleanUpMarkdown();
416
417 return md_;
418}
419
420void Converter::OnHasEnteredTag() {
421 offset_lt_ = index_ch_in_html_;
422 is_in_tag_ = true;
423 is_closing_tag_ = false;
424 prev_tag_ = current_tag_;
425 current_tag_ = "";
426
427 if (!md_.empty()) {
428 UpdatePrevChFromMd();
429 }
430}
431
432Converter *Converter::UpdatePrevChFromMd() {
433 if (!md_.empty()) {
434 prev_ch_in_md_ = md_[md_.length() - 1];
435
436 if (md_.length() > 1)
437 prev_prev_ch_in_md_ = md_[md_.length() - 2];
438 }
439
440 return this;
441}
442
443bool Converter::ParseCharInTag(char ch) {
444 static bool skipping_leading_whitespace = true;
445
446 if (ch == '/' && !is_in_attribute_value_) {
447 is_closing_tag_ = current_tag_.empty();
448 is_self_closing_tag_ = !is_closing_tag_;
449 skipping_leading_whitespace = true; // Reset for next tag
450 return true;
451 }
452
453 if (ch == '>') {
454 // Trim trailing whitespace by removing characters from current_tag_
455 while (!current_tag_.empty() && std::isspace(current_tag_.back())) {
456 current_tag_.pop_back();
457 }
458 skipping_leading_whitespace = true; // Reset for next tag
459 if (!is_self_closing_tag_)
460 return OnHasLeftTag();
461 else {
462 OnHasLeftTag();
463 is_self_closing_tag_ = false;
464 is_closing_tag_ = true;
465 return OnHasLeftTag();
466 }
467 }
468
469 if (ch == '"') {
470 if (is_in_attribute_value_) {
471 is_in_attribute_value_ = false;
472 } else {
473 size_t pos = current_tag_.length();
474 while (pos > 0 && isspace(current_tag_[pos - 1])) {
475 pos--;
476 }
477 if (pos > 0 && current_tag_[pos - 1] == '=') {
478 is_in_attribute_value_ = true;
479 }
480 }
481 skipping_leading_whitespace = false; // Stop skipping after attribute
482 return true;
483 }
484
485 // Handle whitespace: skip leading whitespace, keep others
486 if (isspace(ch) && skipping_leading_whitespace) {
487 return true; // Ignore leading whitespace
488 }
489
490 // Once we encounter a non-whitespace character, stop skipping
491 skipping_leading_whitespace = false;
492 current_tag_ += tolower(ch);
493 return false;
494}
495
496bool Converter::OnHasLeftTag() {
497 is_in_tag_ = false;
498
499 UpdatePrevChFromMd();
500
501 if (!is_closing_tag_)
502 if (TagContainsAttributesToHide(&current_tag_))
503 return true;
504
505 auto cut_tags = Split(current_tag_, ' ');
506 if (cut_tags.empty())
507 return true;
508
509 current_tag_ = cut_tags[0];
510
511 auto tag = tags_[current_tag_];
512
513 if (!tag)
514 return true;
515
516 if (!is_closing_tag_) {
517 tag->OnHasLeftOpeningTag(this);
518 }
519 else {
520 is_closing_tag_ = false;
521
522 tag->OnHasLeftClosingTag(this);
523 }
524
525 return true;
526}
527
528Converter *Converter::ShortenMarkdown(size_t chars) {
529 md_ = md_.substr(0, md_.length() - chars);
530
531 if (chars > chars_in_curr_line_)
532 chars_in_curr_line_ = 0;
533 else
534 chars_in_curr_line_ = chars_in_curr_line_ - chars;
535
536 return this->UpdatePrevChFromMd();
537}
538
539bool Converter::ParseCharInTagContent(char ch) {
540 if (is_in_code_) {
541 md_ += ch;
542
543 if (index_blockquote != 0 && ch == '\n')
544 appendToMd(Repeat("> ", index_blockquote));
545
546 return true;
547 }
548
549 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
550 prev_ch_in_html_ = ch;
551
552 return true;
553 }
554
555 if (ch == '\n') {
556 if (index_blockquote != 0) {
557 md_ += '\n';
558 chars_in_curr_line_ = 0;
559 appendToMd(Repeat("> ", index_blockquote));
560 }
561
562 return true;
563 }
564
565 switch (ch) {
566 case '*':
567 appendToMd("\\*");
568 break;
569 case '`':
570 appendToMd("\\`");
571 break;
572 case '\\':
573 appendToMd("\\\\");
574 break;
575 default:
576 md_ += ch;
577 ++chars_in_curr_line_;
578 break;
579 }
580
581 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
582 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
583 option.splitLines) {
584 if (ch == ' ') { // If the next char is - it will become a list
585 md_ += '\n';
586 chars_in_curr_line_ = 0;
587 } else if (chars_in_curr_line_ > option.hardBreak) {
588 ReplacePreviousSpaceInLineByNewline();
589 }
590 }
591
592 return false;
593}
594
595bool Converter::ReplacePreviousSpaceInLineByNewline() {
596 if (current_tag_ == kTagParagraph ||
597 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
598 return false;
599
600 auto offset = md_.length() - 1;
601
602 if (md_.length() == 0)
603 return true;
604
605 do {
606 if (md_[offset] == '\n')
607 return false;
608
609 if (md_[offset] == ' ') {
610 md_[offset] = '\n';
611 chars_in_curr_line_ = md_.length() - offset;
612
613 return true;
614 }
615
616 --offset;
617 } while (offset > 0);
618
619 return false;
620}
621
622void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
623 if (c->prev_tag_ == kTagImg)
624 c->appendToMd('\n');
625
626 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
627
628 c->appendToMd('[');
629 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
630}
631
632void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
633 if (!c->shortIfPrevCh('[')) {
634 c->appendToMd("](")->appendToMd(current_href_);
635
636 // If title is set append it
637 if (!current_title_.empty()) {
638 c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"');
639 current_title_.clear();
640 }
641
642 c->appendToMd(')');
643
644 if (c->prev_tag_ == kTagImg)
645 c->appendToMd('\n');
646 }
647}
648
649void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
650 c->appendToMd("**");
651}
652
653void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
654 c->appendToMd("**");
655}
656
657void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
658 c->appendToMd('*');
659}
660
661void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
662 c->appendToMd('*');
663}
664
665void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
666 c->appendToMd("<u>");
667}
668
669void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
670 c->appendToMd("</u>");
671}
672
673void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
674 c->appendToMd('~');
675}
676
677void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
678 c->appendToMd('~');
679}
680
681void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
682 if (c->is_in_list_) { // When it's in a list, it's not in a paragraph
683 c->appendToMd(" \n");
684 c->appendToMd(Repeat(" ", c->index_li));
685 } else if (c->is_in_table_) {
686 c->appendToMd("<br>");
687 } else if (!c->md_.empty())
688 c->appendToMd(" \n");
689}
690
691void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
692
693void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
694 if (c->prev_ch_in_md_ != '\n')
695 c->appendToMd('\n');
696
697 if (c->prev_prev_ch_in_md_ != '\n')
698 c->appendToMd('\n');
699}
700
701void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
702
703void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
704 c->appendToMd("\n# ");
705}
706
707void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
708 if (c->prev_prev_ch_in_md_ != ' ')
709 c->appendToMd('\n');
710}
711
712void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
713 c->appendToMd("\n## ");
714}
715
716void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
717 if (c->prev_prev_ch_in_md_ != ' ')
718 c->appendToMd('\n');
719}
720
721void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
722 c->appendToMd("\n### ");
723}
724
725void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
726 if (c->prev_prev_ch_in_md_ != ' ')
727 c->appendToMd('\n');
728}
729
730void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
731 c->appendToMd("\n#### ");
732}
733
734void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
735 if (c->prev_prev_ch_in_md_ != ' ')
736 c->appendToMd('\n');
737}
738
739void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
740 c->appendToMd("\n##### ");
741}
742
743void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
744 if (c->prev_prev_ch_in_md_ != ' ')
745 c->appendToMd('\n');
746}
747
748void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
749 c->appendToMd("\n###### ");
750}
751
752void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
753 if (c->prev_prev_ch_in_md_ != ' ')
754 c->appendToMd('\n');
755}
756
757void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
758 if (c->is_in_table_)
759 return;
760
761 if (!c->is_in_ordered_list_) {
762 c->appendToMd(string({c->option.unorderedList, ' '}));
763 return;
764 }
765
766 ++c->index_ol;
767
768 string num = std::to_string(c->index_ol);
769 num.append({c->option.orderedList, ' '});
770 c->appendToMd(num);
771}
772
773void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
774 if (c->is_in_table_)
775 return;
776
777 if (c->prev_ch_in_md_ != '\n')
778 c->appendToMd('\n');
779}
780
781void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
782
783void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
784 if (c->md_.length() > 0)
785 c->appendToMd(" \n");
786}
787
788void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
789 if (c->is_in_table_)
790 return;
791
792 c->is_in_list_ = true;
793 c->is_in_ordered_list_ = true;
794 c->index_ol = 0;
795
796 ++c->index_li;
797
798 c->ReplacePreviousSpaceInLineByNewline();
799
800 c->appendToMd('\n');
801}
802
803void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
804 if (c->is_in_table_)
805 return;
806
807 c->is_in_ordered_list_ = false;
808
809 if (c->index_li != 0)
810 --c->index_li;
811
812 c->is_in_list_ = c->index_li != 0;
813
814 c->appendToMd('\n');
815}
816
817void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
818 c->is_in_p_ = true;
819
820 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
821 c->appendToMd("\n\t");
822 else if (!c->is_in_list_)
823 c->appendToMd('\n');
824}
825
826void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
827 c->is_in_p_ = false;
828
829 if (!c->md_.empty())
830 c->appendToMd("\n"); // Workaround \n restriction for blockquotes
831
832 if (c->index_blockquote != 0)
833 c->appendToMd(Repeat("> ", c->index_blockquote));
834}
835
836void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
837 c->is_in_pre_ = true;
838
839 if (c->prev_ch_in_md_ != '\n')
840 c->appendToMd('\n');
841
842 if (c->prev_prev_ch_in_md_ != '\n')
843 c->appendToMd('\n');
844
845 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
846 c->ShortenMarkdown(2);
847
848 if (c->is_in_list_)
849 c->appendToMd("\t\t");
850 else
851 c->appendToMd("```");
852}
853
854void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
855 c->is_in_pre_ = false;
856
857 if (c->is_in_list_)
858 return;
859
860 c->appendToMd("```");
861 c->appendToMd('\n'); // Don't combine because of blockquote
862}
863
864void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
865 c->is_in_code_ = true;
866
867 if (c->is_in_pre_) {
868 if (c->is_in_list_)
869 return;
870
871 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
872 if (!code.empty()) {
873 if (startsWith(code, "language-"))
874 code.erase(0, 9); // remove language-
875 c->appendToMd(code);
876 }
877 c->appendToMd('\n');
878 } else
879 c->appendToMd('`');
880}
881
882void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
883 c->is_in_code_ = false;
884
885 if (c->is_in_pre_)
886 return;
887
888 c->appendToMd('`');
889}
890
891void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
892
893void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
894
895void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
896
897void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
898 c->TurnLineIntoHeader1();
899}
900
901void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
902 if (c->is_in_list_ || c->is_in_table_)
903 return;
904
905 c->is_in_list_ = true;
906
907 ++c->index_li;
908
909 c->appendToMd('\n');
910}
911
912void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
913 if (c->is_in_table_)
914 return;
915
916 if (c->index_li != 0)
917 --c->index_li;
918
919 c->is_in_list_ = c->index_li != 0;
920
921 if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n')
922 c->ShortenMarkdown();
923 else if (c->prev_ch_in_md_ != '\n')
924 c->appendToMd('\n');
925}
926
927void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
928 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n')
929 c->appendToMd('\n');
930
931 c->appendToMd("![")
932 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
933 ->appendToMd("](")
934 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
935
936 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
937 if (!title.empty()) {
938 c->appendToMd(" \"")->appendToMd(title)->appendToMd('"');
939 }
940
941 c->appendToMd(")");
942}
943
944void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
945 if (c->prev_tag_ == kTagAnchor)
946 c->appendToMd('\n');
947}
948
949void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
950 c->appendToMd("\n---\n"); // NOTE: We can make this an option
951}
952
953void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
954
955void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
956 c->is_in_table_ = true;
957 c->appendToMd('\n');
958 c->table_start = c->md_.length();
959}
960
961void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
962 c->is_in_table_ = false;
963 c->appendToMd('\n');
964
965 if (!c->option.formatTable)
966 return;
967
968 string table = c->md_.substr(c->table_start);
969 table = formatMarkdownTable(table);
970 c->ShortenMarkdown(c->md_.size() - c->table_start);
971 c->appendToMd(table);
972}
973
974void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
975 c->appendToMd('\n');
976}
977
978void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
979 c->UpdatePrevChFromMd();
980 if (c->prev_ch_in_md_ == '|')
981 c->appendToMd('\n'); // There's a bug
982 else
983 c->appendToMd('|');
984
985 if (!c->tableLine.empty()) {
986 if (c->prev_ch_in_md_ != '\n')
987 c->appendToMd('\n');
988
989 c->tableLine.append("|\n");
990 c->appendToMd(c->tableLine);
991 c->tableLine.clear();
992 }
993}
994
995void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
996 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
997
998 string line = "| ";
999
1000 if (align == "left" || align == "center")
1001 line += ':';
1002
1003 line += '-';
1004
1005 if (align == "right" || align == "center")
1006 line += ": ";
1007 else
1008 line += ' ';
1009
1010 c->tableLine.append(line);
1011
1012 c->appendToMd("| ");
1013}
1014
1015void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
1016
1017void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
1018 if (c->prev_prev_ch_in_md_ != '|')
1019 c->appendToMd("| ");
1020}
1021
1022void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
1023
1024void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
1025 ++c->index_blockquote;
1026 c->appendToMd("\n");
1027 c->appendToMd(Repeat("> ", c->index_blockquote));
1028}
1029
1030void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
1031 --c->index_blockquote;
1032 // Only shorten if a "> " was added (i.e., a newline was processed in the blockquote)
1033 if (!c->md_.empty() && c->md_.length() >= 2 &&
1034 c->md_.substr(c->md_.length() - 2) == "> ") {
1035 c->ShortenMarkdown(2); // Remove the '> ' only if it exists
1036 }
1037}
1038
1040 md_.clear();
1041 prev_ch_in_md_ = 0;
1042 prev_prev_ch_in_md_ = 0;
1043 index_ch_in_html_ = 0;
1044}
1045
1046bool Converter::IsInIgnoredTag() const {
1047 if (current_tag_ == kTagTitle && !option.includeTitle)
1048 return true;
1049
1050 return IsIgnoredTag(current_tag_);
1051}
1052} // namespace html2md
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:393
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:197
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:239
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:249
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:173
void reset()
Reset the generated Markdown.
Definition html2md.cpp:1039
html2md namespace
Definition html2md.h:21
Options for the conversion from HTML to Markdown.
Definition html2md.h:38
std::string formatMarkdownTable(const std::string &inputTable)