html2md  v1.8.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.cpp
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#include "html2md.h"
5#include "table.h"
6
7#include <algorithm>
8#include <cstring>
9#include <memory>
10#include <sstream>
11#include <vector>
12
13using std::make_shared;
14using std::string;
15using std::vector;
16
17namespace {
18bool startsWith(const string &str, const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
21}
22
23bool endsWith(const string &str, const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
26}
27
28size_t ReplaceAll(string *haystack, const string &needle,
29 const string &replacement) {
30 // Get first occurrence
31 size_t pos = (*haystack).find(needle);
32
33 size_t amount_replaced = 0;
34
35 // Repeat until end is reached
36 while (pos != string::npos) {
37 // Replace this occurrence of sub string
38 (*haystack).replace(pos, needle.size(), replacement);
39
40 // Get the next occurrence from the current position
41 pos = (*haystack).find(needle, pos + replacement.size());
42
43 ++amount_replaced;
44 }
45
46 return amount_replaced;
47}
48
49size_t ReplaceAll(string *haystack, const string &needle, const char c) {
50 return ReplaceAll(haystack, needle, string({c}));
51}
52
53// Split given string by given character delimiter into vector of strings
54vector<string> Split(string const &str, char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
57
58 for (string token; getline(iss, token, delimiter);)
59 result.push_back(token);
60
61 return result;
62}
63
64string Repeat(const string &str, size_t amount) {
65 if (amount == 0)
66 return "";
67 else if (amount == 1)
68 return str;
69
70 string out;
71
72 for (size_t i = 0; i < amount; ++i)
73 out.append(str);
74
75 return out;
76}
77
78string toLower(const string &str) {
79 string lower;
80 lower.reserve(str.size());
81 for (char ch : str) {
82 lower += tolower(ch);
83 }
84 return lower;
85}
86
87} // namespace
88
89namespace html2md {
90
91Converter::Converter(const string *html, Options *options) : html_(*html) {
92 if (options)
93 option = *options;
94
95 md_.reserve(html->size() * 0.8);
96 tags_.reserve(41);
97
98 // non-printing tags
99 auto tagIgnored = make_shared<Converter::TagIgnored>();
100 tags_[kTagHead] = tagIgnored;
101 tags_[kTagMeta] = tagIgnored;
102 tags_[kTagNav] = tagIgnored;
103 tags_[kTagNoScript] = tagIgnored;
104 tags_[kTagScript] = tagIgnored;
105 tags_[kTagStyle] = tagIgnored;
106 tags_[kTagTemplate] = tagIgnored;
107
108 // printing tags
109 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
110 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
111 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
112 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
113 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
114 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
115 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
116 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
117 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
118 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
119 tags_[kTagOption] = make_shared<Converter::TagOption>();
120 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
121 tags_[kTagPre] = make_shared<Converter::TagPre>();
122 tags_[kTagCode] = make_shared<Converter::TagCode>();
123 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
124 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
125 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
126 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
127 tags_[kTagImg] = make_shared<Converter::TagImage>();
128 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
129
130 // Text formatting
131 auto tagBold = make_shared<Converter::TagBold>();
132 tags_[kTagBold] = tagBold;
133 tags_[kTagStrong] = tagBold;
134
135 auto tagItalic = make_shared<Converter::TagItalic>();
136 tags_[kTagItalic] = tagItalic;
137 tags_[kTagItalic2] = tagItalic;
138 tags_[kTagDefinition] = tagItalic;
139 tags_[kTagCitation] = tagItalic;
140
141 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
142
143 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
144 tags_[kTagStrighthrought] = tagStrighthrought;
145 tags_[kTagStrighthrought2] = tagStrighthrought;
146
147 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
148
149 // Tables
150 tags_[kTagTable] = make_shared<Converter::TagTable>();
151 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
152 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
153 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
154}
155
156void Converter::CleanUpMarkdown() {
157 TidyAllLines(&md_);
158 std::string buffer;
159 buffer.reserve(md_.size());
160
161 // Replace HTML symbols during the initial pass
162 for (size_t i = 0; i < md_.size();) {
163 bool replaced = false;
164
165 // C++11 compatible iteration over htmlSymbolConversions_
166 for (const auto &symbol_replacement : htmlSymbolConversions_) {
167 const std::string &symbol = symbol_replacement.first;
168 const std::string &replacement = symbol_replacement.second;
169
170 if (md_.compare(i, symbol.size(), symbol) == 0) {
171 buffer.append(replacement);
172 i += symbol.size();
173 replaced = true;
174 break;
175 }
176 }
177
178 if (!replaced) {
179 buffer.push_back(md_[i++]);
180 }
181 }
182
183 // Use swap instead of move assignment for better pre-C++11 compatibility
184 md_.swap(buffer);
185
186 // Optimized replacement sequence
187 const char *replacements[][2] = {
188 {" , ", ", "}, {"\n.\n", ".\n"}, {"\n↵\n", " ↵\n"}, {"\n*\n", "\n"},
189 {"\n. ", ".\n"}, {"\t\t ", "\t\t"},
190 };
191
192 for (const auto &replacement : replacements) {
193 ReplaceAll(&md_, replacement[0], replacement[1]);
194 }
195}
196
198 if (IsInIgnoredTag())
199 return this;
200
201 if (index_blockquote != 0 && ch == '\n') {
202 if (is_in_pre_) {
203 md_ += ch;
204 chars_in_curr_line_ = 0;
205 appendToMd(Repeat("> ", index_blockquote));
206 }
207
208 return this;
209 }
210
211 md_ += ch;
212
213 if (ch == '\n')
214 chars_in_curr_line_ = 0;
215 else
216 ++chars_in_curr_line_;
217
218 return this;
219}
220
222 if (IsInIgnoredTag())
223 return this;
224
225 md_ += str;
226
227 auto str_len = strlen(str);
228
229 for (auto i = 0; i < str_len; ++i) {
230 if (str[i] == '\n')
231 chars_in_curr_line_ = 0;
232 else
233 ++chars_in_curr_line_;
234 }
235
236 return this;
237}
238
240 UpdatePrevChFromMd();
241
242 if (prev_ch_in_md_ == '\n' ||
243 (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*'))
244 return this;
245
246 return appendToMd(' ');
247}
248
249bool Converter::ok() const {
250 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
251 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
252}
253
254void Converter::LTrim(string *s) {
255 (*s).erase((*s).begin(),
256 find_if((*s).begin(), (*s).end(),
257 [](unsigned char ch) { return !std::isspace(ch); }));
258}
259
260Converter *Converter::RTrim(string *s, bool trim_only_blank) {
261 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
262 [trim_only_blank](unsigned char ch) {
263 if (trim_only_blank)
264 return !isblank(ch);
265
266 return !isspace(ch);
267 })
268 .base(),
269 (*s).end());
270
271 return this;
272}
273
274// NOTE: Pay attention when changing one of the trim functions. It can break the
275// output!
276Converter *Converter::Trim(string *s) {
277 if (!startsWith(*s, "\t") || option.forceLeftTrim)
278 LTrim(s);
279
280 if (!(startsWith(*s, " "), endsWith(*s, " ")))
281 RTrim(s);
282
283 return this;
284}
285
286void Converter::TidyAllLines(string *str) {
287 auto lines = Split(*str, '\n');
288 string res;
289
290 uint8_t amount_newlines = 0;
291 bool in_code_block = false;
292
293 for (auto line : lines) {
294 if (startsWith(line, "```") || startsWith(line, "~~~"))
295 in_code_block = !in_code_block;
296 if (in_code_block) {
297 res += line + '\n';
298 continue;
299 }
300
301 Trim(&line);
302
303 if (line.empty()) {
304 if (amount_newlines < 2 && !res.empty()) {
305 res += '\n';
306 amount_newlines++;
307 }
308 } else {
309 amount_newlines = 0;
310
311 res += line + '\n';
312 }
313 }
314
315 *str = res;
316}
317
318string Converter::ExtractAttributeFromTagLeftOf(const string &attr) {
319 // Extract the whole tag from current offset, e.g. from '>', backwards
320 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
321 string lowerTag = toLower(tag); // Convert tag to lowercase for comparison
322
323 // locate given attribute (case-insensitive)
324 auto offset_attr = lowerTag.find(attr);
325
326 if (offset_attr == string::npos)
327 return "";
328
329 // locate attribute-value pair's '='
330 auto offset_equals = tag.find('=', offset_attr);
331
332 if (offset_equals == string::npos)
333 return "";
334
335 // locate value's surrounding quotes
336 auto offset_double_quote = tag.find('"', offset_equals);
337 auto offset_single_quote = tag.find('\'', offset_equals);
338
339 bool has_double_quote = offset_double_quote != string::npos;
340 bool has_single_quote = offset_single_quote != string::npos;
341
342 if (!has_double_quote && !has_single_quote)
343 return "";
344
345 char wrapping_quote = 0;
346
347 size_t offset_opening_quote = 0;
348 size_t offset_closing_quote = 0;
349
350 if (has_double_quote) {
351 if (!has_single_quote) {
352 wrapping_quote = '"';
353 offset_opening_quote = offset_double_quote;
354 } else {
355 if (offset_double_quote < offset_single_quote) {
356 wrapping_quote = '"';
357 offset_opening_quote = offset_double_quote;
358 } else {
359 wrapping_quote = '\'';
360 offset_opening_quote = offset_single_quote;
361 }
362 }
363 } else {
364 // has only single quote
365 wrapping_quote = '\'';
366 offset_opening_quote = offset_single_quote;
367 }
368
369 if (offset_opening_quote == string::npos)
370 return "";
371
372 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
373
374 if (offset_closing_quote == string::npos)
375 return "";
376
377 return tag.substr(offset_opening_quote + 1,
378 offset_closing_quote - 1 - offset_opening_quote);
379}
380
381void Converter::TurnLineIntoHeader1() {
382 appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n");
383
384 chars_in_curr_line_ = 0;
385}
386
387void Converter::TurnLineIntoHeader2() {
388 appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n");
389
390 chars_in_curr_line_ = 0;
391}
392
394 // We already converted
395 if (index_ch_in_html_ == html_.size())
396 return md_;
397
398 reset();
399
400 for (char ch : html_) {
401 ++index_ch_in_html_;
402
403 if (!is_in_tag_ && ch == '<') {
404 OnHasEnteredTag();
405
406 continue;
407 }
408
409 if (is_in_tag_)
410 ParseCharInTag(ch);
411 else
412 ParseCharInTagContent(ch);
413 }
414
415 CleanUpMarkdown();
416
417 return md_;
418}
419
420void Converter::OnHasEnteredTag() {
421 offset_lt_ = index_ch_in_html_;
422 is_in_tag_ = true;
423 is_closing_tag_ = false;
424 prev_tag_ = current_tag_;
425 current_tag_ = "";
426
427 if (!md_.empty()) {
428 UpdatePrevChFromMd();
429 }
430}
431
432Converter *Converter::UpdatePrevChFromMd() {
433 if (!md_.empty()) {
434 prev_ch_in_md_ = md_[md_.length() - 1];
435
436 if (md_.length() > 1)
437 prev_prev_ch_in_md_ = md_[md_.length() - 2];
438 }
439
440 return this;
441}
442
443bool Converter::ParseCharInTag(char ch) {
444 static bool skipping_leading_whitespace = true;
445
446 if (ch == '/' && !is_in_attribute_value_) {
447 is_closing_tag_ = current_tag_.empty();
448 is_self_closing_tag_ = !is_closing_tag_;
449 skipping_leading_whitespace = true; // Reset for next tag
450 return true;
451 }
452
453 if (ch == '>') {
454 // Trim trailing whitespace by removing characters from current_tag_
455 while (!current_tag_.empty() && std::isspace(current_tag_.back())) {
456 current_tag_.pop_back();
457 }
458 skipping_leading_whitespace = true; // Reset for next tag
459 if (!is_self_closing_tag_)
460 return OnHasLeftTag();
461 else {
462 OnHasLeftTag();
463 is_self_closing_tag_ = false;
464 is_closing_tag_ = true;
465 return OnHasLeftTag();
466 }
467 }
468
469 if (ch == '"') {
470 if (is_in_attribute_value_) {
471 is_in_attribute_value_ = false;
472 } else {
473 size_t pos = current_tag_.length();
474 while (pos > 0 && isspace(current_tag_[pos - 1])) {
475 pos--;
476 }
477 if (pos > 0 && current_tag_[pos - 1] == '=') {
478 is_in_attribute_value_ = true;
479 }
480 }
481 skipping_leading_whitespace = false; // Stop skipping after attribute
482 return true;
483 }
484
485 // Handle whitespace: skip leading whitespace, keep others
486 if (isspace(ch) && skipping_leading_whitespace) {
487 return true; // Ignore leading whitespace
488 }
489
490 // Once we encounter a non-whitespace character, stop skipping
491 skipping_leading_whitespace = false;
492 current_tag_ += tolower(ch);
493 return false;
494}
495
496bool Converter::OnHasLeftTag() {
497 is_in_tag_ = false;
498
499 UpdatePrevChFromMd();
500
501 if (!is_closing_tag_)
502 if (TagContainsAttributesToHide(&current_tag_))
503 return true;
504
505 auto cut_tags = Split(current_tag_, ' ');
506 if (cut_tags.empty())
507 return true;
508
509 current_tag_ = cut_tags[0];
510
511 auto tag = tags_[current_tag_];
512
513 if (!tag)
514 return true;
515
516 if (!is_closing_tag_) {
517 tag->OnHasLeftOpeningTag(this);
518 }
519 else {
520 is_closing_tag_ = false;
521
522 tag->OnHasLeftClosingTag(this);
523 }
524
525 return true;
526}
527
528Converter *Converter::ShortenMarkdown(size_t chars) {
529 md_ = md_.substr(0, md_.length() - chars);
530
531 if (chars > chars_in_curr_line_)
532 chars_in_curr_line_ = 0;
533 else
534 chars_in_curr_line_ = chars_in_curr_line_ - chars;
535
536 return this->UpdatePrevChFromMd();
537}
538
539bool Converter::ParseCharInTagContent(char ch) {
540 if (is_in_code_) {
541 md_ += ch;
542
543 if (index_blockquote != 0 && ch == '\n')
544 appendToMd(Repeat("> ", index_blockquote));
545
546 return true;
547 }
548
549 if (option.compressWhitespace && !is_in_pre_) {
550 if (ch == '\t')
551 ch = ' ';
552
553 if (ch == ' ') {
554 UpdatePrevChFromMd();
555 if (prev_ch_in_md_ == ' ' || prev_ch_in_md_ == '\n')
556 return true;
557 }
558 }
559
560 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
561 prev_ch_in_html_ = ch;
562
563 return true;
564 }
565
566 if (ch == '\n') {
567 if (index_blockquote != 0) {
568 md_ += '\n';
569 chars_in_curr_line_ = 0;
570 appendToMd(Repeat("> ", index_blockquote));
571 }
572
573 return true;
574 }
575
576 switch (ch) {
577 case '*':
578 appendToMd("\\*");
579 break;
580 case '`':
581 appendToMd("\\`");
582 break;
583 case '\\':
584 appendToMd("\\\\");
585 break;
586 default:
587 md_ += ch;
588 ++chars_in_curr_line_;
589 break;
590 }
591
592 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
593 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
594 option.splitLines) {
595 if (ch == ' ') { // If the next char is - it will become a list
596 md_ += '\n';
597 chars_in_curr_line_ = 0;
598 } else if (chars_in_curr_line_ > option.hardBreak) {
599 ReplacePreviousSpaceInLineByNewline();
600 }
601 }
602
603 return false;
604}
605
606bool Converter::ReplacePreviousSpaceInLineByNewline() {
607 if (current_tag_ == kTagParagraph ||
608 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
609 return false;
610
611 auto offset = md_.length() - 1;
612
613 if (md_.length() == 0)
614 return true;
615
616 do {
617 if (md_[offset] == '\n')
618 return false;
619
620 if (md_[offset] == ' ') {
621 md_[offset] = '\n';
622 chars_in_curr_line_ = md_.length() - offset;
623
624 return true;
625 }
626
627 --offset;
628 } while (offset > 0);
629
630 return false;
631}
632
633void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
634 if (c->prev_tag_ == kTagImg)
635 c->appendToMd('\n');
636
637 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
638
639 c->appendToMd('[');
640 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
641}
642
643void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
644 if (!c->shortIfPrevCh('[')) {
645 c->appendToMd("](")->appendToMd(current_href_);
646
647 // If title is set append it
648 if (!current_title_.empty()) {
649 c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"');
650 current_title_.clear();
651 }
652
653 c->appendToMd(')');
654
655 if (c->prev_tag_ == kTagImg)
656 c->appendToMd('\n');
657 }
658}
659
660void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
661 c->appendToMd("**");
662}
663
664void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
665 c->appendToMd("**");
666}
667
668void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
669 c->appendToMd('*');
670}
671
672void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
673 c->appendToMd('*');
674}
675
676void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
677 c->appendToMd("<u>");
678}
679
680void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
681 c->appendToMd("</u>");
682}
683
684void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
685 c->appendToMd('~');
686}
687
688void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
689 c->appendToMd('~');
690}
691
692void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
693 if (c->is_in_list_) { // When it's in a list, it's not in a paragraph
694 c->appendToMd(" \n");
695 c->appendToMd(Repeat(" ", c->index_li));
696 } else if (c->is_in_table_) {
697 c->appendToMd("<br>");
698 } else if (!c->md_.empty())
699 c->appendToMd(" \n");
700}
701
702void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
703
704void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
705 if (c->prev_ch_in_md_ != '\n')
706 c->appendToMd('\n');
707
708 if (c->prev_prev_ch_in_md_ != '\n')
709 c->appendToMd('\n');
710}
711
712void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
713
714void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
715 c->appendToMd("\n# ");
716}
717
718void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
719 if (c->prev_prev_ch_in_md_ != ' ')
720 c->appendToMd('\n');
721}
722
723void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
724 c->appendToMd("\n## ");
725}
726
727void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
728 if (c->prev_prev_ch_in_md_ != ' ')
729 c->appendToMd('\n');
730}
731
732void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
733 c->appendToMd("\n### ");
734}
735
736void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
737 if (c->prev_prev_ch_in_md_ != ' ')
738 c->appendToMd('\n');
739}
740
741void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
742 c->appendToMd("\n#### ");
743}
744
745void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
746 if (c->prev_prev_ch_in_md_ != ' ')
747 c->appendToMd('\n');
748}
749
750void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
751 c->appendToMd("\n##### ");
752}
753
754void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
755 if (c->prev_prev_ch_in_md_ != ' ')
756 c->appendToMd('\n');
757}
758
759void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
760 c->appendToMd("\n###### ");
761}
762
763void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
764 if (c->prev_prev_ch_in_md_ != ' ')
765 c->appendToMd('\n');
766}
767
768void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
769 if (c->is_in_table_)
770 return;
771
772 if (!c->is_in_ordered_list_) {
773 c->appendToMd(string({c->option.unorderedList, ' '}));
774 return;
775 }
776
777 ++c->index_ol;
778
779 string num = std::to_string(c->index_ol);
780 num.append({c->option.orderedList, ' '});
781 c->appendToMd(num);
782}
783
784void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
785 if (c->is_in_table_)
786 return;
787
788 if (c->prev_ch_in_md_ != '\n')
789 c->appendToMd('\n');
790}
791
792void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
793
794void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
795 if (c->md_.length() > 0)
796 c->appendToMd(" \n");
797}
798
799void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
800 if (c->is_in_table_)
801 return;
802
803 c->is_in_list_ = true;
804 c->is_in_ordered_list_ = true;
805 c->index_ol = 0;
806
807 ++c->index_li;
808
809 c->ReplacePreviousSpaceInLineByNewline();
810
811 c->appendToMd('\n');
812}
813
814void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
815 if (c->is_in_table_)
816 return;
817
818 c->is_in_ordered_list_ = false;
819
820 if (c->index_li != 0)
821 --c->index_li;
822
823 c->is_in_list_ = c->index_li != 0;
824
825 c->appendToMd('\n');
826}
827
828void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
829 c->is_in_p_ = true;
830
831 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
832 c->appendToMd("\n\t");
833 else if (!c->is_in_list_)
834 c->appendToMd('\n');
835}
836
837void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
838 c->is_in_p_ = false;
839
840 if (!c->md_.empty())
841 c->appendToMd("\n"); // Workaround \n restriction for blockquotes
842
843 if (c->index_blockquote != 0)
844 c->appendToMd(Repeat("> ", c->index_blockquote));
845}
846
847void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
848 c->is_in_pre_ = true;
849
850 if (c->prev_ch_in_md_ != '\n')
851 c->appendToMd('\n');
852
853 if (c->prev_prev_ch_in_md_ != '\n')
854 c->appendToMd('\n');
855
856 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
857 c->ShortenMarkdown(2);
858
859 if (c->is_in_list_)
860 c->appendToMd("\t\t");
861 else
862 c->appendToMd("```");
863}
864
865void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
866 c->is_in_pre_ = false;
867
868 if (c->is_in_list_)
869 return;
870
871 c->appendToMd("```");
872 c->appendToMd('\n'); // Don't combine because of blockquote
873}
874
875void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
876 c->is_in_code_ = true;
877
878 if (c->is_in_pre_) {
879 if (c->is_in_list_)
880 return;
881
882 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
883 if (!code.empty()) {
884 if (startsWith(code, "language-"))
885 code.erase(0, 9); // remove language-
886 c->appendToMd(code);
887 }
888 c->appendToMd('\n');
889 } else
890 c->appendToMd('`');
891}
892
893void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
894 c->is_in_code_ = false;
895
896 if (c->is_in_pre_)
897 return;
898
899 c->appendToMd('`');
900}
901
902void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
903
904void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
905
906void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
907
908void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
909 c->TurnLineIntoHeader1();
910}
911
912void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
913 if (c->is_in_list_ || c->is_in_table_)
914 return;
915
916 c->is_in_list_ = true;
917
918 ++c->index_li;
919
920 c->appendToMd('\n');
921}
922
923void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
924 if (c->is_in_table_)
925 return;
926
927 if (c->index_li != 0)
928 --c->index_li;
929
930 c->is_in_list_ = c->index_li != 0;
931
932 if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n')
933 c->ShortenMarkdown();
934 else if (c->prev_ch_in_md_ != '\n')
935 c->appendToMd('\n');
936}
937
938void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
939 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n')
940 c->appendToMd('\n');
941
942 c->appendToMd("![")
943 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
944 ->appendToMd("](")
945 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
946
947 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
948 if (!title.empty()) {
949 c->appendToMd(" \"")->appendToMd(title)->appendToMd('"');
950 }
951
952 c->appendToMd(")");
953}
954
955void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
956 if (c->prev_tag_ == kTagAnchor)
957 c->appendToMd('\n');
958}
959
960void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
961 c->appendToMd("\n---\n"); // NOTE: We can make this an option
962}
963
964void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
965
966void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
967 c->is_in_table_ = true;
968 c->appendToMd('\n');
969 c->table_start = c->md_.length();
970}
971
972void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
973 c->is_in_table_ = false;
974 c->appendToMd('\n');
975
976 if (!c->option.formatTable)
977 return;
978
979 string table = c->md_.substr(c->table_start);
980 table = formatMarkdownTable(table);
981 c->ShortenMarkdown(c->md_.size() - c->table_start);
982 c->appendToMd(table);
983}
984
985void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
986 c->appendToMd('\n');
987}
988
989void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
990 c->UpdatePrevChFromMd();
991 if (c->prev_ch_in_md_ == '|')
992 c->appendToMd('\n'); // There's a bug
993 else
994 c->appendToMd('|');
995
996 if (!c->tableLine.empty()) {
997 if (c->prev_ch_in_md_ != '\n')
998 c->appendToMd('\n');
999
1000 c->tableLine.append("|\n");
1001 c->appendToMd(c->tableLine);
1002 c->tableLine.clear();
1003 }
1004}
1005
1006void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
1007 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
1008
1009 string line = "| ";
1010
1011 if (align == "left" || align == "center")
1012 line += ':';
1013
1014 line += '-';
1015
1016 if (align == "right" || align == "center")
1017 line += ": ";
1018 else
1019 line += ' ';
1020
1021 c->tableLine.append(line);
1022
1023 c->appendToMd("| ");
1024}
1025
1026void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
1027
1028void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
1029 if (c->prev_prev_ch_in_md_ != '|')
1030 c->appendToMd("| ");
1031}
1032
1033void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
1034
1035void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
1036 ++c->index_blockquote;
1037 c->appendToMd("\n");
1038 c->appendToMd(Repeat("> ", c->index_blockquote));
1039}
1040
1041void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
1042 --c->index_blockquote;
1043 // Only shorten if a "> " was added (i.e., a newline was processed in the blockquote)
1044 if (!c->md_.empty() && c->md_.length() >= 2 &&
1045 c->md_.substr(c->md_.length() - 2) == "> ") {
1046 c->ShortenMarkdown(2); // Remove the '> ' only if it exists
1047 }
1048}
1049
1051 md_.clear();
1052 prev_ch_in_md_ = 0;
1053 prev_prev_ch_in_md_ = 0;
1054 index_ch_in_html_ = 0;
1055}
1056
1057bool Converter::IsInIgnoredTag() const {
1058 if (current_tag_ == kTagTitle && !option.includeTitle)
1059 return true;
1060
1061 return IsIgnoredTag(current_tag_);
1062}
1063} // namespace html2md
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:393
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:197
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:239
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:249
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:193
void reset()
Reset the generated Markdown.
Definition html2md.cpp:1050
html2md namespace
Definition html2md.h:22
Options for the conversion from HTML to Markdown.
Definition html2md.h:39
std::string formatMarkdownTable(const std::string &inputTable)