html2md  v1.6.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.cpp
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#include "html2md.h"
5#include "table.h"
6
7#include <algorithm>
8#include <cstring>
9#include <memory>
10#include <sstream>
11#include <vector>
12
13using std::make_shared;
14using std::string;
15using std::vector;
16
17namespace {
18bool startsWith(const string &str, const string &prefix) {
19 return str.size() >= prefix.size() &&
20 0 == str.compare(0, prefix.size(), prefix);
21}
22
23bool endsWith(const string &str, const string &suffix) {
24 return str.size() >= suffix.size() &&
25 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
26}
27
28size_t ReplaceAll(string *haystack, const string &needle,
29 const string &replacement) {
30 // Get first occurrence
31 size_t pos = (*haystack).find(needle);
32
33 size_t amount_replaced = 0;
34
35 // Repeat until end is reached
36 while (pos != string::npos) {
37 // Replace this occurrence of sub string
38 (*haystack).replace(pos, needle.size(), replacement);
39
40 // Get the next occurrence from the current position
41 pos = (*haystack).find(needle, pos + replacement.size());
42
43 ++amount_replaced;
44 }
45
46 return amount_replaced;
47}
48
49size_t ReplaceAll(string *haystack, const string &needle, const char c) {
50 return ReplaceAll(haystack, needle, string({c}));
51}
52
53// Split given string by given character delimiter into vector of strings
54vector<string> Split(string const &str, char delimiter) {
55 vector<string> result;
56 std::stringstream iss(str);
57
58 for (string token; getline(iss, token, delimiter);)
59 result.push_back(token);
60
61 return result;
62}
63
64string Repeat(const string &str, size_t amount) {
65 if (amount == 0)
66 return "";
67 else if (amount == 1)
68 return str;
69
70 string out;
71
72 for (size_t i = 0; i < amount; ++i)
73 out.append(str);
74
75 return out;
76}
77} // namespace
78
79namespace html2md {
80
81Converter::Converter(string *html, Options *options) : html_(*html) {
82 if (options)
83 option = *options;
84
85 tags_.reserve(41);
86
87 // non-printing tags
88 auto tagIgnored = make_shared<Converter::TagIgnored>();
89 tags_[kTagHead] = tagIgnored;
90 tags_[kTagMeta] = tagIgnored;
91 tags_[kTagNav] = tagIgnored;
92 tags_[kTagNoScript] = tagIgnored;
93 tags_[kTagScript] = tagIgnored;
94 tags_[kTagStyle] = tagIgnored;
95 tags_[kTagTemplate] = tagIgnored;
96
97 // printing tags
98 tags_[kTagAnchor] = make_shared<Converter::TagAnchor>();
99 tags_[kTagBreak] = make_shared<Converter::TagBreak>();
100 tags_[kTagDiv] = make_shared<Converter::TagDiv>();
101 tags_[kTagHeader1] = make_shared<Converter::TagHeader1>();
102 tags_[kTagHeader2] = make_shared<Converter::TagHeader2>();
103 tags_[kTagHeader3] = make_shared<Converter::TagHeader3>();
104 tags_[kTagHeader4] = make_shared<Converter::TagHeader4>();
105 tags_[kTagHeader5] = make_shared<Converter::TagHeader5>();
106 tags_[kTagHeader6] = make_shared<Converter::TagHeader6>();
107 tags_[kTagListItem] = make_shared<Converter::TagListItem>();
108 tags_[kTagOption] = make_shared<Converter::TagOption>();
109 tags_[kTagOrderedList] = make_shared<Converter::TagOrderedList>();
110 tags_[kTagPre] = make_shared<Converter::TagPre>();
111 tags_[kTagCode] = make_shared<Converter::TagCode>();
112 tags_[kTagParagraph] = make_shared<Converter::TagParagraph>();
113 tags_[kTagSpan] = make_shared<Converter::TagSpan>();
114 tags_[kTagUnorderedList] = make_shared<Converter::TagUnorderedList>();
115 tags_[kTagTitle] = make_shared<Converter::TagTitle>();
116 tags_[kTagImg] = make_shared<Converter::TagImage>();
117 tags_[kTagSeperator] = make_shared<Converter::TagSeperator>();
118
119 // Text formatting
120 auto tagBold = make_shared<Converter::TagBold>();
121 tags_[kTagBold] = tagBold;
122 tags_[kTagStrong] = tagBold;
123
124 auto tagItalic = make_shared<Converter::TagItalic>();
125 tags_[kTagItalic] = tagItalic;
126 tags_[kTagItalic2] = tagItalic;
127 tags_[kTagDefinition] = tagItalic;
128 tags_[kTagCitation] = tagItalic;
129
130 tags_[kTagUnderline] = make_shared<Converter::TagUnderline>();
131
132 auto tagStrighthrought = make_shared<Converter::TagStrikethrought>();
133 tags_[kTagStrighthrought] = tagStrighthrought;
134 tags_[kTagStrighthrought2] = tagStrighthrought;
135
136 tags_[kTagBlockquote] = make_shared<Converter::TagBlockquote>();
137
138 // Tables
139 tags_[kTagTable] = make_shared<Converter::TagTable>();
140 tags_[kTagTableRow] = make_shared<Converter::TagTableRow>();
141 tags_[kTagTableHeader] = make_shared<Converter::TagTableHeader>();
142 tags_[kTagTableData] = make_shared<Converter::TagTableData>();
143}
144
145void Converter::CleanUpMarkdown() {
146 TidyAllLines(&md_);
147
148 ReplaceAll(&md_, " , ", ", ");
149
150 ReplaceAll(&md_, "\n.\n", ".\n");
151 ReplaceAll(&md_, "\n↵\n", " ↵\n");
152 ReplaceAll(&md_, "\n*\n", "\n");
153 ReplaceAll(&md_, "\n. ", ".\n");
154
155 ReplaceAll(&md_, "&quot;", '"');
156 ReplaceAll(&md_, "&lt;", "<");
157 ReplaceAll(&md_, "&gt;", ">");
158 ReplaceAll(&html_, "&amp;", '&');
159 ReplaceAll(&html_, "&nbsp;", ' ');
160 ReplaceAll(&html_, "&rarr;", "→");
161
162 ReplaceAll(&md_, "\t\t ", "\t\t");
163}
164
165Converter *Converter::appendToMd(char ch) {
166 if (IsInIgnoredTag())
167 return this;
168
169 if (index_blockquote != 0 && ch == '\n') {
170 if (is_in_pre_) {
171 md_ += ch;
172 chars_in_curr_line_ = 0;
173 appendToMd(Repeat("> ", index_blockquote));
174 }
175
176 return this;
177 }
178
179 md_ += ch;
180
181 if (ch == '\n')
182 chars_in_curr_line_ = 0;
183 else
184 ++chars_in_curr_line_;
185
186 return this;
187}
188
189Converter *Converter::appendToMd(const char *str)
190{
191 if (IsInIgnoredTag())
192 return this;
193
194 md_ += str;
195
196 auto str_len = strlen(str);
197
198 for (auto i = 0; i < str_len; ++i) {
199 if (str[i] == '\n')
200 chars_in_curr_line_ = 0;
201 else
202 ++chars_in_curr_line_;
203 }
204
205 return this;
206}
207
208Converter *Converter::appendBlank() {
209 UpdatePrevChFromMd();
210
211 if (prev_ch_in_md_ == '\n' ||
212 (prev_ch_in_md_ == '*' && prev_prev_ch_in_md_ == '*'))
213 return this;
214
215 return appendToMd(' ');
216}
217
218bool Converter::ok() const {
219 return !is_in_pre_ && !is_in_list_ && !is_in_p_ && !is_in_table_ &&
220 !is_in_tag_ && index_blockquote == 0 && index_li == 0;
221}
222
223void Converter::LTrim(string *s) {
224 (*s).erase((*s).begin(),
225 find_if((*s).begin(), (*s).end(),
226 [](unsigned char ch) { return !std::isspace(ch); }));
227}
228
229Converter *Converter::RTrim(string *s, bool trim_only_blank) {
230 (*s).erase(find_if((*s).rbegin(), (*s).rend(),
231 [trim_only_blank](unsigned char ch) {
232 if (trim_only_blank)
233 return !isblank(ch);
234
235 return !isspace(ch);
236 })
237 .base(),
238 (*s).end());
239
240 return this;
241}
242
243// NOTE: Pay attention when changing one of the trim functions. It can break the
244// output!
245Converter *Converter::Trim(string *s) {
246 if (!startsWith(*s, "\t"))
247 LTrim(s);
248
249 if (!(startsWith(*s, " "), endsWith(*s, " ")))
250 RTrim(s);
251
252 return this;
253}
254
255void Converter::TidyAllLines(string *str) {
256 auto lines = Split(*str, '\n');
257 string res;
258
259 uint8_t amount_newlines = 0;
260 bool in_code_block = false;
261
262 for (auto line : lines) {
263 if (startsWith(line, "```") || startsWith(line, "~~~"))
264 in_code_block = !in_code_block;
265 if (in_code_block) {
266 res += line + '\n';
267 continue;
268 }
269
270 Trim(&line);
271
272 if (line.empty()) {
273 if (amount_newlines < 2 && !res.empty()) {
274 res += '\n';
275 amount_newlines++;
276 }
277 } else {
278 amount_newlines = 0;
279
280 res += line + '\n';
281 }
282 }
283
284 *str = res;
285}
286
287string Converter::ExtractAttributeFromTagLeftOf(const string &attr) {
288 // Extract the whole tag from current offset, e.g. from '>', backwards
289 auto tag = html_.substr(offset_lt_, index_ch_in_html_ - offset_lt_);
290
291 // locate given attribute
292 auto offset_attr = tag.find(attr);
293
294 if (offset_attr == string::npos)
295 return "";
296
297 // locate attribute-value pair's '='
298 auto offset_equals = tag.find('=', offset_attr);
299
300 if (offset_equals == string::npos)
301 return "";
302
303 // locate value's surrounding quotes
304 auto offset_double_quote = tag.find('"', offset_equals);
305 auto offset_single_quote = tag.find('\'', offset_equals);
306
307 bool has_double_quote = offset_double_quote != string::npos;
308 bool has_single_quote = offset_single_quote != string::npos;
309
310 if (!has_double_quote && !has_single_quote)
311 return "";
312
313 char wrapping_quote = 0;
314
315 size_t offset_opening_quote = 0;
316 size_t offset_closing_quote = 0;
317
318 if (has_double_quote) {
319 if (!has_single_quote) {
320 wrapping_quote = '"';
321 offset_opening_quote = offset_double_quote;
322 } else {
323 if (offset_double_quote < offset_single_quote) {
324 wrapping_quote = '"';
325 offset_opening_quote = offset_double_quote;
326 } else {
327 wrapping_quote = '\'';
328 offset_opening_quote = offset_single_quote;
329 }
330 }
331 } else {
332 // has only single quote
333 wrapping_quote = '\'';
334 offset_opening_quote = offset_single_quote;
335 }
336
337 if (offset_opening_quote == string::npos)
338 return "";
339
340 offset_closing_quote = tag.find(wrapping_quote, offset_opening_quote + 1);
341
342 if (offset_closing_quote == string::npos)
343 return "";
344
345 return tag.substr(offset_opening_quote + 1,
346 offset_closing_quote - 1 - offset_opening_quote);
347}
348
349void Converter::TurnLineIntoHeader1() {
350 appendToMd('\n' + Repeat("=", chars_in_curr_line_) + "\n\n");
351
352 chars_in_curr_line_ = 0;
353}
354
355void Converter::TurnLineIntoHeader2() {
356 appendToMd('\n' + Repeat("-", chars_in_curr_line_) + "\n\n");
357
358 chars_in_curr_line_ = 0;
359}
360
361string Converter::convert() {
362 // We already converted
363 if (index_ch_in_html_ == html_.size())
364 return md_;
365
366 reset();
367
368 for (char ch : html_) {
369 ++index_ch_in_html_;
370
371 if (!is_in_tag_ && ch == '<') {
372 OnHasEnteredTag();
373
374 continue;
375 }
376
377 if (is_in_tag_)
378 ParseCharInTag(ch);
379 else
380 ParseCharInTagContent(ch);
381 }
382
383 CleanUpMarkdown();
384
385 return md_;
386}
387
388void Converter::OnHasEnteredTag() {
389 offset_lt_ = index_ch_in_html_;
390 is_in_tag_ = true;
391 prev_tag_ = current_tag_;
392 current_tag_ = "";
393
394 if (!md_.empty()) {
395 UpdatePrevChFromMd();
396 }
397}
398
399Converter *Converter::UpdatePrevChFromMd() {
400 if (!md_.empty()) {
401 prev_ch_in_md_ = md_[md_.length() - 1];
402
403 if (md_.length() > 1)
404 prev_prev_ch_in_md_ = md_[md_.length() - 2];
405 }
406
407 return this;
408}
409
410bool Converter::ParseCharInTag(char ch) {
411 if (ch == '/' && !is_in_attribute_value_) {
412 is_closing_tag_ = current_tag_.empty();
413 is_self_closing_tag_ = !is_closing_tag_;
414
415 return true;
416 }
417
418 if (ch == '>')
419 return OnHasLeftTag();
420
421 if (ch == '"') {
422 if (is_in_attribute_value_) {
423 is_in_attribute_value_ = false;
424 } else if (current_tag_[current_tag_.length() - 1] == '=') {
425 is_in_attribute_value_ = true;
426 }
427
428 return true;
429 }
430
431 current_tag_ += ch;
432
433 return false;
434}
435
436bool Converter::OnHasLeftTag() {
437 is_in_tag_ = false;
438
439 UpdatePrevChFromMd();
440
441 if (!is_closing_tag_)
442 if (TagContainsAttributesToHide(&current_tag_))
443 return true;
444
445 current_tag_ = Split(current_tag_, ' ')[0];
446
447 auto tag = tags_[current_tag_];
448
449 if (!tag)
450 return true;
451
452 if (!is_closing_tag_) {
453 tag->OnHasLeftOpeningTag(this);
454 }
455 if (is_closing_tag_ || is_self_closing_tag_) {
456 is_closing_tag_ = false;
457
458 tag->OnHasLeftClosingTag(this);
459 }
460
461 return true;
462}
463
464Converter *Converter::ShortenMarkdown(size_t chars) {
465 md_ = md_.substr(0, md_.length() - chars);
466
467 if (chars > chars_in_curr_line_)
468 chars_in_curr_line_ = 0;
469 else
470 chars_in_curr_line_ = chars_in_curr_line_ - chars;
471
472 return this->UpdatePrevChFromMd();
473}
474
475bool Converter::ParseCharInTagContent(char ch) {
476 if (is_in_code_) {
477 md_ += ch;
478
479 if (index_blockquote != 0 && ch == '\n')
480 appendToMd(Repeat("> ", index_blockquote));
481
482 return true;
483 }
484
485 if (IsInIgnoredTag() || current_tag_ == kTagLink) {
486 prev_ch_in_html_ = ch;
487
488 return true;
489 }
490
491 if (ch == '\n') {
492 if (index_blockquote != 0) {
493 md_ += '\n';
494 chars_in_curr_line_ = 0;
495 appendToMd(Repeat("> ", index_blockquote));
496 }
497
498 return true;
499 }
500
501 switch (ch) {
502 case '*':
503 appendToMd("\\*");
504 break;
505 case '`':
506 appendToMd("\\`");
507 break;
508 case '\\':
509 appendToMd("\\\\");
510 break;
511 default:
512 md_ += ch;
513 ++chars_in_curr_line_;
514 break;
515 }
516
517 if (chars_in_curr_line_ > option.softBreak && !is_in_table_ && !is_in_list_ &&
518 current_tag_ != kTagImg && current_tag_ != kTagAnchor &&
519 option.splitLines) {
520 if (ch == ' ') { // If the next char is - it will become a list
521 md_ += '\n';
522 chars_in_curr_line_ = 0;
523 } else if (chars_in_curr_line_ > option.hardBreak) {
524 ReplacePreviousSpaceInLineByNewline();
525 }
526 }
527
528 return false;
529}
530
531bool Converter::ReplacePreviousSpaceInLineByNewline() {
532 if (current_tag_ == kTagParagraph ||
533 is_in_table_ && (prev_tag_ != kTagCode && prev_tag_ != kTagPre))
534 return false;
535
536 auto offset = md_.length() - 1;
537
538 if (md_.length() == 0)
539 return true;
540
541 do {
542 if (md_[offset] == '\n')
543 return false;
544
545 if (md_[offset] == ' ') {
546 md_[offset] = '\n';
547 chars_in_curr_line_ = md_.length() - offset;
548
549 return true;
550 }
551
552 --offset;
553 } while (offset > 0);
554
555 return false;
556}
557
558void Converter::TagAnchor::OnHasLeftOpeningTag(Converter *c) {
559 if (c->prev_tag_ == kTagImg)
560 c->appendToMd('\n');
561
562 current_title_ = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
563
564 c->appendToMd('[');
565 current_href_ = c->ExtractAttributeFromTagLeftOf(kAttributeHref);
566}
567
568void Converter::TagAnchor::OnHasLeftClosingTag(Converter *c) {
569 if (!c->shortIfPrevCh('[')) {
570 c->appendToMd("](")->appendToMd(current_href_);
571
572 // If title is set append it
573 if (!current_title_.empty()) {
574 c->appendToMd(" \"")->appendToMd(current_title_)->appendToMd('"');
575 current_title_.clear();
576 }
577
578 c->appendToMd(')');
579
580 if (c->prev_tag_ == kTagImg)
581 c->appendToMd('\n');
582 }
583}
584
585void Converter::TagBold::OnHasLeftOpeningTag(Converter *c) {
586 c->appendToMd("**");
587}
588
589void Converter::TagBold::OnHasLeftClosingTag(Converter *c) {
590 c->appendToMd("**");
591}
592
593void Converter::TagItalic::OnHasLeftOpeningTag(Converter *c) {
594 c->appendToMd('*');
595}
596
597void Converter::TagItalic::OnHasLeftClosingTag(Converter *c) {
598 c->appendToMd('*');
599}
600
601void Converter::TagUnderline::OnHasLeftOpeningTag(Converter *c) {
602 c->appendToMd("<u>");
603}
604
605void Converter::TagUnderline::OnHasLeftClosingTag(Converter *c) {
606 c->appendToMd("</u>");
607}
608
609void Converter::TagStrikethrought::OnHasLeftOpeningTag(Converter *c) {
610 c->appendToMd('~');
611}
612
613void Converter::TagStrikethrought::OnHasLeftClosingTag(Converter *c) {
614 c->appendToMd('~');
615}
616
617void Converter::TagBreak::OnHasLeftOpeningTag(Converter *c) {
618 if (c->is_in_list_) { // When it's in a list, it's not in a paragraph
619 c->appendToMd(" \n");
620 c->appendToMd(Repeat(" ", c->index_li));
621 } else if (c->is_in_table_) {
622 c->appendToMd("<br>");
623 } else if (!c->is_in_p_) {
624 c->appendToMd("\n<br>\n\n");
625 } else if (c->md_.length() > 0)
626 c->appendToMd(" \n");
627}
628
629void Converter::TagBreak::OnHasLeftClosingTag(Converter *c) {}
630
631void Converter::TagDiv::OnHasLeftOpeningTag(Converter *c) {
632 if (c->prev_ch_in_md_ != '\n')
633 c->appendToMd('\n');
634
635 if (c->prev_prev_ch_in_md_ != '\n')
636 c->appendToMd('\n');
637}
638
639void Converter::TagDiv::OnHasLeftClosingTag(Converter *c) {}
640
641void Converter::TagHeader1::OnHasLeftOpeningTag(Converter *c) {
642 c->appendToMd("\n# ");
643}
644
645void Converter::TagHeader1::OnHasLeftClosingTag(Converter *c) {
646 if (c->prev_prev_ch_in_md_ != ' ')
647 c->appendToMd('\n');
648}
649
650void Converter::TagHeader2::OnHasLeftOpeningTag(Converter *c) {
651 c->appendToMd("\n## ");
652}
653
654void Converter::TagHeader2::OnHasLeftClosingTag(Converter *c) {
655 if (c->prev_prev_ch_in_md_ != ' ')
656 c->appendToMd('\n');
657}
658
659void Converter::TagHeader3::OnHasLeftOpeningTag(Converter *c) {
660 c->appendToMd("\n### ");
661}
662
663void Converter::TagHeader3::OnHasLeftClosingTag(Converter *c) {
664 if (c->prev_prev_ch_in_md_ != ' ')
665 c->appendToMd('\n');
666}
667
668void Converter::TagHeader4::OnHasLeftOpeningTag(Converter *c) {
669 c->appendToMd("\n#### ");
670}
671
672void Converter::TagHeader4::OnHasLeftClosingTag(Converter *c) {
673 if (c->prev_prev_ch_in_md_ != ' ')
674 c->appendToMd('\n');
675}
676
677void Converter::TagHeader5::OnHasLeftOpeningTag(Converter *c) {
678 c->appendToMd("\n##### ");
679}
680
681void Converter::TagHeader5::OnHasLeftClosingTag(Converter *c) {
682 if (c->prev_prev_ch_in_md_ != ' ')
683 c->appendToMd('\n');
684}
685
686void Converter::TagHeader6::OnHasLeftOpeningTag(Converter *c) {
687 c->appendToMd("\n###### ");
688}
689
690void Converter::TagHeader6::OnHasLeftClosingTag(Converter *c) {
691 if (c->prev_prev_ch_in_md_ != ' ')
692 c->appendToMd('\n');
693}
694
695void Converter::TagListItem::OnHasLeftOpeningTag(Converter *c) {
696 if (c->is_in_table_)
697 return;
698
699 if (!c->is_in_ordered_list_) {
700 c->appendToMd(string({c->option.unorderedList, ' '}));
701 return;
702 }
703
704 ++c->index_ol;
705
706 string num = std::to_string(c->index_ol);
707 num.append({c->option.orderedList, ' '});
708 c->appendToMd(num);
709}
710
711void Converter::TagListItem::OnHasLeftClosingTag(Converter *c) {
712 if (c->is_in_table_)
713 return;
714
715 if (c->prev_ch_in_md_ != '\n')
716 c->appendToMd('\n');
717}
718
719void Converter::TagOption::OnHasLeftOpeningTag(Converter *c) {}
720
721void Converter::TagOption::OnHasLeftClosingTag(Converter *c) {
722 if (c->md_.length() > 0)
723 c->appendToMd(" \n");
724}
725
726void Converter::TagOrderedList::OnHasLeftOpeningTag(Converter *c) {
727 if (c->is_in_table_)
728 return;
729
730 c->is_in_list_ = true;
731 c->is_in_ordered_list_ = true;
732 c->index_ol = 0;
733
734 ++c->index_li;
735
736 c->ReplacePreviousSpaceInLineByNewline();
737
738 c->appendToMd('\n');
739}
740
741void Converter::TagOrderedList::OnHasLeftClosingTag(Converter *c) {
742 if (c->is_in_table_)
743 return;
744
745 c->is_in_ordered_list_ = false;
746
747 if (c->index_li != 0)
748 --c->index_li;
749
750 c->is_in_list_ = c->index_li != 0;
751
752 c->appendToMd('\n');
753}
754
755void Converter::TagParagraph::OnHasLeftOpeningTag(Converter *c) {
756 c->is_in_p_ = true;
757
758 if (c->is_in_list_ && c->prev_tag_ == kTagParagraph)
759 c->appendToMd("\n\t");
760 else if (!c->is_in_list_)
761 c->appendToMd('\n');
762}
763
764void Converter::TagParagraph::OnHasLeftClosingTag(Converter *c) {
765 c->is_in_p_ = false;
766
767 if (!c->md_.empty())
768 c->appendToMd("\n"); // Workaround \n restriction for blockquotes
769
770 if (c->index_blockquote != 0)
771 c->appendToMd(Repeat("> ", c->index_blockquote));
772}
773
774void Converter::TagPre::OnHasLeftOpeningTag(Converter *c) {
775 c->is_in_pre_ = true;
776
777 if (c->prev_ch_in_md_ != '\n')
778 c->appendToMd('\n');
779
780 if (c->prev_prev_ch_in_md_ != '\n')
781 c->appendToMd('\n');
782
783 if (c->is_in_list_ && c->prev_tag_ != kTagParagraph)
784 c->ShortenMarkdown(2);
785
786 if (c->is_in_list_)
787 c->appendToMd("\t\t");
788 else
789 c->appendToMd("```");
790}
791
792void Converter::TagPre::OnHasLeftClosingTag(Converter *c) {
793 c->is_in_pre_ = false;
794
795 if (c->is_in_list_)
796 return;
797
798 c->appendToMd("```");
799 c->appendToMd('\n'); // Don't combine because of blockquote
800}
801
802void Converter::TagCode::OnHasLeftOpeningTag(Converter *c) {
803 c->is_in_code_ = true;
804
805 if (c->is_in_pre_) {
806 if (c->is_in_list_)
807 return;
808
809 auto code = c->ExtractAttributeFromTagLeftOf(kAttributeClass);
810 if (!code.empty()) {
811 if (startsWith(code, "language-"))
812 code.erase(0, 9); // remove language-
813 c->appendToMd(code);
814 }
815 c->appendToMd('\n');
816 } else
817 c->appendToMd('`');
818}
819
820void Converter::TagCode::OnHasLeftClosingTag(Converter *c) {
821 c->is_in_code_ = false;
822
823 if (c->is_in_pre_)
824 return;
825
826 c->appendToMd('`');
827}
828
829void Converter::TagSpan::OnHasLeftOpeningTag(Converter *c) {}
830
831void Converter::TagSpan::OnHasLeftClosingTag(Converter *c) {}
832
833void Converter::TagTitle::OnHasLeftOpeningTag(Converter *c) {}
834
835void Converter::TagTitle::OnHasLeftClosingTag(Converter *c) {
836 c->TurnLineIntoHeader1();
837}
838
839void Converter::TagUnorderedList::OnHasLeftOpeningTag(Converter *c) {
840 if (c->is_in_list_ || c->is_in_table_)
841 return;
842
843 c->is_in_list_ = true;
844
845 ++c->index_li;
846
847 c->appendToMd('\n');
848}
849
850void Converter::TagUnorderedList::OnHasLeftClosingTag(Converter *c) {
851 if (c->is_in_table_)
852 return;
853
854 if (c->index_li != 0)
855 --c->index_li;
856
857 c->is_in_list_ = c->index_li != 0;
858
859 if (c->prev_prev_ch_in_md_ == '\n' && c->prev_ch_in_md_ == '\n')
860 c->ShortenMarkdown();
861 else if (c->prev_ch_in_md_ != '\n')
862 c->appendToMd('\n');
863}
864
865void Converter::TagImage::OnHasLeftOpeningTag(Converter *c) {
866 if (c->prev_tag_ != kTagAnchor && c->prev_ch_in_md_ != '\n')
867 c->appendToMd('\n');
868
869 c->appendToMd("![")
870 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeAlt))
871 ->appendToMd("](")
872 ->appendToMd(c->ExtractAttributeFromTagLeftOf(kAttributeSrc));
873
874 auto title = c->ExtractAttributeFromTagLeftOf(kAttributeTitle);
875 if (!title.empty()) {
876 c->appendToMd(" \"")->appendToMd(title)->appendToMd('"');
877 }
878
879 c->appendToMd(")");
880}
881
882void Converter::TagImage::OnHasLeftClosingTag(Converter *c) {
883 if (c->prev_tag_ == kTagAnchor)
884 c->appendToMd('\n');
885}
886
887void Converter::TagSeperator::OnHasLeftOpeningTag(Converter *c) {
888 c->appendToMd("\n---\n"); // NOTE: We can make this an option
889}
890
891void Converter::TagSeperator::OnHasLeftClosingTag(Converter *c) {}
892
893void Converter::TagTable::OnHasLeftOpeningTag(Converter *c) {
894 c->is_in_table_ = true;
895 c->appendToMd('\n');
896 c->table_start = c->md_.length();
897}
898
899void Converter::TagTable::OnHasLeftClosingTag(Converter *c) {
900 c->is_in_table_ = false;
901 c->appendToMd('\n');
902
903 if (!c->option.formatTable)
904 return;
905
906 string table = c->md_.substr(c->table_start);
907 table = formatMarkdownTable(table);
908 c->ShortenMarkdown(c->md_.size() - c->table_start);
909 c->appendToMd(table);
910}
911
912void Converter::TagTableRow::OnHasLeftOpeningTag(Converter *c) {
913 c->appendToMd('\n');
914}
915
916void Converter::TagTableRow::OnHasLeftClosingTag(Converter *c) {
917 c->UpdatePrevChFromMd();
918 if (c->prev_ch_in_md_ == '|')
919 c->appendToMd('\n'); // There's a bug
920 else
921 c->appendToMd('|');
922
923 if (!c->tableLine.empty()) {
924 if (c->prev_ch_in_md_ != '\n')
925 c->appendToMd('\n');
926
927 c->tableLine.append("|\n");
928 c->appendToMd(c->tableLine);
929 c->tableLine.clear();
930 }
931}
932
933void Converter::TagTableHeader::OnHasLeftOpeningTag(Converter *c) {
934 auto align = c->ExtractAttributeFromTagLeftOf(kAttrinuteAlign);
935
936 string line = "| ";
937
938 if (align == "left" || align == "center")
939 line += ':';
940
941 line += '-';
942
943 if (align == "right" || align == "center")
944 line += ": ";
945 else
946 line += ' ';
947
948 c->tableLine.append(line);
949
950 c->appendToMd("| ");
951}
952
953void Converter::TagTableHeader::OnHasLeftClosingTag(Converter *c) {}
954
955void Converter::TagTableData::OnHasLeftOpeningTag(Converter *c) {
956 if (c->prev_prev_ch_in_md_ != '|')
957 c->appendToMd("| ");
958}
959
960void Converter::TagTableData::OnHasLeftClosingTag(Converter *c) {}
961
962void Converter::TagBlockquote::OnHasLeftOpeningTag(Converter *c) {
963 ++c->index_blockquote;
964}
965
966void Converter::TagBlockquote::OnHasLeftClosingTag(Converter *c) {
967 --c->index_blockquote;
968 c->ShortenMarkdown(2); // Remove the '> '
969}
970
971void Converter::reset() {
972 md_.clear();
973 prev_ch_in_md_ = 0;
974 prev_prev_ch_in_md_ = 0;
975 index_ch_in_html_ = 0;
976}
977
978bool Converter::IsInIgnoredTag() const {
979 if (current_tag_ == kTagTitle && !option.includeTitle)
980 return true;
981
982 return IsIgnoredTag(current_tag_);
983}
984} // namespace html2md
Class for converting HTML to Markdown.
Definition html2md.h:159
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:173
html2md namespace
Definition html2md.h:21
std::string formatMarkdownTable(const std::string &inputTable)