html2md  v1.8.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.h
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#ifndef HTML2MD_H
5#define HTML2MD_H
6
7#include <memory>
8#include <string>
9#include <unordered_map>
10#include <cstdint>
11
22namespace html2md {
23
39struct Options {
46 bool splitLines = true;
47
52 int softBreak = 80;
53
57 int hardBreak = 100;
58
75 char unorderedList = '-';
76
91 char orderedList = '.';
92
100 bool includeTitle = true;
101
108 bool formatTable = true;
109
116 bool forceLeftTrim = false;
117
125 bool compressWhitespace = false;
126
134};
135
180public:
193 explicit inline Converter(const std::string &html,
194 struct Options *options = nullptr) {
195 *this = Converter(&html, options);
196 }
197
205 [[nodiscard]] std::string convert();
206
212 Converter *appendToMd(char ch);
213
219 Converter *appendToMd(const char *str);
220
226 inline Converter *appendToMd(const std::string &s) {
227 return appendToMd(s.c_str());
228 }
229
239
252 void addHtmlSymbolConversion(const std::string &htmlSymbol,
253 const std::string &replacement) {
254 htmlSymbolConversions_[htmlSymbol] = replacement;
255 }
256
263 void removeHtmlSymbolConversion(const std::string &htmlSymbol) {
264 htmlSymbolConversions_.erase(htmlSymbol);
265 }
266
271 void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); }
272
278 [[nodiscard]] bool ok() const;
279
283 void reset();
284
290 inline bool operator==(const Converter *c) const { return *this == *c; }
291
292 inline bool operator==(const Converter &c) const {
293 return html_ == c.html_ && option == c.option;
294 }
295
299 inline explicit operator bool() const { return ok(); };
300
301private:
302 // Attributes
303 static constexpr const char *kAttributeHref = "href";
304 static constexpr const char *kAttributeAlt = "alt";
305 static constexpr const char *kAttributeTitle = "title";
306 static constexpr const char *kAttributeClass = "class";
307 static constexpr const char *kAttributeSrc = "src";
308 static constexpr const char *kAttrinuteAlign = "align";
309
310 static constexpr const char *kTagAnchor = "a";
311 static constexpr const char *kTagBreak = "br";
312 static constexpr const char *kTagCode = "code";
313 static constexpr const char *kTagDiv = "div";
314 static constexpr const char *kTagHead = "head";
315 static constexpr const char *kTagLink = "link";
316 static constexpr const char *kTagListItem = "li";
317 static constexpr const char *kTagMeta = "meta";
318 static constexpr const char *kTagNav = "nav";
319 static constexpr const char *kTagNoScript = "noscript";
320 static constexpr const char *kTagOption = "option";
321 static constexpr const char *kTagOrderedList = "ol";
322 static constexpr const char *kTagParagraph = "p";
323 static constexpr const char *kTagPre = "pre";
324 static constexpr const char *kTagScript = "script";
325 static constexpr const char *kTagSpan = "span";
326 static constexpr const char *kTagStyle = "style";
327 static constexpr const char *kTagTemplate = "template";
328 static constexpr const char *kTagTitle = "title";
329 static constexpr const char *kTagUnorderedList = "ul";
330 static constexpr const char *kTagImg = "img";
331 static constexpr const char *kTagSeperator = "hr";
332
333 // Text format
334 static constexpr const char *kTagBold = "b";
335 static constexpr const char *kTagStrong = "strong";
336 static constexpr const char *kTagItalic = "em";
337 static constexpr const char *kTagItalic2 = "i";
338 static constexpr const char *kTagCitation = "cite";
339 static constexpr const char *kTagDefinition = "dfn";
340 static constexpr const char *kTagUnderline = "u";
341 static constexpr const char *kTagStrighthrought = "del";
342 static constexpr const char *kTagStrighthrought2 = "s";
343
344 static constexpr const char *kTagBlockquote = "blockquote";
345
346 // Header
347 static constexpr const char *kTagHeader1 = "h1";
348 static constexpr const char *kTagHeader2 = "h2";
349 static constexpr const char *kTagHeader3 = "h3";
350 static constexpr const char *kTagHeader4 = "h4";
351 static constexpr const char *kTagHeader5 = "h5";
352 static constexpr const char *kTagHeader6 = "h6";
353
354 // Table
355 static constexpr const char *kTagTable = "table";
356 static constexpr const char *kTagTableRow = "tr";
357 static constexpr const char *kTagTableHeader = "th";
358 static constexpr const char *kTagTableData = "td";
359
360 size_t index_ch_in_html_ = 0;
361
362 bool is_closing_tag_ = false;
363 bool is_in_attribute_value_ = false;
364 bool is_in_code_ = false;
365 bool is_in_list_ = false;
366 bool is_in_p_ = false;
367 bool is_in_pre_ = false;
368 bool is_in_table_ = false;
369 bool is_in_table_row_ = false;
370 bool is_in_tag_ = false;
371 bool is_self_closing_tag_ = false;
372
373 // relevant for <li> only, false = is in unordered list
374 bool is_in_ordered_list_ = false;
375 uint8_t index_ol = 0;
376
377 // store the table start
378 size_t table_start = 0;
379
380 // number of lists
381 uint8_t index_li = 0;
382
383 uint8_t index_blockquote = 0;
384
385 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
386 char prev_ch_in_html_ = 'x';
387
388 std::string html_;
389
390 uint16_t offset_lt_ = 0;
391 std::string current_tag_;
392 std::string prev_tag_;
393
394 // Line which separates header from data
395 std::string tableLine;
396
397 size_t chars_in_curr_line_ = 0;
398
399 std::string md_;
400
401 Options option;
402
403 std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
404 {"&quot;", "\""}, {"&lt;", "<"}, {"&gt;", ">"},
405 {"&amp;", "&"}, {"&nbsp;", " "}, {"&rarr;", "→"}};
406
407 // Tag: base class for tag types
408 struct Tag {
409 virtual void OnHasLeftOpeningTag(Converter *c) = 0;
410 virtual void OnHasLeftClosingTag(Converter *c) = 0;
411 };
412
413 // Tag types
414
415 // tags that are not printed (nav, script, noscript, ...)
416 struct TagIgnored : Tag {
417 void OnHasLeftOpeningTag(Converter *c) override {};
418 void OnHasLeftClosingTag(Converter *c) override {};
419 };
420
421 struct TagAnchor : Tag {
422 void OnHasLeftOpeningTag(Converter *c) override;
423 void OnHasLeftClosingTag(Converter *c) override;
424
425 std::string current_href_;
426 std::string current_title_;
427 };
428
429 struct TagBold : Tag {
430 void OnHasLeftOpeningTag(Converter *c) override;
431 void OnHasLeftClosingTag(Converter *c) override;
432 };
433
434 struct TagItalic : Tag {
435 void OnHasLeftOpeningTag(Converter *c) override;
436 void OnHasLeftClosingTag(Converter *c) override;
437 };
438
439 struct TagUnderline : Tag {
440 void OnHasLeftOpeningTag(Converter *c) override;
441 void OnHasLeftClosingTag(Converter *c) override;
442 };
443
444 struct TagStrikethrought : Tag {
445 void OnHasLeftOpeningTag(Converter *c) override;
446 void OnHasLeftClosingTag(Converter *c) override;
447 };
448
449 struct TagBreak : Tag {
450 void OnHasLeftOpeningTag(Converter *c) override;
451 void OnHasLeftClosingTag(Converter *c) override;
452 };
453
454 struct TagDiv : Tag {
455 void OnHasLeftOpeningTag(Converter *c) override;
456 void OnHasLeftClosingTag(Converter *c) override;
457 };
458
459 struct TagHeader1 : Tag {
460 void OnHasLeftOpeningTag(Converter *c) override;
461 void OnHasLeftClosingTag(Converter *c) override;
462 };
463
464 struct TagHeader2 : Tag {
465 void OnHasLeftOpeningTag(Converter *c) override;
466 void OnHasLeftClosingTag(Converter *c) override;
467 };
468
469 struct TagHeader3 : Tag {
470 void OnHasLeftOpeningTag(Converter *c) override;
471 void OnHasLeftClosingTag(Converter *c) override;
472 };
473
474 struct TagHeader4 : Tag {
475 void OnHasLeftOpeningTag(Converter *c) override;
476 void OnHasLeftClosingTag(Converter *c) override;
477 };
478
479 struct TagHeader5 : Tag {
480 void OnHasLeftOpeningTag(Converter *c) override;
481 void OnHasLeftClosingTag(Converter *c) override;
482 };
483
484 struct TagHeader6 : Tag {
485 void OnHasLeftOpeningTag(Converter *c) override;
486 void OnHasLeftClosingTag(Converter *c) override;
487 };
488
489 struct TagListItem : Tag {
490 void OnHasLeftOpeningTag(Converter *c) override;
491 void OnHasLeftClosingTag(Converter *c) override;
492 };
493
494 struct TagOption : Tag {
495 void OnHasLeftOpeningTag(Converter *c) override;
496 void OnHasLeftClosingTag(Converter *c) override;
497 };
498
499 struct TagOrderedList : Tag {
500 void OnHasLeftOpeningTag(Converter *c) override;
501 void OnHasLeftClosingTag(Converter *c) override;
502 };
503
504 struct TagParagraph : Tag {
505 void OnHasLeftOpeningTag(Converter *c) override;
506 void OnHasLeftClosingTag(Converter *c) override;
507 };
508
509 struct TagPre : Tag {
510 void OnHasLeftOpeningTag(Converter *c) override;
511 void OnHasLeftClosingTag(Converter *c) override;
512 };
513
514 struct TagCode : Tag {
515 void OnHasLeftOpeningTag(Converter *c) override;
516 void OnHasLeftClosingTag(Converter *c) override;
517 };
518
519 struct TagSpan : Tag {
520 void OnHasLeftOpeningTag(Converter *c) override;
521 void OnHasLeftClosingTag(Converter *c) override;
522 };
523
524 struct TagTitle : Tag {
525 void OnHasLeftOpeningTag(Converter *c) override;
526 void OnHasLeftClosingTag(Converter *c) override;
527 };
528
529 struct TagUnorderedList : Tag {
530 void OnHasLeftOpeningTag(Converter *c) override;
531 void OnHasLeftClosingTag(Converter *c) override;
532 };
533
534 struct TagImage : Tag {
535 void OnHasLeftOpeningTag(Converter *c) override;
536 void OnHasLeftClosingTag(Converter *c) override;
537 };
538
539 struct TagSeperator : Tag {
540 void OnHasLeftOpeningTag(Converter *c) override;
541 void OnHasLeftClosingTag(Converter *c) override;
542 };
543
544 struct TagTable : Tag {
545 void OnHasLeftOpeningTag(Converter *c) override;
546 void OnHasLeftClosingTag(Converter *c) override;
547 };
548
549 struct TagTableRow : Tag {
550 void OnHasLeftOpeningTag(Converter *c) override;
551 void OnHasLeftClosingTag(Converter *c) override;
552 };
553
554 struct TagTableHeader : Tag {
555 void OnHasLeftOpeningTag(Converter *c) override;
556 void OnHasLeftClosingTag(Converter *c) override;
557 };
558
559 struct TagTableData : Tag {
560 void OnHasLeftOpeningTag(Converter *c) override;
561 void OnHasLeftClosingTag(Converter *c) override;
562 };
563
564 struct TagBlockquote : Tag {
565 void OnHasLeftOpeningTag(Converter *c) override;
566 void OnHasLeftClosingTag(Converter *c) override;
567 };
568
569 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
570
571 explicit Converter(const std::string *html, struct Options *options);
572
573 void CleanUpMarkdown();
574
575 // Trim from start (in place)
576 static void LTrim(std::string *s);
577
578 // Trim from end (in place)
579 Converter *RTrim(std::string *s, bool trim_only_blank = false);
580
581 // Trim from both ends (in place)
582 Converter *Trim(std::string *s);
583
584 // 1. trim all lines
585 // 2. reduce consecutive newlines to maximum 3
586 void TidyAllLines(std::string *str);
587
588 std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
589
590 void TurnLineIntoHeader1();
591
592 void TurnLineIntoHeader2();
593
594 // Current char: '<'
595 void OnHasEnteredTag();
596
597 Converter *UpdatePrevChFromMd();
598
605 bool ParseCharInTag(char ch);
606
607 // Current char: '>'
608 bool OnHasLeftTag();
609
610 inline static bool TagContainsAttributesToHide(std::string *tag) {
611 using std::string;
612
613 return (*tag).find(" aria=\"hidden\"") != string::npos ||
614 (*tag).find("display:none") != string::npos ||
615 (*tag).find("visibility:hidden") != string::npos ||
616 (*tag).find("opacity:0") != string::npos ||
617 (*tag).find("Details-content--hidden-not-important") != string::npos;
618 }
619
620 Converter *ShortenMarkdown(size_t chars = 1);
621 inline bool shortIfPrevCh(char prev) {
622 if (prev_ch_in_md_ == prev) {
623 ShortenMarkdown();
624 return true;
625 }
626 return false;
627 };
628
633 bool ParseCharInTagContent(char ch);
634
635 // Replace previous space (if any) in current markdown line by newline
636 bool ReplacePreviousSpaceInLineByNewline();
637
638 static inline bool IsIgnoredTag(const std::string &tag) {
639 return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
640 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
641
642 // meta: not ignored to tolerate if closing is omitted
643 }
644
645 [[nodiscard]] bool IsInIgnoredTag() const;
646}; // Converter
647
654inline std::string Convert(const std::string &html, bool *ok = nullptr) {
655 Converter c(html);
656 auto md = c.convert();
657 if (ok != nullptr)
658 *ok = c.ok();
659 return md;
660}
661
662#ifndef PYTHON_BINDINGS
663inline std::string Convert(const std::string &&html, bool *ok = nullptr) {
664 return Convert(html, ok);
665}
666#endif
667
668} // namespace html2md
669
670#endif // HTML2MD_H
Class for converting HTML to Markdown.
Definition html2md.h:179
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:393
void addHtmlSymbolConversion(const std::string &htmlSymbol, const std::string &replacement)
Add an HTML symbol conversion.
Definition html2md.h:252
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:197
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Definition html2md.h:290
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:239
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:249
void removeHtmlSymbolConversion(const std::string &htmlSymbol)
Remove an HTML symbol conversion.
Definition html2md.h:263
bool operator==(const Converter &c) const
Definition html2md.h:292
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:193
void reset()
Reset the generated Markdown.
Definition html2md.cpp:1050
void clearHtmlSymbolConversions()
Clear all HTML symbol conversions.
Definition html2md.h:271
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
Definition html2md.h:226
html2md namespace
Definition html2md.h:22
std::string Convert(const std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Definition html2md.h:654
Options for the conversion from HTML to Markdown.
Definition html2md.h:39
bool operator==(html2md::Options o) const
Definition html2md.h:127
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
Definition html2md.h:52
bool compressWhitespace
Whether to compress whitespace (tabs, multiple spaces) into a single space.
Definition html2md.h:125
int hardBreak
hardBreak Force a break after ... characters in a line
Definition html2md.h:57
char orderedList
The char used after the number of the item.
Definition html2md.h:91
bool formatTable
Whetever to format Markdown Tables.
Definition html2md.h:108
bool splitLines
Add new line when a certain number of characters is reached.
Definition html2md.h:46
char unorderedList
The char used for unordered lists.
Definition html2md.h:75
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.
Definition html2md.h:100
bool forceLeftTrim
Whether to force left trim of lines in the final Markdown output.
Definition html2md.h:116