html2md  v1.7.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.h
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#ifndef HTML2MD_H
5#define HTML2MD_H
6
7#include <memory>
8#include <string>
9#include <unordered_map>
10
21namespace html2md {
22
38struct Options {
45 bool splitLines = true;
46
51 int softBreak = 80;
52
56 int hardBreak = 100;
57
74 char unorderedList = '-';
75
90 char orderedList = '.';
91
99 bool includeTitle = true;
100
107 bool formatTable = true;
108
109 inline bool operator==(html2md::Options o) const {
110 return splitLines == o.splitLines && unorderedList == o.unorderedList &&
113 };
114};
115
160public:
173 explicit inline Converter(const std::string &html,
174 struct Options *options = nullptr) {
175 *this = Converter(&html, options);
176 }
177
185 [[nodiscard]] std::string convert();
186
192 Converter *appendToMd(char ch);
193
199 Converter *appendToMd(const char *str);
200
206 inline Converter *appendToMd(const std::string &s) {
207 return appendToMd(s.c_str());
208 }
209
219
232 void addHtmlSymbolConversion(const std::string &htmlSymbol,
233 const std::string &replacement) {
234 htmlSymbolConversions_[htmlSymbol] = replacement;
235 }
236
243 void removeHtmlSymbolConversion(const std::string &htmlSymbol) {
244 htmlSymbolConversions_.erase(htmlSymbol);
245 }
246
251 void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); }
252
258 [[nodiscard]] bool ok() const;
259
263 void reset();
264
270 inline bool operator==(const Converter *c) const { return *this == *c; }
271
272 inline bool operator==(const Converter &c) const {
273 return html_ == c.html_ && option == c.option;
274 }
275
279 inline explicit operator bool() const { return ok(); };
280
281private:
282 // Attributes
283 static constexpr const char *kAttributeHref = "href";
284 static constexpr const char *kAttributeAlt = "alt";
285 static constexpr const char *kAttributeTitle = "title";
286 static constexpr const char *kAttributeClass = "class";
287 static constexpr const char *kAttributeSrc = "src";
288 static constexpr const char *kAttrinuteAlign = "align";
289
290 static constexpr const char *kTagAnchor = "a";
291 static constexpr const char *kTagBreak = "br";
292 static constexpr const char *kTagCode = "code";
293 static constexpr const char *kTagDiv = "div";
294 static constexpr const char *kTagHead = "head";
295 static constexpr const char *kTagLink = "link";
296 static constexpr const char *kTagListItem = "li";
297 static constexpr const char *kTagMeta = "meta";
298 static constexpr const char *kTagNav = "nav";
299 static constexpr const char *kTagNoScript = "noscript";
300 static constexpr const char *kTagOption = "option";
301 static constexpr const char *kTagOrderedList = "ol";
302 static constexpr const char *kTagParagraph = "p";
303 static constexpr const char *kTagPre = "pre";
304 static constexpr const char *kTagScript = "script";
305 static constexpr const char *kTagSpan = "span";
306 static constexpr const char *kTagStyle = "style";
307 static constexpr const char *kTagTemplate = "template";
308 static constexpr const char *kTagTitle = "title";
309 static constexpr const char *kTagUnorderedList = "ul";
310 static constexpr const char *kTagImg = "img";
311 static constexpr const char *kTagSeperator = "hr";
312
313 // Text format
314 static constexpr const char *kTagBold = "b";
315 static constexpr const char *kTagStrong = "strong";
316 static constexpr const char *kTagItalic = "em";
317 static constexpr const char *kTagItalic2 = "i";
318 static constexpr const char *kTagCitation = "cite";
319 static constexpr const char *kTagDefinition = "dfn";
320 static constexpr const char *kTagUnderline = "u";
321 static constexpr const char *kTagStrighthrought = "del";
322 static constexpr const char *kTagStrighthrought2 = "s";
323
324 static constexpr const char *kTagBlockquote = "blockquote";
325
326 // Header
327 static constexpr const char *kTagHeader1 = "h1";
328 static constexpr const char *kTagHeader2 = "h2";
329 static constexpr const char *kTagHeader3 = "h3";
330 static constexpr const char *kTagHeader4 = "h4";
331 static constexpr const char *kTagHeader5 = "h5";
332 static constexpr const char *kTagHeader6 = "h6";
333
334 // Table
335 static constexpr const char *kTagTable = "table";
336 static constexpr const char *kTagTableRow = "tr";
337 static constexpr const char *kTagTableHeader = "th";
338 static constexpr const char *kTagTableData = "td";
339
340 size_t index_ch_in_html_ = 0;
341
342 bool is_closing_tag_ = false;
343 bool is_in_attribute_value_ = false;
344 bool is_in_code_ = false;
345 bool is_in_list_ = false;
346 bool is_in_p_ = false;
347 bool is_in_pre_ = false;
348 bool is_in_table_ = false;
349 bool is_in_table_row_ = false;
350 bool is_in_tag_ = false;
351 bool is_self_closing_tag_ = false;
352
353 // relevant for <li> only, false = is in unordered list
354 bool is_in_ordered_list_ = false;
355 uint8_t index_ol = 0;
356
357 // store the table start
358 size_t table_start = 0;
359
360 // number of lists
361 uint8_t index_li = 0;
362
363 uint8_t index_blockquote = 0;
364
365 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
366 char prev_ch_in_html_ = 'x';
367
368 std::string html_;
369
370 uint16_t offset_lt_ = 0;
371 std::string current_tag_;
372 std::string prev_tag_;
373
374 // Line which separates header from data
375 std::string tableLine;
376
377 size_t chars_in_curr_line_ = 0;
378
379 std::string md_;
380
381 Options option;
382
383 std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
384 {"&quot;", "\""}, {"&lt;", "<"}, {"&gt;", ">"},
385 {"&amp;", "&"}, {"&nbsp;", " "}, {"&rarr;", "→"}};
386
387 // Tag: base class for tag types
388 struct Tag {
389 virtual void OnHasLeftOpeningTag(Converter *c) = 0;
390 virtual void OnHasLeftClosingTag(Converter *c) = 0;
391 };
392
393 // Tag types
394
395 // tags that are not printed (nav, script, noscript, ...)
396 struct TagIgnored : Tag {
397 void OnHasLeftOpeningTag(Converter *c) override {};
398 void OnHasLeftClosingTag(Converter *c) override {};
399 };
400
401 struct TagAnchor : Tag {
402 void OnHasLeftOpeningTag(Converter *c) override;
403 void OnHasLeftClosingTag(Converter *c) override;
404
405 std::string current_href_;
406 std::string current_title_;
407 };
408
409 struct TagBold : Tag {
410 void OnHasLeftOpeningTag(Converter *c) override;
411 void OnHasLeftClosingTag(Converter *c) override;
412 };
413
414 struct TagItalic : Tag {
415 void OnHasLeftOpeningTag(Converter *c) override;
416 void OnHasLeftClosingTag(Converter *c) override;
417 };
418
419 struct TagUnderline : Tag {
420 void OnHasLeftOpeningTag(Converter *c) override;
421 void OnHasLeftClosingTag(Converter *c) override;
422 };
423
424 struct TagStrikethrought : Tag {
425 void OnHasLeftOpeningTag(Converter *c) override;
426 void OnHasLeftClosingTag(Converter *c) override;
427 };
428
429 struct TagBreak : Tag {
430 void OnHasLeftOpeningTag(Converter *c) override;
431 void OnHasLeftClosingTag(Converter *c) override;
432 };
433
434 struct TagDiv : Tag {
435 void OnHasLeftOpeningTag(Converter *c) override;
436 void OnHasLeftClosingTag(Converter *c) override;
437 };
438
439 struct TagHeader1 : Tag {
440 void OnHasLeftOpeningTag(Converter *c) override;
441 void OnHasLeftClosingTag(Converter *c) override;
442 };
443
444 struct TagHeader2 : Tag {
445 void OnHasLeftOpeningTag(Converter *c) override;
446 void OnHasLeftClosingTag(Converter *c) override;
447 };
448
449 struct TagHeader3 : Tag {
450 void OnHasLeftOpeningTag(Converter *c) override;
451 void OnHasLeftClosingTag(Converter *c) override;
452 };
453
454 struct TagHeader4 : Tag {
455 void OnHasLeftOpeningTag(Converter *c) override;
456 void OnHasLeftClosingTag(Converter *c) override;
457 };
458
459 struct TagHeader5 : Tag {
460 void OnHasLeftOpeningTag(Converter *c) override;
461 void OnHasLeftClosingTag(Converter *c) override;
462 };
463
464 struct TagHeader6 : Tag {
465 void OnHasLeftOpeningTag(Converter *c) override;
466 void OnHasLeftClosingTag(Converter *c) override;
467 };
468
469 struct TagListItem : Tag {
470 void OnHasLeftOpeningTag(Converter *c) override;
471 void OnHasLeftClosingTag(Converter *c) override;
472 };
473
474 struct TagOption : Tag {
475 void OnHasLeftOpeningTag(Converter *c) override;
476 void OnHasLeftClosingTag(Converter *c) override;
477 };
478
479 struct TagOrderedList : Tag {
480 void OnHasLeftOpeningTag(Converter *c) override;
481 void OnHasLeftClosingTag(Converter *c) override;
482 };
483
484 struct TagParagraph : Tag {
485 void OnHasLeftOpeningTag(Converter *c) override;
486 void OnHasLeftClosingTag(Converter *c) override;
487 };
488
489 struct TagPre : Tag {
490 void OnHasLeftOpeningTag(Converter *c) override;
491 void OnHasLeftClosingTag(Converter *c) override;
492 };
493
494 struct TagCode : Tag {
495 void OnHasLeftOpeningTag(Converter *c) override;
496 void OnHasLeftClosingTag(Converter *c) override;
497 };
498
499 struct TagSpan : Tag {
500 void OnHasLeftOpeningTag(Converter *c) override;
501 void OnHasLeftClosingTag(Converter *c) override;
502 };
503
504 struct TagTitle : Tag {
505 void OnHasLeftOpeningTag(Converter *c) override;
506 void OnHasLeftClosingTag(Converter *c) override;
507 };
508
509 struct TagUnorderedList : Tag {
510 void OnHasLeftOpeningTag(Converter *c) override;
511 void OnHasLeftClosingTag(Converter *c) override;
512 };
513
514 struct TagImage : Tag {
515 void OnHasLeftOpeningTag(Converter *c) override;
516 void OnHasLeftClosingTag(Converter *c) override;
517 };
518
519 struct TagSeperator : Tag {
520 void OnHasLeftOpeningTag(Converter *c) override;
521 void OnHasLeftClosingTag(Converter *c) override;
522 };
523
524 struct TagTable : Tag {
525 void OnHasLeftOpeningTag(Converter *c) override;
526 void OnHasLeftClosingTag(Converter *c) override;
527 };
528
529 struct TagTableRow : Tag {
530 void OnHasLeftOpeningTag(Converter *c) override;
531 void OnHasLeftClosingTag(Converter *c) override;
532 };
533
534 struct TagTableHeader : Tag {
535 void OnHasLeftOpeningTag(Converter *c) override;
536 void OnHasLeftClosingTag(Converter *c) override;
537 };
538
539 struct TagTableData : Tag {
540 void OnHasLeftOpeningTag(Converter *c) override;
541 void OnHasLeftClosingTag(Converter *c) override;
542 };
543
544 struct TagBlockquote : Tag {
545 void OnHasLeftOpeningTag(Converter *c) override;
546 void OnHasLeftClosingTag(Converter *c) override;
547 };
548
549 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
550
551 explicit Converter(const std::string *html, struct Options *options);
552
553 void CleanUpMarkdown();
554
555 // Trim from start (in place)
556 static void LTrim(std::string *s);
557
558 // Trim from end (in place)
559 Converter *RTrim(std::string *s, bool trim_only_blank = false);
560
561 // Trim from both ends (in place)
562 Converter *Trim(std::string *s);
563
564 // 1. trim all lines
565 // 2. reduce consecutive newlines to maximum 3
566 void TidyAllLines(std::string *str);
567
568 std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
569
570 void TurnLineIntoHeader1();
571
572 void TurnLineIntoHeader2();
573
574 // Current char: '<'
575 void OnHasEnteredTag();
576
577 Converter *UpdatePrevChFromMd();
578
585 bool ParseCharInTag(char ch);
586
587 // Current char: '>'
588 bool OnHasLeftTag();
589
590 inline static bool TagContainsAttributesToHide(std::string *tag) {
591 using std::string;
592
593 return (*tag).find(" aria=\"hidden\"") != string::npos ||
594 (*tag).find("display:none") != string::npos ||
595 (*tag).find("visibility:hidden") != string::npos ||
596 (*tag).find("opacity:0") != string::npos ||
597 (*tag).find("Details-content--hidden-not-important") != string::npos;
598 }
599
600 Converter *ShortenMarkdown(size_t chars = 1);
601 inline bool shortIfPrevCh(char prev) {
602 if (prev_ch_in_md_ == prev) {
603 ShortenMarkdown();
604 return true;
605 }
606 return false;
607 };
608
613 bool ParseCharInTagContent(char ch);
614
615 // Replace previous space (if any) in current markdown line by newline
616 bool ReplacePreviousSpaceInLineByNewline();
617
618 static inline bool IsIgnoredTag(const std::string &tag) {
619 return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
620 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
621
622 // meta: not ignored to tolerate if closing is omitted
623 }
624
625 [[nodiscard]] bool IsInIgnoredTag() const;
626}; // Converter
627
634inline std::string Convert(const std::string &html, bool *ok = nullptr) {
635 Converter c(html);
636 auto md = c.convert();
637 if (ok != nullptr)
638 *ok = c.ok();
639 return md;
640}
641
642#ifndef PYTHON_BINDINGS
643inline std::string Convert(const std::string &&html, bool *ok = nullptr) {
644 return Convert(html, ok);
645}
646#endif
647
648} // namespace html2md
649
650#endif // HTML2MD_H
Class for converting HTML to Markdown.
Definition html2md.h:159
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:393
void addHtmlSymbolConversion(const std::string &htmlSymbol, const std::string &replacement)
Add an HTML symbol conversion.
Definition html2md.h:232
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:197
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Definition html2md.h:270
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:239
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:249
void removeHtmlSymbolConversion(const std::string &htmlSymbol)
Remove an HTML symbol conversion.
Definition html2md.h:243
bool operator==(const Converter &c) const
Definition html2md.h:272
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:173
void reset()
Reset the generated Markdown.
Definition html2md.cpp:1039
void clearHtmlSymbolConversions()
Clear all HTML symbol conversions.
Definition html2md.h:251
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
Definition html2md.h:206
html2md namespace
Definition html2md.h:21
std::string Convert(const std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Definition html2md.h:634
Options for the conversion from HTML to Markdown.
Definition html2md.h:38
bool operator==(html2md::Options o) const
Definition html2md.h:109
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
Definition html2md.h:51
int hardBreak
hardBreak Force a break after ... characters in a line
Definition html2md.h:56
char orderedList
The char used after the number of the item.
Definition html2md.h:90
bool formatTable
Whetever to format Markdown Tables.
Definition html2md.h:107
bool splitLines
Add new line when a certain number of characters is reached.
Definition html2md.h:45
char unorderedList
The char used for unordered lists.
Definition html2md.h:74
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.
Definition html2md.h:99