html2md  v1.6.0
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.h
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#ifndef HTML2MD_H
5#define HTML2MD_H
6
7#include <memory>
8#include <string>
9#include <unordered_map>
10
21namespace html2md {
22
38struct Options {
45 bool splitLines = true;
46
51 int softBreak = 80;
52
56 int hardBreak = 100;
57
74 char unorderedList = '-';
75
90 char orderedList = '.';
91
99 bool includeTitle = true;
100
107 bool formatTable = true;
108
109 inline bool operator==(html2md::Options o) const {
110 return splitLines == o.splitLines && unorderedList == o.unorderedList &&
113 };
114};
115
160public:
173 explicit inline Converter(std::string &html,
174 struct Options *options = nullptr) {
175 *this = Converter(&html, options);
176 }
177
185 [[nodiscard]] std::string convert();
186
192 Converter *appendToMd(char ch);
193
199 Converter *appendToMd(const char *str);
200
206 inline Converter *appendToMd(const std::string &s) {
207 return appendToMd(s.c_str());
208 }
209
219
225 [[nodiscard]] bool ok() const;
226
230 void reset();
231
237 inline bool operator==(const Converter *c) const { return *this == *c; }
238
239 inline bool operator==(const Converter &c) const {
240 return html_ == c.html_ && option == c.option;
241 }
242
246 inline explicit operator bool() const { return ok(); };
247
248private:
249 // Attributes
250 static constexpr const char *kAttributeHref = "href";
251 static constexpr const char *kAttributeAlt = "alt";
252 static constexpr const char *kAttributeTitle = "title";
253 static constexpr const char *kAttributeClass = "class";
254 static constexpr const char *kAttributeSrc = "src";
255 static constexpr const char *kAttrinuteAlign = "align";
256
257 static constexpr const char *kTagAnchor = "a";
258 static constexpr const char *kTagBreak = "br";
259 static constexpr const char *kTagCode = "code";
260 static constexpr const char *kTagDiv = "div";
261 static constexpr const char *kTagHead = "head";
262 static constexpr const char *kTagLink = "link";
263 static constexpr const char *kTagListItem = "li";
264 static constexpr const char *kTagMeta = "meta";
265 static constexpr const char *kTagNav = "nav";
266 static constexpr const char *kTagNoScript = "noscript";
267 static constexpr const char *kTagOption = "option";
268 static constexpr const char *kTagOrderedList = "ol";
269 static constexpr const char *kTagParagraph = "p";
270 static constexpr const char *kTagPre = "pre";
271 static constexpr const char *kTagScript = "script";
272 static constexpr const char *kTagSpan = "span";
273 static constexpr const char *kTagStyle = "style";
274 static constexpr const char *kTagTemplate = "template";
275 static constexpr const char *kTagTitle = "title";
276 static constexpr const char *kTagUnorderedList = "ul";
277 static constexpr const char *kTagImg = "img";
278 static constexpr const char *kTagSeperator = "hr";
279
280 // Text format
281 static constexpr const char *kTagBold = "b";
282 static constexpr const char *kTagStrong = "strong";
283 static constexpr const char *kTagItalic = "em";
284 static constexpr const char *kTagItalic2 = "i";
285 static constexpr const char *kTagCitation = "cite";
286 static constexpr const char *kTagDefinition = "dfn";
287 static constexpr const char *kTagUnderline = "u";
288 static constexpr const char *kTagStrighthrought = "del";
289 static constexpr const char *kTagStrighthrought2 = "s";
290
291 static constexpr const char *kTagBlockquote = "blockquote";
292
293 // Header
294 static constexpr const char *kTagHeader1 = "h1";
295 static constexpr const char *kTagHeader2 = "h2";
296 static constexpr const char *kTagHeader3 = "h3";
297 static constexpr const char *kTagHeader4 = "h4";
298 static constexpr const char *kTagHeader5 = "h5";
299 static constexpr const char *kTagHeader6 = "h6";
300
301 // Table
302 static constexpr const char *kTagTable = "table";
303 static constexpr const char *kTagTableRow = "tr";
304 static constexpr const char *kTagTableHeader = "th";
305 static constexpr const char *kTagTableData = "td";
306
307 size_t index_ch_in_html_ = 0;
308
309 bool is_closing_tag_ = false;
310 bool is_in_attribute_value_ = false;
311 bool is_in_code_ = false;
312 bool is_in_list_ = false;
313 bool is_in_p_ = false;
314 bool is_in_pre_ = false;
315 bool is_in_table_ = false;
316 bool is_in_table_row_ = false;
317 bool is_in_tag_ = false;
318 bool is_self_closing_tag_ = false;
319
320 // relevant for <li> only, false = is in unordered list
321 bool is_in_ordered_list_ = false;
322 uint8_t index_ol = 0;
323
324 // store the table start
325 size_t table_start = 0;
326
327 // number of lists
328 uint8_t index_li = 0;
329
330 uint8_t index_blockquote = 0;
331
332 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
333 char prev_ch_in_html_ = 'x';
334
335 std::string html_;
336
337 uint16_t offset_lt_ = 0;
338 std::string current_tag_;
339 std::string prev_tag_;
340
341 // Line which separates header from data
342 std::string tableLine;
343
344 size_t chars_in_curr_line_ = 0;
345
346 std::string md_;
347
348 Options option;
349
350 // Tag: base class for tag types
351 struct Tag {
352 virtual void OnHasLeftOpeningTag(Converter *c) = 0;
353 virtual void OnHasLeftClosingTag(Converter *c) = 0;
354 };
355
356 // Tag types
357
358 // tags that are not printed (nav, script, noscript, ...)
359 struct TagIgnored : Tag {
360 void OnHasLeftOpeningTag(Converter *c) override{};
361 void OnHasLeftClosingTag(Converter *c) override{};
362 };
363
364 struct TagAnchor : Tag {
365 void OnHasLeftOpeningTag(Converter *c) override;
366 void OnHasLeftClosingTag(Converter *c) override;
367
368 std::string current_href_;
369 std::string current_title_;
370 };
371
372 struct TagBold : Tag {
373 void OnHasLeftOpeningTag(Converter *c) override;
374 void OnHasLeftClosingTag(Converter *c) override;
375 };
376
377 struct TagItalic : Tag {
378 void OnHasLeftOpeningTag(Converter *c) override;
379 void OnHasLeftClosingTag(Converter *c) override;
380 };
381
382 struct TagUnderline : Tag {
383 void OnHasLeftOpeningTag(Converter *c) override;
384 void OnHasLeftClosingTag(Converter *c) override;
385 };
386
387 struct TagStrikethrought : Tag {
388 void OnHasLeftOpeningTag(Converter *c) override;
389 void OnHasLeftClosingTag(Converter *c) override;
390 };
391
392 struct TagBreak : Tag {
393 void OnHasLeftOpeningTag(Converter *c) override;
394 void OnHasLeftClosingTag(Converter *c) override;
395 };
396
397 struct TagDiv : Tag {
398 void OnHasLeftOpeningTag(Converter *c) override;
399 void OnHasLeftClosingTag(Converter *c) override;
400 };
401
402 struct TagHeader1 : Tag {
403 void OnHasLeftOpeningTag(Converter *c) override;
404 void OnHasLeftClosingTag(Converter *c) override;
405 };
406
407 struct TagHeader2 : Tag {
408 void OnHasLeftOpeningTag(Converter *c) override;
409 void OnHasLeftClosingTag(Converter *c) override;
410 };
411
412 struct TagHeader3 : Tag {
413 void OnHasLeftOpeningTag(Converter *c) override;
414 void OnHasLeftClosingTag(Converter *c) override;
415 };
416
417 struct TagHeader4 : Tag {
418 void OnHasLeftOpeningTag(Converter *c) override;
419 void OnHasLeftClosingTag(Converter *c) override;
420 };
421
422 struct TagHeader5 : Tag {
423 void OnHasLeftOpeningTag(Converter *c) override;
424 void OnHasLeftClosingTag(Converter *c) override;
425 };
426
427 struct TagHeader6 : Tag {
428 void OnHasLeftOpeningTag(Converter *c) override;
429 void OnHasLeftClosingTag(Converter *c) override;
430 };
431
432 struct TagListItem : Tag {
433 void OnHasLeftOpeningTag(Converter *c) override;
434 void OnHasLeftClosingTag(Converter *c) override;
435 };
436
437 struct TagOption : Tag {
438 void OnHasLeftOpeningTag(Converter *c) override;
439 void OnHasLeftClosingTag(Converter *c) override;
440 };
441
442 struct TagOrderedList : Tag {
443 void OnHasLeftOpeningTag(Converter *c) override;
444 void OnHasLeftClosingTag(Converter *c) override;
445 };
446
447 struct TagParagraph : Tag {
448 void OnHasLeftOpeningTag(Converter *c) override;
449 void OnHasLeftClosingTag(Converter *c) override;
450 };
451
452 struct TagPre : Tag {
453 void OnHasLeftOpeningTag(Converter *c) override;
454 void OnHasLeftClosingTag(Converter *c) override;
455 };
456
457 struct TagCode : Tag {
458 void OnHasLeftOpeningTag(Converter *c) override;
459 void OnHasLeftClosingTag(Converter *c) override;
460 };
461
462 struct TagSpan : Tag {
463 void OnHasLeftOpeningTag(Converter *c) override;
464 void OnHasLeftClosingTag(Converter *c) override;
465 };
466
467 struct TagTitle : Tag {
468 void OnHasLeftOpeningTag(Converter *c) override;
469 void OnHasLeftClosingTag(Converter *c) override;
470 };
471
472 struct TagUnorderedList : Tag {
473 void OnHasLeftOpeningTag(Converter *c) override;
474 void OnHasLeftClosingTag(Converter *c) override;
475 };
476
477 struct TagImage : Tag {
478 void OnHasLeftOpeningTag(Converter *c) override;
479 void OnHasLeftClosingTag(Converter *c) override;
480 };
481
482 struct TagSeperator : Tag {
483 void OnHasLeftOpeningTag(Converter *c) override;
484 void OnHasLeftClosingTag(Converter *c) override;
485 };
486
487 struct TagTable : Tag {
488 void OnHasLeftOpeningTag(Converter *c) override;
489 void OnHasLeftClosingTag(Converter *c) override;
490 };
491
492 struct TagTableRow : Tag {
493 void OnHasLeftOpeningTag(Converter *c) override;
494 void OnHasLeftClosingTag(Converter *c) override;
495 };
496
497 struct TagTableHeader : Tag {
498 void OnHasLeftOpeningTag(Converter *c) override;
499 void OnHasLeftClosingTag(Converter *c) override;
500 };
501
502 struct TagTableData : Tag {
503 void OnHasLeftOpeningTag(Converter *c) override;
504 void OnHasLeftClosingTag(Converter *c) override;
505 };
506
507 struct TagBlockquote : Tag {
508 void OnHasLeftOpeningTag(Converter *c) override;
509 void OnHasLeftClosingTag(Converter *c) override;
510 };
511
512 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
513
514 explicit Converter(std::string *html, struct Options *options);
515
516 void CleanUpMarkdown();
517
518 // Trim from start (in place)
519 static void LTrim(std::string *s);
520
521 // Trim from end (in place)
522 Converter *RTrim(std::string *s, bool trim_only_blank = false);
523
524 // Trim from both ends (in place)
525 Converter *Trim(std::string *s);
526
527 // 1. trim all lines
528 // 2. reduce consecutive newlines to maximum 3
529 void TidyAllLines(std::string *str);
530
531 std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
532
533 void TurnLineIntoHeader1();
534
535 void TurnLineIntoHeader2();
536
537 // Current char: '<'
538 void OnHasEnteredTag();
539
540 Converter *UpdatePrevChFromMd();
541
548 bool ParseCharInTag(char ch);
549
550 // Current char: '>'
551 bool OnHasLeftTag();
552
553 inline static bool TagContainsAttributesToHide(std::string *tag) {
554 using std::string;
555
556 return (*tag).find(" aria=\"hidden\"") != string::npos ||
557 (*tag).find("display:none") != string::npos ||
558 (*tag).find("visibility:hidden") != string::npos ||
559 (*tag).find("opacity:0") != string::npos ||
560 (*tag).find("Details-content--hidden-not-important") != string::npos;
561 }
562
563 Converter *ShortenMarkdown(size_t chars = 1);
564 inline bool shortIfPrevCh(char prev) {
565 if (prev_ch_in_md_ == prev) {
566 ShortenMarkdown();
567 return true;
568 }
569 return false;
570 };
571
576 bool ParseCharInTagContent(char ch);
577
578 // Replace previous space (if any) in current markdown line by newline
579 bool ReplacePreviousSpaceInLineByNewline();
580
581 static inline bool IsIgnoredTag(const std::string &tag) {
582 return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
583 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
584
585 // meta: not ignored to tolerate if closing is omitted
586 }
587
588 [[nodiscard]] bool IsInIgnoredTag() const;
589}; // Converter
590
597inline std::string Convert(std::string &html, bool *ok = nullptr) {
598 Converter c(html);
599 auto md = c.convert();
600 if (ok != nullptr)
601 *ok = c.ok();
602 return md;
603}
604
605#ifndef PYTHON_BINDINGS
606inline std::string Convert(std::string &&html, bool *ok = nullptr) {
607 return Convert(html, ok);
608}
609#endif
610
611} // namespace html2md
612
613#endif // HTML2MD_H
Class for converting HTML to Markdown.
Definition html2md.h:159
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:361
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:165
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Definition html2md.h:237
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:208
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:218
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:173
bool operator==(const Converter &c) const
Definition html2md.h:239
void reset()
Reset the generated Markdown.
Definition html2md.cpp:971
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
Definition html2md.h:206
html2md namespace
Definition html2md.h:21
std::string Convert(std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Definition html2md.h:597
Options for the conversion from HTML to Markdown.
Definition html2md.h:38
bool operator==(html2md::Options o) const
Definition html2md.h:109
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
Definition html2md.h:51
int hardBreak
hardBreak Force a break after ... characters in a line
Definition html2md.h:56
char orderedList
The char used after the number of the item.
Definition html2md.h:90
bool formatTable
Whetever to format Markdown Tables.
Definition html2md.h:107
bool splitLines
Add new line when a certain number of characters is reached.
Definition html2md.h:45
char unorderedList
The char used for unordered lists.
Definition html2md.h:74
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.
Definition html2md.h:99