html2md  v1.5.4
Simple and fast HTML to Markdown converter
Loading...
Searching...
No Matches
html2md.h
Go to the documentation of this file.
1// Copyright (c) Tim Gromeyer
2// Licensed under the MIT License - https://opensource.org/licenses/MIT
3
4#ifndef HTML2MD_H
5#define HTML2MD_H
6
7#include <map>
8#include <memory>
9#include <string>
10#include <unordered_map>
11#include <vector>
12
23namespace html2md {
24
40struct Options {
44 bool splitLines = true;
45
62 char unorderedList = '-';
63
78 char orderedList = '.';
79
87 bool includeTitle = true;
88
95 bool formatTable = true;
96
97 inline bool operator==(html2md::Options o) const {
100 };
101};
102
147public:
160 explicit inline Converter(std::string &html,
161 struct Options *options = nullptr) {
162 *this = Converter(&html, options);
163 }
164
172 [[nodiscard]] std::string convert();
173
179 Converter *appendToMd(char ch);
180
186 Converter *appendToMd(const char *str);
187
193 inline Converter *appendToMd(const std::string &s) {
194 return appendToMd(s.c_str());
195 }
196
206
212 [[nodiscard]] bool ok() const;
213
217 void reset();
218
224 inline bool operator==(const Converter *c) const { return *this == *c; }
225
226 inline bool operator==(const Converter &c) const {
227 return html_ == c.html_ && option == c.option;
228 }
229
233 inline explicit operator bool() const { return ok(); };
234
235private:
236 // Attributes
237 static constexpr const char *kAttributeHref = "href";
238 static constexpr const char *kAttributeAlt = "alt";
239 static constexpr const char *kAttributeTitle = "title";
240 static constexpr const char *kAttributeClass = "class";
241 static constexpr const char *kAttributeSrc = "src";
242 static constexpr const char *kAttrinuteAlign = "align";
243
244 static constexpr const char *kTagAnchor = "a";
245 static constexpr const char *kTagBreak = "br";
246 static constexpr const char *kTagCode = "code";
247 static constexpr const char *kTagDiv = "div";
248 static constexpr const char *kTagHead = "head";
249 static constexpr const char *kTagLink = "link";
250 static constexpr const char *kTagListItem = "li";
251 static constexpr const char *kTagMeta = "meta";
252 static constexpr const char *kTagNav = "nav";
253 static constexpr const char *kTagNoScript = "noscript";
254 static constexpr const char *kTagOption = "option";
255 static constexpr const char *kTagOrderedList = "ol";
256 static constexpr const char *kTagParagraph = "p";
257 static constexpr const char *kTagPre = "pre";
258 static constexpr const char *kTagScript = "script";
259 static constexpr const char *kTagSpan = "span";
260 static constexpr const char *kTagStyle = "style";
261 static constexpr const char *kTagTemplate = "template";
262 static constexpr const char *kTagTitle = "title";
263 static constexpr const char *kTagUnorderedList = "ul";
264 static constexpr const char *kTagImg = "img";
265 static constexpr const char *kTagSeperator = "hr";
266
267 // Text format
268 static constexpr const char *kTagBold = "b";
269 static constexpr const char *kTagStrong = "strong";
270 static constexpr const char *kTagItalic = "em";
271 static constexpr const char *kTagItalic2 = "i";
272 static constexpr const char *kTagCitation = "cite";
273 static constexpr const char *kTagDefinition = "dfn";
274 static constexpr const char *kTagUnderline = "u";
275 static constexpr const char *kTagStrighthrought = "del";
276 static constexpr const char *kTagStrighthrought2 = "s";
277
278 static constexpr const char *kTagBlockquote = "blockquote";
279
280 // Header
281 static constexpr const char *kTagHeader1 = "h1";
282 static constexpr const char *kTagHeader2 = "h2";
283 static constexpr const char *kTagHeader3 = "h3";
284 static constexpr const char *kTagHeader4 = "h4";
285 static constexpr const char *kTagHeader5 = "h5";
286 static constexpr const char *kTagHeader6 = "h6";
287
288 // Table
289 static constexpr const char *kTagTable = "table";
290 static constexpr const char *kTagTableRow = "tr";
291 static constexpr const char *kTagTableHeader = "th";
292 static constexpr const char *kTagTableData = "td";
293
294 size_t index_ch_in_html_ = 0;
295
296 bool is_closing_tag_ = false;
297 bool is_in_attribute_value_ = false;
298 bool is_in_code_ = false;
299 bool is_in_list_ = false;
300 bool is_in_p_ = false;
301 bool is_in_pre_ = false;
302 bool is_in_table_ = false;
303 bool is_in_table_row_ = false;
304 bool is_in_tag_ = false;
305
306 // relevant for <li> only, false = is in unordered list
307 bool is_in_ordered_list_ = false;
308 uint8_t index_ol = 0;
309
310 // store the table start
311 size_t table_start = 0;
312
313 // number of lists
314 uint8_t index_li = 0;
315
316 uint8_t index_blockquote = 0;
317
318 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
319 char prev_ch_in_html_ = 'x';
320
321 std::string html_;
322
323 uint16_t offset_lt_ = 0;
324 std::string current_tag_;
325 std::string prev_tag_;
326
327 // Line which separates header from data
328 std::string tableLine;
329
330 size_t chars_in_curr_line_ = 0;
331
332 std::string md_;
333
334 Options option;
335
336 // Tag: base class for tag types
337 struct Tag {
338 virtual void OnHasLeftOpeningTag(Converter *c) = 0;
339 virtual void OnHasLeftClosingTag(Converter *c) = 0;
340 };
341
342 // Tag types
343
344 // tags that are not printed (nav, script, noscript, ...)
345 struct TagIgnored : Tag {
346 void OnHasLeftOpeningTag(Converter *c) override{};
347 void OnHasLeftClosingTag(Converter *c) override{};
348 };
349
350 struct TagAnchor : Tag {
351 void OnHasLeftOpeningTag(Converter *c) override;
352 void OnHasLeftClosingTag(Converter *c) override;
353
354 std::string current_href_;
355 std::string current_title_;
356 };
357
358 struct TagBold : Tag {
359 void OnHasLeftOpeningTag(Converter *c) override;
360 void OnHasLeftClosingTag(Converter *c) override;
361 };
362
363 struct TagItalic : Tag {
364 void OnHasLeftOpeningTag(Converter *c) override;
365 void OnHasLeftClosingTag(Converter *c) override;
366 };
367
368 struct TagUnderline : Tag {
369 void OnHasLeftOpeningTag(Converter *c) override;
370 void OnHasLeftClosingTag(Converter *c) override;
371 };
372
373 struct TagStrikethrought : Tag {
374 void OnHasLeftOpeningTag(Converter *c) override;
375 void OnHasLeftClosingTag(Converter *c) override;
376 };
377
378 struct TagBreak : Tag {
379 void OnHasLeftOpeningTag(Converter *c) override;
380 void OnHasLeftClosingTag(Converter *c) override;
381 };
382
383 struct TagDiv : Tag {
384 void OnHasLeftOpeningTag(Converter *c) override;
385 void OnHasLeftClosingTag(Converter *c) override;
386 };
387
388 struct TagHeader1 : Tag {
389 void OnHasLeftOpeningTag(Converter *c) override;
390 void OnHasLeftClosingTag(Converter *c) override;
391 };
392
393 struct TagHeader2 : Tag {
394 void OnHasLeftOpeningTag(Converter *c) override;
395 void OnHasLeftClosingTag(Converter *c) override;
396 };
397
398 struct TagHeader3 : Tag {
399 void OnHasLeftOpeningTag(Converter *c) override;
400 void OnHasLeftClosingTag(Converter *c) override;
401 };
402
403 struct TagHeader4 : Tag {
404 void OnHasLeftOpeningTag(Converter *c) override;
405 void OnHasLeftClosingTag(Converter *c) override;
406 };
407
408 struct TagHeader5 : Tag {
409 void OnHasLeftOpeningTag(Converter *c) override;
410 void OnHasLeftClosingTag(Converter *c) override;
411 };
412
413 struct TagHeader6 : Tag {
414 void OnHasLeftOpeningTag(Converter *c) override;
415 void OnHasLeftClosingTag(Converter *c) override;
416 };
417
418 struct TagListItem : Tag {
419 void OnHasLeftOpeningTag(Converter *c) override;
420 void OnHasLeftClosingTag(Converter *c) override;
421 };
422
423 struct TagOption : Tag {
424 void OnHasLeftOpeningTag(Converter *c) override;
425 void OnHasLeftClosingTag(Converter *c) override;
426 };
427
428 struct TagOrderedList : Tag {
429 void OnHasLeftOpeningTag(Converter *c) override;
430 void OnHasLeftClosingTag(Converter *c) override;
431 };
432
433 struct TagParagraph : Tag {
434 void OnHasLeftOpeningTag(Converter *c) override;
435 void OnHasLeftClosingTag(Converter *c) override;
436 };
437
438 struct TagPre : Tag {
439 void OnHasLeftOpeningTag(Converter *c) override;
440 void OnHasLeftClosingTag(Converter *c) override;
441 };
442
443 struct TagCode : Tag {
444 void OnHasLeftOpeningTag(Converter *c) override;
445 void OnHasLeftClosingTag(Converter *c) override;
446 };
447
448 struct TagSpan : Tag {
449 void OnHasLeftOpeningTag(Converter *c) override;
450 void OnHasLeftClosingTag(Converter *c) override;
451 };
452
453 struct TagTitle : Tag {
454 void OnHasLeftOpeningTag(Converter *c) override;
455 void OnHasLeftClosingTag(Converter *c) override;
456 };
457
458 struct TagUnorderedList : Tag {
459 void OnHasLeftOpeningTag(Converter *c) override;
460 void OnHasLeftClosingTag(Converter *c) override;
461 };
462
463 struct TagImage : Tag {
464 void OnHasLeftOpeningTag(Converter *c) override;
465 void OnHasLeftClosingTag(Converter *c) override;
466 };
467
468 struct TagSeperator : Tag {
469 void OnHasLeftOpeningTag(Converter *c) override;
470 void OnHasLeftClosingTag(Converter *c) override;
471 };
472
473 struct TagTable : Tag {
474 void OnHasLeftOpeningTag(Converter *c) override;
475 void OnHasLeftClosingTag(Converter *c) override;
476 };
477
478 struct TagTableRow : Tag {
479 void OnHasLeftOpeningTag(Converter *c) override;
480 void OnHasLeftClosingTag(Converter *c) override;
481 };
482
483 struct TagTableHeader : Tag {
484 void OnHasLeftOpeningTag(Converter *c) override;
485 void OnHasLeftClosingTag(Converter *c) override;
486 };
487
488 struct TagTableData : Tag {
489 void OnHasLeftOpeningTag(Converter *c) override;
490 void OnHasLeftClosingTag(Converter *c) override;
491 };
492
493 struct TagBlockquote : Tag {
494 void OnHasLeftOpeningTag(Converter *c) override;
495 void OnHasLeftClosingTag(Converter *c) override;
496 };
497
498 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
499
500 explicit Converter(std::string *html, struct Options *options);
501
502 void CleanUpMarkdown();
503
504 // Trim from start (in place)
505 static void LTrim(std::string *s);
506
507 // Trim from end (in place)
508 Converter *RTrim(std::string *s, bool trim_only_blank = false);
509
510 // Trim from both ends (in place)
511 Converter *Trim(std::string *s);
512
513 // 1. trim all lines
514 // 2. reduce consecutive newlines to maximum 3
515 void TidyAllLines(std::string *str);
516
517 std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
518
519 void TurnLineIntoHeader1();
520
521 void TurnLineIntoHeader2();
522
523 // Current char: '<'
524 void OnHasEnteredTag();
525
526 Converter *UpdatePrevChFromMd();
527
534 bool ParseCharInTag(char ch);
535
536 // Current char: '>'
537 bool OnHasLeftTag();
538
539 inline static bool TagContainsAttributesToHide(std::string *tag) {
540 using std::string;
541
542 return (*tag).find(" aria=\"hidden\"") != string::npos ||
543 (*tag).find("display:none") != string::npos ||
544 (*tag).find("visibility:hidden") != string::npos ||
545 (*tag).find("opacity:0") != string::npos ||
546 (*tag).find("Details-content--hidden-not-important") != string::npos;
547 }
548
549 Converter *ShortenMarkdown(size_t chars = 1);
550 inline bool shortIfPrevCh(char prev) {
551 if (prev_ch_in_md_ == prev) {
552 ShortenMarkdown();
553 return true;
554 }
555 return false;
556 };
557
562 bool ParseCharInTagContent(char ch);
563
564 // Replace previous space (if any) in current markdown line by newline
565 bool ReplacePreviousSpaceInLineByNewline();
566
567 static inline bool IsIgnoredTag(const std::string &tag) {
568 return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
569 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
570
571 // meta: not ignored to tolerate if closing is omitted
572 }
573
574 [[nodiscard]] bool IsInIgnoredTag() const;
575}; // Converter
576
583inline std::string Convert(std::string &html, bool *ok = nullptr) {
584 Converter c(html);
585 auto md = c.convert();
586 if (ok != nullptr)
587 *ok = c.ok();
588 return md;
589}
590
591#ifndef PYTHON_BINDINGS
592inline std::string Convert(std::string &&html, bool *ok = nullptr) {
593 return Convert(html, ok);
594}
595#endif
596
597} // namespace html2md
598
599#endif // HTML2MD_H
Class for converting HTML to Markdown.
Definition html2md.h:146
std::string convert()
Convert HTML into Markdown.
Definition html2md.cpp:363
Converter * appendToMd(char ch)
Append a char to the Markdown.
Definition html2md.cpp:167
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Definition html2md.h:224
Converter * appendBlank()
Appends a ' ' in certain cases.
Definition html2md.cpp:210
bool ok() const
Checks if everything was closed properly(in the HTML).
Definition html2md.cpp:220
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
Definition html2md.h:160
bool operator==(const Converter &c) const
Definition html2md.h:226
void reset()
Reset the generated Markdown.
Definition html2md.cpp:974
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
Definition html2md.h:193
html2md namespace
Definition html2md.h:23
std::string Convert(std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Definition html2md.h:583
Options for the conversion from HTML to Markdown.
Definition html2md.h:40
bool operator==(html2md::Options o) const
Definition html2md.h:97
char orderedList
The char used after the number of the item.
Definition html2md.h:78
bool formatTable
Whetever to format Markdown Tables.
Definition html2md.h:95
bool splitLines
Add new line when a certain number of characters is reached.
Definition html2md.h:44
char unorderedList
The char used for unordered lists.
Definition html2md.h:62
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.
Definition html2md.h:87