9#include <unordered_map>
174 struct Options *options =
nullptr) {
185 [[nodiscard]] std::string
convert();
233 const std::string &replacement) {
234 htmlSymbolConversions_[htmlSymbol] = replacement;
244 htmlSymbolConversions_.erase(htmlSymbol);
258 [[nodiscard]]
bool ok()
const;
273 return html_ == c.html_ && option == c.option;
279 inline explicit operator bool()
const {
return ok(); };
283 static constexpr const char *kAttributeHref =
"href";
284 static constexpr const char *kAttributeAlt =
"alt";
285 static constexpr const char *kAttributeTitle =
"title";
286 static constexpr const char *kAttributeClass =
"class";
287 static constexpr const char *kAttributeSrc =
"src";
288 static constexpr const char *kAttrinuteAlign =
"align";
290 static constexpr const char *kTagAnchor =
"a";
291 static constexpr const char *kTagBreak =
"br";
292 static constexpr const char *kTagCode =
"code";
293 static constexpr const char *kTagDiv =
"div";
294 static constexpr const char *kTagHead =
"head";
295 static constexpr const char *kTagLink =
"link";
296 static constexpr const char *kTagListItem =
"li";
297 static constexpr const char *kTagMeta =
"meta";
298 static constexpr const char *kTagNav =
"nav";
299 static constexpr const char *kTagNoScript =
"noscript";
300 static constexpr const char *kTagOption =
"option";
301 static constexpr const char *kTagOrderedList =
"ol";
302 static constexpr const char *kTagParagraph =
"p";
303 static constexpr const char *kTagPre =
"pre";
304 static constexpr const char *kTagScript =
"script";
305 static constexpr const char *kTagSpan =
"span";
306 static constexpr const char *kTagStyle =
"style";
307 static constexpr const char *kTagTemplate =
"template";
308 static constexpr const char *kTagTitle =
"title";
309 static constexpr const char *kTagUnorderedList =
"ul";
310 static constexpr const char *kTagImg =
"img";
311 static constexpr const char *kTagSeperator =
"hr";
314 static constexpr const char *kTagBold =
"b";
315 static constexpr const char *kTagStrong =
"strong";
316 static constexpr const char *kTagItalic =
"em";
317 static constexpr const char *kTagItalic2 =
"i";
318 static constexpr const char *kTagCitation =
"cite";
319 static constexpr const char *kTagDefinition =
"dfn";
320 static constexpr const char *kTagUnderline =
"u";
321 static constexpr const char *kTagStrighthrought =
"del";
322 static constexpr const char *kTagStrighthrought2 =
"s";
324 static constexpr const char *kTagBlockquote =
"blockquote";
327 static constexpr const char *kTagHeader1 =
"h1";
328 static constexpr const char *kTagHeader2 =
"h2";
329 static constexpr const char *kTagHeader3 =
"h3";
330 static constexpr const char *kTagHeader4 =
"h4";
331 static constexpr const char *kTagHeader5 =
"h5";
332 static constexpr const char *kTagHeader6 =
"h6";
335 static constexpr const char *kTagTable =
"table";
336 static constexpr const char *kTagTableRow =
"tr";
337 static constexpr const char *kTagTableHeader =
"th";
338 static constexpr const char *kTagTableData =
"td";
340 size_t index_ch_in_html_ = 0;
342 bool is_closing_tag_ =
false;
343 bool is_in_attribute_value_ =
false;
344 bool is_in_code_ =
false;
345 bool is_in_list_ =
false;
346 bool is_in_p_ =
false;
347 bool is_in_pre_ =
false;
348 bool is_in_table_ =
false;
349 bool is_in_table_row_ =
false;
350 bool is_in_tag_ =
false;
351 bool is_self_closing_tag_ =
false;
354 bool is_in_ordered_list_ =
false;
355 uint8_t index_ol = 0;
358 size_t table_start = 0;
361 uint8_t index_li = 0;
363 uint8_t index_blockquote = 0;
365 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
366 char prev_ch_in_html_ =
'x';
370 uint16_t offset_lt_ = 0;
371 std::string current_tag_;
372 std::string prev_tag_;
375 std::string tableLine;
377 size_t chars_in_curr_line_ = 0;
383 std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
384 {
""",
"\""}, {
"<",
"<"}, {
">",
">"},
385 {
"&",
"&"}, {
" ",
" "}, {
"→",
"→"}};
389 virtual void OnHasLeftOpeningTag(
Converter *c) = 0;
390 virtual void OnHasLeftClosingTag(
Converter *c) = 0;
396 struct TagIgnored : Tag {
397 void OnHasLeftOpeningTag(
Converter *c)
override {};
398 void OnHasLeftClosingTag(
Converter *c)
override {};
401 struct TagAnchor : Tag {
402 void OnHasLeftOpeningTag(
Converter *c)
override;
403 void OnHasLeftClosingTag(
Converter *c)
override;
405 std::string current_href_;
406 std::string current_title_;
409 struct TagBold : Tag {
410 void OnHasLeftOpeningTag(
Converter *c)
override;
411 void OnHasLeftClosingTag(
Converter *c)
override;
414 struct TagItalic : Tag {
415 void OnHasLeftOpeningTag(
Converter *c)
override;
416 void OnHasLeftClosingTag(
Converter *c)
override;
419 struct TagUnderline : Tag {
420 void OnHasLeftOpeningTag(
Converter *c)
override;
421 void OnHasLeftClosingTag(
Converter *c)
override;
424 struct TagStrikethrought : Tag {
425 void OnHasLeftOpeningTag(
Converter *c)
override;
426 void OnHasLeftClosingTag(
Converter *c)
override;
429 struct TagBreak : Tag {
430 void OnHasLeftOpeningTag(
Converter *c)
override;
431 void OnHasLeftClosingTag(
Converter *c)
override;
434 struct TagDiv : Tag {
435 void OnHasLeftOpeningTag(
Converter *c)
override;
436 void OnHasLeftClosingTag(
Converter *c)
override;
439 struct TagHeader1 : Tag {
440 void OnHasLeftOpeningTag(
Converter *c)
override;
441 void OnHasLeftClosingTag(
Converter *c)
override;
444 struct TagHeader2 : Tag {
445 void OnHasLeftOpeningTag(
Converter *c)
override;
446 void OnHasLeftClosingTag(
Converter *c)
override;
449 struct TagHeader3 : Tag {
450 void OnHasLeftOpeningTag(
Converter *c)
override;
451 void OnHasLeftClosingTag(
Converter *c)
override;
454 struct TagHeader4 : Tag {
455 void OnHasLeftOpeningTag(
Converter *c)
override;
456 void OnHasLeftClosingTag(
Converter *c)
override;
459 struct TagHeader5 : Tag {
460 void OnHasLeftOpeningTag(
Converter *c)
override;
461 void OnHasLeftClosingTag(
Converter *c)
override;
464 struct TagHeader6 : Tag {
465 void OnHasLeftOpeningTag(
Converter *c)
override;
466 void OnHasLeftClosingTag(
Converter *c)
override;
469 struct TagListItem : Tag {
470 void OnHasLeftOpeningTag(
Converter *c)
override;
471 void OnHasLeftClosingTag(
Converter *c)
override;
474 struct TagOption : Tag {
475 void OnHasLeftOpeningTag(
Converter *c)
override;
476 void OnHasLeftClosingTag(
Converter *c)
override;
479 struct TagOrderedList : Tag {
480 void OnHasLeftOpeningTag(
Converter *c)
override;
481 void OnHasLeftClosingTag(
Converter *c)
override;
484 struct TagParagraph : Tag {
485 void OnHasLeftOpeningTag(
Converter *c)
override;
486 void OnHasLeftClosingTag(
Converter *c)
override;
489 struct TagPre : Tag {
490 void OnHasLeftOpeningTag(
Converter *c)
override;
491 void OnHasLeftClosingTag(
Converter *c)
override;
494 struct TagCode : Tag {
495 void OnHasLeftOpeningTag(
Converter *c)
override;
496 void OnHasLeftClosingTag(
Converter *c)
override;
499 struct TagSpan : Tag {
500 void OnHasLeftOpeningTag(
Converter *c)
override;
501 void OnHasLeftClosingTag(
Converter *c)
override;
504 struct TagTitle : Tag {
505 void OnHasLeftOpeningTag(
Converter *c)
override;
506 void OnHasLeftClosingTag(
Converter *c)
override;
509 struct TagUnorderedList : Tag {
510 void OnHasLeftOpeningTag(
Converter *c)
override;
511 void OnHasLeftClosingTag(
Converter *c)
override;
514 struct TagImage : Tag {
515 void OnHasLeftOpeningTag(
Converter *c)
override;
516 void OnHasLeftClosingTag(
Converter *c)
override;
519 struct TagSeperator : Tag {
520 void OnHasLeftOpeningTag(
Converter *c)
override;
521 void OnHasLeftClosingTag(
Converter *c)
override;
524 struct TagTable : Tag {
525 void OnHasLeftOpeningTag(
Converter *c)
override;
526 void OnHasLeftClosingTag(
Converter *c)
override;
529 struct TagTableRow : Tag {
530 void OnHasLeftOpeningTag(
Converter *c)
override;
531 void OnHasLeftClosingTag(
Converter *c)
override;
534 struct TagTableHeader : Tag {
535 void OnHasLeftOpeningTag(
Converter *c)
override;
536 void OnHasLeftClosingTag(
Converter *c)
override;
539 struct TagTableData : Tag {
540 void OnHasLeftOpeningTag(
Converter *c)
override;
541 void OnHasLeftClosingTag(
Converter *c)
override;
544 struct TagBlockquote : Tag {
545 void OnHasLeftOpeningTag(
Converter *c)
override;
546 void OnHasLeftClosingTag(
Converter *c)
override;
549 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
551 explicit Converter(
const std::string *html,
struct Options *options);
553 void CleanUpMarkdown();
556 static void LTrim(std::string *s);
559 Converter *RTrim(std::string *s,
bool trim_only_blank =
false);
566 void TidyAllLines(std::string *str);
568 std::string ExtractAttributeFromTagLeftOf(
const std::string &attr);
570 void TurnLineIntoHeader1();
572 void TurnLineIntoHeader2();
575 void OnHasEnteredTag();
585 bool ParseCharInTag(
char ch);
590 inline static bool TagContainsAttributesToHide(std::string *tag) {
593 return (*tag).find(
" aria=\"hidden\"") != string::npos ||
594 (*tag).find(
"display:none") != string::npos ||
595 (*tag).find(
"visibility:hidden") != string::npos ||
596 (*tag).find(
"opacity:0") != string::npos ||
597 (*tag).find(
"Details-content--hidden-not-important") != string::npos;
600 Converter *ShortenMarkdown(
size_t chars = 1);
601 inline bool shortIfPrevCh(
char prev) {
602 if (prev_ch_in_md_ == prev) {
613 bool ParseCharInTagContent(
char ch);
616 bool ReplacePreviousSpaceInLineByNewline();
618 static inline bool IsIgnoredTag(
const std::string &tag) {
619 return (tag[0] ==
'-' || kTagTemplate == tag || kTagStyle == tag ||
620 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
625 [[nodiscard]]
bool IsInIgnoredTag()
const;
634inline std::string
Convert(
const std::string &html,
bool *ok =
nullptr) {
642#ifndef PYTHON_BINDINGS
643inline std::string
Convert(
const std::string &&html,
bool *ok =
nullptr) {
Class for converting HTML to Markdown.
std::string convert()
Convert HTML into Markdown.
void addHtmlSymbolConversion(const std::string &htmlSymbol, const std::string &replacement)
Add an HTML symbol conversion.
Converter * appendToMd(char ch)
Append a char to the Markdown.
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
void removeHtmlSymbolConversion(const std::string &htmlSymbol)
Remove an HTML symbol conversion.
bool operator==(const Converter &c) const
Converter(const std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
void reset()
Reset the generated Markdown.
void clearHtmlSymbolConversions()
Clear all HTML symbol conversions.
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
std::string Convert(const std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Options for the conversion from HTML to Markdown.
bool operator==(html2md::Options o) const
int softBreak
softBreak Wrap after ... characters when the next space is reached and as long as it's not in a list,...
int hardBreak
hardBreak Force a break after ... characters in a line
char orderedList
The char used after the number of the item.
bool formatTable
Whetever to format Markdown Tables.
bool splitLines
Add new line when a certain number of characters is reached.
char unorderedList
The char used for unordered lists.
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.