10#include <unordered_map>
161 struct Options *options =
nullptr) {
172 [[nodiscard]] std::string
convert();
212 [[nodiscard]]
bool ok()
const;
227 return html_ == c.html_ && option == c.option;
233 inline explicit operator bool()
const {
return ok(); };
237 static constexpr const char *kAttributeHref =
"href";
238 static constexpr const char *kAttributeAlt =
"alt";
239 static constexpr const char *kAttributeTitle =
"title";
240 static constexpr const char *kAttributeClass =
"class";
241 static constexpr const char *kAttributeSrc =
"src";
242 static constexpr const char *kAttrinuteAlign =
"align";
244 static constexpr const char *kTagAnchor =
"a";
245 static constexpr const char *kTagBreak =
"br";
246 static constexpr const char *kTagCode =
"code";
247 static constexpr const char *kTagDiv =
"div";
248 static constexpr const char *kTagHead =
"head";
249 static constexpr const char *kTagLink =
"link";
250 static constexpr const char *kTagListItem =
"li";
251 static constexpr const char *kTagMeta =
"meta";
252 static constexpr const char *kTagNav =
"nav";
253 static constexpr const char *kTagNoScript =
"noscript";
254 static constexpr const char *kTagOption =
"option";
255 static constexpr const char *kTagOrderedList =
"ol";
256 static constexpr const char *kTagParagraph =
"p";
257 static constexpr const char *kTagPre =
"pre";
258 static constexpr const char *kTagScript =
"script";
259 static constexpr const char *kTagSpan =
"span";
260 static constexpr const char *kTagStyle =
"style";
261 static constexpr const char *kTagTemplate =
"template";
262 static constexpr const char *kTagTitle =
"title";
263 static constexpr const char *kTagUnorderedList =
"ul";
264 static constexpr const char *kTagImg =
"img";
265 static constexpr const char *kTagSeperator =
"hr";
268 static constexpr const char *kTagBold =
"b";
269 static constexpr const char *kTagStrong =
"strong";
270 static constexpr const char *kTagItalic =
"em";
271 static constexpr const char *kTagItalic2 =
"i";
272 static constexpr const char *kTagCitation =
"cite";
273 static constexpr const char *kTagDefinition =
"dfn";
274 static constexpr const char *kTagUnderline =
"u";
275 static constexpr const char *kTagStrighthrought =
"del";
276 static constexpr const char *kTagStrighthrought2 =
"s";
278 static constexpr const char *kTagBlockquote =
"blockquote";
281 static constexpr const char *kTagHeader1 =
"h1";
282 static constexpr const char *kTagHeader2 =
"h2";
283 static constexpr const char *kTagHeader3 =
"h3";
284 static constexpr const char *kTagHeader4 =
"h4";
285 static constexpr const char *kTagHeader5 =
"h5";
286 static constexpr const char *kTagHeader6 =
"h6";
289 static constexpr const char *kTagTable =
"table";
290 static constexpr const char *kTagTableRow =
"tr";
291 static constexpr const char *kTagTableHeader =
"th";
292 static constexpr const char *kTagTableData =
"td";
294 size_t index_ch_in_html_ = 0;
296 bool is_closing_tag_ =
false;
297 bool is_in_attribute_value_ =
false;
298 bool is_in_code_ =
false;
299 bool is_in_list_ =
false;
300 bool is_in_p_ =
false;
301 bool is_in_pre_ =
false;
302 bool is_in_table_ =
false;
303 bool is_in_table_row_ =
false;
304 bool is_in_tag_ =
false;
307 bool is_in_ordered_list_ =
false;
308 uint8_t index_ol = 0;
311 size_t table_start = 0;
314 uint8_t index_li = 0;
316 uint8_t index_blockquote = 0;
318 char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
319 char prev_ch_in_html_ =
'x';
323 uint16_t offset_lt_ = 0;
324 std::string current_tag_;
325 std::string prev_tag_;
328 std::string tableLine;
330 size_t chars_in_curr_line_ = 0;
338 virtual void OnHasLeftOpeningTag(
Converter *c) = 0;
339 virtual void OnHasLeftClosingTag(
Converter *c) = 0;
345 struct TagIgnored : Tag {
346 void OnHasLeftOpeningTag(
Converter *c)
override{};
347 void OnHasLeftClosingTag(
Converter *c)
override{};
350 struct TagAnchor : Tag {
351 void OnHasLeftOpeningTag(
Converter *c)
override;
352 void OnHasLeftClosingTag(
Converter *c)
override;
354 std::string current_href_;
355 std::string current_title_;
358 struct TagBold : Tag {
359 void OnHasLeftOpeningTag(
Converter *c)
override;
360 void OnHasLeftClosingTag(
Converter *c)
override;
363 struct TagItalic : Tag {
364 void OnHasLeftOpeningTag(
Converter *c)
override;
365 void OnHasLeftClosingTag(
Converter *c)
override;
368 struct TagUnderline : Tag {
369 void OnHasLeftOpeningTag(
Converter *c)
override;
370 void OnHasLeftClosingTag(
Converter *c)
override;
373 struct TagStrikethrought : Tag {
374 void OnHasLeftOpeningTag(
Converter *c)
override;
375 void OnHasLeftClosingTag(
Converter *c)
override;
378 struct TagBreak : Tag {
379 void OnHasLeftOpeningTag(
Converter *c)
override;
380 void OnHasLeftClosingTag(
Converter *c)
override;
383 struct TagDiv : Tag {
384 void OnHasLeftOpeningTag(
Converter *c)
override;
385 void OnHasLeftClosingTag(
Converter *c)
override;
388 struct TagHeader1 : Tag {
389 void OnHasLeftOpeningTag(
Converter *c)
override;
390 void OnHasLeftClosingTag(
Converter *c)
override;
393 struct TagHeader2 : Tag {
394 void OnHasLeftOpeningTag(
Converter *c)
override;
395 void OnHasLeftClosingTag(
Converter *c)
override;
398 struct TagHeader3 : Tag {
399 void OnHasLeftOpeningTag(
Converter *c)
override;
400 void OnHasLeftClosingTag(
Converter *c)
override;
403 struct TagHeader4 : Tag {
404 void OnHasLeftOpeningTag(
Converter *c)
override;
405 void OnHasLeftClosingTag(
Converter *c)
override;
408 struct TagHeader5 : Tag {
409 void OnHasLeftOpeningTag(
Converter *c)
override;
410 void OnHasLeftClosingTag(
Converter *c)
override;
413 struct TagHeader6 : Tag {
414 void OnHasLeftOpeningTag(
Converter *c)
override;
415 void OnHasLeftClosingTag(
Converter *c)
override;
418 struct TagListItem : Tag {
419 void OnHasLeftOpeningTag(
Converter *c)
override;
420 void OnHasLeftClosingTag(
Converter *c)
override;
423 struct TagOption : Tag {
424 void OnHasLeftOpeningTag(
Converter *c)
override;
425 void OnHasLeftClosingTag(
Converter *c)
override;
428 struct TagOrderedList : Tag {
429 void OnHasLeftOpeningTag(
Converter *c)
override;
430 void OnHasLeftClosingTag(
Converter *c)
override;
433 struct TagParagraph : Tag {
434 void OnHasLeftOpeningTag(
Converter *c)
override;
435 void OnHasLeftClosingTag(
Converter *c)
override;
438 struct TagPre : Tag {
439 void OnHasLeftOpeningTag(
Converter *c)
override;
440 void OnHasLeftClosingTag(
Converter *c)
override;
443 struct TagCode : Tag {
444 void OnHasLeftOpeningTag(
Converter *c)
override;
445 void OnHasLeftClosingTag(
Converter *c)
override;
448 struct TagSpan : Tag {
449 void OnHasLeftOpeningTag(
Converter *c)
override;
450 void OnHasLeftClosingTag(
Converter *c)
override;
453 struct TagTitle : Tag {
454 void OnHasLeftOpeningTag(
Converter *c)
override;
455 void OnHasLeftClosingTag(
Converter *c)
override;
458 struct TagUnorderedList : Tag {
459 void OnHasLeftOpeningTag(
Converter *c)
override;
460 void OnHasLeftClosingTag(
Converter *c)
override;
463 struct TagImage : Tag {
464 void OnHasLeftOpeningTag(
Converter *c)
override;
465 void OnHasLeftClosingTag(
Converter *c)
override;
468 struct TagSeperator : Tag {
469 void OnHasLeftOpeningTag(
Converter *c)
override;
470 void OnHasLeftClosingTag(
Converter *c)
override;
473 struct TagTable : Tag {
474 void OnHasLeftOpeningTag(
Converter *c)
override;
475 void OnHasLeftClosingTag(
Converter *c)
override;
478 struct TagTableRow : Tag {
479 void OnHasLeftOpeningTag(
Converter *c)
override;
480 void OnHasLeftClosingTag(
Converter *c)
override;
483 struct TagTableHeader : Tag {
484 void OnHasLeftOpeningTag(
Converter *c)
override;
485 void OnHasLeftClosingTag(
Converter *c)
override;
488 struct TagTableData : Tag {
489 void OnHasLeftOpeningTag(
Converter *c)
override;
490 void OnHasLeftClosingTag(
Converter *c)
override;
493 struct TagBlockquote : Tag {
494 void OnHasLeftOpeningTag(
Converter *c)
override;
495 void OnHasLeftClosingTag(
Converter *c)
override;
498 std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
500 explicit Converter(std::string *html,
struct Options *options);
502 void CleanUpMarkdown();
505 static void LTrim(std::string *s);
508 Converter *RTrim(std::string *s,
bool trim_only_blank =
false);
515 void TidyAllLines(std::string *str);
517 std::string ExtractAttributeFromTagLeftOf(
const std::string &attr);
519 void TurnLineIntoHeader1();
521 void TurnLineIntoHeader2();
524 void OnHasEnteredTag();
534 bool ParseCharInTag(
char ch);
539 inline static bool TagContainsAttributesToHide(std::string *tag) {
542 return (*tag).find(
" aria=\"hidden\"") != string::npos ||
543 (*tag).find(
"display:none") != string::npos ||
544 (*tag).find(
"visibility:hidden") != string::npos ||
545 (*tag).find(
"opacity:0") != string::npos ||
546 (*tag).find(
"Details-content--hidden-not-important") != string::npos;
549 Converter *ShortenMarkdown(
size_t chars = 1);
550 inline bool shortIfPrevCh(
char prev) {
551 if (prev_ch_in_md_ == prev) {
562 bool ParseCharInTagContent(
char ch);
565 bool ReplacePreviousSpaceInLineByNewline();
567 static inline bool IsIgnoredTag(
const std::string &tag) {
568 return (tag[0] ==
'-' || kTagTemplate == tag || kTagStyle == tag ||
569 kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
574 [[nodiscard]]
bool IsInIgnoredTag()
const;
583inline std::string
Convert(std::string &html,
bool *ok =
nullptr) {
591#ifndef PYTHON_BINDINGS
592inline std::string
Convert(std::string &&html,
bool *ok =
nullptr) {
Class for converting HTML to Markdown.
std::string convert()
Convert HTML into Markdown.
Converter * appendToMd(char ch)
Append a char to the Markdown.
bool operator==(const Converter *c) const
Checks if the HTML matches and the options are the same.
Converter * appendBlank()
Appends a ' ' in certain cases.
bool ok() const
Checks if everything was closed properly(in the HTML).
Converter(std::string &html, struct Options *options=nullptr)
Standard initializer, takes HTML as parameter. Also prepares everything.
bool operator==(const Converter &c) const
void reset()
Reset the generated Markdown.
Converter * appendToMd(const std::string &s)
Append a string to the Markdown.
std::string Convert(std::string &html, bool *ok=nullptr)
Static wrapper around the Converter class.
Options for the conversion from HTML to Markdown.
bool operator==(html2md::Options o) const
char orderedList
The char used after the number of the item.
bool formatTable
Whetever to format Markdown Tables.
bool splitLines
Add new line when a certain number of characters is reached.
char unorderedList
The char used for unordered lists.
bool includeTitle
Whether title is added as h1 heading at the very beginning of the markdown.