From 71384263fbf76970cff8c8f6810596326beda45e Mon Sep 17 00:00:00 2001 From: anchal7299 Date: Mon, 5 Jul 2021 19:39:39 +0530 Subject: [PATCH] Fix new line inside emphasis --- html2text/__init__.py | 37 ++++++++++++++++++++++----- test/new_line_in_emphasis.html | 1 + test/new_line_in_emphasis.md | 3 +++ test/test_new_line_inside_emphasis.py | 8 ++++++ 4 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 test/new_line_in_emphasis.html create mode 100644 test/new_line_in_emphasis.md create mode 100644 test/test_new_line_inside_emphasis.py diff --git a/html2text/__init__.py b/html2text/__init__.py index 7e1a279..0184858 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -86,7 +86,7 @@ def __init__( self.tag_callback = None self.open_quote = config.OPEN_QUOTE # covered in cli self.close_quote = config.CLOSE_QUOTE # covered in cli - + if out is None: self.out = self.outtextf else: @@ -120,6 +120,8 @@ def __init__( self.tag_stack = ( [] ) # type: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] + self.emphasis_tag_stack = {} + self.remove_space = False self.emphasis = 0 self.drop_white_space = 0 self.inheader = False @@ -302,10 +304,19 @@ def handle_tag( ) -> None: self.current_tag = tag + if tag in ["b","em","i","u"]: + if start: + if tag in self.emphasis_tag_stack: + self.emphasis_tag_stack[tag] += 1 + else: + self.emphasis_tag_stack[tag] = 1 + elif list(self.emphasis_tag_stack.keys()): + self.emphasis_tag_stack.popitem() + if self.tag_callback is not None: if self.tag_callback(self, tag, attrs, start) is True: return - + # first thing inside the anchor tag is another tag # that produces some output if ( @@ -372,10 +383,24 @@ def handle_tag( self.p() if tag == "br" and start: + for key in list(self.emphasis_tag_stack.keys())[::-1]: + if(key == "b"): + self.o(self.strong_mark) + elif key in ["em","i","u"]: + self.o(self.emphasis_mark) + if self.blockquote > 0: self.o(" \n> ") else: self.o(" \n") + + for key in list(self.emphasis_tag_stack.keys()): + if(key == "b"): + self.o(self.strong_mark) + elif key in ["em","i","u"]: + self.o(self.emphasis_mark) + self.remove_space = True + self.drop_white_space = 1 if tag == "hr" and start: self.p() @@ -640,11 +665,11 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: # https://spec.commonmark.org/0.28/#motivation # TODO: line up
  1. s > 9 correctly. parent_list = None - for list in self.list: + for item in self.list: self.o( - " " if parent_list == "ol" and list.name == "ul" else " " + " " if parent_list == "ol" and item.name == "ul" else " " ) - parent_list = list.name + parent_list = item.name if li.name == "ul": self.o(self.ul_item_mark + " ") @@ -743,7 +768,7 @@ def o( self.abbr_data += data if not self.quiet: - if self.google_doc: + if self.google_doc or self.remove_space: # prevent white space immediately after 'begin emphasis' # marks ('**' and '_') lstripped_data = data.lstrip() diff --git a/test/new_line_in_emphasis.html b/test/new_line_in_emphasis.html new file mode 100644 index 0000000..4290ada --- /dev/null +++ b/test/new_line_in_emphasis.html @@ -0,0 +1 @@ +Our multiline
    bold text
    \ No newline at end of file diff --git a/test/new_line_in_emphasis.md b/test/new_line_in_emphasis.md new file mode 100644 index 0000000..df3a7ae --- /dev/null +++ b/test/new_line_in_emphasis.md @@ -0,0 +1,3 @@ +**Our multiline** +**bold text** + diff --git a/test/test_new_line_inside_emphasis.py b/test/test_new_line_inside_emphasis.py new file mode 100644 index 0000000..b5d69a4 --- /dev/null +++ b/test/test_new_line_inside_emphasis.py @@ -0,0 +1,8 @@ +import html2text + +def test_emphasis_with_new_line(): + h = html2text.HTML2Text() + html = "Our multiline
    bold text
    " + result = h.handle(html) + assert result == '**Our multiline** \n**bold text**\n\n' + \ No newline at end of file