codelucas · jecarr · May 7, 2021 · May 10, 2021 · May 10, 2021 · May 10, 2021
diff --git a/newspaper/api.py b/newspaper/api.py
@@ -82,12 +82,12 @@ def fulltext(html, language='en'):
 
     extractor = ContentExtractor(config)
     document_cleaner = DocumentCleaner(config)
-    output_formatter = OutputFormatter(config)
+    output_formatter = OutputFormatter(config, extractor)
 
     doc = config.get_parser().fromstring(html)
     doc = document_cleaner.clean(doc)
 
-    top_node = extractor.calculate_best_node(doc)
+    top_node, extra_nodes = extractor.calculate_best_node(doc)
     top_node = extractor.post_cleanup(top_node)
-    text, article_html = output_formatter.get_formatted(top_node)
+    text, article_html = output_formatter.get_formatted(top_node, extra_nodes)
     return text
diff --git a/newspaper/article.py b/newspaper/article.py
@@ -224,7 +224,7 @@ def parse(self):
         self.link_hash = parse_candidate.link_hash  # MD5
 
         document_cleaner = DocumentCleaner(self.config)
-        output_formatter = OutputFormatter(self.config)
+        output_formatter = OutputFormatter(self.config, self.extractor)
 
         title = self.extractor.get_title(self.clean_doc)
         self.set_title(title)
@@ -270,16 +270,15 @@ def parse(self):
         # Before any computations on the body, clean DOM object
         self.doc = document_cleaner.clean(self.doc)
 
-        self.top_node = self.extractor.calculate_best_node(self.doc)
+        self.top_node, extra_nodes = self.extractor.calculate_best_node(self.doc)
         if self.top_node is not None:
             video_extractor = VideoExtractor(self.config, self.top_node)
             self.set_movies(video_extractor.get_videos())
 
             self.top_node = self.extractor.post_cleanup(self.top_node)
             self.clean_top_node = copy.deepcopy(self.top_node)
 
-            text, article_html = output_formatter.get_formatted(
-                self.top_node)
+            text, article_html = output_formatter.get_formatted(self.top_node, extra_nodes)
             self.set_article_html(article_html)
             self.set_text(text)
 

diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -778,6 +778,10 @@ def calculate_best_node(self, doc):
         i = 0
         parent_nodes = []
         nodes_with_text = []
+        # A dictionary where each nodes_with_text element (node a) is mapped to indices of parent_nodes. This is where
+        # the elements the indices provides (nodes b, c, etc) are such that:
+        # nodes b and c = parents or grandparents of node a
+        nodes_wtext_parent_map = {}
 
         for node in nodes_to_check:
             text_node = self.parser.getText(node)
@@ -791,7 +795,7 @@ def calculate_best_node(self, doc):
         negative_scoring = 0
         bottom_negativescore_nodes = float(nodes_number) * 0.25
 
-        for node in nodes_with_text:
+        for idx, node in enumerate(nodes_with_text):
             boost_score = float(0)
             # boost
             if self.is_boostable(node):
@@ -817,7 +821,12 @@ def calculate_best_node(self, doc):
             self.update_score(parent_node, upscore)
             self.update_node_count(parent_node, 1)
 
+            # For this index in the loop, we haven't added any parent nodes yet
+            nodes_wtext_parent_map[idx] = []
+
             if parent_node not in parent_nodes:
+                # Map this index of nodes_with_text to the index of the parent when added to parent_nodes
+                nodes_wtext_parent_map[idx].append(len(parent_nodes))
                 parent_nodes.append(parent_node)
 
             # Parent of parent node
@@ -826,21 +835,34 @@ def calculate_best_node(self, doc):
                 self.update_node_count(parent_parent_node, 1)
                 self.update_score(parent_parent_node, upscore / 2)
                 if parent_parent_node not in parent_nodes:
+                    # Update nodes_wtext_parent_map in same manner as before
+                    nodes_wtext_parent_map[idx].append(len(parent_nodes))
                     parent_nodes.append(parent_parent_node)
             cnt += 1
             i += 1
 
         top_node_score = 0
-        for e in parent_nodes:
+        # The index of top_node within parent_nodes
+        top_node_index = -1
+        for idx, e in enumerate(parent_nodes):
             score = self.get_score(e)
 
-            if score > top_node_score:
+            if score > top_node_score or top_node is None:
                 top_node = e
+                top_node_index = idx
                 top_node_score = score
 
-            if top_node is None:
-                top_node = e
-        return top_node
+        # Nodes with text that are not related to top_node
+        unrelated_nodes_wtext = []
+        # Now that top_node has been determined, loop through parent mappings to populate unrelated_nodes_wtext
+        for nodes_wtext_idx in nodes_wtext_parent_map:
+            # Obtain the indices of the parents for this node (index)
+            parents = nodes_wtext_parent_map[nodes_wtext_idx]
+            # If the top node is unrelated, add it to the list
+            if top_node_index not in parents:
+                unrelated_nodes_wtext.append(nodes_with_text[nodes_wtext_idx])
+
+        return top_node, unrelated_nodes_wtext
 
     def is_boostable(self, node):
         """A lot of times the first paragraph might be the caption under an image

diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py
@@ -7,20 +7,43 @@
 __license__ = 'MIT'
 __copyright__ = 'Copyright 2014, Lucas Ou-Yang'
 
-from html import unescape
 import logging
 
+from copy import deepcopy
+from html import unescape
+from lxml import etree
 from .text import innerTrim
 
 
 log = logging.getLogger(__name__)
 
 
+# A small method to prepare a given string so it can be added to string list
+def _prepare_txt(txt):
+    if txt:
+        txt = unescape(txt)
+        txt_lis = innerTrim(txt).split(r'\n')
+        txt_lis = [n.strip(' ') for n in txt_lis]
+        return txt_lis
+    return []
+
+
+# A small method to update a txts list with a given string list
+def _update_text_list(txts, to_add, index=None):
+    if index is not None:
+        # If we are given an index, insert the list's elements at the specified index
+        txts[index:0] = to_add
+    else:
+        # Else add the list's elements to the end of txts
+        txts.extend(to_add)
+
+
 class OutputFormatter(object):
 
-    def __init__(self, config):
+    def __init__(self, config, extractor):
         self.top_node = None
         self.config = config
+        self.extractor = extractor
         self.parser = self.config.get_parser()
         self.language = config.language
         self.stopwords_class = config.stopwords_class
@@ -38,46 +61,142 @@ def update_language(self, meta_lang):
     def get_top_node(self):
         return self.top_node
 
-    def get_formatted(self, top_node):
+    def get_formatted(self, top_node, extra_nodes=[]):
         """Returns the body text of an article, and also the body article
         html if specified. Returns in (text, html) form
         """
         self.top_node = top_node
-        html, text = '', ''
 
         self.remove_negativescores_nodes()
-
-        if self.config.keep_article_html:
-            html = self.convert_to_html()
-
+        # Take a copy of top_node before editing it further
+        top_node_copy = deepcopy(self.top_node)
         self.links_to_text()
         self.add_newline_to_br()
         self.add_newline_to_li()
         self.replace_with_text()
         self.remove_empty_tags()
         self.remove_trailing_media_div()
-        text = self.convert_to_text()
-        # print(self.parser.nodeToString(self.get_top_node()))
+        text, html = self.convert_to_text(extra_nodes, top_node_copy)
         return (text, html)
 
-    def convert_to_text(self):
+    def convert_to_text(self, extra_nodes, html_to_update):
+        # The current list of texts to be used for a final combined, joined text
         txts = []
+        # Obtain the text based on top_node
         for node in list(self.get_top_node()):
             try:
                 txt = self.parser.getText(node)
             except ValueError as err:  # lxml error
                 log.info('%s ignoring lxml node error: %s', __title__, err)
                 txt = None
+            _update_text_list(txts, _prepare_txt(txt))
+        # Factor in any missing text before returning final result
+        return self.add_missing_text(txts, extra_nodes, html_to_update)
+
+    def add_missing_text(self, txts, extra_nodes, html_to_update):
+        """A method to return (text, html) given the current text and html so far (txts list and html_to_update).
+        The method uses extra_nodes to consider any text that needs to be added before returning final text and html."""
+        # Keep track of the current index we are on for the text and html
+        current_idx, html_idx = 0, 0
+        # For each additional node we have...
+        for extra in extra_nodes:
+            # Ignore non-text nodes or nodes with a high link density
+            if extra.text is None or self.extractor.is_highlink_density(extra):
+                continue
+            # Prepare the node's text if it were to be added; count the length of the list to be added
+            stripped_txts = _prepare_txt(extra.text)
+            txt_count = len(stripped_txts)
+            # Check the text is not already within the final txts list
+            match = set(stripped_txts).intersection(txts)
+            node_found = bool(len(match))
+            # In regards to the html, take a copy of this node before parsing any hyperlinks
+            extra_pre_parsed = deepcopy(extra)
+            self.parser.stripTags(extra, 'a')
+            # If the text is already in the txts list, update current_idx to be where the node's text is + 1
+            if node_found:
+                # In case of multiple entries for this node's text, gather all indices of the text in txts and
+                # find the max (latest) entry
+                found_idxs = []
+                for m in match:
+                    found_idxs.append(txts.index(m))
+                current_idx = max(found_idxs) + 1
+            # If the current node's text has not been added to the final txts list
+            else:
+                _update_text_list(txts, _prepare_txt(extra.text), index=current_idx)
+                # Update current_idx to be incremented by how many entries were added to txts
+                current_idx += txt_count
+            # Update the html if it should be updated
+            if self.config.keep_article_html:
+                html_idx, html_to_update = self.insert_missing_html(extra_pre_parsed, html_to_update, html_idx,
+                                                                    node_found, stripped_txts[0])
+        # Return final string based on txts list and html string
+        return '\n\n'.join(txts), self.convert_to_html(html_to_update)
 
-            if txt:
-                txt = unescape(txt)
-                txt_lis = innerTrim(txt).split(r'\n')
-                txt_lis = [n.strip(' ') for n in txt_lis]
-                txts.extend(txt_lis)
-        return '\n\n'.join(txts)
+    def insert_missing_html(self, node_pre_parsed, html_to_update, html_idx, text_found, node_text):
+        """A method that updates html by checking if node_text should be inserted into html_to_update. The method then
+        returns the updated html and a new html-index being the position in html_to_update after the insertion."""
+        # Message to warn with in case of search errors or no results
+        truncated = '\'' + node_text[:30] + '...\''
+        warning_msg = 'Could not determine position of element with text ' + truncated \
+                      + ' Duplicates may occur in article html.'
+        # Matching element(s) given node_text
+        found_html = None
+        try:
+            # Do a starts-with search in case a sentence has been split
+            found_html = html_to_update.xpath('//*[starts-with(text(), $nodetext)]', nodetext=node_text)
+        except etree.XPathEvalError:
+            logging.warning(warning_msg + ' Error searching for text.')
+        # If we found a match
+        if found_html:
+            # Report if multiple matches found
+            if len(found_html) > 1:
+                logging.warning('Multiple matches for ' + truncated + ' in html, article html may be disordered.')
+            # Flag to check if we found the match's position in html_to_update
+            pos_found = False
+            # The current node we are checking whilst finding the position
+            current_node = found_html[0]
+            # Whilst we haven't found the position and we still have a current_node to check
+            while not pos_found and current_node is not None:
+                try:
+                    # Attempt to find its position in relation to the rest of the html and return found index + 1
+                    html_idx = html_to_update.index(current_node)
+                    return html_idx + 1, html_to_update
+                except ValueError:
+                    # If the element is not found, try the current node's parent
+                    parent = current_node.getparent()
+                    # If we have exhausted the search via parent nodes, exit loop
+                    if current_node == parent:
+                        break
+                    # Set current node to be its parent to continue the search
+                    current_node = parent
+            # Warn if node_text is found in the html but we couldn't find the element's position
+            logging.warning(
+                warning_msg + ' Could not trace element with this text or parent element after xpath match.')
+        # No matches with the xpath search
+        else:
+            # Attempt to search for the html elements from position html_idx and onwards based on text
+            for search_idx, search_elem in enumerate(html_to_update[html_idx:]):
+                # If we found the element this way, exit loop and return the updated html_idx
+                if search_elem.text and innerTrim(search_elem.text) in node_text:
+                    return html_idx + search_idx + 1, html_to_update
+                # Whilst we are on this node, check its descendants; cannot use
+                # self.parser.childNodesWithText() as this creates nodes with text and duplicates text in final html
+                search_elem_children = self.parser.childNodes(search_elem)
+                # Do same text check for each child and return updated html_idx if there's a match
+                for search_elem_child in search_elem_children:
+                    if search_elem_child.text and innerTrim(search_elem_child.text) in node_text:
+                        return html_idx + search_idx + 1, html_to_update
+            # Warn if text originally included node_text because it would have been expected to appear in the final html
+            if text_found:
+                logging.warning(warning_msg + ' Article text originally included this text.')
+        # If we haven't returned an updated html_idx, then update html with element and return both index and html
+        html_to_update.insert(html_idx, node_pre_parsed)
+        return html_idx + 1, html_to_update
 
-    def convert_to_html(self):
-        cleaned_node = self.parser.clean_article_html(self.get_top_node())
+    def convert_to_html(self, node=None):
+        if node is None:
+            node = self.get_top_node()
+        cleaned_node = self.parser.clean_article_html(node)
         return self.parser.nodeToString(cleaned_node)
 
     def add_newline_to_br(self):