Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include all nodes with text #885

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
6 changes: 3 additions & 3 deletions newspaper/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,12 @@ def fulltext(html, language='en'):

extractor = ContentExtractor(config)
document_cleaner = DocumentCleaner(config)
output_formatter = OutputFormatter(config)
output_formatter = OutputFormatter(config, extractor)

doc = config.get_parser().fromstring(html)
doc = document_cleaner.clean(doc)

top_node = extractor.calculate_best_node(doc)
top_node, extra_nodes = extractor.calculate_best_node(doc)
top_node = extractor.post_cleanup(top_node)
text, article_html = output_formatter.get_formatted(top_node)
text, article_html = output_formatter.get_formatted(top_node, extra_nodes)
return text
7 changes: 3 additions & 4 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def parse(self):
self.link_hash = parse_candidate.link_hash # MD5

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)
output_formatter = OutputFormatter(self.config, self.extractor)

title = self.extractor.get_title(self.clean_doc)
self.set_title(title)
Expand Down Expand Up @@ -270,16 +270,15 @@ def parse(self):
# Before any computations on the body, clean DOM object
self.doc = document_cleaner.clean(self.doc)

self.top_node = self.extractor.calculate_best_node(self.doc)
self.top_node, extra_nodes = self.extractor.calculate_best_node(self.doc)
if self.top_node is not None:
video_extractor = VideoExtractor(self.config, self.top_node)
self.set_movies(video_extractor.get_videos())

self.top_node = self.extractor.post_cleanup(self.top_node)
self.clean_top_node = copy.deepcopy(self.top_node)

text, article_html = output_formatter.get_formatted(
self.top_node)
text, article_html = output_formatter.get_formatted(self.top_node, extra_nodes)
self.set_article_html(article_html)
self.set_text(text)

Expand Down
34 changes: 28 additions & 6 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,10 @@ def calculate_best_node(self, doc):
i = 0
parent_nodes = []
nodes_with_text = []
# A dictionary where each nodes_with_text element (node a) is mapped to indices of parent_nodes. This is where
# the elements the indices provides (nodes b, c, etc) are such that:
# nodes b and c = parents or grandparents of node a
nodes_wtext_parent_map = {}

for node in nodes_to_check:
text_node = self.parser.getText(node)
Expand All @@ -791,7 +795,7 @@ def calculate_best_node(self, doc):
negative_scoring = 0
bottom_negativescore_nodes = float(nodes_number) * 0.25

for node in nodes_with_text:
for idx, node in enumerate(nodes_with_text):
boost_score = float(0)
# boost
if self.is_boostable(node):
Expand All @@ -817,7 +821,12 @@ def calculate_best_node(self, doc):
self.update_score(parent_node, upscore)
self.update_node_count(parent_node, 1)

# For this index in the loop, we haven't added any parent nodes yet
nodes_wtext_parent_map[idx] = []

if parent_node not in parent_nodes:
# Map this index of nodes_with_text to the index of the parent when added to parent_nodes
nodes_wtext_parent_map[idx].append(len(parent_nodes))
parent_nodes.append(parent_node)

# Parent of parent node
Expand All @@ -826,21 +835,34 @@ def calculate_best_node(self, doc):
self.update_node_count(parent_parent_node, 1)
self.update_score(parent_parent_node, upscore / 2)
if parent_parent_node not in parent_nodes:
# Update nodes_wtext_parent_map in same manner as before
nodes_wtext_parent_map[idx].append(len(parent_nodes))
parent_nodes.append(parent_parent_node)
cnt += 1
i += 1

top_node_score = 0
for e in parent_nodes:
# The index of top_node within parent_nodes
top_node_index = -1
for idx, e in enumerate(parent_nodes):
score = self.get_score(e)

if score > top_node_score:
if score > top_node_score or top_node is None:
top_node = e
top_node_index = idx
top_node_score = score

if top_node is None:
top_node = e
return top_node
# Nodes with text that are not related to top_node
unrelated_nodes_wtext = []
# Now that top_node has been determined, loop through parent mappings to populate unrelated_nodes_wtext
for nodes_wtext_idx in nodes_wtext_parent_map:
# Obtain the indices of the parents for this node (index)
parents = nodes_wtext_parent_map[nodes_wtext_idx]
# If the top node is unrelated, add it to the list
if top_node_index not in parents:
unrelated_nodes_wtext.append(nodes_with_text[nodes_wtext_idx])

return top_node, unrelated_nodes_wtext

def is_boostable(self, node):
"""A lot of times the first paragraph might be the caption under an image
Expand Down
157 changes: 138 additions & 19 deletions newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,43 @@
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

from html import unescape
import logging

from copy import deepcopy
from html import unescape
from lxml import etree
from .text import innerTrim


log = logging.getLogger(__name__)


# A small method to prepare a given string so it can be added to string list
def _prepare_txt(txt):
if txt:
txt = unescape(txt)
txt_lis = innerTrim(txt).split(r'\n')
txt_lis = [n.strip(' ') for n in txt_lis]
return txt_lis
return []


# A small method to update a txts list with a given string list
def _update_text_list(txts, to_add, index=None):
if index is not None:
# If we are given an index, insert the list's elements at the specified index
txts[index:0] = to_add
else:
# Else add the list's elements to the end of txts
txts.extend(to_add)


class OutputFormatter(object):

def __init__(self, config):
def __init__(self, config, extractor):
self.top_node = None
self.config = config
self.extractor = extractor
self.parser = self.config.get_parser()
self.language = config.language
self.stopwords_class = config.stopwords_class
Expand All @@ -38,46 +61,142 @@ def update_language(self, meta_lang):
def get_top_node(self):
return self.top_node

def get_formatted(self, top_node):
def get_formatted(self, top_node, extra_nodes=[]):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
html, text = '', ''

self.remove_negativescores_nodes()

if self.config.keep_article_html:
html = self.convert_to_html()

# Take a copy of top_node before editing it further
top_node_copy = deepcopy(self.top_node)
self.links_to_text()
self.add_newline_to_br()
self.add_newline_to_li()
self.replace_with_text()
self.remove_empty_tags()
self.remove_trailing_media_div()
text = self.convert_to_text()
# print(self.parser.nodeToString(self.get_top_node()))
text, html = self.convert_to_text(extra_nodes, top_node_copy)
return (text, html)

def convert_to_text(self):
def convert_to_text(self, extra_nodes, html_to_update):
# The current list of texts to be used for a final combined, joined text
txts = []
# Obtain the text based on top_node
for node in list(self.get_top_node()):
try:
txt = self.parser.getText(node)
except ValueError as err: # lxml error
log.info('%s ignoring lxml node error: %s', __title__, err)
txt = None
_update_text_list(txts, _prepare_txt(txt))
# Factor in any missing text before returning final result
return self.add_missing_text(txts, extra_nodes, html_to_update)

def add_missing_text(self, txts, extra_nodes, html_to_update):
"""A method to return (text, html) given the current text and html so far (txts list and html_to_update).
The method uses extra_nodes to consider any text that needs to be added before returning final text and html."""
# Keep track of the current index we are on for the text and html
current_idx, html_idx = 0, 0
# For each additional node we have...
for extra in extra_nodes:
# Ignore non-text nodes or nodes with a high link density
if extra.text is None or self.extractor.is_highlink_density(extra):
continue
# Prepare the node's text if it were to be added; count the length of the list to be added
stripped_txts = _prepare_txt(extra.text)
txt_count = len(stripped_txts)
# Check the text is not already within the final txts list
match = set(stripped_txts).intersection(txts)
node_found = bool(len(match))
# In regards to the html, take a copy of this node before parsing any hyperlinks
extra_pre_parsed = deepcopy(extra)
self.parser.stripTags(extra, 'a')
# If the text is already in the txts list, update current_idx to be where the node's text is + 1
if node_found:
# In case of multiple entries for this node's text, gather all indices of the text in txts and
# find the max (latest) entry
found_idxs = []
for m in match:
found_idxs.append(txts.index(m))
current_idx = max(found_idxs) + 1
# If the current node's text has not been added to the final txts list
else:
_update_text_list(txts, _prepare_txt(extra.text), index=current_idx)
# Update current_idx to be incremented by how many entries were added to txts
current_idx += txt_count
# Update the html if it should be updated
if self.config.keep_article_html:
html_idx, html_to_update = self.insert_missing_html(extra_pre_parsed, html_to_update, html_idx,
node_found, stripped_txts[0])
# Return final string based on txts list and html string
return '\n\n'.join(txts), self.convert_to_html(html_to_update)

if txt:
txt = unescape(txt)
txt_lis = innerTrim(txt).split(r'\n')
txt_lis = [n.strip(' ') for n in txt_lis]
txts.extend(txt_lis)
return '\n\n'.join(txts)
def insert_missing_html(self, node_pre_parsed, html_to_update, html_idx, text_found, node_text):
"""A method that updates html by checking if node_text should be inserted into html_to_update. The method then
returns the updated html and a new html-index being the position in html_to_update after the insertion."""
# Message to warn with in case of search errors or no results
truncated = '\'' + node_text[:30] + '...\''
warning_msg = 'Could not determine position of element with text ' + truncated \
+ ' Duplicates may occur in article html.'
# Matching element(s) given node_text
found_html = None
try:
# Do a starts-with search in case a sentence has been split
found_html = html_to_update.xpath('//*[starts-with(text(), $nodetext)]', nodetext=node_text)
except etree.XPathEvalError:
logging.warning(warning_msg + ' Error searching for text.')
# If we found a match
if found_html:
# Report if multiple matches found
if len(found_html) > 1:
logging.warning('Multiple matches for ' + truncated + ' in html, article html may be disordered.')
# Flag to check if we found the match's position in html_to_update
pos_found = False
# The current node we are checking whilst finding the position
current_node = found_html[0]
# Whilst we haven't found the position and we still have a current_node to check
while not pos_found and current_node is not None:
try:
# Attempt to find its position in relation to the rest of the html and return found index + 1
html_idx = html_to_update.index(current_node)
return html_idx + 1, html_to_update
except ValueError:
# If the element is not found, try the current node's parent
parent = current_node.getparent()
# If we have exhausted the search via parent nodes, exit loop
if current_node == parent:
break
# Set current node to be its parent to continue the search
current_node = parent
# Warn if node_text is found in the html but we couldn't find the element's position
logging.warning(
warning_msg + ' Could not trace element with this text or parent element after xpath match.')
# No matches with the xpath search
else:
# Attempt to search for the html elements from position html_idx and onwards based on text
for search_idx, search_elem in enumerate(html_to_update[html_idx:]):
# If we found the element this way, exit loop and return the updated html_idx
if search_elem.text and innerTrim(search_elem.text) in node_text:
return html_idx + search_idx + 1, html_to_update
# Whilst we are on this node, check its descendants; cannot use
# self.parser.childNodesWithText() as this creates nodes with text and duplicates text in final html
search_elem_children = self.parser.childNodes(search_elem)
# Do same text check for each child and return updated html_idx if there's a match
for search_elem_child in search_elem_children:
if search_elem_child.text and innerTrim(search_elem_child.text) in node_text:
return html_idx + search_idx + 1, html_to_update
# Warn if text originally included node_text because it would have been expected to appear in the final html
if text_found:
logging.warning(warning_msg + ' Article text originally included this text.')
# If we haven't returned an updated html_idx, then update html with element and return both index and html
html_to_update.insert(html_idx, node_pre_parsed)
return html_idx + 1, html_to_update

def convert_to_html(self):
cleaned_node = self.parser.clean_article_html(self.get_top_node())
def convert_to_html(self, node=None):
if node is None:
node = self.get_top_node()
cleaned_node = self.parser.clean_article_html(node)
return self.parser.nodeToString(cleaned_node)

def add_newline_to_br(self):
Expand Down