Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow newspaper work on news websites like carbon-pulse #975

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'

import logging
import requests

from .parsers import Parser
from .text import (StopWords, StopWordsArabic, StopWordsChinese,
Expand Down Expand Up @@ -63,6 +64,7 @@ def __init__(self):
# Unique stopword classes for oriental languages, don't toggle
self.stopwords_class = StopWords

self.session = requests.Session()
self.browser_user_agent = 'newspaper/%s' % __version__
self.headers = {}
self.request_timeout = 7
Expand Down
29 changes: 20 additions & 9 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,18 +681,29 @@ def get_category_urls(self, source_url, doc):
'subdomain' % p_url))
continue
else:
valid_categories.append(scheme + '://' + domain)
# TODO account for case where category is in form
# http://subdomain.domain.tld/category/ <-- still legal!
if subdomain_contains:
valid_categories.append(scheme + '://' + domain)
# TODO account for case where category is in form
# http://subdomain.domain.tld/category/ <-- still legal!
else:
# support for urls like
# http://domain.tld/category/[category]
path_chunks = [x for x in path.split('/') if len(x) > 0]
path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])]
path_chunks = [x for x in path_chunks if not x.isnumeric()]

if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
valid_categories.append('//' + domain + path)
else:
# we want a path with just one subdir
# cnn.com/world and cnn.com/world/ are both valid_categories
# carbon-pulse.com/category/international/ is a valid_categories
path_chunks = [x for x in path.split('/') if len(x) > 0]
if 'index.html' in path_chunks:
path_chunks.remove('index.html')
path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])]
path_chunks = [x for x in path_chunks if not x.isnumeric()]

if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
valid_categories.append(domain + path)
valid_categories.append(path)
else:
if self.config.verbose:
print(('elim category url %s for >1 path chunks '
Expand All @@ -709,8 +720,9 @@ def get_category_urls(self, source_url, doc):
'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
'howto', 'how to', 'faq', 'terms', 'charts', 'services',
'contact', 'plus', 'admin', 'login', 'signup', 'register',
'developer', 'proxy']
'developer', 'proxy', 'what-we-offer', 'staff']

valid_categories = list(set(valid_categories))
_valid_categories = []

# TODO Stop spamming urlparse and tldextract calls...
Expand Down Expand Up @@ -747,8 +759,7 @@ def get_category_urls(self, source_url, doc):

_valid_categories = list(set(_valid_categories))

category_urls = [urls.prepare_url(p_url, source_url)
for p_url in _valid_categories]
category_urls = [urls.prepare_url(p_url, source_url) for p_url in _valid_categories]
category_urls = [c for c in category_urls if c is not None]
return category_urls

Expand Down
6 changes: 4 additions & 2 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ def get_html_2XX_only(url, config=None, response=None):
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
session = config.session

if response is not None:
return _get_html_from_response(response, config)

response = requests.get(
response = session.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))

html = _get_html_from_response(response, config)
Expand Down Expand Up @@ -102,11 +103,12 @@ def __init__(self, url, config=None):
self.timeout = config.request_timeout
self.proxies = config.proxies
self.headers = config.headers
self.session = config.session
self.resp = None

def send(self):
try:
self.resp = requests.get(self.url, **get_request_kwargs(
self.resp = self.session.get(self.url, **get_request_kwargs(
self.timeout, self.useragent, self.proxies, self.headers))
if self.config.http_success_only:
self.resp.raise_for_status()
Expand Down
5 changes: 5 additions & 0 deletions newspaper/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,11 @@ def valid_url(url, verbose=False, test=False):
if verbose: print('%s verified for being a slug' % url)
return True

# Allow for paths like /[numeric] (eg: https://carbon-pulse.com/226570/)
if len(path_chunks) == 1 and path_chunks[0].isnumeric():
if verbose: print('%s verified for isnumeric' % url)
return True

# There must be at least 2 subpaths
if len(path_chunks) <= 1:
if verbose: print('%s caught for path chunks too small' % url)
Expand Down
2 changes: 2 additions & 0 deletions newspaper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def inner_function(*args, **kwargs):
# call the decorated function...
result = function(*args, **kwargs)
# ... and save the cached object for next time
os.makedirs(cache_folder, exist_ok=True)
pickle.dump(result, open(filepath, "wb"))
return result
return inner_function
Expand Down Expand Up @@ -324,6 +325,7 @@ def memoize_articles(source, articles):
memo_text = ''

# TODO if source: source.write_upload_times(prev_length, new_length)
os.makedirs(settings.MEMO_DIR, exist_ok=True)
ff = codecs.open(d_pth, 'w', 'utf-8')
ff.write(memo_text)
ff.close()
Expand Down