codelucas · Cabu · Oct 2, 2023 · Oct 2, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/newspaper/configuration.py b/newspaper/configuration.py
@@ -11,6 +11,7 @@
 __copyright__ = 'Copyright 2014, Lucas Ou-Yang'
 
 import logging
+import requests
 
 from .parsers import Parser
 from .text import (StopWords, StopWordsArabic, StopWordsChinese,
@@ -63,6 +64,7 @@ def __init__(self):
         # Unique stopword classes for oriental languages, don't toggle
         self.stopwords_class = StopWords
 
+        self.session = requests.Session()
         self.browser_user_agent = 'newspaper/%s' % __version__
         self.headers = {}
         self.request_timeout = 7

diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -681,18 +681,29 @@ def get_category_urls(self, source_url, doc):
                                'subdomain' % p_url))
                     continue
                 else:
-                    valid_categories.append(scheme + '://' + domain)
-                    # TODO account for case where category is in form
-                    # http://subdomain.domain.tld/category/ <-- still legal!
+                    if subdomain_contains:
+                        valid_categories.append(scheme + '://' + domain)
+                        # TODO account for case where category is in form
+                        # http://subdomain.domain.tld/category/ <-- still legal!
+                    else:
+                        # support for urls like
+                        # http://domain.tld/category/[category]
+                        path_chunks = [x for x in path.split('/') if len(x) > 0]
+                        path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])]
+                        path_chunks = [x for x in path_chunks if not x.isnumeric()]
+
+                        if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
+                            valid_categories.append('//' + domain + path)
             else:
                 # we want a path with just one subdir
                 # cnn.com/world and cnn.com/world/ are both valid_categories
+                # carbon-pulse.com/category/international/ is a valid_categories
                 path_chunks = [x for x in path.split('/') if len(x) > 0]
-                if 'index.html' in path_chunks:
-                    path_chunks.remove('index.html')
+                path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])]
+                path_chunks = [x for x in path_chunks if not x.isnumeric()]
 
                 if len(path_chunks) == 1 and len(path_chunks[0]) < 14:
-                    valid_categories.append(domain + path)
+                    valid_categories.append(path)
                 else:
                     if self.config.verbose:
                         print(('elim category url %s for >1 path chunks '
@@ -709,8 +720,9 @@ def get_category_urls(self, source_url, doc):
             'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
             'howto', 'how to', 'faq', 'terms', 'charts', 'services',
             'contact', 'plus', 'admin', 'login', 'signup', 'register',
-            'developer', 'proxy']
+            'developer', 'proxy', 'what-we-offer', 'staff']
 
+        valid_categories = list(set(valid_categories))
         _valid_categories = []
 
         # TODO Stop spamming urlparse and tldextract calls...
@@ -747,8 +759,7 @@ def get_category_urls(self, source_url, doc):
 
         _valid_categories = list(set(_valid_categories))
 
-        category_urls = [urls.prepare_url(p_url, source_url)
-                         for p_url in _valid_categories]
+        category_urls = [urls.prepare_url(p_url, source_url) for p_url in _valid_categories]
         category_urls = [c for c in category_urls if c is not None]
         return category_urls
 

diff --git a/newspaper/network.py b/newspaper/network.py
@@ -55,11 +55,12 @@ def get_html_2XX_only(url, config=None, response=None):
     timeout = config.request_timeout
     proxies = config.proxies
     headers = config.headers
+    session = config.session
 
     if response is not None:
         return _get_html_from_response(response, config)
 
-    response = requests.get(
+    response = session.get(
         url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
 
     html = _get_html_from_response(response, config)
@@ -102,11 +103,12 @@ def __init__(self, url, config=None):
         self.timeout = config.request_timeout
         self.proxies = config.proxies
         self.headers = config.headers
+        self.session = config.session
         self.resp = None
 
     def send(self):
         try:
-            self.resp = requests.get(self.url, **get_request_kwargs(
+            self.resp = self.session.get(self.url, **get_request_kwargs(
                 self.timeout, self.useragent, self.proxies, self.headers))
             if self.config.http_success_only:
                 self.resp.raise_for_status()

diff --git a/newspaper/urls.py b/newspaper/urls.py
@@ -211,6 +211,11 @@ def valid_url(url, verbose=False, test=False):
                 if verbose: print('%s verified for being a slug' % url)
                 return True
 
+    # Allow for paths like /[numeric] (eg: https://carbon-pulse.com/226570/)
+    if len(path_chunks) == 1 and path_chunks[0].isnumeric():
+        if verbose: print('%s verified for isnumeric' % url)
+        return True
+
     # There must be at least 2 subpaths
     if len(path_chunks) <= 1:
         if verbose: print('%s caught for path chunks too small' % url)

diff --git a/newspaper/utils.py b/newspaper/utils.py
@@ -235,6 +235,7 @@ def inner_function(*args, **kwargs):
             # call the decorated function...
             result = function(*args, **kwargs)
             # ... and save the cached object for next time
+            os.makedirs(cache_folder, exist_ok=True)
             pickle.dump(result, open(filepath, "wb"))
             return result
         return inner_function
@@ -324,6 +325,7 @@ def memoize_articles(source, articles):
         memo_text = ''
 
     # TODO if source: source.write_upload_times(prev_length, new_length)
+    os.makedirs(settings.MEMO_DIR, exist_ok=True)
     ff = codecs.open(d_pth, 'w', 'utf-8')
     ff.write(memo_text)
     ff.close()