s0md3v · mariuszmalek · Dec 26, 2022 · Dec 26, 2022 · Dec 26, 2022
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ Photon can extract the following data while crawling:
 - JavaScript files & Endpoints present in them
 - Strings matching custom regex pattern
 - Subdomains & DNS related data
+- Metadata from sites
 
 The extracted information is saved in an organized manner or can be [exported as json](https://github.com/s0md3v/Photon/wiki/Usage#export-formatted-result).
 

diff --git a/core/metadata.py b/core/metadata.py
@@ -0,0 +1,60 @@
+import extruct
+
+
+def extract_metadata(text, url):
+    """Extract all metadata present in the page and return a dictionary of metadata lists. 
+
+    Args:
+        url (string): URL of page from which to extract metadata. 
+
+    Returns: 
+        metadata (dict): Dictionary of json-ld, microdata, and opengraph lists. 
+        Each of the lists present within the dictionary contains multiple dictionaries.
+    """
+
+    metadata = extruct.extract(text, 
+                               base_url=url,
+                               uniform=True,
+                               syntaxes=['json-ld',
+                                         'microdata',
+                                         'opengraph'])
+    return metadata
+
+
+def get_dictionary_by_key_value(dictionary, target_key, target_value):
+    """Return a dictionary that contains a target key value pair. 
+
+    Args:
+        dictionary: Metadata dictionary containing lists of other dictionaries.
+        target_key: Target key to search for within a dictionary inside a list. 
+        target_value: Target value to search for within a dictionary inside a list. 
+
+    Returns:
+        target_dictionary: Target dictionary that contains target key value pair. 
+    """
+    result = None
+    for key in dictionary:
+        if len(dictionary[key]) > 0:
+            for item in dictionary[key]:
+                if item[target_key] == target_value:
+                    return item
+    return result
+
+
+def get_dictionary_by_key(dictionary, target_key):
+    """Return a dictionary that contains a target key. 
+
+    Args:
+        dictionary: Metadata dictionary containing lists of other dictionaries.
+        target_key: Target key to search for within a dictionary inside a list. 
+
+    Returns:
+        target_dictionary: Target dictionary that contains target key value pair. 
+    """
+    result = None
+    for key in dictionary:
+        if len(dictionary[key]) > 0:
+            for item in dictionary[key]:
+                if item[target_key]:
+                    return item
+    return result
diff --git a/photon.py b/photon.py
@@ -36,6 +36,7 @@
 from core.prompt import prompt
 from core.requester import requester
 from core.updater import updater
+from core.metadata import extract_metadata, get_dictionary_by_key_value, get_dictionary_by_key
 from core.utils import (luhn,
                         proxy_type,
                         is_good_proxy,
@@ -80,6 +81,7 @@
                     type=float)
 parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies',
                     type=proxy_type)
+parser.add_argument('-m', '--metadata', help='page metadata', dest='store_true')
 
 # Switches
 parser.add_argument('--clone', help='clone the website locally', dest='clone',
@@ -142,6 +144,7 @@
 crawl_level = args.level or 2  # Crawling level
 thread_count = args.threads or 2  # Number of threads
 only_urls = bool(args.only_urls)  # Only URLs mode is off by default
+has_metadata = bool(args.metadata) # Has metadata
 
 # Variables we are gonna use later to store stuff
 keys = set()  # High entropy strings, prolly secret keys
@@ -158,6 +161,7 @@
 processed = set(['dummy'])  # URLs that have been crawled
 # URLs that belong to the target i.e. in-scope
 internal = set(args.seeds)
+metadata = set()
 
 everything = []
 bad_scripts = set()  # Unclean javascript file urls
@@ -239,6 +243,11 @@ def remove_file(url):
 def extractor(url):
     """Extract details from the response body."""
     response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
+
+    # Add metadata
+    if has_metadata:
+        metadata.add(extract_metadata(response, url))
+
     if clone:
         mirror(url, response)
     matches = rhref.findall(response)
@@ -378,9 +387,9 @@ def jscanner(url):
     os.mkdir(output_dir) # create a new directory
 
 datasets = [files, intel, robots, custom, failed, internal, scripts,
-            external, fuzzable, endpoints, keys]
+            external, fuzzable, endpoints, keys, metadata]
 dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal',
-                 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
+                 'scripts', 'external', 'fuzzable', 'endpoints', 'keys', 'metadata']
 
 writer(datasets, dataset_names, output_dir)
 # Printing out results
@@ -399,7 +408,7 @@ def jscanner(url):
     'custom': list(custom), 'failed': list(failed), 'internal': list(internal),
     'scripts': list(scripts), 'external': list(external),
     'fuzzable': list(fuzzable), 'endpoints': list(endpoints),
-    'keys': list(keys)
+    'keys': list(keys), 'metadata': list(metadata)
 }
 
 if args.dns: