Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New function for handle metadata #187

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Photon can extract the following data while crawling:
- JavaScript files & Endpoints present in them
- Strings matching custom regex pattern
- Subdomains & DNS related data
- Metadata from sites

The extracted information is saved in an organized manner or can be [exported as json](https://github.com/s0md3v/Photon/wiki/Usage#export-formatted-result).

Expand Down
60 changes: 60 additions & 0 deletions core/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import extruct


def extract_metadata(text, url):
"""Extract all metadata present in the page and return a dictionary of metadata lists.

Args:
url (string): URL of page from which to extract metadata.

Returns:
metadata (dict): Dictionary of json-ld, microdata, and opengraph lists.
Each of the lists present within the dictionary contains multiple dictionaries.
"""

metadata = extruct.extract(text,
base_url=url,
uniform=True,
syntaxes=['json-ld',
'microdata',
'opengraph'])
return metadata


def get_dictionary_by_key_value(dictionary, target_key, target_value):
"""Return a dictionary that contains a target key value pair.

Args:
dictionary: Metadata dictionary containing lists of other dictionaries.
target_key: Target key to search for within a dictionary inside a list.
target_value: Target value to search for within a dictionary inside a list.

Returns:
target_dictionary: Target dictionary that contains target key value pair.
"""
result = None
for key in dictionary:
if len(dictionary[key]) > 0:
for item in dictionary[key]:
if item[target_key] == target_value:
return item
return result


def get_dictionary_by_key(dictionary, target_key):
"""Return a dictionary that contains a target key.

Args:
dictionary: Metadata dictionary containing lists of other dictionaries.
target_key: Target key to search for within a dictionary inside a list.

Returns:
target_dictionary: Target dictionary that contains target key value pair.
"""
result = None
for key in dictionary:
if len(dictionary[key]) > 0:
for item in dictionary[key]:
if item[target_key]:
return item
return result
15 changes: 12 additions & 3 deletions photon.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from core.prompt import prompt
from core.requester import requester
from core.updater import updater
from core.metadata import extract_metadata, get_dictionary_by_key_value, get_dictionary_by_key
from core.utils import (luhn,
proxy_type,
is_good_proxy,
Expand Down Expand Up @@ -80,6 +81,7 @@
type=float)
parser.add_argument('-p', '--proxy', help='Proxy server IP:PORT or DOMAIN:PORT', dest='proxies',
type=proxy_type)
parser.add_argument('-m', '--metadata', help='page metadata', dest='store_true')

# Switches
parser.add_argument('--clone', help='clone the website locally', dest='clone',
Expand Down Expand Up @@ -142,6 +144,7 @@
crawl_level = args.level or 2 # Crawling level
thread_count = args.threads or 2 # Number of threads
only_urls = bool(args.only_urls) # Only URLs mode is off by default
has_metadata = bool(args.metadata) # Has metadata

# Variables we are gonna use later to store stuff
keys = set() # High entropy strings, prolly secret keys
Expand All @@ -158,6 +161,7 @@
processed = set(['dummy']) # URLs that have been crawled
# URLs that belong to the target i.e. in-scope
internal = set(args.seeds)
metadata = set()

everything = []
bad_scripts = set() # Unclean javascript file urls
Expand Down Expand Up @@ -239,6 +243,11 @@ def remove_file(url):
def extractor(url):
"""Extract details from the response body."""
response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)

# Add metadata
if has_metadata:
metadata.add(extract_metadata(response, url))

if clone:
mirror(url, response)
matches = rhref.findall(response)
Expand Down Expand Up @@ -378,9 +387,9 @@ def jscanner(url):
os.mkdir(output_dir) # create a new directory

datasets = [files, intel, robots, custom, failed, internal, scripts,
external, fuzzable, endpoints, keys]
external, fuzzable, endpoints, keys, metadata]
dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal',
'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
'scripts', 'external', 'fuzzable', 'endpoints', 'keys', 'metadata']

writer(datasets, dataset_names, output_dir)
# Printing out results
Expand All @@ -399,7 +408,7 @@ def jscanner(url):
'custom': list(custom), 'failed': list(failed), 'internal': list(internal),
'scripts': list(scripts), 'external': list(external),
'fuzzable': list(fuzzable), 'endpoints': list(endpoints),
'keys': list(keys)
'keys': list(keys), 'metadata': list(metadata)
}

if args.dns:
Expand Down