-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsite_list_parse.py
109 lines (82 loc) · 3.84 KB
/
site_list_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
from domain import Domain
import time
import threading
class Site_List_Parse():
def __init__(self, path_to_url_list : str, path_to_output: str):
# url_list should be a file with a list of domains
self.path_to_url_list = path_to_url_list
self.url_list = []
with open(path_to_url_list, "r") as u:
urls = u.readlines()
for url in urls:
if url not in self.url_list:
# NO REPEATS -- WILL FUCK W MULTITHREADING
self.url_list.append(url)
self.path_to_output = path_to_output
self.output_dict = {}
self.threads = set()
def parse(self):
with open(self.path_to_output, "w") as o:
for url in self.url_list:
# self.get_info_from_url(url)
url = url.strip()
if url != "":
thread = threading.Thread(target=self.get_info_from_url, args=[url])
thread.start()
self.threads.add(thread)
# for some reason it doesn't always get every IP without this sleep
time.sleep(0.2)
for thread in self.threads:
thread.join()
# put the dict in human readable format in the output file
json.dump(self.output_dict, o, sort_keys=True, indent=4)
for u in self.url_list:
if u.strip() not in self.output_dict.keys():
print(f"why isn't {u} in the dict?")
def get_info_from_url(self, url):
epoch_time = float(time.time())
site = Domain(url)
if not site.is_valid_url():
# create json for invalid website
return
ipv4s, ipv6s, rdns_names = site.get_ip_addresses()
server = site.get_server_type()
# print(f"RECEIVED : {url}'s ips: {ipv4s}, {ipv6s}")
url_dict = {"scan_time" : epoch_time}
# initialize ip lists (meant to have empty lists if none found)
url_dict["ipv4_addresses"] = []
url_dict["ipv6_addresses"] = []
url_dict["rdns_names"] = []
# WITH MULTITHREADING, THIS MAY CAUSE ISSUES IF THE SAME DOMAIN IS SUBMITTED MULTIPLE TIMES
# but honestly it should be ..... okay (knock on wood), because it only does reassignments
for ipv4 in ipv4s:
# Domain.get_ip_addresses() returns a list of 4 tuples that have the ips but also some info
# about their address families and their socket kinds
if ipv4 and ipv4 not in url_dict['ipv4_addresses']:
url_dict["ipv4_addresses"].append(ipv4)
for ipv6 in ipv6s:
if ipv6 and ipv6 not in url_dict['ipv6_addresses']:
url_dict["ipv6_addresses"].append(ipv6)
for rdns in rdns_names:
if rdns not in url_dict["rdns_names"] and rdns != "":
url_dict["rdns_names"].append(rdns)
if server:
url_dict["http_server"] = server
else:
url_dict["http_server"] = None
# USE IPV4 LIST TO GET RTT TIMES
ipv4s = url_dict['ipv4_addresses']
# rtt_range, locations = site.get_rtt_and_geo(ipv4s)
# url_dict['rtt_range'] = rtt_range
url_dict['geo_locations'] = site.get_geo(ipv4s)
# NOW LETS DO INSECURE, REDIRECT, AND TLS
url_dict['insecure_http'] = site.test_insecure_http()
redirect_and_hsts = site.test_redirect_to_https_and_hsts()
url_dict['redirect_to_https'] = redirect_and_hsts['https']
url_dict['hsts'] = redirect_and_hsts['hsts']
url_dict['tls_versions'] = site.test_tls()
# NOW LETS DO ROOT_CA AND
url_dict['root_ca'] = site.get_root_ca()
# FINALLY, add this url's info to our class dict
self.output_dict[url] = url_dict