-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcopy_webpage.py
120 lines (96 loc) · 4.5 KB
/
copy_webpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/python
import sys
import os
import urllib2
import urlparse
import codecs
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from resource_acquisition_manager import *
from page_resource_copier import *
from page_resource_copier import *
url_list = []
def recursive_webpage_cursor(url_cursor, file_path, root_replacement, file_depth, root_directory):
#Record that the present web-page is being tracked
url_list.append(url_cursor)
#Acquire http response from url
request = urllib2.Request(url_cursor)
url_components = urlparse.urlparse(url_cursor)
file_name = url_components.path.split('/')[-1]
root_path = url_components.netloc
try:
responce = urllib2.urlopen(request)
html = responce.read()
except urllib2.URLError, e:
raise Exception("%s returned an error: %s" % (url_cursor, e) )
sys.exit(0)
url_components = urlparse.urlparse(url_components.path)
current_directory = check_create_directory(url_components.path, file_path, root_directory)
#Add this web-page to the current directory unless it already exists
if file_name is '':
file_loc = os.path.join(current_directory, 'index.html')
elif file_name.endswith('.html'):
file_loc = os.path.join(current_directory, file_name)
else:
file_loc = os.path.join(current_directory, (file_name + '.html') )
modified_soup = BeautifulSoup(html, from_encoding = 'utf8')
#Modify links in file so that they function as relative links in the final
#output html
for link in modified_soup.find_all('a'):
original_link = link['href']
#Make sure that all absolute path files and the index file use the right
#web address
if 'http://' + root_path + '/' == link['href']:
depth_relative_link_slashes = '../' * file_depth
updated_link = depth_relative_link_slashes + 'index.html'
else:
#Make sure that all non-index files point to html files
if not link['href'].endswith('.html') and '#' not in link['href']:
updated_link = link['href'] + '.html'
else:
updated_link = link['href']
#Snip the first forward slash off of relative hyperlinks
if updated_link.startswith('/'):
depth_relative_link_slashes = '../' * file_depth
updated_link = depth_relative_link_slashes + updated_link[1:]
link['href'] = updated_link
#Find all css file locations
for css_link in modified_soup.find_all('link'):
depth_relative_link_slashes = '../' * file_depth
updated_link = depth_relative_link_slashes + css_link['href'][1:]
css_link['href'] = updated_link
#Acquire all image files
modified_soup = get_page_resources(file_depth, modified_soup, file_path, url_cursor, root_directory)
#Check to make sure we're not modifying a link that's already inside
#of the cloned url list, if so add this link, too.
if original_link not in url_list:
url_list.append(updated_link)
try:
html_file = codecs.open(file_loc, 'w', 'utf8')
output_string = modified_soup.prettify('ascii')
html_file.write(output_string)
except IOError as e:
print 'IO Write Error: %s'%e
sys.exit(0)
finally:
html_file.close()
#Acquire CSS Files
acquire_css_files(html, modified_soup, url_cursor, file_path, root_directory)
#Use beautiful soup to iterate over all hyper-links and resources recursively
soup = BeautifulSoup(html)
#Report results
print("%s cloned..." % url_cursor)
#Find all the hyper-links on this page
hyperlinks = soup.find_all('a')
#Filter out all hyper-links which are mailto links
hyperlinks = filter(lambda link: 'mailto:' not in link['href'], hyperlinks)
#Filter out bookmark hyper-links
hyperlinks = filter(lambda link: '#' not in link['href'], hyperlinks)
#Filter out external hyper-links (e.g. don't copy all of google.com)
hyperlinks = filter(lambda link: not link['href'].startswith('http://') or link['href'].startswith(url_components.netloc), hyperlinks)
#Recursively go about traveling through each of these hyper-links repeating
#this process on those pages that haven't already been created.
for link in hyperlinks:
next_page = urlparse.urljoin(url_cursor, link['href'])
if next_page not in url_list:
recursive_webpage_cursor(next_page, file_path, root_replacement, next_page.count('/') - 3, root_directory)