-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebScraper.py
78 lines (58 loc) · 2.61 KB
/
webScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Web Crawler
import mechanize
import urllib
def get_wallpapers(url,directory):
browser = mechanize.Browser()
browser.set_handle_equiv(True)
browser.set_handle_redirect(True)
browser.set_handle_referer(True)
browser.set_handle_robots(True)
browser.addheaders = [('user-agent', ' Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
# Create string of html code of requested webpage
html = browser.open(url).read()
browser.close()
# Array to hold image file names+extensions
images = []
# Standard format of how the image file is stored in html of wallhaven website
# Only missing the unique image extension at the end
standard = 'http://wallpapers.wallhaven.cc/wallpapers/full/wallhaven-'
#End of webpage html
endIndex = len(html)
# Initialize the current index to position in html near the image files
currentIndex = html.find('figure id="thumb-')
while currentIndex != -1:
# Create substring of html to hold only relevant code (image files)
html = html[currentIndex:endIndex]
# Beginning of unique image extension
beginExtension = html.find('th-') + 3
# End of unique image extension
endExtension = html.find('"', beginExtension)
# Extension of unique image file
extension = html[beginExtension:endExtension]
# Create full image file name
currentImage = standard + extension
# Parse html so that the previously acquired image address is excluded
html = html[endExtension:endIndex]
# Add the current image to the array of images
images.append(currentImage)
# Set currentIndex for the next loop iteration
currentIndex = html.find('figure id="thumb-')
# Print image file names to verify correct parsing
for i in range(len(images)):
print(images[i])
print('Downloading images..')
count = 0
for pic in images:
count += 1
print("Image " + str(count))
# Retrieve the current pic and store in specified local location
urllib.urlretrieve(pic, directory + 'wallbase_img_' + str(count) + '.jpg')
print('Download complete.')
if __name__ == "__main__":
base_url = 'http://alpha.wallhaven.cc/search?q='
user_direc = raw_input('Enter the directory path to where you would like to store your images: ')
user_search = raw_input("Enter the type of wallpaper you're searching for (ex. 'batman'): ")
user_search = urllib.quote(user_search)
url = base_url + user_search
print url
get_wallpapers(url,user_direc)