-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimagedownloader.py
executable file
·65 lines (56 loc) · 1.91 KB
/
imagedownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
"""This is a recursive web crawler. Don't go pointing this at random sites;
it doesn't respect robots.txt and it is pretty brutal about how quickly it
fetches pages.
The code for this is very short; this is perhaps a good indication
that this is making the most effective use of the primitves at hand.
The fetch function does all the work of making http requests,
searching for new urls, and dispatching new fetches. The GreenPool
acts as sort of a job coordinator (and concurrency controller of
course)
"""
from __future__ import with_statement
from eventlet.green import urllib2
import eventlet
import sys, os, hashlib, time
import layout
pool_size = 20
pool = eventlet.GreenPool(pool_size)
user_agent = 'Wikipedia 1.0 Bot'
class TimeoutException(Exception):
pass
def fetch((url, fn), urlfns):
print 'pool', pool.running(), pool.waiting()
print len(urlfns), "fetching", url
try:
with eventlet.Timeout(5, TimeoutException):
request = urllib2.Request(url)
request.add_header('User-Agent',user_agent)
opener = urllib2.build_opener()
try:
res = opener.open(request)
open(fn, 'w').write(res.read())
except urllib2.HTTPError, e:
print e, url
if urlfns:
pool.spawn_n(fetch, urlfns.pop(), urlfns)
except TimeoutException:
# retry
print 'timeout', url
pool.spawn_n(fetch, (url, fn), urlfns)
def fetchall(urlfns):
for i in range(pool_size):
pool.spawn_n(fetch, urlfns.pop(), urlfns)
pool.waitall()
def main():
urls_fn = sys.argv[1]
images_dir = sys.argv[2]
urlfns = []
for url in open(urls_fn):
url = url.strip()
fn = os.path.join(images_dir, layout.ext_img_url2fn(url))
if not os.path.exists(fn):
urlfns.append((url, fn))
fetchall(urlfns)
if __name__ == '__main__':
main()