-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_image.py
152 lines (135 loc) · 4.63 KB
/
get_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import time
from lxml import html
import re
import sys
def get_response(url,referer = "https://girl-atlas.com/album/58d15fcc92d302622dc57a80'",count = 1,timeout = 30):
# 填充请求的头部
print ("try %s url:%s") % (str(count),url)
count += 1
if count > 6 : return None
headers = {
# 'Host':"www.girls-altas.com",
# "Content-type": "image/jpeg",
# "Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests":"1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"referer":referer
}
try:
return requests.get(url, headers = headers,timeout = timeout)
except Exception, e:
print 'str(Exception):\t', str(Exception)
print 'str(e):\t\t', str(e)
print "*" * 100
return get_response(url,referer = referer,count = count,timeout = timeout)
def getAvUrl(str_class,url):
start_url = 'https://girl-atlas.com'
p1 = r"^/%s/.*" % str_class
pattern1 = re.compile(p1)
rst = pattern1.findall(url)
if len(rst) >= 1:
return start_url + rst[0]
def getEncode(title):
return title.encode("utf-8")
def get_tag_urls(url = 'https://girl-atlas.com/'):
tag = []
response = get_response(url)
if response is None:
return None
else:
parsed_body = html.fromstring(response.text)
next_url = parsed_body.xpath('//a/@href')
tmp = ["tag" for i in range(0,len(next_url))]
next_url = map(getAvUrl,tmp,next_url)
next_url = [item for item in next_url if item is not None]
tag.extend(next_url)
# print next_url
tag = list(set(tag))
return tag
def get_album_urls(url = 'https://girl-atlas.com/'):
#total 256 page album
urls = [ url + "?p=%s" % (str(i)) for i in range(221,257) ]
return urls
def get_page_urls(urls):
album = []
for url in urls:
print url
response = get_response(url)
if response is None:
return None
else:
parsed_body = html.fromstring(response.text)
next_url = parsed_body.xpath('//a/@href')
tmp = ["album" for i in range(0,len(next_url))]
next_url = map(getAvUrl,tmp,next_url)
next_url = [item for item in next_url if item is not None]
album.extend(next_url)
# print next_url
album = list(set(album))
return album
# 获取每个girl专辑的Url
def get_girl_urls(page_urls):
if page_urls is None:
return None
girl_urls = []
count = 0
start_dir = './data/'
for url in page_urls:
print url
response = get_response(url)
parsed_body = html.fromstring(response.text)
# Xpath
girl = parsed_body.xpath('/html/body/div[2]/section/div/div[1]/div[1]/ul/li/img/@src|/html/body/div[2]/section/div/div[1]/div[1]/ul/li/img/@delay')
title = parsed_body.xpath('/html/body/div[2]/section/div/div[1]/div[1]/ul/li/img/@title')
title = map(getEncode,title)
tmp = zip(title, girl)
pic = dict((x, y) for x, y in tmp)
for k,v in pic.items():
count += 1
print count,k,v
if "/" in k:k = k.replace("/","-")
with open(start_dir + k + ".jpg", 'wb') as f:
r = get_response(v)
f.write(r.content)
# with open("info.py","a") as f:
# content = ("%s:%s\n") % (k,v)
# f.writelines(content)
girl_urls.append(pic)
return girl_urls
# 开始下载图片
def get_images(girl_list):
# 图片的默认存储文件夹
if girl_list is None:
return None
count = 0
start_dir = './data/'
for pic in girl_list:
for k,v in pic.items():
count += 1
print count,k,v
with open(start_dir + k + ".jpg", 'wb') as f:
r = get_response(v)
f.write(r.content)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')
start_time = time.time()
print start_time
## download ablum url
album_urls = get_album_urls()
print album_urls
page_urls = get_page_urls(album_urls)
## download tag url
tag_urls = get_tag_urls()
print tag_urls
page_urls = get_page_urls(tag_urls)
girl_urls = get_girl_urls(page_urls)
stop_time = time.time()
elapsed_time = time.time() - start_time
print
print "elasped %s seconds!!!!" % elapsed_time