-
Notifications
You must be signed in to change notification settings - Fork 2
/
p1_tags.py
129 lines (105 loc) · 4.52 KB
/
p1_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# 2020-7-2 17:37:34
import re
import string
import urllib
import math
import time
import random
import os
import urllib.request
from urllib.parse import quote
# constant
delta=2.5
threshold=(9.0-delta)*math.log(1000) # the threshold of a good book
BookPerPage=25 # the number of books per page (except for the last one)
# open the url and read
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
def getHtml(url):
url_new=quote(url,safe=string.printable)
req=urllib.request.Request(url=url_new, headers=headers)
page=urllib.request.urlopen(req)
html=page.read().decode('UTF-8')
page.close()
return html
def getBookInfo(html):
temp=r'<a href="https://book.douban.com/subject/(.*)/" title="(.*)" \n (.*)>\n\n (.*)\n\n\n \n <span style="font-size:12px;"> : (.*) </span>\n\n </a>\n\n </h2>\n <div class="pub">\n \n \n (.*)\n\n </div>\n\n\n \n \n \n \n <div class="star clearfix">\n <span class="allstar(.*)"></span>\n <span class="rating_nums">(.*)</span>\n\n <span class="pl">\n (.*)人评价(.*)\n </span>\n </div>'
temp=re.compile(temp)
temp=re.findall(temp, html)
temp_0=temp
temp=r'<a href="https://book.douban.com/subject/(.*)/" title="(.*)" \n (.*)>\n\n (.*)\n\n\n \n\n </a>\n\n </h2>\n <div class="pub">\n \n \n (.*)\n\n </div>\n\n\n \n \n \n \n <div class="star clearfix">\n <span class="allstar(.*)"></span>\n <span class="rating_nums">(.*)</span>\n\n <span class="pl">\n (.*)人评价(.*)\n </span>\n </div>'
temp=re.compile(temp)
temp=re.findall(temp, html)
temp_1=temp
temp_0.extend(temp_1)
return temp_0
# most of the results are stored in this file
ResultFile='Books_tags'
FileHandle=open(ResultFile,'w')
FileHandle.write("ID, rating, votes, title, author, (translator), publisher, date, price\n")
FileHandle.close()
# make a directory to store the results for each Tags separately
TagsDir='Tags'
if not os.path.exists(TagsDir):
os.mkdir(TagsDir)
# import the Tags ID from a text file
TagsFile='Tags_unique'
FileHandle=open(TagsFile,'r',encoding='UTF-8')
sTag=FileHandle.read() # set of tags
FileHandle.close()
sTag=sTag.split(',')
# some special cases for checking the codes
# sTag=['历史','文学']
nTag=len(sTag)
iTag=0
for cTag in sTag: # current tag
iTag=iTag+1
print("\nProcessing Tags {} of {}.\nTag: {}".format(iTag, nTag, cTag))
PageNumber=50 # grab 50 pages, a tuning parameter
BooksPerPage=20 # 20 books in each page
BookNumber=BooksPerPage*PageNumber
TagsFile="Tags/{}".format(cTag)
FileHandle=open(TagsFile,'w',encoding='UTF-8')
FileHandle.write("Tag: {} \n\n".format(cTag))
FileHandle.write("ID, rating, votes, title, author, (translator), publisher, date, price\n")
FileHandle.close()
# crawl the pages in a Tags
for i in range(0,PageNumber):
print("Page {}".format(i+1))
url='https://book.douban.com/tag/{}?start={}'.format(cTag, i*BooksPerPage)
try:
html = getHtml(url)
except: # for broken links
break
BookInfo=getBookInfo(html)
if len(BookInfo)==0:
break
LenBook=BooksPerPage
if len(BookInfo)!=BooksPerPage:
LenBook=len(BookInfo) # update the number of books
for j in range(0,LenBook):
tmp=BookInfo[j]
# book information
if len(tmp)==9:
ID=tmp[0]
title=tmp[1]
rating=tmp[6]
votes=tmp[7]
others=tmp[4]
elif len(tmp)==10:
ID=tmp[0]
title=tmp[1]
rating=tmp[7]
votes=tmp[8]
others=tmp[5]
if rating=='':
rating='0.0'
votes=votes[1:len(votes)]
others=others.replace(' /',',')
ID=int(ID)
votes=int(votes)
FileHandle=open(ResultFile,'a',encoding='UTF-8')
FileHandle.write("{:0>8d}, {}, {:0>6d}, {}, {}\n".format(ID, rating, votes, title, others))
FileHandle.close()
FileHandle=open(TagsFile,'a',encoding='UTF-8')
FileHandle.write("{:0>8d}, {}, {:0>6d}, {}, {}\n".format(ID, rating, votes, title, others))
FileHandle.close()