This repository has been archived by the owner on Feb 1, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathscrape.py
executable file
·134 lines (101 loc) · 4.65 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
import sys
import io
import re
import requests
import dateutil.parser
from genshi.input import HTML
import smartypants
import ez_epub
# Setup session to not hit Android download app page
session = requests.session()
# No user agent. Wattpad now blocks all user agents containing "Python".
session.headers['User-Agent'] = ''
# Used by Android app normally
# Example parameters are what Android provides
API_STORYINFO = 'https://www.wattpad.com/api/v3/stories/' #9876543?drafts=0&include_deleted=1
# Used by website and Android app normally
API_STORYTEXT = 'https://www.wattpad.com/apiv2/storytext' # ?id=23456789
# Webpage uses a page parameter: ?id=23456789&page=1
# Android uses these parameters: ?id=23456789&increment_read_count=1&include_paragraph_id=1&output=text_zip
# Now (2015-06-15), returns HTML instead of JSON. output=json will get JSON again
API_CHAPTERINFO = 'https://www.wattpad.com/apiv2/info' # ?id=23456789
# Documented api
API_GETCATEGORIES = 'https://www.wattpad.com/apiv2/getcategories'
ILLEAGAL_FILENAME_CHARACTERS = str.maketrans(r'.<>:"/\|?*^', '-----------')
# Fixup the categories data, this could probably be cached too
categories = session.get(API_GETCATEGORIES).json()
categories = {int(k): v for k, v in categories.items()}
def download_story(story_id):
# TODO: probably use {'drafts': 0, 'include_deleted': 0}
storyinfo = session.get(API_STORYINFO + story_id, params={'drafts': 1, 'include_deleted': 1}).json()
story_title = storyinfo['title']
story_description = storyinfo['description']
story_createDate = dateutil.parser.parse(storyinfo['createDate'])
story_modifyDate = dateutil.parser.parse(storyinfo['modifyDate'])
story_author = storyinfo['user']['name']
story_categories = [categories[c] for c in storyinfo['categories'] if c in categories] # category can be 0
story_rating = storyinfo['rating'] # TODO: I think 4 is adult?
story_cover = io.BytesIO(session.get(storyinfo['cover']).content)
story_url = storyinfo['url']
print('Story "{story_title}": {story_id}'.format(story_title=story_title, story_id=story_id))
# Setup epub
book = ez_epub.Book()
book.title = story_title
book.authors = [story_author]
book.sections = []
book.impl.addCover(fileobj=story_cover)
book.impl.description = HTML(story_description, encoding='utf-8') # TODO: not sure if this is HTML or text
book.impl.url = story_url
book.impl.addMeta('publisher', 'Wattpad - scraped')
book.impl.addMeta('source', story_url)
for part in storyinfo['parts']:
chapter_title = part['title']
if part['draft']:
print('Skipping "{chapter_title}": {chapter_id}, part is draft'.format(chapter_title=chapter_title, chapter_id=chapter_id))
continue
if 'deleted' in part and part['deleted']:
print('Skipping "{chapter_title}": {chapter_id}, part is deleted'.format(chapter_title=chapter_title, chapter_id=chapter_id))
continue
chapter_id = part['id']
# TODO: could intelligently only redownload modified parts
chapter_modifyDate = dateutil.parser.parse(part['modifyDate'])
print('Downloading "{chapter_title}": {chapter_id}'.format(chapter_title=chapter_title, chapter_id=chapter_id))
chapter_html = session.get(API_STORYTEXT, params={'id': chapter_id, 'output': 'json'}).json()['text']
chapter_html = smartypants.smartypants(chapter_html)
section = ez_epub.Section()
section.html = HTML(chapter_html, encoding='utf-8')
section.title = chapter_title
book.sections.append(section)
print('Saving epub')
book.make('./{title}'.format(title=book.title.translate(ILLEAGAL_FILENAME_CHARACTERS)))
def get_story_id(url):
# Extract the id number from the url
match = re.search(r'\d+', url)
if not match:
return None
# Check if it's a valid id of a story
url_id = match.group()
storyinfo_req = session.get(API_STORYINFO + url_id)
if storyinfo_req.ok:
return url_id
# If not, check if it's a chapter id and retrieve the story id
chapterinfo_req = session.get(API_CHAPTERINFO, params={'id': url_id})
if not chapterinfo_req.ok:
return None
story_url = chapterinfo_req.json()['url']
story_id = re.search(r'\d+', story_url).group()
return story_id
def main():
if sys.argv[1:]:
story_urls = sys.argv[1:]
else:
story_urls = sys.stdin
for story_url in story_urls:
story_id = get_story_id(story_url)
if story_id:
download_story(story_id)
else:
print('ERROR: could not retrieve story', story_url)
if __name__ == '__main__':
main()