forked from SolRacht/writing.com-archival
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchiver.py
130 lines (101 loc) · 5.04 KB
/
archiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from scraper import get_chapter_list, get_search_ids
from scraper import get_search_page_interactive_ids, all_search_urls
from aggregator import get_chapters,get_story_info_safe
from htmlformat import formatChapter, formatIntro, formatOutline
import os
import re
def barf(filepath, text):
with open(filepath,'w',encoding='utf-8') as o:
o.write(text)
def is_item_id(string):
if re.match('(\d)+-(.)+',string): #Returns None or match object
return True
else:
return False
''' Returns a list of descent strings [str1 str2 ... stri] s.t. the local archive is missing those chapters from the remote story.
Will not return strictly the missing chapters. Will also return each's predecessor, because that one's choices will have to be updated.
To accomplish that I have chosen the naive way of just removing and redownloading the chapters with newly existing choices. '''
def get_missing_chapters(canon_descents,directory,story_id):
canon_chapters = [x + ".html" for x in canon_descents]
os.chdir(directory)
existing_chapters = os.listdir()
existing_chapters = set(existing_chapters) - set(['intro.html','outline.html'])
missing_chapters_all = list(set(canon_chapters) - existing_chapters)
#We also need to list the chapters previous each missing chapter to update links naively lol
missing_chapters_connections = set(list(missing_chapters_all))
for ch in missing_chapters_all:
c = ch[:-6] + '.html'
if c != '.html':
try:
os.remove(c)
except FileNotFoundError:
pass
missing_chapters_connections.add(c)
os.chdir('../../')
if len(canon_chapters) == len(missing_chapters_all):
print('# {}: Downloading for the first time.'.format(story_id))
else:
print('# {}: Updating with {} chapters.'.format(story_id,len(missing_chapters_connections)))
return [ x[:-5] for x in missing_chapters_connections ]
''' A list of all currently existing things in ./archive/ '''
def get_existing_archives():
os.chdir('archive')
ls = os.listdir()
os.chdir('..')
return filter(lambda x: is_item_id(x),ls)
''' Archives the story designated by its id (the 'item_id' in urls) and downloads missing chapters.
Returns a map {descent : string -> error : Exception} describing any errors it encountered while downloading chapters.'''
def archive(story_id):
#Directory
#Hard coded because the stylesheet in ./templates/style.css is referenced with a relative path
archive_dir="archive"
story_root = archive_dir+"/"+story_id+"/"
if not os.path.exists(story_root):
os.makedirs(story_root)
print('# {}: Gathering info.'.format(story_id))
#Basic info
info = get_story_info_safe(story_id)
barf(story_root + "intro.html",formatIntro(info))
#Outline
canon_descents, canon_names = get_chapter_list(story_id)
barf(story_root + "outline.html",formatOutline(info.pretty_title,canon_descents,canon_names))
#Missing chapters
#Barf chapters
error_chapters = {}
missing_chapters = get_missing_chapters(canon_descents,story_root,story_id)
for descent, chapter in get_chapters(story_id, missing_chapters, threads_per_batch=10):
if issubclass(type(chapter),Exception):
error_chapters[descent] = chapter
else:
barf(story_root + descent + ".html", formatChapter(chapter,descent,canon_descents))
if len(error_chapters) > 0:
print('# {}: Finished with {} errors. Try again. If problem persists contact the developer.'.format(story_id,len(error_chapters)))
else:
print('# {}: Finished!'.format(story_id))
return error_chapters
''' calls archive on every interactive listed in this search page and all subsequent pages. Does not thread each story b/c that would overload the server.
!! NOTE!!: THIS ONLY WORKS IF YOU GIVE A URL WHERE THE PAGE NO. IS SPECIFIED IN THE URL.
NORMALLY THE PAGE IS SPECIFIED IN POST, OR SOMETHING.
TO ENSURE A USEABLE URL, GO TO A SEARCH PAGE, AND CLICK THE MAGNIFYING GLASS ICON ABOVE THE RESULT LIST, AND USE THE NEW URL
returns a mapping of item_id to error chapters map as described in the output of archive.
'''
def archive_search(search_url):
if search_url.find('&page=') < 0:
raise ValueError('Incorrect URL given to archive_search. Please see note in its source code.')
print('# Gathering item_ids of the search...')
ids = list(get_search_ids(search_url))
error_chapters = {}
for idx, id in enumerate(ids):
print('### Archiving item_id {}/{}'.format(idx+1,len(ids)))
error_chapters[id] = archive(id)
return error_chapters
''' Updates every existing archive
returns a mapping of item_id to error chapters map as described in the output of archive.
'''
def update_archive():
ids = list(get_existing_archives())
error_chapters = {}
for i,id in enumerate(ids):
print('### Updating archive {}/{}'.format(i,len(ids)))
error_chapters[id] = archive(id)
return error_chapters