-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbooktoki.py
166 lines (127 loc) · 4.75 KB
/
booktoki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
이 스크립트는 더 이상 기능하지 않습니다.
https://github.com/STR-HK/hdl-stubs/ 에서 최신 버전의 스크립트를 다운로드하세요.
This script is no longer functional.
Please download the latest version of the script from https://github.com/STR-HK/hdl-stubs/
このスクリプトはもう機能しません。
最新バージョンのスクリプトを https://github.com/STR-HK/hdl-stubs/ からダウンロードしてください。
此脚本已不再可用。
请从 https://github.com/STR-HK/hdl-stubs/ 下载最新版本的脚本。
Este script ya no funciona.
Por favor, descarga la última versión del script desde https://github.com/STR-HK/hdl-stubs/.
Ce script n'est plus fonctionnel.
Veuillez télécharger la dernière version du script depuis https://github.com/STR-HK/hdl-stubs/.
Этот скрипт больше не работает.
Пожалуйста, скачайте последнюю версию скрипта с https://github.com/STR-HK/hdl-stubs/.
"""
# coding: utf8
# title: Add Booktoki.com
# author: github.com/STR-HK/hdl-stubs
# comment: Created at 12022/09/17
from fileinput import filename
from io import BytesIO
import re
import requests
from utils import Soup, LazyUrl, Downloader, Session, clean_title
import clf2
import json
class Image(object):
def __init__(self, src, name):
ext = ".{}".format(src.split(".")[-1])
if ext.lower()[1:] not in ["jpg", "jpeg", "bmp", "png", "gif", "webm", "webp"]:
ext = ".jpg"
self.filename = f"{name}{ext}"
self.url = LazyUrl(src, lambda _: src, self)
@Downloader.register
class Downloader_Booktoki(Downloader):
type = "booktoki"
URLS = [r"regex:booktoki[0-9]*\.com"]
MAX_CORE = 4
icon = "https://booktoki153.com/img/book/favicon-32x32.png"
def read(self):
soup = get_soup(self.url)
artist = get_artist(soup)
title = f"[{artist}] {get_title(soup)}"
self.artist = artist
src = soup.find("div", class_="view-img").find("img")["src"]
img = Image(src, "cover")
self.urls.append(img.url)
pages = get_pages_list(soup)
self.print_(pages)
for n, page in enumerate(pages):
self.title = f"{title} ({n+1}/{len(pages)})"
self.print_(f"Reading: {n+1}/{len(pages)}")
pagesoup = get_soup(page)
@try_n(4)
def ctt_foo():
try:
return get_content(pagesoup)
except:
return get_content(get_soup(page))
content = ctt_foo()
f = BytesIO()
f.write(content.replace(" ", "\n").encode("UTF-8"))
f.seek(0)
@try_n(4)
def fne_foo():
try:
return f"{get_page_title(pagesoup)}.txt"
except:
return f"{get_page_title(get_soup(page))}.txt"
self.filenames[f] = fne_foo()
self.urls.append(f)
self.title = title
def get_soup(url):
html = clf2.solve(url)["html"]
return Soup(html)
def get_pages_list(soup):
pages_list = []
list_body = soup.find("ul", class_="list-body")
list_items = list_body.find_all("li", class_="list-item")
for list_item in list_items:
page = list_item.find("a")["href"]
pages_list.append(page)
pages_list.reverse()
return pages_list
def get_info_list(soup):
"""
-> [
title,
[
platform,
tags,
artist
],
summary
]
"""
infobox = soup.find("div", class_="col-sm-8")
contents = infobox.find_all("div", class_="view-content")
details = contents[1].get_text().split("\xa0")
for n, detail in enumerate(details):
details[n] = detail.strip()
return [contents[0].get_text().strip(), details, contents[2].get_text().strip()]
def get_content(soup):
novel = soup.find("div", {"id": "novel_content"})
ps = novel.find_all("p")
if len(novel.find_all("p")) != 0:
text = ""
for n, p in enumerate(ps):
if p.get_text() != "":
text += p.get_text()
else:
text += "\n"
text += "\n"
else:
text = novel.get_text()
return text
def get_title(soup):
return get_info_list(soup)[0]
def get_artist(soup):
return get_info_list(soup)[1][2]
def get_page_title(soup):
full = soup.find("div", class_="toon-title")
span = full.find("span").get_text()
title = full.get_text().replace(span, "").strip()
return clean_title(title)
log(f"Site Added: Booktoki")