[AllInOne] booktoki_v3.py

# coding: utf8
# title: Add Booktoki.com
# author: github.com/STR-HK/hdl-stubs
# comment: Created at 2023/10/23

from io import BytesIO
from utils import Soup, LazyUrl, Downloader, clean_title
import clf2


class Image(object):

    def __init__(self, src, name):
        ext = ".{}".format(src.split(".")[-1])
        if ext.lower()[1:] not in ["jpg", "jpeg", "bmp", "png", "gif", "webm", "webp"]:
            ext = ".jpg"
        self.filename = f"{name}{ext}"
        self.url = LazyUrl(src, lambda _: src, self)


@Downloader.register
class Downloader_Booktoki(Downloader):
    type = "booktoki"
    URLS = [r"regex:booktoki[0-9]*\.com"]
    MAX_CORE = 4
    icon = "https://manatoki.net/img/book/favicon-32x32.png"

    def read(self):
        soup = get_soup(self.url)
        artist = self.get_artist(soup)
        title = f"[{artist}] {self.get_title(soup)} 全"
        self.artist = artist

        img_candidate = soup.find("div", class_="view-img").find("img")
        if img_candidate:
            src = img_candidate["src"]
            img = Image(src, "cover")
            self.urls.append(img.url)

        content_titles = []
        contents = []

        pages = get_pages_list(soup)
        self.print_(pages)
        for n, page in enumerate(pages):
            self.title = f"{title} ({n+1}/{len(pages)})"
            self.print_(f"Reading: {n+1}/{len(pages)}")
            pagesoup = get_soup(page)

            @try_n(4)
            def content_getter():
                try:
                    return get_content(pagesoup)
                except:
                    return get_content(get_soup(page))

            contents.append(content_getter().replace("&nbsp;", "\n"))

            @try_n(4)
            def title_getter():
                try:
                    return f"{n+1}화 | {get_page_title(pagesoup)}"
                except:
                    return f"{n+1}화 | {get_page_title(get_soup(page))}"

            content_titles.append(title_getter())

        full_content = ""
        for n in range(len(content_titles)):
            full_content += content_titles[n]
            full_content += "\n\n"
            full_content += contents[n]
            full_content += "\n\n\n"
        full_content += "終"

        f = BytesIO()
        f.write(full_content.encode("UTF-8"))
        f.seek(0)

        self.filenames[f] = title + ".txt"
        self.urls.append(f)

        self.title = title

    def get_info_list(self, soup: Soup) -> list:
        """-> [title, [platform, tags, artist], summary]"""
        infobox = soup.find("div", class_="col-sm-8")
        contents = infobox.find_all("div", class_="view-content")
        details = contents[1].get_text().split("\xa0")
        for n, detail in enumerate(details):
            details[n] = detail.strip()
        return [
            contents[0].get_text().strip(),
            details,
            contents[2].get_text().strip(),
        ]

    def get_title(self, soup: Soup):
        return clean_title(self.get_info_list(soup)[0])

    def get_artist(self, soup: Soup):
        return clean_title(self.get_info_list(soup)[1][2])


def get_soup(url: str):
    return Soup(clf2.solve(url)["html"])


def get_pages_list(soup: Soup):
    pages_list = []
    list_body = soup.find("ul", class_="list-body")
    list_items = list_body.find_all("li", class_="list-item")
    for list_item in list_items:
        page = list_item.find("a")["href"]
        pages_list.append(page)
    pages_list.reverse()
    return pages_list


def get_content(soup: Soup):
    novel = soup.find("div", {"id": "novel_content"})
    ps = novel.find_all("p")
    if len(novel.find_all("p")) != 0:
        text = ""
        for n, p in enumerate(ps):
            if p.get_text() != "":
                text += p.get_text()
            else:
                text += "\n"
            text += "\n"
    else:
        text = novel.get_text()
    return text


def get_page_title(soup: Soup):
    full = soup.find("div", class_="toon-title")
    span = full.find("span").get_text()
    title = full.get_text().replace(span, "").strip()
    return clean_title(title)


log(f"Site Added: Booktoki")