新增gyutto.com站点刮削

Yuukiy · Dec 12, 2023 · 6d37c2d · 6d37c2d
1 parent f2a9106
commit 6d37c2d
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 0 deletions.
diff --git a/core/avid.py b/core/avid.py
@@ -30,6 +30,10 @@ def get_id(filepath: str) -> str:
         match = re.search(r'getchu[-_]*(\d+)', filename, re.I)
         if match:
             return 'GETCHU-' + match.group(1)
+    elif 'gyutto' in filename_lc:
+        match = re.search(r'gyutto-(\d+)', filename, re.I)
+        if match:
+            return 'GYUTTO-' + match.group(1)
     else:
         # 先尝试移除可疑域名进行匹配，如果匹配不到再使用原始文件名进行匹配
         no_domain = re.sub(r'\w{3,10}\.(com|net|app|xyz)', '', filename, flags=re.I)
@@ -114,6 +118,9 @@ def guess_av_type(avid: str) -> str:
     match = re.match(r'^GETCHU-(\d+)',avid,re.I)
     if match:
         return 'getchu'
+    match = re.match(r'^GYUTTO-(\d+)',avid,re.I)
+    if match:
+        return 'gyutto'
     # 如果传入的avid完全匹配cid的模式，则将影片归类为cid
     cid = get_cid(avid)
     if cid == avid:

diff --git a/core/config.ini b/core/config.ini
@@ -32,6 +32,7 @@ normal = airav,avsox,javbus,javdb,javlib,jav321,mgstage,prestige
 fc2 = fc2,fc2fan,javdb,msin,javmenu
 cid = fanza
 getchu = dl_getchu
+gyutto = gyutto
 
 [Crawler]
 # 爬虫至少要获取到哪些字段才可以视为抓取成功？

diff --git a/web/gyutto.py b/web/gyutto.py
@@ -0,0 +1,89 @@
+"""从https://gyutto.com/官网抓取数据"""
+import os
+import sys
+import logging
+import time
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from web.base import resp2html, request_get
+from web.exceptions import *
+from core.datatype import MovieInfo
+
+logger = logging.getLogger(__name__)
+
+# https://dl.gyutto.com/i/item266923
+base_url = 'http://gyutto.com'
+base_encode = 'euc-jp'
+
+def get_movie_title(html):
+    container = html.xpath("//h1")
+    if len(container) > 0:
+        container = container[0]
+    title = container.text
+
+    return title
+
+def get_movie_img(html, index = 1):
+    images = []
+    container = html.xpath("//a[@class='highslide']/img")
+    if len(container) > 0:
+        if index == 0:
+            return container[0].get('src')
+
+        for row in container:
+            images.append(row.get('src'))
+
+    return images
+
+def parse_data(movie: MovieInfo):
+    """解析指定番号的影片数据"""
+    # 去除番号中的'gyutto'字样
+    id_uc = movie.dvdid.upper()
+    if not id_uc.startswith('GYUTTO-'):
+        raise ValueError('Invalid gyutto number: ' + movie.dvdid)
+    gyutto_id = id_uc.replace('GYUTTO-', '')
+    # 抓取网页
+    url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1'
+    r = request_get(url, delay_raise=True)
+    if r.status_code == 404:
+        raise MovieNotFoundError(__name__, movie.dvdid)
+    html = resp2html(r, base_encode)
+    container = html.xpath("//dl[@class='BasicInfo clearfix']")
+
+    for row in container:
+        key = row.xpath(".//dt/text()")
+        if key[0] == "サークル":
+            producer = ''.join(row.xpath(".//dd/a/text()"))
+        elif key[0] == "ジャンル":
+            genre = row.xpath(".//dd/a/text()")
+        elif key[0] == "配信開始日":
+            date = row.xpath(".//dd/text()")
+            date_str = ''.join(date)
+            date_time = time.strptime(date_str, "%Y年%m月%d日")
+            publish_date = time.strftime("%Y-%m-%d", date_time)
+
+    plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
+
+    movie.title = get_movie_title(html)
+    movie.cover = get_movie_img(html, 0)
+    movie.preview_pics = get_movie_img(html)
+    movie.dvdid = id_uc
+    movie.url = url
+    movie.producer = producer
+    # movie.actress = actress
+    # movie.duration = duration
+    movie.publish_date = publish_date
+    movie.genre = genre
+    movie.plot = plot
+
+if __name__ == "__main__":
+    import pretty_errors
+
+    pretty_errors.configure(display_link=True)
+    logger.root.handlers[1].level = logging.DEBUG
+    movie = MovieInfo('gyutto-266923')
+
+    try:
+        parse_data(movie)
+    except CrawlerError as e:
+        logger.error(e, exc_info=1)