Skip to content

Commit

Permalink
新增gyutto.com站点刮削
Browse files Browse the repository at this point in the history
  • Loading branch information
musnow authored and Yuukiy committed Dec 12, 2023
1 parent f2a9106 commit 6d37c2d
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 0 deletions.
7 changes: 7 additions & 0 deletions core/avid.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ def get_id(filepath: str) -> str:
match = re.search(r'getchu[-_]*(\d+)', filename, re.I)
if match:
return 'GETCHU-' + match.group(1)
elif 'gyutto' in filename_lc:
match = re.search(r'gyutto-(\d+)', filename, re.I)
if match:
return 'GYUTTO-' + match.group(1)
else:
# 先尝试移除可疑域名进行匹配,如果匹配不到再使用原始文件名进行匹配
no_domain = re.sub(r'\w{3,10}\.(com|net|app|xyz)', '', filename, flags=re.I)
Expand Down Expand Up @@ -114,6 +118,9 @@ def guess_av_type(avid: str) -> str:
match = re.match(r'^GETCHU-(\d+)',avid,re.I)
if match:
return 'getchu'
match = re.match(r'^GYUTTO-(\d+)',avid,re.I)
if match:
return 'gyutto'
# 如果传入的avid完全匹配cid的模式,则将影片归类为cid
cid = get_cid(avid)
if cid == avid:
Expand Down
1 change: 1 addition & 0 deletions core/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ normal = airav,avsox,javbus,javdb,javlib,jav321,mgstage,prestige
fc2 = fc2,fc2fan,javdb,msin,javmenu
cid = fanza
getchu = dl_getchu
gyutto = gyutto

[Crawler]
# 爬虫至少要获取到哪些字段才可以视为抓取成功?
Expand Down
89 changes: 89 additions & 0 deletions web/gyutto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""从https://gyutto.com/官网抓取数据"""
import os
import sys
import logging
import time

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from web.base import resp2html, request_get
from web.exceptions import *
from core.datatype import MovieInfo

logger = logging.getLogger(__name__)

# https://dl.gyutto.com/i/item266923
base_url = 'http://gyutto.com'
base_encode = 'euc-jp'

def get_movie_title(html):
container = html.xpath("//h1")
if len(container) > 0:
container = container[0]
title = container.text

return title

def get_movie_img(html, index = 1):
images = []
container = html.xpath("//a[@class='highslide']/img")
if len(container) > 0:
if index == 0:
return container[0].get('src')

for row in container:
images.append(row.get('src'))

return images

def parse_data(movie: MovieInfo):
"""解析指定番号的影片数据"""
# 去除番号中的'gyutto'字样
id_uc = movie.dvdid.upper()
if not id_uc.startswith('GYUTTO-'):
raise ValueError('Invalid gyutto number: ' + movie.dvdid)
gyutto_id = id_uc.replace('GYUTTO-', '')
# 抓取网页
url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1'
r = request_get(url, delay_raise=True)
if r.status_code == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
html = resp2html(r, base_encode)
container = html.xpath("//dl[@class='BasicInfo clearfix']")

for row in container:
key = row.xpath(".//dt/text()")
if key[0] == "サークル":
producer = ''.join(row.xpath(".//dd/a/text()"))
elif key[0] == "ジャンル":
genre = row.xpath(".//dd/a/text()")
elif key[0] == "配信開始日":
date = row.xpath(".//dd/text()")
date_str = ''.join(date)
date_time = time.strptime(date_str, "%Y年%m月%d日")
publish_date = time.strftime("%Y-%m-%d", date_time)

plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0]

movie.title = get_movie_title(html)
movie.cover = get_movie_img(html, 0)
movie.preview_pics = get_movie_img(html)
movie.dvdid = id_uc
movie.url = url
movie.producer = producer
# movie.actress = actress
# movie.duration = duration
movie.publish_date = publish_date
movie.genre = genre
movie.plot = plot

if __name__ == "__main__":
import pretty_errors

pretty_errors.configure(display_link=True)
logger.root.handlers[1].level = logging.DEBUG
movie = MovieInfo('gyutto-266923')

try:
parse_data(movie)
except CrawlerError as e:
logger.error(e, exc_info=1)

0 comments on commit 6d37c2d

Please sign in to comment.