Merge pull request #12 from ZombieFly/Support-Bwiki

✨ feat: 更换bwiki适配方式
ZombieFly · Oct 8, 2022 · 590ed5d · 590ed5d
2 parents 660040e + e7d652b
commit 590ed5d
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 34 deletions.
diff --git a/nonebot_plugin_wiki/mediawiki/util.py b/nonebot_plugin_wiki/mediawiki/util.py
@@ -1,7 +1,9 @@
 from __future__ import print_function, unicode_literals
 
+from typing import Union, Tuple, List
 import sys
 import functools
+from html.parser import HTMLParser
 
 
 def debug(fn):
@@ -14,6 +16,65 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def get_from_attrs(attr_list, target):
+    if not target:
+        return False
+    if isinstance(target, str):
+        for attr in attr_list:
+            if target == attr[0]:
+                return attr[1]
+    if isinstance(target, tuple):
+        for attr in attr_list:
+            if target[0] == attr[0] and target[1] in attr[1]:
+                return True
+    if isinstance(target, list) and len(target) == 2:
+        find = target[0]
+        fetch = target[1]
+        got = False
+        temp = None
+        for attr in attr_list:
+            if find[0] == attr[0] and find[1] in attr[1]:
+                got = True
+            if fetch[0] == attr[0]:
+                temp = attr[1]
+            if temp and got:
+                return temp
+    return False
+
+
+class SimpleInnerParser(HTMLParser):
+    def __init__(
+        self,
+        target_tag='p',
+        target_attr: Union[str, Tuple, List[Tuple]] = None,  # type: ignore
+        text_context=True
+    ):
+        super().__init__()
+        self.output = []
+        self.open_write = False
+        self.target_tag = target_tag
+        self.target_attr = target_attr
+        self.text_context = text_context
+
+    def handle_starttag(self, tag, attrs):
+        if tag == self.target_tag or not self.target_tag:
+            checker = get_from_attrs(
+                attrs, self.target_attr) if self.target_attr else True
+            self.open_write = True and checker
+        if value := get_from_attrs(attrs, self.target_attr):
+            if not self.text_context:
+                self.output.append(str(value).strip())
+                self.open_write = False
+
+    def handle_endtag(self, tag):
+        if tag == self.target_tag:
+            self.open_write = False
+
+    def handle_data(self, data: str):
+        if self.open_write and self.text_context and data.strip():
+            self.output.append(data.strip())
+
+
 class cache(object):
 
     def __init__(self, fn):

diff --git a/nonebot_plugin_wiki/mediawiki/wikipedia.py b/nonebot_plugin_wiki/mediawiki/wikipedia.py
@@ -10,6 +10,7 @@
 
 from .exceptions import (
     ApiReturnError,
+    NoExtractError,
     PageError,
     DisambiguationError,
     RedirectError,
@@ -262,13 +263,41 @@ async def random(pages=1):
     return titles
 
 
+async def bwiki_summary(title: str):
+    """接受来自BWiki的标题，返回文字简介
+
+    Args:
+        title: 标题
+
+    Returns:
+        str: 简介
+    """
+    params = {
+        "action": "parse",
+        "format": "json",
+        "disablelimitreport": False,
+        "redirects": True,
+        "disableeditsection": True,
+        "page": title
+    }
+
+    request = await _wiki_request(params)
+
+    to_feed = request["parse"]["text"]["*"]
+    wikibot_parser = util.SimpleInnerParser("", ("class", "wiki-bot"))
+    wikibot_parser.feed(to_feed)
+    wikibot_parser.close()
+
+    return '\n'.join(wikibot_parser.output)
+
+
 async def summary(
     title,
     sentences=0,
     chars=0,
     auto_suggest=True,
     redirect=True
-) -> Tuple[int, str]:  # type: ignore
+) -> Tuple[int, str]:    # type: ignore
     '''
     页面id与纯文本简介
     Keyword arguments:
@@ -292,35 +321,21 @@ async def summary(
         'titles': title,
 
     }
+    if "wiki.biligame" not in API_URL:
+        if sentences:
+            query_params['exsentences'] = sentences
+        elif chars:
+            query_params['exchars'] = chars
+        else:
+            query_params['exintro'] = ''
+        request = await _wiki_request(query_params)
+        try:
+            summary = (request['query']['pages'][pageid]['extract']).strip()
+        except KeyError as e:
+            raise NoExtractError from e
 
-    if sentences:
-        query_params['exsentences'] = sentences
-    elif chars:
-        query_params['exchars'] = chars
     else:
-        query_params['exintro'] = ''
-    request = await _wiki_request(query_params)
-    try:
-        summary = (request['query']['pages'][pageid]['extract']).strip()
-    except KeyError:
-
-        # bili wiki解析
-        global USER_AGENT
-
-        headers = {
-            'User-Agent': USER_AGENT
-        }
-
-        async with httpx.AsyncClient(proxies=PROXIES, timeout=None) as client:
-            r = await client.get(page_info.url, headers=headers)
-
-        r = r.text
-
-        summary = re.sub(r"</?(.+?)>", "", ''.join(re.compile(
-            r'<p><b>(.*?)\n<p>\n<div').findall(r)))
-        if summary == '':
-            summary = re.sub(r"</?(.+?)>", "", ''.join(re.compile(
-                r'<p><b>(.*?)\n</p>').findall(r)))
+        summary = await bwiki_summary(title)
 
     return [pageid, summary]  # type: ignore
 
@@ -805,7 +820,7 @@ async def _wiki_request(params):
     global PROXIES
 
     params['format'] = 'json'
-    if not ('action' in params):
+    if 'action' not in params:
         params['action'] = 'query'
 
     headers = {
@@ -823,16 +838,14 @@ async def _wiki_request(params):
         time.sleep(int(wait_time.total_seconds()))
 
     global RETRY_TIMES
-    for times in range(RETRY_TIMES):
+    for _ in range(RETRY_TIMES):
         async with httpx.AsyncClient(proxies=PROXIES, timeout=None) as client:
             r = await client.get(API_URL, params=params, headers=headers)
 
         ret = r.json()
+        # print(ret)
 
-        if 'error' in ret:
-            if ' a temporary problem' in ret['error']['info']:
-                pass
-        else:
+        if 'error' not in ret:
             if RATE_LIMIT:
                 RATE_LIMIT_LAST_CALL = datetime.now()
             return ret