Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: 获取原创微博长内容修改为接口调用 #406

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions weibo_spider/parser/comment_parser.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
import logging
import random
import requests
from time import sleep

from .parser import Parser
from .util import handle_garbled, handle_html
from .util import handle_html, get_long_weibo_detail

logger = logging.getLogger('spider.comment_parser')


class CommentParser(Parser):
def __init__(self, cookie, weibo_id):
self.cookie = cookie
self.weibo_id = weibo_id
self.url = 'https://weibo.cn/comment/' + weibo_id
self.selector = handle_html(self.cookie, self.url)

def get_long_weibo(self):
"""获取长原创微博"""
try:
for i in range(5):
self.selector = handle_html(self.cookie, self.url)
if self.selector is not None:
info = self.selector.xpath("//div[@class='c']")[1]
wb_content = handle_garbled(info)
wb_time = info.xpath("//span[@class='ct']/text()")[0]
weibo_content = wb_content[wb_content.find(':') +
1:wb_content.rfind(wb_time)]
if weibo_content is not None:
return weibo_content
weibo_content = get_long_weibo_detail(self.cookie, self.weibo_id)
if weibo_content is not None:
return weibo_content
sleep(random.randint(6, 10))
except Exception:
logger.exception(u'网络出错')
Expand Down
14 changes: 14 additions & 0 deletions weibo_spider/parser/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import hashlib
import json
import logging
import re
import sys

import requests
Expand Down Expand Up @@ -118,3 +119,16 @@ def string_to_int(string):
elif string.endswith(u'亿'):
string = float(string[:-1]) * 100000000
return int(string)


def get_long_weibo_detail(cookie, id):
"""获取长微博详情"""
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User_Agent': user_agent, 'Cookie': cookie}
resp = requests.get("https://m.weibo.cn/statuses/show?id=" + id, headers=headers)
if resp.status_code == 200:
content = resp.json()['data']['text'].replace("<br />", "\n")
return re.sub("</?[^>]+>", "", content)
except Exception as e:
logger.exception(e)