Commit 6bd6c0fe authored by litaolemo's avatar litaolemo

update

parent d1db5b38
......@@ -27,7 +27,7 @@ from crawler.crawler_sys.utils import connect_with_redis
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import *
......@@ -70,7 +70,7 @@ class Crawler_weibo():
page_res = retry_get_url(url,headers=headers,proxies=0)
page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text,flags=re.DOTALL)[0]
page_json = json.loads(page_json_context)
text = page_json[0]["status"]["text"]
text = dehtml(page_json[0]["status"]["text"])
repost_count = trans_play_count(page_json[0]["status"]["reposts_count"])
comment_count = trans_play_count(page_json[0]["status"]["comments_count"])
favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"])
......@@ -204,7 +204,7 @@ class Crawler_weibo():
if __name__ == '__main__':
test = Crawler_weibo()
url = 'https://weibo.com/p/1003061669879400/home?from=page_100306&mod=TAB#place'
url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place'
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
# nnn = test.video_page(url)
# kw = '任正非 BBC'
......
This diff is collapsed.
This diff is collapsed.
# -*- coding:UTF-8 -*-
# @Time : 2020/7/22 19:53
# @File : html_to_str.py
# @email : litao@igengmei.com
# @author : litao
from .HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def dehtml(text):
try:
parser = _DeHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text
......@@ -5,3 +5,4 @@ absl-py==0.9.0
dkl=0.2.15
redis=3.5.3
elasticsearch=7.8.0
HTMLParser=0.0.2
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment