Commit d1db5b38 authored by litaolemo's avatar litaolemo

update 增加微博爬虫,测试ok

parent aa4469d5
# crawler
1. 部署在BJ-PaaS-test-nvwa001/srv/apps/
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
2. 创建虚拟环境 conda activate crawler_env/conda deactivate
\ No newline at end of file
......@@ -41,7 +41,7 @@ def get_proxy_from_redis():
def func_get_proxy_to_redis():
# chance = random.random()
auth = kdl.Auth("997803479675913", "er2siw6i58c61s387sqljvovoz8zybaq")
auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v")
client = kdl.Client(auth)
# 获取订单到期时间, 返回时间字符串
......
......@@ -1792,7 +1792,7 @@ class Crawler_toutiao():
# pass
data_lis.append(res)
if len(data_lis) >= 10:
if len(data_lis) >= 100:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
......
......@@ -27,7 +27,7 @@ import requests
import re
import datetime
import json
import aiohttp
# import aiohttp
import random
from bs4 import BeautifulSoup
from multiprocessing import Pool
......@@ -45,7 +45,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
class Crawler_v_qq():
......
# -*- coding:UTF-8 -*-
# @Time ": "2020/7/22 14:42",
# @File ": "crawler_weibo.py",
# @email ": "litao@igengmei.com",
# @author ": "litao",
# -*- coding: "utf-8 -*-",
import os
import copy
import requests
import re
import datetime
import json
# import aiohttp
import random
# from bs4 import BeautifulSoup
# from multiprocessing import Pool
# from multiprocessing import Process
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
from crawler.crawler_sys.utils import connect_with_redis
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from write_data_into_es.func_get_releaser_id import *
class Crawler_weibo():
def __init__(self, timeout=None, platform='weibo'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['describe', 'repost_count', 'isOriginal',
'video_id']
for popk in pop_key_Lst:
self.video_data.pop(popk)
@staticmethod
def get_video_image(data):
video_photo_url = data["pic_496x280"]
return video_photo_url
@staticmethod
def get_single_page(mid):
url = "https://m.weibo.cn/status/%s" % mid
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "same-origin",
"sec-fetch-site": "same-origin",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}
page_res = retry_get_url(url,headers=headers,proxies=0)
page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text,flags=re.DOTALL)[0]
page_json = json.loads(page_json_context)
text = page_json[0]["status"]["text"]
repost_count = trans_play_count(page_json[0]["status"]["reposts_count"])
comment_count = trans_play_count(page_json[0]["status"]["comments_count"])
favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"])
return text,repost_count,comment_count,favorite_count
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
# def get_weibo_info(self,releaser_id):
# url = "https://m.weibo.cn/api/config"
# headers = {
# "accept": "application/json, text/plain, */*",
# "accept-encoding": "gzip, deflate, br",
# "accept-language": "zh-CN,zh;q=0.9",
# # "cookie": "_T_WM=30976479190; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=ce3c56; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
# "mweibo-pwa": "1",
# "referer": "https://m.weibo.cn/u/{0}?uid={1}&t=0".format(releaser_id,releaser_id),
# "sec-fetch-dest": "empty",
# "sec-fetch-mode": "cors",
# "sec-fetch-site": "same-origin",
# "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
# "x-requested-with": "XMLHttpRequest",
# # "x-xsrf-token": "ce3c56",
# }
# requests_res = retry_get_url(url,headers=headers)
# res_json = requests_res.json()
# xsrf_token = res_json["data"]["st"]
# url_extr = res_json["data"]["loginUrl"].split(releaser_id+"%26")
# set_cookies = requests_res.headers.get()
# return xsrf_token,url_extr
# @logged
def releaser_page(self, releaserUrl,
output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
releaser_page_num_max=10000,
es_index=None,
doc_type=None,proxies_num=None):
print('Processing releaserUrl %s' % releaserUrl)
result_Lst = []
releaser_id,containerid = self.get_releaser_id(releaserUrl)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers = {
"accept": "application/json, text/plain, */*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
"mweibo-pwa": "1",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
# "x-xsrf-token": xsrf_token,
}
pagenum = 0
has_more = True
since_id = 0
if releaser_id != None:
while pagenum <= releaser_page_num_max and has_more:
pagenum += 1
"?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format(releaser_id,releaser_id,releaser_id,since_id)
headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id)
print('Page number": "%d' % pagenum)
try:
if proxies_num:
get_page = retry_get_url(url,headers=headers, timeout=self.timeout,proxies=proxies_num)
else:
get_page = retry_get_url(url,headers=headers,timeout=self.timeout)
except:
get_page = None
has_more = False
if get_page != None and get_page.status_code == 200:
page_json = get_page.json()
total = page_json["data"]["cardlistInfo"]["total"]
if pagenum > total:
break
since_id = page_json["data"]["cardlistInfo"]["since_id"]
page_dic = page_json["data"].get("cards")
if page_dic:
for one in page_dic:
try:
mblog = one.get("mblog")
mid = mblog.get("mid")
forward_text = ""
forward_user = ""
if one.get("source") == "绿洲":
text_type = "绿洲"
elif mblog.get("retweeted_status"):
text_type = "转发"
forward_text = mblog.get("retweeted_status").get("raw_text")
forward_user = mblog.get("retweeted_status").get("user").get("screen_name")
else:
text_type = one.get("source")
if mblog.get("isLongText"):
text,repost_count,comment_count,favorite_count = self.get_single_page(mid)
else:
text = mblog["raw_text"]
res_dic = {
"release_time": trans_strtime_to_timestamp(mblog["created_at"]),
"url": one["scheme"],
"releaser": mblog["user"]["screen_name"],
"repost_count": trans_play_count(mblog["reposts_count"]),
"comment_count": trans_play_count(mblog["comments_count"]),
"favorite_count": trans_play_count(mblog["attitudes_count"]),
"title": text.replace("\u200b",""),
"wb_type":text_type,
"forward_user":forward_user,
"forward_text":forward_text,
"mid":mid,
"releaserUrl":"https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str":"weibo_%s" % releaser_id
}
yield res_dic
except Exception as e:
print(mblog)
print("row formate error %s"% e)
continue
def get_releaser_follower_num(self, releaserUrl):
pass
if __name__ == '__main__':
test = Crawler_weibo()
url = 'https://weibo.com/p/1003061669879400/home?from=page_100306&mod=TAB#place'
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
# nnn = test.video_page(url)
# kw = '任正非 BBC'
# #sr = test.search_page(kw, search_pages_max=2)
res = test.releaser_page(url, output_to_es_raw=True,
es_index='crawler-data-raw',
releaser_page_num_max=400,proxies_num=0)
for r in res:
print(r)
# test.get_single_page(4524055937468233)
\ No newline at end of file
......@@ -130,7 +130,7 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
year = func_inyear(input_time)
if year != str(0):
input_time = year + '-' + input_time
real_time = real_time = int(datetime.datetime.strptime(input_time,
real_time = int(datetime.datetime.strptime(input_time,
'%Y-%m-%d').timestamp()*1e3)
else:
print('error in {input_time}'.format(input_time=input_time))
......@@ -143,5 +143,31 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
real_time = 0
print('unsuitable format %s' % input_time)
return real_time
\ No newline at end of file
def weibo_parse_time(publish_time):
publish_time = publish_time.split('来自')[0]
if '刚刚' in publish_time:
publish_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
elif '分钟' in publish_time:
minute = publish_time[:publish_time.find('分钟')]
minute = datetime.timedelta(minutes=int(minute))
publish_time = (datetime.datetime.now() -
minute).strftime('%Y-%m-%d %H:%M')
elif '今天' in publish_time:
today = datetime.datetime.now().strftime('%Y-%m-%d')
time = publish_time[3:]
publish_time = today + ' ' + time
elif '月' in publish_time:
year = datetime.datetime.now().strftime('%Y')
month = publish_time[0:2]
day = publish_time[3:5]
time = publish_time[7:12]
publish_time = year + '-' + month + '-' + day + ' ' + time
else:
publish_time = publish_time
return trans_strtime_to_timestamp(publish_time)
if __name__ == "__main__":
print(trans_strtime_to_timestamp("06-03"))
\ No newline at end of file
lxml==4.5.1
requests==2.23.0
tqdm==4.46.1
absl-py==0.9.0
dkl=0.2.15
redis=3.5.3
elasticsearch=7.8.0
......@@ -43,7 +43,7 @@ def write_es(target_index,platform="short_video"):
'abstract': '《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数。',
'article_id': 6851819988165394958, 'releaser_id_str': 'toutiao_103497952048',
'video_img': 'http://p26-tt.byteimg.com/img/pgc-image/7448338f2712460d968e65062b249a40~720x380_cs.webp',
'id': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
'aid': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
'content': '<h1 class="pgc-h-arrow-right">饿了吗?戳右边关注我们,每天给您送上最新出炉的娱乐硬核大餐!</h1><p>上周六(7月18日)《这就是街舞》火热开播,引来无数热爱街舞的小伙伴前来观看,除了选手们精彩的表演,最令人期待的还是四位导师的表现。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/1cde4effb617411fb66467228f5fc20e" img_width="494" img_height="585" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数(选手晋级票)。</p><p>最终,张艺兴以第一名的成绩拿到最多毛巾,而45岁的钟汉良力压90后王一博成为倒数第二。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f7d980bc497a4f98963d76dbd5ec08cc" img_width="1582" img_height="887" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>7月21日,有网友发帖质疑,是因为王一博年纪最小,红的时间最短所以用他挽尊吗?并质疑钟汉良是否暗箱操作。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/ba8cd99c5cb2429aa46d99c5a5f62918" img_width="636" img_height="126" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>该网友提出两个疑点,一是投票阶段的视频虽然经过剪辑,却能看出最开始的时候都没人投钟汉良,最后是怎么比王一博多的?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/e0a38a7a89bc4763b551b5011def58f1" img_width="619" img_height="403" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>二是投票结束时钟汉良表情凝重,而王一博表情轻松,看起来很有信心。钟汉良有在节目里说能用耳朵听见有多少人投自己。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/546810450d7b4f0c929479eea207b3ad" img_width="855" img_height="465" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实网友有这样的猜想并不奇怪,在节目播出时便有圈内人发博表示搞不懂为什么王一博会垫底。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/0eca18a8e15243e0b03ff1e8cd90f0af" img_width="919" img_height="384" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>而对钟汉良是否能胜任《这街》导师身份的话题也持续讨论未断过。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/8831a69ccbef4b8fae65497356fedfcb" img_width="775" img_height="323" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>在大多数人心中,张艺兴、王嘉尔、王一博三个年轻人都是在街舞方面有所造诣的唱跳艺人。</p><p>钟汉良不是演员吗?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f55235255cee43dd8fce9f091f987176" img_width="879" img_height="518" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>诚然,如今已有45岁的钟汉良在跳舞方面已经不如年轻人有爆发力,但节目中也可以看出他有跳街舞的基础。</p><p>许多人不知道的是,他出道前其实是无线的舞蹈艺人,没靠脸吃饭的时候,他也是拥有一身舞艺,不一定比王一博差。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/7448338f2712460d968e65062b249a40" img_width="759" img_height="502" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1993年钟汉良加入TVB的舞蹈艺员训练班,学习现代舞、民族舞、芭蕾舞,而他最擅长的是拉丁舞,当时曾为梅艳芳、张国荣等天王巨星担任伴舞,更有一个当芭蕾舞者的梦想。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a5d6f1c59bad422eb4631b5d6a8feadc" img_width="506" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>但因为他生的白净,身高183cm,长相帅气,于是很快被调至艺员部拍电视剧。那段时间他连续出演《少年五虎》、《刑事侦缉档案》、《第三类法庭》为自己积累了不少观众缘。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a05a5086d9164c909058276ce4a9f3f5" img_width="594" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1995年,他获台湾唱作人邰正宵发掘,赴台湾发展并成为唱跳歌手,并出了自己的首张专辑《OREA》。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/59f1d258c5274cfebe109a2917d4b2e8" img_width="400" img_height="262" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>钟汉良当了5年的歌手都没有什么水花,直到2000年后,他开始转向内地拍剧,才开始小有名气。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/73663a4cf05841d694c42b3cb9d91be3" img_width="524" img_height="345" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实这期间他并不是没有尝试再回到舞台上,2006年他就参加了《舞林大会》,但这时候大家已经开始淡忘他曾是一位唱跳歌手。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/dcc4137db0a24d28b5595ca771a2e8f5" img_width="1311" img_height="684" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>直到2015年,他成立自己的工作室,再次重拾歌手的身份,除了出个人音乐专辑外,还开过演唱会,如今加盟《这就是街舞》,可以尽情展示他的舞技,也算是一场多年来当舞者的夙愿。</p><p>他在街舞也确实玩的很开心。虽然依然有质疑的声音出现,但相信在后面的节目中能在他身上看到更多惊喜。</p><p>#钟汉良暗箱操作#、#钟汉良跳舞#、#钟汉良王一博#</p><p>作者:每天都想吃榴莲</p><p>责编:阿叉</p>'}
_id = "test0"
......
......@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def weibo(releaserUrl,**kwargs):
try:
containerid = ""
if "/u/" in releaserUrl:
releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
releaser_id = containerid = re.findall("/u/(\d+)",releaserUrl)[0]
elif "/p/" in releaserUrl:
releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
releaser_id = containerid =re.findall("/p/(\d+)",releaserUrl)[0]
if len(releaser_id) >= 15:
releaser_id = releaser_id[6:]
elif "/" in releaserUrl:
releaser_id = re.findall("(\d+)",releaserUrl)[0]
releaser_id = containerid= re.findall("(\d+)",releaserUrl)[0]
else:
try:
releaserid = int(releaserUrl)
except:
return None
return releaser_id
return releaser_id,containerid
except:
return None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment