Commit d1db5b38 authored by litaolemo's avatar litaolemo

update 增加微博爬虫,测试ok

parent aa4469d5
# crawler
1. 部署在BJ-PaaS-test-nvwa001/srv/apps/
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
2. 创建虚拟环境 conda activate crawler_env/conda deactivate
\ No newline at end of file
......@@ -41,7 +41,7 @@ def get_proxy_from_redis():
def func_get_proxy_to_redis():
# chance = random.random()
auth = kdl.Auth("997803479675913", "er2siw6i58c61s387sqljvovoz8zybaq")
auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v")
client = kdl.Client(auth)
# 获取订单到期时间, 返回时间字符串
......
......@@ -1792,7 +1792,7 @@ class Crawler_toutiao():
# pass
data_lis.append(res)
if len(data_lis) >= 10:
if len(data_lis) >= 100:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
......
......@@ -27,7 +27,7 @@ import requests
import re
import datetime
import json
import aiohttp
# import aiohttp
import random
from bs4 import BeautifulSoup
from multiprocessing import Pool
......@@ -45,7 +45,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
class Crawler_v_qq():
......
This diff is collapsed.
......@@ -130,7 +130,7 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
year = func_inyear(input_time)
if year != str(0):
input_time = year + '-' + input_time
real_time = real_time = int(datetime.datetime.strptime(input_time,
real_time = int(datetime.datetime.strptime(input_time,
'%Y-%m-%d').timestamp()*1e3)
else:
print('error in {input_time}'.format(input_time=input_time))
......@@ -143,5 +143,31 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
real_time = 0
print('unsuitable format %s' % input_time)
return real_time
\ No newline at end of file
def weibo_parse_time(publish_time):
publish_time = publish_time.split('来自')[0]
if '刚刚' in publish_time:
publish_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
elif '分钟' in publish_time:
minute = publish_time[:publish_time.find('分钟')]
minute = datetime.timedelta(minutes=int(minute))
publish_time = (datetime.datetime.now() -
minute).strftime('%Y-%m-%d %H:%M')
elif '今天' in publish_time:
today = datetime.datetime.now().strftime('%Y-%m-%d')
time = publish_time[3:]
publish_time = today + ' ' + time
elif '月' in publish_time:
year = datetime.datetime.now().strftime('%Y')
month = publish_time[0:2]
day = publish_time[3:5]
time = publish_time[7:12]
publish_time = year + '-' + month + '-' + day + ' ' + time
else:
publish_time = publish_time
return trans_strtime_to_timestamp(publish_time)
if __name__ == "__main__":
print(trans_strtime_to_timestamp("06-03"))
\ No newline at end of file
lxml==4.5.1
requests==2.23.0
tqdm==4.46.1
absl-py==0.9.0
dkl=0.2.15
redis=3.5.3
elasticsearch=7.8.0
......@@ -43,7 +43,7 @@ def write_es(target_index,platform="short_video"):
'abstract': '《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数。',
'article_id': 6851819988165394958, 'releaser_id_str': 'toutiao_103497952048',
'video_img': 'http://p26-tt.byteimg.com/img/pgc-image/7448338f2712460d968e65062b249a40~720x380_cs.webp',
'id': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
'aid': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
'content': '<h1 class="pgc-h-arrow-right">饿了吗?戳右边关注我们,每天给您送上最新出炉的娱乐硬核大餐!</h1><p>上周六(7月18日)《这就是街舞》火热开播,引来无数热爱街舞的小伙伴前来观看,除了选手们精彩的表演,最令人期待的还是四位导师的表现。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/1cde4effb617411fb66467228f5fc20e" img_width="494" img_height="585" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数(选手晋级票)。</p><p>最终,张艺兴以第一名的成绩拿到最多毛巾,而45岁的钟汉良力压90后王一博成为倒数第二。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f7d980bc497a4f98963d76dbd5ec08cc" img_width="1582" img_height="887" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>7月21日,有网友发帖质疑,是因为王一博年纪最小,红的时间最短所以用他挽尊吗?并质疑钟汉良是否暗箱操作。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/ba8cd99c5cb2429aa46d99c5a5f62918" img_width="636" img_height="126" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>该网友提出两个疑点,一是投票阶段的视频虽然经过剪辑,却能看出最开始的时候都没人投钟汉良,最后是怎么比王一博多的?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/e0a38a7a89bc4763b551b5011def58f1" img_width="619" img_height="403" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>二是投票结束时钟汉良表情凝重,而王一博表情轻松,看起来很有信心。钟汉良有在节目里说能用耳朵听见有多少人投自己。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/546810450d7b4f0c929479eea207b3ad" img_width="855" img_height="465" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实网友有这样的猜想并不奇怪,在节目播出时便有圈内人发博表示搞不懂为什么王一博会垫底。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/0eca18a8e15243e0b03ff1e8cd90f0af" img_width="919" img_height="384" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>而对钟汉良是否能胜任《这街》导师身份的话题也持续讨论未断过。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/8831a69ccbef4b8fae65497356fedfcb" img_width="775" img_height="323" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>在大多数人心中,张艺兴、王嘉尔、王一博三个年轻人都是在街舞方面有所造诣的唱跳艺人。</p><p>钟汉良不是演员吗?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f55235255cee43dd8fce9f091f987176" img_width="879" img_height="518" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>诚然,如今已有45岁的钟汉良在跳舞方面已经不如年轻人有爆发力,但节目中也可以看出他有跳街舞的基础。</p><p>许多人不知道的是,他出道前其实是无线的舞蹈艺人,没靠脸吃饭的时候,他也是拥有一身舞艺,不一定比王一博差。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/7448338f2712460d968e65062b249a40" img_width="759" img_height="502" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1993年钟汉良加入TVB的舞蹈艺员训练班,学习现代舞、民族舞、芭蕾舞,而他最擅长的是拉丁舞,当时曾为梅艳芳、张国荣等天王巨星担任伴舞,更有一个当芭蕾舞者的梦想。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a5d6f1c59bad422eb4631b5d6a8feadc" img_width="506" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>但因为他生的白净,身高183cm,长相帅气,于是很快被调至艺员部拍电视剧。那段时间他连续出演《少年五虎》、《刑事侦缉档案》、《第三类法庭》为自己积累了不少观众缘。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a05a5086d9164c909058276ce4a9f3f5" img_width="594" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1995年,他获台湾唱作人邰正宵发掘,赴台湾发展并成为唱跳歌手,并出了自己的首张专辑《OREA》。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/59f1d258c5274cfebe109a2917d4b2e8" img_width="400" img_height="262" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>钟汉良当了5年的歌手都没有什么水花,直到2000年后,他开始转向内地拍剧,才开始小有名气。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/73663a4cf05841d694c42b3cb9d91be3" img_width="524" img_height="345" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实这期间他并不是没有尝试再回到舞台上,2006年他就参加了《舞林大会》,但这时候大家已经开始淡忘他曾是一位唱跳歌手。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/dcc4137db0a24d28b5595ca771a2e8f5" img_width="1311" img_height="684" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>直到2015年,他成立自己的工作室,再次重拾歌手的身份,除了出个人音乐专辑外,还开过演唱会,如今加盟《这就是街舞》,可以尽情展示他的舞技,也算是一场多年来当舞者的夙愿。</p><p>他在街舞也确实玩的很开心。虽然依然有质疑的声音出现,但相信在后面的节目中能在他身上看到更多惊喜。</p><p>#钟汉良暗箱操作#、#钟汉良跳舞#、#钟汉良王一博#</p><p>作者:每天都想吃榴莲</p><p>责编:阿叉</p>'}
_id = "test0"
......
......@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def weibo(releaserUrl,**kwargs):
try:
containerid = ""
if "/u/" in releaserUrl:
releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
releaser_id = containerid = re.findall("/u/(\d+)",releaserUrl)[0]
elif "/p/" in releaserUrl:
releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
releaser_id = containerid =re.findall("/p/(\d+)",releaserUrl)[0]
if len(releaser_id) >= 15:
releaser_id = releaser_id[6:]
elif "/" in releaserUrl:
releaser_id = re.findall("(\d+)",releaserUrl)[0]
releaser_id = containerid= re.findall("(\d+)",releaserUrl)[0]
else:
try:
releaserid = int(releaserUrl)
except:
return None
return releaser_id
return releaser_id,containerid
except:
return None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment