Commit 699ae037 authored by litaolemo's avatar litaolemo

update

parent 2838ff4e
......@@ -5,4 +5,4 @@
3. source /root/anaconda3/bin/activate
4. 创建虚拟环境 conda activate crawler_env/conda deactivate
5. 抓取程序 python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5
\ No newline at end of file
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
\ No newline at end of file
......@@ -33,6 +33,7 @@ def feed_url_into_redis(dict_Lst, platform,
print('Pushed data into redis')
return True
def pull_url_from_es(platform, release_time_lower_bdr=None):
"""
Just pull urls from es index crawler-url-register.
......
......@@ -9,12 +9,28 @@ import datetime
import time
import sys
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from gm_upload import upload, upload_file
import requests
import os
import copy
import re
import HTMLParser
import pymysql
from crawler.crawler_sys.utils.output_results import retry_get_url
from lxml import html
from lxml.html.clean import Cleaner
import random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19)
conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
db='mimas_test', charset='utf8')
cur = conn.cursor()
class push_rule(object):
def __init__(self, repost_count_ni=None, comment_count_ni=None, favorite_count_ni=None, time_range=5):
def __init__(self, repost_count_ni=None, comment_count_ni=None, favorite_count_ni=None, time_range=5, level=3):
"""
传入增量计算规则
如 5分钟点赞量增长200
......@@ -29,6 +45,7 @@ class push_rule(object):
self._comment_count_ni = comment_count_ni
self._favorite_count_ni = favorite_count_ni
self._time_range = time_range
self.level = level
try:
self.repost_per_min = self._repost_count_ni / time_range
except:
......@@ -89,8 +106,12 @@ class push_rule(object):
return False
def scan_from_redis(push_rule_class):
def scan_from_redis(push_rule_class_list) -> Dict:
# len_id_list = rds.llen("doc_id")
set_name = "exists_doc_id_set_%s" % datetime.datetime.now().strftime("%Y-%m-%d")
rds.sadd(set_name, "test")
rds.expire(set_name, 259200)
out_ts = datetime.datetime.now().timestamp() * 1e3 - 86400000
while True:
doc_id = rds.lpop("doc_id")
if doc_id:
......@@ -110,29 +131,222 @@ def scan_from_redis(push_rule_class):
comment_count = one_data.get("comment_count")
favorite_count = one_data.get("favorite_count")
continue
for push_bool in push_rule_class_list:
bool_res = push_bool.parse_data(fetch_time_last=fetch_time, repost_count_last=repost_count,
comment_count_last=comment_count,
favorite_count_last=favorite_count,
comment_count=one_data.get("comment_count"),
favorite_count=one_data.get("favorite_count"),
repost_count=one_data.get("repost_count"), parse_mode="and",
fetch_time=one_data.get("fetch_time"))
# print(bool_res)
if bool_res:
one_data["level"] = push_bool.level
if one_data["release_time"] < out_ts:
continue
set_name = "exists_doc_id_set_%s" % datetime.datetime.fromtimestamp(
one_data["release_time"] / 1e3).strftime("%Y-%m-%d")
if rds.sismember(set_name, one_data["doc_id"]):
continue
else:
rds.sadd(set_name, one_data["doc_id"])
yield one_data
bool_res = push_rule_class.parse_data(fetch_time_last=fetch_time, repost_count_last=repost_count,
comment_count_last=comment_count,
favorite_count_last=favorite_count,
comment_count=one_data.get("comment_count"),
favorite_count=one_data.get("favorite_count"),
repost_count=one_data.get("repost_count"), parse_mode="and",
fetch_time=one_data.get("fetch_time"))
print(bool_res)
if bool_res:
pass
print(res_list)
print(res_list)
# else:
# time.sleep(1)
else:
time.sleep(5)
WHITE_TAGS = {
"basic": ["div", "p", "span", "img", "br", "video", 'a'], # 暂定小程序及爬取数据使用
"all": [
"div", "p", "span", "img", "br", "video", "audio", "a", "b", "strong", "i", "ul", "ol", "li", "em", "h1",
"h2", "h3", "h4", "h5", "h6", "iframe",
] # 可以展示的所有白标签
}
def _get_rich_text(rich_text):
"""
富文本标签转成标签
:param rich_text:
:return:
"""
try:
h = HTMLParser.HTMLParser()
rich_text = h.unescape(rich_text.decode("utf-8").replace("&amp;", "&").replace("\n", "<br>")) # 富文本标签转成标签对象
return rich_text
except:
return rich_text
def gm_convert_html_tags(rich_text, all_tags=False, remove_tags=None):
"""
富文本内容重新清洗,剔除不需要的样式
:param rich_text: 富文本
:param all_tags: 是否需要匹配所有白名单中的标签
:param remove_tags: 需要剔除的,白名单标签 []
:return:
"""
if not rich_text:
return ""
# rich_text = _get_rich_text(rich_text)
# 标签清洗 + 补齐 参数
tags = WHITE_TAGS["all"] if all_tags else WHITE_TAGS["basic"]
if remove_tags:
tags = [tag for tag in tags if tag not in remove_tags]
kw = {
"remove_unknown_tags": False,
"allow_tags": tags,
"safe_attrs": ["src", ],
}
if "a" in tags:
kw["safe_attrs"].append("href")
elif all_tags:
kw["safe_attrs"].extend(["class", "style"])
if "iframe" in kw["allow_tags"]:
kw["embedded"] = False
clear = Cleaner(**kw)
rich_text = clear.clean_html(rich_text)
# 增加样式
element_obj = html.fromstring(rich_text)
for element in element_obj.xpath(u"//img|//video"):
if not all_tags: # 小程序,普通用户,爬取数据
element.attrib["width"] = "100%" # 图片、视频增加宽度 100%
if element.tag == "video" and all_tags:
element.attrib["class"] = "js_richtext_video"
# 移除a标签中跳转链不是gengmei开头的链接
for item in element_obj.xpath("//a[not(starts-with(@href, 'gengmei://'))]"):
item.getparent().remove(item)
# a 标签追加样式
for item in element_obj.xpath("//a"):
item.attrib["style"] = 'color:#3FB5AF' # a标签颜色
rich_text = html.tostring(element_obj, encoding="unicode")
return rich_text
def push_data_to_user(res_data: Dict) -> Dict:
"""
处理数据为可以入库的格式
:param res_data:
:return:
"""
qiniu_img_list = []
if res_data["img_list"]:
for img_url in res_data["img_list"]:
img_wb = retry_get_url(img_url).content
res = upload(img_wb)
print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append((res, img_info_json))
# 替换图片
if res_data["platform"] == "weibo":
res_data["qiniu_img_list"] = qiniu_img_list
elif res_data["platform"] == "douban":
content = res_data.get("content")
if content:
for count, img_url in enumerate(res_data["img_list"]):
content.replace(img_url, qiniu_img_list[count][0])
res_data["qiniu_img_list"] = qiniu_img_list
if res_data["platform"] == "weibo":
res_data["content"] = gm_convert_html_tags(res_data["title"], all_tags=True)
res_data["title"] = ""
elif res_data["platform"] == "douban":
res_data["content"] = gm_convert_html_tags(res_data["content"], all_tags=True)
return res_data
user_id_list = [33524704, 33524711, 33524716, 33524731, 33524740, 33524697, 33524707, 33524712, 33524717, 33524724,
33524755, 33524762, 33524779, 33524766, 33524782]
img_type = {
"OTHER": 1,
# '其他图片'
"GIF": 2,
# "GIF动图")
"JPG": 3,
# "JPG图片")
"JPEG": 4,
# "JPEG图片")
"PNG": 5,
# "PNG图片")
"BMP": 6,
# "BMP位图")
"WEBP": 7,
# "WEBP图片类型")
"TIFF": 8,
# "TIFF图片类型")
}
def write_data_into_mysql(res_data):
now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
data = push_data_to_user(res_data)
if data.get("title"):
sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
user_id=random.choice(user_id_list), content=data["content"], is_online=1, status=2, platform=3,
content_level=data["level"],
is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
try:
res = cur.execute(sql_query)
tractate_id = int(conn.insert_id())
if res:
conn.commit()
except Exception as e:
print("commit error %s" % e)
conn.rollback()
if data.get("qiniu_img_list"):
for img_info in data.get("qiniu_img_list"):
if img_info[0] in data.get("content"):
image_url_source = 2
else:
image_url_source = 1
try:
image_type = img_type.get(img_info[0].split(".")[-1].upper())
except:
image_type = 1
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},{image_url},{width},{height},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=img_info[1]["width"],
height=img_info[1]["heigh"], image_url_source=image_url_source, image_type=image_type,
create_time=now_str, update_time=now_str)
def task_main():
# 实例化数据判断规则
push_rule_class = push_rule(favorite_count_ni=0.000000001, time_range=5)
# 循环处理抓取数据
scan_from_redis(push_rule_class)
# 实例化数据判断规则 注意高优先级在前 低优先级在后
push_rule_class1 = push_rule(favorite_count_ni=0.000000001, time_range=5, level=5)
push_rule_class2 = push_rule(comment_count_ni=0.0000000001, time_range=5, level=3)
rules_list = [
push_rule_class1,
push_rule_class2
]
# 循环处理抓取数据,返回需要添加至后端的数据
for res_data in scan_from_redis(rules_list):
write_data_into_mysql(res_data)
cur.close()
conn.close()
task_main()
# -*- coding:UTF-8 -*-
# @Time : 2020/7/28 16:31
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
......@@ -80,7 +80,7 @@ class CrawlerDouban():
comment_count = trans_play_count(page_json["comments_count"])
favorite_count = trans_play_count(page_json["like_count"])
collection_count = trans_play_count(page_json["collections_count"])
img_list = re.findall('img src=".*?"',content)
img_list = re.findall(r'"(http.*?[jpg|webp]{1}?)"',content)
dic = {
"content":content,
"repost_count":repost_count,
......@@ -185,7 +185,7 @@ class CrawlerDouban():
res_dic["doc_id"] = doc_id
res_dic.update(self.get_single_page(mid,proxies_num))
# print(res_dic)
yield res_dic
except Exception as e:
print("single data parse error %s " %e)
......
git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
lxml==4.5.1
requests==2.23.0
tqdm==4.46.1
......@@ -5,7 +6,6 @@ absl-py==0.9.0
kdl==0.2.15
redis==3.5.3
elasticsearch==7.8.0
qiniu==7.2.8
aiohttp==3.6.2
bs4==4.9.1
selenium==3.141.0
......@@ -13,3 +13,4 @@ fontTools==4.13.0
numpy==1.19.1
pandas==1.0.5
mymsql==0.10.0
qiniu==7.1.4
\ No newline at end of file
......@@ -18,58 +18,104 @@ import datetime
import elasticsearch
from write_data_into_es.func_cal_doc_id import *
import urllib
from urllib.parse import parse_qs,urlparse
from urllib.parse import parse_qs, urlparse
# from func_get_releaser_id import *
hosts = '172.16.32.37'
port = 9200
user = 'zhouyujiang'
passwd = '8tM9JDN2LVxM'
http_auth = (user, passwd)
es = elasticsearch.Elasticsearch(hosts=hosts, port=port)
def write_es(target_index,platform="short_video"):
count = 0
bulk_all_body = ""
if True:
dic = {
'platform': 'toutiao', 'title': '45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看',
'url': 'http://toutiao.com/group/6851819988165394958/?iid=0&app=news_article', 'duration': 0,
'releaser': '盖饭娱乐官方号', 'play_count': 14350, 'favorite_count': 43, 'comment_count': 131, 'repost_count': 2,
'data_provider': 'gengmei',
'releaserUrl': 'https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=103497952048&media_id=1609675594821640&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article',
'release_time': 1595315782000, 'fetch_time': 1595384097955,
'abstract': '《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数。',
'article_id': 6851819988165394958, 'releaser_id_str': 'toutiao_103497952048',
'video_img': 'http://p26-tt.byteimg.com/img/pgc-image/7448338f2712460d968e65062b249a40~720x380_cs.webp',
'aid': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
'content': '<h1 class="pgc-h-arrow-right">饿了吗?戳右边关注我们,每天给您送上最新出炉的娱乐硬核大餐!</h1><p>上周六(7月18日)《这就是街舞》火热开播,引来无数热爱街舞的小伙伴前来观看,除了选手们精彩的表演,最令人期待的还是四位导师的表现。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/1cde4effb617411fb66467228f5fc20e" img_width="494" img_height="585" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数(选手晋级票)。</p><p>最终,张艺兴以第一名的成绩拿到最多毛巾,而45岁的钟汉良力压90后王一博成为倒数第二。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f7d980bc497a4f98963d76dbd5ec08cc" img_width="1582" img_height="887" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>7月21日,有网友发帖质疑,是因为王一博年纪最小,红的时间最短所以用他挽尊吗?并质疑钟汉良是否暗箱操作。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/ba8cd99c5cb2429aa46d99c5a5f62918" img_width="636" img_height="126" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>该网友提出两个疑点,一是投票阶段的视频虽然经过剪辑,却能看出最开始的时候都没人投钟汉良,最后是怎么比王一博多的?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/e0a38a7a89bc4763b551b5011def58f1" img_width="619" img_height="403" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>二是投票结束时钟汉良表情凝重,而王一博表情轻松,看起来很有信心。钟汉良有在节目里说能用耳朵听见有多少人投自己。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/546810450d7b4f0c929479eea207b3ad" img_width="855" img_height="465" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实网友有这样的猜想并不奇怪,在节目播出时便有圈内人发博表示搞不懂为什么王一博会垫底。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/0eca18a8e15243e0b03ff1e8cd90f0af" img_width="919" img_height="384" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>而对钟汉良是否能胜任《这街》导师身份的话题也持续讨论未断过。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/8831a69ccbef4b8fae65497356fedfcb" img_width="775" img_height="323" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>在大多数人心中,张艺兴、王嘉尔、王一博三个年轻人都是在街舞方面有所造诣的唱跳艺人。</p><p>钟汉良不是演员吗?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f55235255cee43dd8fce9f091f987176" img_width="879" img_height="518" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>诚然,如今已有45岁的钟汉良在跳舞方面已经不如年轻人有爆发力,但节目中也可以看出他有跳街舞的基础。</p><p>许多人不知道的是,他出道前其实是无线的舞蹈艺人,没靠脸吃饭的时候,他也是拥有一身舞艺,不一定比王一博差。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/7448338f2712460d968e65062b249a40" img_width="759" img_height="502" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1993年钟汉良加入TVB的舞蹈艺员训练班,学习现代舞、民族舞、芭蕾舞,而他最擅长的是拉丁舞,当时曾为梅艳芳、张国荣等天王巨星担任伴舞,更有一个当芭蕾舞者的梦想。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a5d6f1c59bad422eb4631b5d6a8feadc" img_width="506" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>但因为他生的白净,身高183cm,长相帅气,于是很快被调至艺员部拍电视剧。那段时间他连续出演《少年五虎》、《刑事侦缉档案》、《第三类法庭》为自己积累了不少观众缘。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a05a5086d9164c909058276ce4a9f3f5" img_width="594" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1995年,他获台湾唱作人邰正宵发掘,赴台湾发展并成为唱跳歌手,并出了自己的首张专辑《OREA》。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/59f1d258c5274cfebe109a2917d4b2e8" img_width="400" img_height="262" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>钟汉良当了5年的歌手都没有什么水花,直到2000年后,他开始转向内地拍剧,才开始小有名气。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/73663a4cf05841d694c42b3cb9d91be3" img_width="524" img_height="345" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实这期间他并不是没有尝试再回到舞台上,2006年他就参加了《舞林大会》,但这时候大家已经开始淡忘他曾是一位唱跳歌手。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/dcc4137db0a24d28b5595ca771a2e8f5" img_width="1311" img_height="684" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>直到2015年,他成立自己的工作室,再次重拾歌手的身份,除了出个人音乐专辑外,还开过演唱会,如今加盟《这就是街舞》,可以尽情展示他的舞技,也算是一场多年来当舞者的夙愿。</p><p>他在街舞也确实玩的很开心。虽然依然有质疑的声音出现,但相信在后面的节目中能在他身上看到更多惊喜。</p><p>#钟汉良暗箱操作#、#钟汉良跳舞#、#钟汉良王一博#</p><p>作者:每天都想吃榴莲</p><p>责编:阿叉</p>'}
_id = "test0"
bulk_head = '{"index": {"_id":"%s"}}' % _id
data_str = json.dumps(dic, ensure_ascii=False)
bulk_one_body = bulk_head + '\n' + data_str + '\n'
bulk_all_body += bulk_one_body
count += 1
if count % 500 == 0:
eror_dic = es.bulk(index=target_index,
body=bulk_all_body, request_timeout=500)
bulk_all_body = ''
if eror_dic['errors'] is True:
print(eror_dic['items'])
print(bulk_all_body)
print(count)
# hosts = '172.16.32.37'
# port = 9200
# user = 'zhouyujiang'
# passwd = '8tM9JDN2LVxM'
# http_auth = (user, passwd)
# es = elasticsearch.Elasticsearch(hosts=hosts, port=port)
#
#
# def write_es(target_index,platform="short_video"):
# count = 0
# bulk_all_body = ""
#
# if True:
# dic = {
# 'platform': 'toutiao', 'title': '45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看',
# 'url': 'http://toutiao.com/group/6851819988165394958/?iid=0&app=news_article', 'duration': 0,
# 'releaser': '盖饭娱乐官方号', 'play_count': 14350, 'favorite_count': 43, 'comment_count': 131, 'repost_count': 2,
# 'data_provider': 'gengmei',
# 'releaserUrl': 'https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=103497952048&media_id=1609675594821640&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article',
# 'release_time': 1595315782000, 'fetch_time': 1595384097955,
# 'abstract': '《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数。',
# 'article_id': 6851819988165394958, 'releaser_id_str': 'toutiao_103497952048',
# 'video_img': 'http://p26-tt.byteimg.com/img/pgc-image/7448338f2712460d968e65062b249a40~720x380_cs.webp',
# 'aid': 'toutiao_6851819988165394958', 'high_quality_flag': 0, 'releaser_followers_count': 264759,
# 'content': '<h1 class="pgc-h-arrow-right">饿了吗?戳右边关注我们,每天给您送上最新出炉的娱乐硬核大餐!</h1><p>上周六(7月18日)《这就是街舞》火热开播,引来无数热爱街舞的小伙伴前来观看,除了选手们精彩的表演,最令人期待的还是四位导师的表现。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/1cde4effb617411fb66467228f5fc20e" img_width="494" img_height="585" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数(选手晋级票)。</p><p>最终,张艺兴以第一名的成绩拿到最多毛巾,而45岁的钟汉良力压90后王一博成为倒数第二。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f7d980bc497a4f98963d76dbd5ec08cc" img_width="1582" img_height="887" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>7月21日,有网友发帖质疑,是因为王一博年纪最小,红的时间最短所以用他挽尊吗?并质疑钟汉良是否暗箱操作。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/ba8cd99c5cb2429aa46d99c5a5f62918" img_width="636" img_height="126" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>该网友提出两个疑点,一是投票阶段的视频虽然经过剪辑,却能看出最开始的时候都没人投钟汉良,最后是怎么比王一博多的?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/e0a38a7a89bc4763b551b5011def58f1" img_width="619" img_height="403" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>二是投票结束时钟汉良表情凝重,而王一博表情轻松,看起来很有信心。钟汉良有在节目里说能用耳朵听见有多少人投自己。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/546810450d7b4f0c929479eea207b3ad" img_width="855" img_height="465" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实网友有这样的猜想并不奇怪,在节目播出时便有圈内人发博表示搞不懂为什么王一博会垫底。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/0eca18a8e15243e0b03ff1e8cd90f0af" img_width="919" img_height="384" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>而对钟汉良是否能胜任《这街》导师身份的话题也持续讨论未断过。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/8831a69ccbef4b8fae65497356fedfcb" img_width="775" img_height="323" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>在大多数人心中,张艺兴、王嘉尔、王一博三个年轻人都是在街舞方面有所造诣的唱跳艺人。</p><p>钟汉良不是演员吗?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f55235255cee43dd8fce9f091f987176" img_width="879" img_height="518" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>诚然,如今已有45岁的钟汉良在跳舞方面已经不如年轻人有爆发力,但节目中也可以看出他有跳街舞的基础。</p><p>许多人不知道的是,他出道前其实是无线的舞蹈艺人,没靠脸吃饭的时候,他也是拥有一身舞艺,不一定比王一博差。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/7448338f2712460d968e65062b249a40" img_width="759" img_height="502" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1993年钟汉良加入TVB的舞蹈艺员训练班,学习现代舞、民族舞、芭蕾舞,而他最擅长的是拉丁舞,当时曾为梅艳芳、张国荣等天王巨星担任伴舞,更有一个当芭蕾舞者的梦想。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a5d6f1c59bad422eb4631b5d6a8feadc" img_width="506" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>但因为他生的白净,身高183cm,长相帅气,于是很快被调至艺员部拍电视剧。那段时间他连续出演《少年五虎》、《刑事侦缉档案》、《第三类法庭》为自己积累了不少观众缘。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a05a5086d9164c909058276ce4a9f3f5" img_width="594" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1995年,他获台湾唱作人邰正宵发掘,赴台湾发展并成为唱跳歌手,并出了自己的首张专辑《OREA》。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/59f1d258c5274cfebe109a2917d4b2e8" img_width="400" img_height="262" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>钟汉良当了5年的歌手都没有什么水花,直到2000年后,他开始转向内地拍剧,才开始小有名气。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/73663a4cf05841d694c42b3cb9d91be3" img_width="524" img_height="345" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实这期间他并不是没有尝试再回到舞台上,2006年他就参加了《舞林大会》,但这时候大家已经开始淡忘他曾是一位唱跳歌手。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/dcc4137db0a24d28b5595ca771a2e8f5" img_width="1311" img_height="684" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>直到2015年,他成立自己的工作室,再次重拾歌手的身份,除了出个人音乐专辑外,还开过演唱会,如今加盟《这就是街舞》,可以尽情展示他的舞技,也算是一场多年来当舞者的夙愿。</p><p>他在街舞也确实玩的很开心。虽然依然有质疑的声音出现,但相信在后面的节目中能在他身上看到更多惊喜。</p><p>#钟汉良暗箱操作#、#钟汉良跳舞#、#钟汉良王一博#</p><p>作者:每天都想吃榴莲</p><p>责编:阿叉</p>'}
#
# _id = "test0"
# bulk_head = '{"index": {"_id":"%s"}}' % _id
# data_str = json.dumps(dic, ensure_ascii=False)
# bulk_one_body = bulk_head + '\n' + data_str + '\n'
# bulk_all_body += bulk_one_body
# count += 1
# if count % 500 == 0:
# eror_dic = es.bulk(index=target_index,
# body=bulk_all_body, request_timeout=500)
# bulk_all_body = ''
# if eror_dic['errors'] is True:
# print(eror_dic['items'])
# print(bulk_all_body)
# print(count)
#
# if bulk_all_body != '':
# eror_dic = es.bulk(body=bulk_all_body,
# index=target_index,
# request_timeout=500)
# print(eror_dic)
# if eror_dic['errors'] is True:
# print(eror_dic)
# bulk_all_body = ''
# # print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
#
#
# write_es("crawler-data-raw_v1")
# import pymysql
#
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
# db='mimas_test', charset='utf8')
# cur = conn.cursor()
# res_data = {'release_time': 1595952088000, 'fetch_time': 1595989493952,
# 'url': 'https://www.douban.com/group/topic/186708104/', 'releaser': '易燃易暴食', 'repost_count': 6,
# 'comment_count': 153, 'favorite_count': 27, 'title': '李子柒明明就又黑又糙',
# 'releaserUrl': 'https://www.douban.com/people/129156222', 'releaser_id_str': 'douban_129156222',
# 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684400.webp', 'mid': '186708104',
# 'platform': 'douban', 'doc_id': 'douban_186708104',
# 'content': '<div><div><div><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684400.webp"></div></div><div><div><img src="https://img1.doubanio.com/view/group_topic/l/public/p317684407.webp"></div></div><div><div><img src="https://img1.doubanio.com/view/group_topic/l/public/p317684397.webp"></div></div><div><div><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684404.webp"></div></div><div><div><img src="https://img1.doubanio.com/view/group_topic/l/public/p317684409.webp"></div></div><div><div><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684401.webp"></div></div></div>',
# 'collection_count': 8,
# 'img_list': ['img src="https://img3.doubanio.com/view/group_topic/l/public/p317684400.webp"',
# 'img src="https://img1.doubanio.com/view/group_topic/l/public/p317684407.webp"',
# 'img src="https://img1.doubanio.com/view/group_topic/l/public/p317684397.webp"',
# 'img src="https://img9.doubanio.com/view/group_topic/l/public/p317684404.webp"',
# 'img src="https://img1.doubanio.com/view/group_topic/l/public/p317684409.webp"',
# 'img src="https://img3.doubanio.com/view/group_topic/l/public/p317684401.webp"']}
# sql_query = """insert into api_tractate
# (user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type)
# values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type});""".format(user_id=33524706, content=res_data["content"], is_online=1, status=2, platform=3, content_level=3,
# is_excellent=0, create_time=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
# last_modified=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), user_del=0,
# low_quality=0, low_quality_deal=0, platform_id=res_data["doc_id"], pgc_type=1)
# res = cur.execute(sql_query)
# # print("最新ID为", int(cur.lastrowid))
# print("插入数据的ID", int(conn.insert_id()))
# # conn.commit()
# conn.commit()
# print(conn)
if bulk_all_body != '':
eror_dic = es.bulk(body=bulk_all_body,
index=target_index,
request_timeout=500)
print(eror_dic)
if eror_dic['errors'] is True:
print(eror_dic)
bulk_all_body = ''
# print(platform, releaser, 'end_have:', len(wirte_set), 'add:', len(set_url))
# print(res)
# cur.ex
# result = cur.fetchall()
# print(result)
#
# sql_query = """
# insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,image_webp,create_time,update_time)
# values ({tractate_id},{image_url},{width},{height},{image_url_source},{image_type},{image_webp},{create_time},{update_time})
# """.format(tractate_id,image_url=,width=,height=,image_url_source=,image_type=,update_time=)
from gm_upload import upload, upload_file
write_es("crawler-data-raw_v1")
\ No newline at end of file
res = upload_file("D:\lemo下载\p317377995.jpg")
print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment