Commit 044201c0 authored by litaolemo's avatar litaolemo

update 新增热点帖子抽取存库脚本

parent 699ae037
......@@ -247,14 +247,18 @@ def push_data_to_user(res_data: Dict) -> Dict:
:return:
"""
qiniu_img_list = []
content = ""
if res_data["img_list"]:
for img_url in res_data["img_list"]:
img_wb = retry_get_url(img_url).content
res = upload(img_wb)
print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append((res, img_info_json))
try:
img_wb = retry_get_url(img_url).content
res = upload(img_wb)
print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append((res+"-w", img_info_json))
except:
return {}
# 替换图片
if res_data["platform"] == "weibo":
......@@ -263,8 +267,10 @@ def push_data_to_user(res_data: Dict) -> Dict:
content = res_data.get("content")
if content:
for count, img_url in enumerate(res_data["img_list"]):
content.replace(img_url, qiniu_img_list[count][0])
# print(qiniu_img_list[count][0])
content = content.replace(img_url, qiniu_img_list[count][0])
res_data["qiniu_img_list"] = qiniu_img_list
res_data["content"] = content
if res_data["platform"] == "weibo":
res_data["content"] = gm_convert_html_tags(res_data["title"], all_tags=True)
res_data["title"] = ""
......@@ -297,17 +303,15 @@ img_type = {
def write_data_into_mysql(res_data):
now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
data = push_data_to_user(res_data)
if data.get("title"):
sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
user_id=random.choice(user_id_list), content=data["content"], is_online=1, status=2, platform=3,
content_level=data["level"],
is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
try:
sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
user_id=random.choice(user_id_list), content=data["content"], is_online=1, status=2, platform=3,
content_level=data["level"],
is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
res = cur.execute(sql_query)
tractate_id = int(conn.insert_id())
if res:
......@@ -322,15 +326,38 @@ def write_data_into_mysql(res_data):
else:
image_url_source = 1
try:
image_type = img_type.get(img_info[0].split(".")[-1].upper())
image_type = img_type.get(img_info[1]["format"].upper())
except:
image_type = 1
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},{image_url},{width},{height},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=img_info[1]["width"],
height=img_info[1]["heigh"], image_url_source=image_url_source, image_type=image_type,
create_time=now_str, update_time=now_str)
try:
width = img_info[1]["width"]
height = img_info[1]["height"]
except:
width = 0
height = 0
try:
if img_type == 7:
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source,
image_type=image_type,image_webp=img_info[0],
create_time=now_str, update_time=now_str)
else:
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source, image_type=image_type,
create_time=now_str, update_time=now_str)
res = cur.execute(sql_query)
if res:
conn.commit()
except Exception as e:
print("commit error %s" % e)
conn.rollback()
def task_main():
......@@ -344,7 +371,8 @@ def task_main():
# 循环处理抓取数据,返回需要添加至后端的数据
for res_data in scan_from_redis(rules_list):
write_data_into_mysql(res_data)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
cur.close()
conn.close()
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment