Commit 044201c0 authored by litaolemo's avatar litaolemo

update 新增热点帖子抽取存库脚本

parent 699ae037
...@@ -247,14 +247,18 @@ def push_data_to_user(res_data: Dict) -> Dict: ...@@ -247,14 +247,18 @@ def push_data_to_user(res_data: Dict) -> Dict:
:return: :return:
""" """
qiniu_img_list = [] qiniu_img_list = []
content = ""
if res_data["img_list"]: if res_data["img_list"]:
for img_url in res_data["img_list"]: for img_url in res_data["img_list"]:
try:
img_wb = retry_get_url(img_url).content img_wb = retry_get_url(img_url).content
res = upload(img_wb) res = upload(img_wb)
print(res) print(res)
img_info = retry_get_url(res + "-imageinfo") img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json() img_info_json = img_info.json()
qiniu_img_list.append((res, img_info_json)) qiniu_img_list.append((res+"-w", img_info_json))
except:
return {}
# 替换图片 # 替换图片
if res_data["platform"] == "weibo": if res_data["platform"] == "weibo":
...@@ -263,8 +267,10 @@ def push_data_to_user(res_data: Dict) -> Dict: ...@@ -263,8 +267,10 @@ def push_data_to_user(res_data: Dict) -> Dict:
content = res_data.get("content") content = res_data.get("content")
if content: if content:
for count, img_url in enumerate(res_data["img_list"]): for count, img_url in enumerate(res_data["img_list"]):
content.replace(img_url, qiniu_img_list[count][0]) # print(qiniu_img_list[count][0])
content = content.replace(img_url, qiniu_img_list[count][0])
res_data["qiniu_img_list"] = qiniu_img_list res_data["qiniu_img_list"] = qiniu_img_list
res_data["content"] = content
if res_data["platform"] == "weibo": if res_data["platform"] == "weibo":
res_data["content"] = gm_convert_html_tags(res_data["title"], all_tags=True) res_data["content"] = gm_convert_html_tags(res_data["title"], all_tags=True)
res_data["title"] = "" res_data["title"] = ""
...@@ -297,7 +303,7 @@ img_type = { ...@@ -297,7 +303,7 @@ img_type = {
def write_data_into_mysql(res_data): def write_data_into_mysql(res_data):
now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
data = push_data_to_user(res_data) data = push_data_to_user(res_data)
if data.get("title"): try:
sql_query = """insert into api_tractate sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title) (user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format( values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format(
...@@ -306,8 +312,6 @@ def write_data_into_mysql(res_data): ...@@ -306,8 +312,6 @@ def write_data_into_mysql(res_data):
is_excellent=0, create_time=now_str, is_excellent=0, create_time=now_str,
last_modified=now_str, user_del=0, last_modified=now_str, user_del=0,
low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"]) low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=1, title=data["title"])
try:
res = cur.execute(sql_query) res = cur.execute(sql_query)
tractate_id = int(conn.insert_id()) tractate_id = int(conn.insert_id())
if res: if res:
...@@ -322,15 +326,38 @@ def write_data_into_mysql(res_data): ...@@ -322,15 +326,38 @@ def write_data_into_mysql(res_data):
else: else:
image_url_source = 1 image_url_source = 1
try: try:
image_type = img_type.get(img_info[0].split(".")[-1].upper()) image_type = img_type.get(img_info[1]["format"].upper())
except: except:
image_type = 1 image_type = 1
try:
width = img_info[1]["width"]
height = img_info[1]["height"]
except:
width = 0
height = 0
try:
if img_type == 7:
sql_query = """ sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,image_webp,create_time,update_time) insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},{image_url},{width},{height},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}') values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=img_info[1]["width"], """.format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=img_info[1]["heigh"], image_url_source=image_url_source, image_type=image_type, height=height, image_url_source=image_url_source,
image_type=image_type,image_webp=img_info[0],
create_time=now_str, update_time=now_str) create_time=now_str, update_time=now_str)
else:
sql_query = """
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source, image_type=image_type,
create_time=now_str, update_time=now_str)
res = cur.execute(sql_query)
if res:
conn.commit()
except Exception as e:
print("commit error %s" % e)
conn.rollback()
def task_main(): def task_main():
...@@ -344,7 +371,8 @@ def task_main(): ...@@ -344,7 +371,8 @@ def task_main():
# 循环处理抓取数据,返回需要添加至后端的数据 # 循环处理抓取数据,返回需要添加至后端的数据
for res_data in scan_from_redis(rules_list): for res_data in scan_from_redis(rules_list):
write_data_into_mysql(res_data) write_data_into_mysql(res_data)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test)
cur.close() cur.close()
conn.close() conn.close()
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment