Commit fd0899b4 authored by litaolemo's avatar litaolemo

update

parent 044201c0
...@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register ...@@ -11,9 +11,8 @@ from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19) rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=19)
def feed_url_into_redis(dict_Lst, platform, def feed_url_into_redis(dict_Lst, expire=0,
release_time_lower_bdr=None, ):
batch_str=None):
""" """
release_time_lower_bdr must be an int value represent release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given. timestamp in milliseconds if given.
...@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform, ...@@ -27,7 +26,8 @@ def feed_url_into_redis(dict_Lst, platform,
doc_id = data_dict['doc_id'] doc_id = data_dict['doc_id']
sadd_c = rds.lpush(doc_id, json.dumps(data_dict)) sadd_c = rds.lpush(doc_id, json.dumps(data_dict))
res = rds.lpush("doc_id", doc_id) res = rds.lpush("doc_id", doc_id)
rds.expire(doc_id,259200) if expire:
rds.expire(doc_id,expire)
except: except:
print('Failed to push data into redis') print('Failed to push data into redis')
print('Pushed data into redis') print('Pushed data into redis')
......
...@@ -224,7 +224,8 @@ def single_thead(processe,name): ...@@ -224,7 +224,8 @@ def single_thead(processe,name):
filepath=None, filepath=None,
output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
es_index=es_index, es_index=es_index,
output_to_es_register=output_to_es_register) output_to_es_register=output_to_es_register,
expire=86400)
print(len(data_list)) print(len(data_list))
data_list.clear() data_list.clear()
......
...@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict: ...@@ -256,13 +256,15 @@ def push_data_to_user(res_data: Dict) -> Dict:
print(res) print(res)
img_info = retry_get_url(res + "-imageinfo") img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json() img_info_json = img_info.json()
qiniu_img_list.append((res+"-w", img_info_json)) qiniu_img_list.append((res + "-w", img_info_json))
except: except:
return {} return {}
# 替换图片 # 替换图片
if res_data["platform"] == "weibo": if res_data["platform"] == "weibo":
res_data["qiniu_img_list"] = qiniu_img_list res_data["qiniu_img_list"] = qiniu_img_list
if "http://t.cn/" in res_data["title"]:
res_data["title"] = res_data["title"].split("http://t.cn/")[0]
elif res_data["platform"] == "douban": elif res_data["platform"] == "douban":
content = res_data.get("content") content = res_data.get("content")
if content: if content:
...@@ -300,9 +302,12 @@ img_type = { ...@@ -300,9 +302,12 @@ img_type = {
# "TIFF图片类型") # "TIFF图片类型")
} }
def write_data_into_mysql(res_data): def write_data_into_mysql(res_data):
now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 清洗数据为可以入库的格式
data = push_data_to_user(res_data) data = push_data_to_user(res_data)
tractate_id = 0
try: try:
sql_query = """insert into api_tractate sql_query = """insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title) (user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
...@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data): ...@@ -342,7 +347,7 @@ def write_data_into_mysql(res_data):
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}') values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
""".format(tractate_id=tractate_id, image_url=img_info[0], width=width, """.format(tractate_id=tractate_id, image_url=img_info[0], width=width,
height=height, image_url_source=image_url_source, height=height, image_url_source=image_url_source,
image_type=image_type,image_webp=img_info[0], image_type=image_type, image_webp=img_info[0],
create_time=now_str, update_time=now_str) create_time=now_str, update_time=now_str)
else: else:
sql_query = """ sql_query = """
...@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data): ...@@ -358,23 +363,39 @@ def write_data_into_mysql(res_data):
except Exception as e: except Exception as e:
print("commit error %s" % e) print("commit error %s" % e)
conn.rollback() conn.rollback()
if tractate_id:
return tractate_id
else:
return None
def task_main(): def task_main():
# 实例化数据判断规则 注意高优先级在前 低优先级在后 # 实例化数据判断规则 注意高优先级在前 低优先级在后
push_rule_class1 = push_rule(favorite_count_ni=0.000000001, time_range=5, level=5) push_rule_class1 = push_rule(comment_count_ni=20, time_range=5, level=5)
push_rule_class2 = push_rule(comment_count_ni=0.0000000001, time_range=5, level=3) push_rule_class2 = push_rule(comment_count_ni=5, time_range=5, level=3)
rules_list = [ rules_list = [
push_rule_class1, push_rule_class1,
push_rule_class2 push_rule_class2
] ]
# 循环处理抓取数据,返回需要添加至后端的数据 # 循环处理抓取数据,返回需要添加至后端的数据
for res_data in scan_from_redis(rules_list): for res_data in scan_from_redis(rules_list):
write_data_into_mysql(res_data) tractate_id = write_data_into_mysql(res_data)
if res_data["level"] == 5:
title_str = res_data["platform"] + "帖子内容审核"
body_str = """
问好:
有一篇新的优秀内容需要审核,帖子号为{tractate_id}
内容如下:
{content}
""".format(tractate_id=tractate_id, content=res_data["content"])
send_file_email("", "", email_group=["‎<duanyingrong@igengmei.com>"], cc_group=["litao@igengmei.com"],
email_msg_body_str=body_str, title_str=title_str)
# test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5} # test = {'release_time': 1595952037000, 'fetch_time': 1596012816514, 'url': 'https://www.douban.com/group/topic/186707979/', 'releaser': '🍫', 'repost_count': 40, 'comment_count': 411, 'favorite_count': 144, 'title': '王俊凯终于还是举铁了', 'releaserUrl': 'https://www.douban.com/people/57762442', 'releaser_id_str': 'douban_57762442', 'video_img': 'https://img3.doubanio.com/view/group_topic/sqxs/public/p317684082.webp', 'mid': '186707979', 'platform': 'douban', 'doc_id': 'douban_186707979', 'content': '<div id=\'content\'><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp" width="500"/></div></div><div class="image-container image-float-center"><div class="image-wrapper"><img src="https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp" width="500"/></div></div><p></p></div>', 'collection_count': 107, 'img_list': ['https://img3.doubanio.com/view/group_topic/l/public/p317684082.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684064.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684093.webp', 'https://img9.doubanio.com/view/group_topic/l/public/p317684095.webp', 'https://img3.doubanio.com/view/group_topic/l/public/p317684052.webp'], 'level': 5}
# write_data_into_mysql(test) # write_data_into_mysql(test)
cur.close() cur.close()
conn.close() conn.close()
task_main() if __name__ == "__main__":
task_main()
...@@ -93,7 +93,7 @@ def output_result(result_Lst, platform, ...@@ -93,7 +93,7 @@ def output_result(result_Lst, platform,
# feed url into redis # feed url into redis
if push_to_redis: if push_to_redis:
feed_url_into_redis( feed_url_into_redis(
result_Lst, platform) result_Lst,expire=kwargs.get("expire"))
# output into file according to passed in parameters # output into file according to passed in parameters
if output_to_file is True and filepath is not None: if output_to_file is True and filepath is not None:
......
...@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[], ...@@ -44,7 +44,7 @@ def send_file_email(file_path, data_str, email_group=[],
outer['To'] = ','.join(email_group) outer['To'] = ','.join(email_group)
outer['Cc'] = ','.join(cc_group) outer['Cc'] = ','.join(cc_group)
if not sender: if not sender:
outer['From'] = 'litao@igengmei.com.cn' outer['From'] = 'litao@igengmei.com'
else: else:
outer['From'] = sender outer['From'] = sender
mail_service = 'smtp.exmail.qq.com' mail_service = 'smtp.exmail.qq.com'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment