# -*- coding:UTF-8 -*-
# @Time : 2021/1/8 16:41
# @File : xiaohongshu_to_rpc.py
# @email : litao@igengmei.com
# @author : litao
import copy
import datetime
import hashlib
import random
import time
import redis, json
from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
from crawler_sys.utils.output_results import retry_get_url
from crawler.gm_upload.gm_upload import upload, upload_file
from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import *
gm_user_id_list = [
'5cca9b3700000000120314c9',
'5aa0f7bae8ac2b65bfcdaf0e',
'5c20dd200000000007027c07',
'5fe1c1ba0000000001006e65']
def xiaohongshu_pc(res_json,pid):
video_dic = {}
qiniu_img_list = []
try:
pid = res_json["NoteView"]["commentInfo"]["targetNoteId"]
except:
pid = res_json["NoteView"]["content"]["id"]
for img_url in res_json["NoteView"]["content"]["imageList"]:
try:
img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('
')
except Exception as e:
print("down load img error %s" % e)
continue
try:
desc_fix = "
" + res_json["NoteView"]["content"]['desc'].replace('\n', '
') + "".join(
qiniu_img_list) + "
"
res_json["NoteView"]["content"]["desc_fix"] = desc_fix
if res_json["NoteView"]["author"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["NoteView"]["content"]["title"]
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["NoteView"]["content"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["NoteView"].get("comments"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["NoteView"]["comments"]["data"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
elif res_json["NoteView"].get("commentInfo"):
for comment in res_json["NoteView"]["commentInfo"]["comments"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
return video_dic
def xiaohongshu_xiaochengxu(res_json):
video_dic = {}
qiniu_img_list = []
try:
pid = res_json["data"]["id"]
except:
pass
for img_url in res_json["data"]["imageList"]:
try:
img_wb = retry_get_url(img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('
')
except Exception as e:
print("down load img error %s" % e)
continue
try:
desc_fix = "" + res_json["data"]['desc'].replace('\n', '
') + "".join(
qiniu_img_list) + "
"
if res_json["data"]["user"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["data"]["title"]
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["data"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["data"].get("commentList"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["data"]["commentList"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = hashlib.md5((comment['user']['id']+comment['content']).encode("utf8")).hexdigest()
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
return video_dic
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
if res_json.get("NoteView"):
xiaohongshu_pc(res_json,pid)
elif res_json.get("data"):
xiaohongshu_xiaochengxu(res_json)
rds.hdel("xiaohongshu",pid)
rds.sadd("xiaohongshu_exists_set",pid)
else:
time.sleep(5)