Commit 53307e8c authored by litaolemo's avatar litaolemo

更新小红书爬虫

parent c68e576c
......@@ -9,7 +9,7 @@ import random
import time
import redis, json
from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
from crawler_sys.utils.output_results import retry_get_url
from crawler.gm_upload.gm_upload import upload, upload_file
from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import *
......@@ -20,18 +20,7 @@ gm_user_id_list = [
'5c20dd200000000007027c07',
'5fe1c1ba0000000001006e65']
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
def xiaohongshu_pc(res_json,pid):
video_dic = {}
qiniu_img_list = []
try:
......@@ -40,7 +29,7 @@ while True:
pid = res_json["NoteView"]["content"]["id"]
for img_url in res_json["NoteView"]["content"]["imageList"]:
try:
img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'],img_url['traceId'])).content
img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
......@@ -50,7 +39,8 @@ while True:
print("down load img error %s" % e)
continue
try:
desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>"
desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(
qiniu_img_list) + "</p>"
res_json["NoteView"]["content"]["desc_fix"] = desc_fix
if res_json["NoteView"]["author"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
......@@ -86,7 +76,8 @@ while True:
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0, 60))).timestamp()
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
elif res_json["NoteView"].get("commentInfo"):
......@@ -97,9 +88,83 @@ while True:
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0,24),minutes=random.randint(0,60))).timestamp()
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
return video_dic
def xiaohongshu_xiaochengxu(res_json):
video_dic = {}
qiniu_img_list = []
try:
pid = res_json["data"]["id"]
except:
pass
for img_url in res_json["data"]["imageList"]:
try:
img_wb = retry_get_url(img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('<img src="' + res + '-w">')
except Exception as e:
print("down load img error %s" % e)
continue
try:
desc_fix = "<p>" + res_json["data"]['desc'].replace('\n', '<br>') + "".join(
qiniu_img_list) + "</p>"
if res_json["data"]["user"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["data"]["title"]
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["data"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["data"].get("commentList"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["data"]["commentList"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
......@@ -107,7 +172,26 @@ while True:
print(e)
except Exception as e:
print(e)
return video_dic
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
if res_json.get("NoteView"):
xiaohongshu_pc(res_json,pid)
elif res_json.get("data"):
xiaohongshu_xiaochengxu(res_json)
rds.hdel("xiaohongshu",pid)
rds.sadd("xiaohongshu_exists_set",pid)
else:
......
......@@ -5,6 +5,7 @@
# @author : litao
import copy
import execjs
import redis
import requests
import json
......@@ -248,7 +249,16 @@ class Crawler_xiaohongshu():
# break
if __name__ == '__main__':
test = Crawler_xiaohongshu()
# try:
# with open(r'D:\work_file\gengmei\crawler\crawler_sys\site_crawler_by_redis\xiaohongshu_js.js', 'r', encoding='utf-8') as f:
# js = f.read()
# except:
# with open('/srv/apps/crawler/crawler_sys/site_crawler_by_redis/xiaohongshu.js', 'r', encoding='utf-8') as f:
# js = f.read()
# # print(js)
# exec_js = execjs.compile(js)
# exec_js.call("get_sign", "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae")
# test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list =[
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
......@@ -358,4 +368,8 @@ if __name__ == '__main__':
]
for url in url_list:
print(url)
try:
res = test.releaser_page(url,proxies_num=0)
except Exception as e:
print(e)
continue
const crypto = require('crypto');
/**
* 生成 x-sign header
* `x-sign: 'X' + md5(url + 'WSUDD')`
* @param {string} url url
* @param {object} params 参数
*/
function generateXSign(url, params = {}) {
const searchString = new URLSearchParams(params).toString();
const realUrl = `${url}${searchString ? '?' : ''}${searchString}WSUDD`;
const md5 = crypto.createHash('md5').update(realUrl).digest('hex');
return `X${md5}`;
}
module.exports = {
generateXSign,
};
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment