Commit 53307e8c authored by litaolemo's avatar litaolemo

更新小红书爬虫

parent c68e576c
...@@ -9,7 +9,7 @@ import random ...@@ -9,7 +9,7 @@ import random
import time import time
import redis, json import redis, json
from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data # from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
from crawler_sys.utils.output_results import retry_get_url from crawler_sys.utils.output_results import retry_get_url
from crawler.gm_upload.gm_upload import upload, upload_file from crawler.gm_upload.gm_upload import upload, upload_file
from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import * from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import *
...@@ -20,18 +20,7 @@ gm_user_id_list = [ ...@@ -20,18 +20,7 @@ gm_user_id_list = [
'5c20dd200000000007027c07', '5c20dd200000000007027c07',
'5fe1c1ba0000000001006e65'] '5fe1c1ba0000000001006e65']
def xiaohongshu_pc(res_json,pid):
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
video_dic = {} video_dic = {}
qiniu_img_list = [] qiniu_img_list = []
try: try:
...@@ -40,7 +29,7 @@ while True: ...@@ -40,7 +29,7 @@ while True:
pid = res_json["NoteView"]["content"]["id"] pid = res_json["NoteView"]["content"]["id"]
for img_url in res_json["NoteView"]["content"]["imageList"]: for img_url in res_json["NoteView"]["content"]["imageList"]:
try: try:
img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'],img_url['traceId'])).content img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99) res = upload(img_wb, img_type=99)
# print(res) # print(res)
img_info = retry_get_url(res + "-imageinfo") img_info = retry_get_url(res + "-imageinfo")
...@@ -50,7 +39,8 @@ while True: ...@@ -50,7 +39,8 @@ while True:
print("down load img error %s" % e) print("down load img error %s" % e)
continue continue
try: try:
desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>" desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(
qiniu_img_list) + "</p>"
res_json["NoteView"]["content"]["desc_fix"] = desc_fix res_json["NoteView"]["content"]["desc_fix"] = desc_fix
if res_json["NoteView"]["author"]['id'] in gm_user_id_list: if res_json["NoteView"]["author"]['id'] in gm_user_id_list:
video_dic["level"] = "5" video_dic["level"] = "5"
...@@ -86,7 +76,8 @@ while True: ...@@ -86,7 +76,8 @@ while True:
video_dic["user_id"] = comment_id video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id) comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24), video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0, 60))).timestamp() minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic)) comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create") # rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
elif res_json["NoteView"].get("commentInfo"): elif res_json["NoteView"].get("commentInfo"):
...@@ -97,9 +88,83 @@ while True: ...@@ -97,9 +88,83 @@ while True:
comment_id = random.choice(comment_id_list_copy) comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id) comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0,24),minutes=random.randint(0,60))).timestamp() video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
return video_dic
def xiaohongshu_xiaochengxu(res_json):
video_dic = {}
qiniu_img_list = []
try:
pid = res_json["data"]["id"]
except:
pass
for img_url in res_json["data"]["imageList"]:
try:
img_wb = retry_get_url(img_url["url"].replace(img_url['fileId'], img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('<img src="' + res + '-w">')
except Exception as e:
print("down load img error %s" % e)
continue
try:
desc_fix = "<p>" + res_json["data"]['desc'].replace('\n', '<br>') + "".join(
qiniu_img_list) + "</p>"
if res_json["data"]["user"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["data"]["title"]
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["data"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["data"].get("commentList"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["data"]["commentList"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0,
60))).timestamp()
comment_list.append(copy.deepcopy(video_dic)) comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create") # rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list: if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create") rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e: except Exception as e:
...@@ -107,7 +172,26 @@ while True: ...@@ -107,7 +172,26 @@ while True:
print(e) print(e)
except Exception as e: except Exception as e:
print(e) print(e)
return video_dic
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
if res_json.get("NoteView"):
xiaohongshu_pc(res_json,pid)
elif res_json.get("data"):
xiaohongshu_xiaochengxu(res_json)
rds.hdel("xiaohongshu",pid) rds.hdel("xiaohongshu",pid)
rds.sadd("xiaohongshu_exists_set",pid) rds.sadd("xiaohongshu_exists_set",pid)
else: else:
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
# @author : litao # @author : litao
import copy import copy
import execjs
import redis import redis
import requests import requests
import json import json
...@@ -248,7 +249,16 @@ class Crawler_xiaohongshu(): ...@@ -248,7 +249,16 @@ class Crawler_xiaohongshu():
# break # break
if __name__ == '__main__': if __name__ == '__main__':
test = Crawler_xiaohongshu() # try:
# with open(r'D:\work_file\gengmei\crawler\crawler_sys\site_crawler_by_redis\xiaohongshu_js.js', 'r', encoding='utf-8') as f:
# js = f.read()
# except:
# with open('/srv/apps/crawler/crawler_sys/site_crawler_by_redis/xiaohongshu.js', 'r', encoding='utf-8') as f:
# js = f.read()
# # print(js)
# exec_js = execjs.compile(js)
# exec_js.call("get_sign", "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae")
# test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae' releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list =[ url_list =[
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae", "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
...@@ -358,4 +368,8 @@ if __name__ == '__main__': ...@@ -358,4 +368,8 @@ if __name__ == '__main__':
] ]
for url in url_list: for url in url_list:
print(url) print(url)
try:
res = test.releaser_page(url,proxies_num=0) res = test.releaser_page(url,proxies_num=0)
except Exception as e:
print(e)
continue
const crypto = require('crypto');
/**
* 生成 x-sign header
* `x-sign: 'X' + md5(url + 'WSUDD')`
* @param {string} url url
* @param {object} params 参数
*/
function generateXSign(url, params = {}) {
const searchString = new URLSearchParams(params).toString();
const realUrl = `${url}${searchString ? '?' : ''}${searchString}WSUDD`;
const md5 = crypto.createHash('md5').update(realUrl).digest('hex');
return `X${md5}`;
}
module.exports = {
generateXSign,
};
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment