Commit 51fd88eb authored by litaolemo's avatar litaolemo

update 小红书爬虫

parent e6c39b10
...@@ -222,6 +222,13 @@ def wangyi_news(releaserUrl,**kwargs): ...@@ -222,6 +222,13 @@ def wangyi_news(releaserUrl,**kwargs):
else: else:
return None return None
def xiaohongshu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
res = re.findall(r"user/profile/(.*)", releaserUrl)
if res:
return res[0]
else:
return None
plantform_func = { plantform_func = {
"toutiao": toutiao, "toutiao": toutiao,
...@@ -233,6 +240,7 @@ plantform_func = { ...@@ -233,6 +240,7 @@ plantform_func = {
"kwai": kwai, "kwai": kwai,
"网易新闻": wangyi_news, "网易新闻": wangyi_news,
"抖音":douyin, "抖音":douyin,
"xiaohongshu":xiaohongshu
} }
......
# -*- coding:UTF-8 -*-
# @Time : 2021/1/4 13:39
# @File : crawler_xiaohongshu.py
# @email : litao@igengmei.com
# @author : litao
import copy
import requests
import json
import datetime
import re
# from . import bulk_write_into_es
import hashlib
import time
from selenium import webdriver
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler_sys.framework.video_fields_std import Std_fields_video
from crawler_sys.utils.output_results import output_result
from crawler.gm_upload.gm_upload import upload, upload_file
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
import random, urllib
from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
user_id_list = [29865245,
36426151,
36426142,
36427666,
36427661,
36427657,
36427655,
36427634,
33524762,
33524779,
33524697,
30963358,
31293584,
31358392,
31358396,
31358397,
31358419,
31358448,
31358610,
31358658,
]
class Crawler_xiaohongshu():
def __init__(self, timeout=None, platform='xiaohongshu'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
self.TotalVideo_num = None
self.midstepurl = None
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal']
for key in unused_key_list:
self.video_data.pop(key)
def get_one_page(self, page_id, proxies=0):
url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "timestamp2=202101062497d4bed842476b2618e0ea;",
"referer": "https://www.xiaohongshu.com/explore",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
res = retry_get_url(url, headers=headers, proxies=proxies)
res_text = res.text
res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0]
# scope = {}
json_data = json.loads(res_json_text.replace("undefined","null"))
# exec("res_json =" + res_json_text.strip("\n"),scope)
qiniu_img_list = []
# for img_url in json_data["NoteView"]["content"]["imageList"]:
# try:
# img_wb = retry_get_url(img_url["url"]).content
# res = upload(img_wb,img_type=99)
# print(res)
# img_info = retry_get_url(res + "-imageinfo")
# img_info_json = img_info.json()
# qiniu_img_list.append((res + "-w", img_info_json))
# except Exception as e:
# print("down load img error %s" % e)
# return {}
# json_data["NoteView"]["content"]["qiniu_img_list"] = qiniu_img_list
return json_data
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform="xiaohongshu", releaserUrl=releaserUrl)
def releaser_page(self, releaserUrl,
output_to_file=False,
filepath=None,
releaser_page_num_max=30,
output_to_es_raw=False,
es_index=None,
doc_type=None,
output_to_es_register=False,
push_to_redis=False, proxies_num=None, **kwargs):
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "timestamp2=202101062497d4bed842476b2618e0ea;",
"referer": releaserUrl,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
count = 1
# has_more = True
retry_time = 0
result_list = []
releaser_id = self.get_releaser_id(releaserUrl)
releaserUrl = 'https://www.xiaohongshu.com/user/profile/%s' % releaser_id
self.video_data['releaserUrl'] = releaserUrl
pcursor = 0
# print(proxies)
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1000:
try:
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num)
except:
continue
# print(get_page.content)
# time.sleep(random.randint(1, 2))
page_text = res.text
data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0]
data_json = json.loads(data_list)
# # print(data_list)
if data_json:
print("get data at releaser: %s page: %s" % (releaser_id, count))
count += 1
for info_dic in data_json["ProfileLayout"]["noteData"]:
video_dic = {}
page_id = info_dic["id"]
title = info_dic["title"]
desc = info_dic["desc"]
time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp()
if info_dic["type"] != "normal":
continue
page_data = self.get_one_page(page_id,proxies=proxies_num)
print(page_data)
title = title
anwser = desc
pid = page_data["NoteView"]["content"]["id"]
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["title"] = title
# video_dic["content"] = anwser
video_dic["user_id"] = random.choice(user_id_list)
video_dic["create_time"] = time_ts
rpc_res = post_data(video_dic,"cims/question/batch_create")
print(res)
break
if __name__ == '__main__':
test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
res = test.releaser_page(releaserurl)
# -*- coding:UTF-8 -*-
# @Time : 2021/1/6 10:21
# @File : rpc_data_to_answer.py
# @email : litao@igengmei.com
# @author : litao
import requests
import typing
"""
https://www.yuque.com/docs/share/f4abe44b-6593-46b4-b280-5c87e4db2c85?#
rpc: cims/question/batch_create 创建问题
rpc: cims/answer/batch_create 创建回答
rpc: cims/reply/batch_create 创建评论
"""
platfrom_id_dict = {
"zhihu":0,
"weixin":1,
"weibo":2,
"hera":3,
"insheadline":7,
"kyc":8,
"xiaohongshu":9,
"gm":99
}
data_type_dict = {
"cims/question/batch_create": ["platform","platform_id","title","content","user_id","create_time","is_online"],
"cims/answer/batch_create": ["platform","platform_id","platform_question_id","content","user_id","create_time","is_online"],
"cims/reply/batch_create": ["platform","platform_id","platform_answer_id","content","user_id","create_time","is_online"]
}
def post_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict:
headers = {
'X-GAIA-HELIOS-VERSION': '0.7.5',
}
for key in data_dict:
if key not in data_type_dict[rpc_type]:
data_dict.pop(key)
print(data_dict)
data = {
'requests': '[{"params": {"replies": [{%s}]}, "method": "%s, "timeout": 120}]' % (str(data_dict),rpc_type)
}
response = requests.post('http://127.0.0.1:8003/v1/batch', headers=headers, data=data)
print(response.text)
return response.json()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment