From 51fd88eb3dd5320dd859447195522428a3bd3823 Mon Sep 17 00:00:00 2001 From: litaolemo <593516104@qq.com> Date: Thu, 7 Jan 2021 11:27:00 +0800 Subject: [PATCH] =?UTF-8?q?update=20=20=E5=B0=8F=E7=BA=A2=E4=B9=A6?= =?UTF-8?q?=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler_sys/framework/func_get_releaser_id.py | 8 + .../site_crawler/crawler_xiaohongshu.py | 186 ++++++++++++++++++ crawler_sys/utils/rpc_data_to_answer.py | 47 +++++ 3 files changed, 241 insertions(+) create mode 100644 crawler_sys/site_crawler/crawler_xiaohongshu.py create mode 100644 crawler_sys/utils/rpc_data_to_answer.py diff --git a/crawler_sys/framework/func_get_releaser_id.py b/crawler_sys/framework/func_get_releaser_id.py index 8a9830a..aff4f55 100644 --- a/crawler_sys/framework/func_get_releaser_id.py +++ b/crawler_sys/framework/func_get_releaser_id.py @@ -222,6 +222,13 @@ def wangyi_news(releaserUrl,**kwargs): else: return None +def xiaohongshu(releaserUrl,**kwargs): + releaserUrl = releaserUrl.split("?")[0] + res = re.findall(r"user/profile/(.*)", releaserUrl) + if res: + return res[0] + else: + return None plantform_func = { "toutiao": toutiao, @@ -233,6 +240,7 @@ plantform_func = { "kwai": kwai, "网易新闻": wangyi_news, "抖音":douyin, + "xiaohongshu":xiaohongshu } diff --git a/crawler_sys/site_crawler/crawler_xiaohongshu.py b/crawler_sys/site_crawler/crawler_xiaohongshu.py new file mode 100644 index 0000000..e6dbeaf --- /dev/null +++ b/crawler_sys/site_crawler/crawler_xiaohongshu.py @@ -0,0 +1,186 @@ +# -*- coding:UTF-8 -*- +# @Time : 2021/1/4 13:39 +# @File : crawler_xiaohongshu.py +# @email : litao@igengmei.com +# @author : litao +import copy + +import requests +import json +import datetime +import re +# from . import bulk_write_into_es +import hashlib +import time +from selenium import webdriver +from crawler.crawler_sys.utils.output_results import retry_get_url +from crawler_sys.framework.video_fields_std import Std_fields_video +from crawler_sys.utils.output_results import output_result +from crawler.gm_upload.gm_upload import upload, upload_file + +try: + from crawler_sys.framework.func_get_releaser_id import * +except: + from func_get_releaser_id import * +from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy +# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count +import random, urllib +from crawler.crawler_sys.utils.rpc_data_to_answer import post_data + +user_id_list = [29865245, +36426151, +36426142, +36427666, +36427661, +36427657, +36427655, +36427634, +33524762, +33524779, +33524697, +30963358, +31293584, +31358392, +31358396, +31358397, +31358419, +31358448, +31358610, +31358658, +] + +class Crawler_xiaohongshu(): + def __init__(self, timeout=None, platform='xiaohongshu'): + if timeout == None: + self.timeout = 10 + else: + self.timeout = timeout + self.platform = platform + self.TotalVideo_num = None + self.midstepurl = None + std_fields = Std_fields_video() + self.video_data = std_fields.video_data + self.video_data['platform'] = self.platform + unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal'] + for key in unused_key_list: + self.video_data.pop(key) + + def get_one_page(self, page_id, proxies=0): + url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id + headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept-encoding": "gzip, deflate, br", + "accept-language": "zh-CN,zh;q=0.9", + "cache-control": "max-age=0", + "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;", + "referer": "https://www.xiaohongshu.com/explore", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", + } + res = retry_get_url(url, headers=headers, proxies=proxies) + res_text = res.text + res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0] + # scope = {} + + json_data = json.loads(res_json_text.replace("undefined","null")) + # exec("res_json =" + res_json_text.strip("\n"),scope) + qiniu_img_list = [] + # for img_url in json_data["NoteView"]["content"]["imageList"]: + # try: + # img_wb = retry_get_url(img_url["url"]).content + # res = upload(img_wb,img_type=99) + # print(res) + # img_info = retry_get_url(res + "-imageinfo") + # img_info_json = img_info.json() + # qiniu_img_list.append((res + "-w", img_info_json)) + # except Exception as e: + # print("down load img error %s" % e) + # return {} + # json_data["NoteView"]["content"]["qiniu_img_list"] = qiniu_img_list + return json_data + + def get_releaser_id(self, releaserUrl): + return get_releaser_id(platform="xiaohongshu", releaserUrl=releaserUrl) + + def releaser_page(self, releaserUrl, + output_to_file=False, + filepath=None, + releaser_page_num_max=30, + output_to_es_raw=False, + es_index=None, + doc_type=None, + output_to_es_register=False, + push_to_redis=False, proxies_num=None, **kwargs): + headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept-encoding": "gzip, deflate", + "accept-language": "zh-CN,zh;q=0.9", + "cache-control": "max-age=0", + "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;", + "referer": releaserUrl, + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", + } + + count = 1 + # has_more = True + retry_time = 0 + result_list = [] + releaser_id = self.get_releaser_id(releaserUrl) + releaserUrl = 'https://www.xiaohongshu.com/user/profile/%s' % releaser_id + self.video_data['releaserUrl'] = releaserUrl + pcursor = 0 + + # print(proxies) + # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} + while count <= releaser_page_num_max and count <= 1000: + try: + res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num) + except: + continue + # print(get_page.content) + + # time.sleep(random.randint(1, 2)) + page_text = res.text + data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0] + data_json = json.loads(data_list) + # # print(data_list) + if data_json: + print("get data at releaser: %s page: %s" % (releaser_id, count)) + count += 1 + for info_dic in data_json["ProfileLayout"]["noteData"]: + video_dic = {} + page_id = info_dic["id"] + title = info_dic["title"] + desc = info_dic["desc"] + time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp() + if info_dic["type"] != "normal": + continue + page_data = self.get_one_page(page_id,proxies=proxies_num) + print(page_data) + title = title + anwser = desc + pid = page_data["NoteView"]["content"]["id"] + video_dic["platform"] = "9" + video_dic["platform_id"] = pid + video_dic["title"] = title + # video_dic["content"] = anwser + video_dic["user_id"] = random.choice(user_id_list) + video_dic["create_time"] = time_ts + rpc_res = post_data(video_dic,"cims/question/batch_create") + print(res) + break + +if __name__ == '__main__': + test = Crawler_xiaohongshu() + releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae' + res = test.releaser_page(releaserurl) diff --git a/crawler_sys/utils/rpc_data_to_answer.py b/crawler_sys/utils/rpc_data_to_answer.py new file mode 100644 index 0000000..e9c808a --- /dev/null +++ b/crawler_sys/utils/rpc_data_to_answer.py @@ -0,0 +1,47 @@ +# -*- coding:UTF-8 -*- +# @Time : 2021/1/6 10:21 +# @File : rpc_data_to_answer.py +# @email : litao@igengmei.com +# @author : litao + +import requests +import typing +""" +https://www.yuque.com/docs/share/f4abe44b-6593-46b4-b280-5c87e4db2c85?# +rpc: cims/question/batch_create 创建问题 +rpc: cims/answer/batch_create åˆ›å»ºå›žç” +rpc: cims/reply/batch_create 创建评论 +""" + +platfrom_id_dict = { + "zhihu":0, + "weixin":1, + "weibo":2, + "hera":3, + "insheadline":7, + "kyc":8, + "xiaohongshu":9, + "gm":99 +} + +data_type_dict = { + "cims/question/batch_create": ["platform","platform_id","title","content","user_id","create_time","is_online"], + "cims/answer/batch_create": ["platform","platform_id","platform_question_id","content","user_id","create_time","is_online"], + "cims/reply/batch_create": ["platform","platform_id","platform_answer_id","content","user_id","create_time","is_online"] +} + + +def post_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict: + headers = { + 'X-GAIA-HELIOS-VERSION': '0.7.5', + } + for key in data_dict: + if key not in data_type_dict[rpc_type]: + data_dict.pop(key) + print(data_dict) + data = { + 'requests': '[{"params": {"replies": [{%s}]}, "method": "%s, "timeout": 120}]' % (str(data_dict),rpc_type) + } + response = requests.post('http://127.0.0.1:8003/v1/batch', headers=headers, data=data) + print(response.text) + return response.json() \ No newline at end of file -- 2.18.0