Commit 79bbb8d0 authored by litaolemo's avatar litaolemo

update

parent cfb67895
# -*- coding:UTF-8 -*-
# @Time : 2021/1/14 19:53
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
# -*- coding:UTF-8 -*-
# @Time : 2021/1/14 19:54
# @File : rpc_config.py
# @email : litao@igengmei.com
# @author : litao
majiayonghu_list = [
36436814,
36436809,
36436805,
36436803,
36436800,
36436797,
36436794,
36436793,
36436787,
36436782,
36436769,
36436763,
36436758,
36436756,
36436749,
36436745,
36436738,
36436731,
36436729,
36436725,
36436720,
36436717,
36436716,
36436709,
36436703,
36436701,
36436690,
36436689,
36436685,
36436674,
36426171,
36426170,
36426169,
36426168,
36426167,
36426166,
36426165,
36426164,
36426163,
36426162,
36426161,
36426160,
36426159,
36426158,
36426157,
36426156,
36426155,
36426154,
36426153,
36426152,
36426150,
36426149,
36426148,
36426147,
36426146,
36426145,
36426143,
36426141,
36368922,
36368921,
36368920,
36368918,
36368917,
]
user_id_list = [
29865245,
36426151,
36426142,
36427666,
36427661,
36427657,
36427655,
36427634,
33524762,
33524779,
33524697,
30963358,
31293584,
31358392,
31358396,
31358397,
31358419,
31358448,
31358610,
31358658,
]
......@@ -12,6 +12,7 @@ import redis, json
from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
from crawler_sys.utils.output_results import retry_get_url
from crawler.gm_upload.gm_upload import upload, upload_file
from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import *
gm_user_id_list = [
"3236957071",
......@@ -26,160 +27,70 @@ gm_user_id_list = [
"7048594049",
]
majiayonghu_list = [
36436814,
36436809,
36436805,
36436803,
36436800,
36436797,
36436794,
36436793,
36436787,
36436782,
36436769,
36436763,
36436758,
36436756,
36436749,
36436745,
36436738,
36436731,
36436729,
36436725,
36436720,
36436717,
36436716,
36436709,
36436703,
36436701,
36436690,
36436689,
36436685,
36436674,
36426171,
36426170,
36426169,
36426168,
36426167,
36426166,
36426165,
36426164,
36426163,
36426162,
36426161,
36426160,
36426159,
36426158,
36426157,
36426156,
36426155,
36426154,
36426153,
36426152,
36426150,
36426149,
36426148,
36426147,
36426146,
36426145,
36426143,
36426141,
36368922,
36368921,
36368920,
36368918,
36368917,
]
user_id_list = [
29865245,
36426151,
36426142,
36427666,
36427661,
36427657,
36427655,
36427634,
33524762,
33524779,
33524697,
30963358,
31293584,
31358392,
31358396,
31358397,
31358419,
31358448,
31358610,
31358658,
]
# f= open("josnfile.json","r",encoding='utf-8')
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True)
pid_list = rds.hkeys("weibo")
# for line in f:
for pid in pid_list:
res = rds.hget("weibo", pid)
if rds.hexists("weibo_with_img", pid):
continue
res_json = json.loads(res)
video_dic = {}
qiniu_img_list = []
# print(res_json)
if "http://t.cn/" in res_json["title"]:
continue
for img_url in res_json["img_list"]:
try:
img_wb = retry_get_url(img_url.replace("large", "sq480")).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('<img src="' + res + '-w">')
except Exception as e:
print("down load img error %s" % e)
continue
print(qiniu_img_list)
try:
# if True:
try:
title = res_json["title"].split("\n")[0]
except:
title = res_json["title"]
desc_fix = "<p>" + res_json["title"].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>"
res_json["desc_fix"] = desc_fix
# print(desc_fix)
# f.write(json.dumps(res_json) + "\n")
# f.flush()
res = rds.hset("weibo_with_img", key=pid, value=json.dumps(res_json))
if res_json["releaser_id_str"].replace("weibo_","") in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "2"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = title
while True:
if rds.hlen("weibo"):
pid_list = rds.hkeys("weibo")
for pid in pid_list:
if rds.sismember("weibo_exists_set", pid):
rds.hdel("weibo", pid)
continue
res = rds.hget("weibo", pid)
res_json = json.loads(res)
video_dic = {}
qiniu_img_list = []
# print(res_json)
if "http://t.cn/" in res_json["title"]:
continue
for img_url in res_json["img_list"]:
try:
img_wb = retry_get_url(img_url.replace("large", "sq480")).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('<img src="' + res + '-w">')
except Exception as e:
print("down load img error %s" % e)
continue
print(qiniu_img_list)
try:
# if True:
try:
title = res_json["title"].split("\n")[0]
except:
title = res_json["title"]
desc_fix = "<p>" + res_json["title"].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>"
res_json["desc_fix"] = desc_fix
# print(desc_fix)
# f.write(json.dumps(res_json) + "\n")
# f.flush()
res = rds.hset("weibo_with_img", key=pid, value=json.dumps(res_json))
if res_json["releaser_id_str"].replace("weibo_","") in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "2"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = title
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = int(res_json["release_time"]/1e3)
video_dic["create_time"] = create_time
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = int(res_json["release_time"]/1e3)
video_dic["create_time"] = create_time
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
except Exception as e:
print(e)
continue
except Exception as e:
print(e)
continue
rds.hdel("weibo", pid)
# break
# f.flush()
# f.close()
\ No newline at end of file
......@@ -6,11 +6,13 @@
import copy
import datetime
import random
import time
import redis, json
from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
# from crawler_sys.utils.output_results import retry_get_url
# from crawler.gm_upload.gm_upload import upload, upload_file
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_single_data,post_muilty_data
from crawler_sys.utils.output_results import retry_get_url
from crawler.gm_upload.gm_upload import upload, upload_file
from crawler.crawler_sys.scheduler.redis_to_rpc.rpc_config import *
gm_user_id_list = [
'5cca9b3700000000120314c9',
......@@ -18,190 +20,94 @@ gm_user_id_list = [
'5c20dd200000000007027c07',
'5fe1c1ba0000000001006e65']
majiayonghu_list = [
36436814,
36436809,
36436805,
36436803,
36436800,
36436797,
36436794,
36436793,
36436787,
36436782,
36436769,
36436763,
36436758,
36436756,
36436749,
36436745,
36436738,
36436731,
36436729,
36436725,
36436720,
36436717,
36436716,
36436709,
36436703,
36436701,
36436690,
36436689,
36436685,
36436674,
36426171,
36426170,
36426169,
36426168,
36426167,
36426166,
36426165,
36426164,
36426163,
36426162,
36426161,
36426160,
36426159,
36426158,
36426157,
36426156,
36426155,
36426154,
36426153,
36426152,
36426150,
36426149,
36426148,
36426147,
36426146,
36426145,
36426143,
36426141,
36368922,
36368921,
36368920,
36368918,
36368917,
]
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True)
while True:
if rds.hlen("xiaohongshu"):
pid_list = rds.hkeys("xiaohongshu")
for pid in pid_list:
user_id_list = [
29865245,
36426151,
36426142,
36427666,
36427661,
36427657,
36427655,
36427634,
33524762,
33524779,
33524697,
30963358,
31293584,
31358392,
31358396,
31358397,
31358419,
31358448,
31358610,
31358658,
]
if rds.sismember("xiaohongshu_exists_set", pid):
rds.hdel("xiaohongshu", pid)
continue
line = rds.hget("xiaohongshu", pid)
res_json = json.loads(line)
video_dic = {}
qiniu_img_list = []
try:
pid = res_json["NoteView"]["commentInfo"]["targetNoteId"]
except:
pid = res_json["NoteView"]["content"]["id"]
for img_url in res_json["NoteView"]["content"]["imageList"]:
try:
img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'],img_url['traceId'])).content
res = upload(img_wb, img_type=99)
# print(res)
img_info = retry_get_url(res + "-imageinfo")
img_info_json = img_info.json()
qiniu_img_list.append('<img src="' + res + '-w">')
except Exception as e:
print("down load img error %s" % e)
continue
try:
desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>"
res_json["NoteView"]["content"]["desc_fix"] = desc_fix
if res_json["NoteView"]["author"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["NoteView"]["content"]["title"]
f= open("josnfile.json","r",encoding='utf-8')
# rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True)
# pid_list = rds.hkeys("xiaohongshu")
for line in f:
# for pid in f:
# res = rds.hget("xiaohongshu", pid)
# if rds.hexists("xiaohongshu_with_img", pid):
# continue
res_json = json.loads(line)
video_dic = {}
qiniu_img_list = []
# print(res_json)
try:
pid = res_json["NoteView"]["commentInfo"]["targetNoteId"]
except:
pid = res_json["NoteView"]["content"]["id"]
# for img_url in res_json["NoteView"]["content"]["imageList"]:
# try:
# img_wb = retry_get_url("http:" + img_url["url"].replace(img_url['fileId'],img_url['traceId'])).content
# res = upload(img_wb, img_type=99)
# # print(res)
# img_info = retry_get_url(res + "-imageinfo")
# img_info_json = img_info.json()
# qiniu_img_list.append('<img src="' + res + '-w">')
# except Exception as e:
# print("down load img error %s" % e)
# continue
# print(qiniu_img_list)
try:
# if True:
# desc_fix = "<p>" + res_json["NoteView"]["content"]['desc'].replace('\n', '<br>') + "".join(qiniu_img_list) + "</p>"
# res_json["NoteView"]["content"]["desc_fix"] = desc_fix
desc_fix = res_json["NoteView"]["content"]["desc_fix"]
# print(desc_fix)
# f.write(json.dumps(res_json) + "\n")
# f.flush()
# res = rds.hset("xiaohongshu_with_img", key=pid, value=json.dumps(res_json))
if res_json["NoteView"]["author"]['id'] in gm_user_id_list:
video_dic["level"] = "5"
else:
video_dic["level"] = "3"
video_dic["platform"] = "9"
video_dic["platform_id"] = pid
video_dic["platform_answer_id"] = pid
video_dic["title"] = res_json["NoteView"]["content"]["title"]
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["NoteView"]["content"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["NoteView"].get("comments"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["NoteView"]["comments"]["data"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0, 60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
elif res_json["NoteView"].get("commentInfo"):
for comment in res_json["NoteView"]["commentInfo"]["comments"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0,24),minutes=random.randint(0,60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
continue
# break
# f.flush()
# f.close()
\ No newline at end of file
user_id_list_copy = copy.deepcopy(user_id_list)
qustion_id = random.choice(user_id_list_copy)
user_id_list_copy.remove(qustion_id)
video_dic["user_id"] = qustion_id
create_time = datetime.datetime.strptime(res_json["NoteView"]["content"]["time"],
'%Y-%m-%d %H:%M')
video_dic["create_time"] = create_time.timestamp()
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/question/batch_create")
# print(rpc_res)
video_dic["platform_question_id"] = pid
video_dic["content"] = desc_fix
video_dic["user_id"] = random.choice(user_id_list_copy)
rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/answer/batch_create")
comment_list = []
try:
if res_json["NoteView"].get("comments"):
# print(res_json["NoteView"].get("data"))
for comment in res_json["NoteView"]["comments"]["data"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0, 24),
minutes=random.randint(0, 60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
elif res_json["NoteView"].get("commentInfo"):
for comment in res_json["NoteView"]["commentInfo"]["comments"]:
video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id']
comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id
comment_id_list_copy.remove(comment_id)
video_dic["create_time"] = (create_time + datetime.timedelta(hours=random.randint(0,24),minutes=random.randint(0,60))).timestamp()
comment_list.append(copy.deepcopy(video_dic))
# rpc_res = post_single_data(copy.deepcopy(video_dic), "cims/reply/batch_create")
if comment_list:
rpc_res = post_muilty_data(comment_list, "cims/reply/batch_create")
except Exception as e:
print("comment error")
print(e)
except Exception as e:
print(e)
continue
rds.hdel("xiaohongshu",pid)
else:
time.sleep(5)
# -*- coding:UTF-8 -*-
# @Time : 2021/1/14 14:51
# @File : crawler_xiaohongshu.py
# @email : litao@igengmei.com
# @author : litao
# -*- coding:UTF-8 -*-
# @Time : 2021/1/4 13:39
# @File : crawler_xiaohongshu.py
# @email : litao@igengmei.com
# @author : litao
import copy
import redis
import requests
import json
import datetime
import re
# from . import bulk_write_into_es
import hashlib
import time
from selenium import webdriver
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler_sys.framework.video_fields_std import Std_fields_video
from crawler_sys.utils.output_results import output_result
from crawler.gm_upload.gm_upload import upload, upload_file
from selenium.webdriver import ActionChains
from selenium import webdriver
try:
from write_data_into_es.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
import random, urllib
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True)
class Crawler_xiaohongshu():
def __init__(self, timeout=None, platform='xiaohongshu'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
self.TotalVideo_num = None
self.midstepurl = None
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
self.chrome_options = webdriver.ChromeOptions()
self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('--disable-gpu')
self.chrome_options.add_argument("--no-sandbox")
# proxies_dic = get_proxy(1)
# proxies_dic_list = proxies_dic["http"].split(":")
# proxy_server = '--proxy-server=http://{ip}:{port}'.format(ip=proxies_dic_list[1].replace("/",""), port=int(proxies_dic_list[2]))
# print(proxy_server)
# self.chrome_options.add_argument(proxy_server)
# self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"')
# self.chrome_options.add_argument('accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"')
# self.chrome_options.add_argument('accept-encoding="gzip, deflate"')
# self.chrome_options.add_argument('accept-language="zh-CN,zh;q=0.9"')
# self.chrome_options.add_argument('cache-control="max-age=0"')
# self.chrome_options.add_argument('referer="https://www.xiaohongshu.com/explore"')
# self.chrome_options.add_argument('sec-fetch-dest="document"')
# self.chrome_options.add_argument('sec-fetch-mode="navigate"')
# self.chrome_options.add_argument('sec-fetch-user="?1"')
# self.chrome_options.add_argument('upgrade-insecure-requests="1"')
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
def __exit__(self):
self.driver.close()
def get_one_page(self, page_id, proxies=0,cookies={}):
url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
# "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;",
"referer": "https://www.xiaohongshu.com/explore",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
res = retry_get_url(url, headers=headers, proxies=proxies,cookies=cookies)
res_text = res.text
res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0]
# scope = {}
json_data = json.loads(res_json_text.replace("undefined","null"))
return json_data
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform="xiaohongshu", releaserUrl=releaserUrl)
def releaser_page(self, releaserUrl,
output_to_file=False,
filepath=None,
releaser_page_num_max=30,
output_to_es_raw=False,
es_index=None,
doc_type=None,
output_to_es_register=False,
push_to_redis=False, proxies_num=None, **kwargs):
# self.driver.get("https://www.xiaohongshu.com/")
# time.sleep(1)
# self.driver.implicitly_wait(2)
# self.driver.add_cookie(cookie_dict={'name': 'timestamp2', 'value': '2021010899964852bd70ca4c0c991c6c'})
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# # print(page_element)
# self.driver.get("https://www.xiaohongshu.com/explore")
# self.driver.implicitly_wait(2)
# time.sleep(1)
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# # print(page_element)
#
# self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]").click()
# self.driver.implicitly_wait(2)
# time.sleep(1)
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# print(page_element)
# cookie = self.driver.get_cookies()
#
# return True
# print(self.driver.get_log("performance"))
# cookie_dic={}
# for k in cookie:
# cookie_dic[k["name"]] = k["value"]
# print(cookie_dic)
headers = {
"host":"www.xiaohongshu.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
# "cookie": "xhsTrackerId=a81077f9-661a-4731-c790-ac6fbbeaa44b; extra_exp_ids=gif_clt1,ques_exp2; xhsuid=2EFsw5qOMk70l1we; timestamp2=2021010899964852bd70ca4c0c991c6c; timestamp2.sig=Lj3xTHgJ-JO20IUULPRnAhACddlzUtd7AsUzrlJQbWc; xhs_spses.5dde=*; xhsTracker=url=index&searchengine=baidu; xhs_spid.5dde=4dc700089fbdde46.1610082780.1.1610083480.1610082780.d70776d0-eac9-4684-912e-130f0cdb86a1",
"pragma": "no-cache",
"sec-ch-ua": '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
}
count = 1
# has_more = True
retry_time = 0
result_list = []
releaser_id = self.get_releaser_id(releaserUrl)
releaserUrl = 'http://www.xiaohongshu.com/user/profile/%s' % releaser_id
pcursor = 0
cookie_dic = {'timestamp2': '2021010899964852bd70ca4c0c991c6c'}
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1:
try:
print(releaserUrl)
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num, cookies=cookie_dic)
except:
continue
# print(get_page.content)
# time.sleep(random.randint(1, 2))
page_text = res.text
# print(page_text)
data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0]
# print(data_list)
data_json = json.loads(data_list.replace("undefined","null"))
# # print(data_list)
if data_json:
print("get data at releaser: %s page: %s" % (releaser_id, count))
count += 1
for info_dic in data_json["Main"]["notesDetail"]:
video_dic = {}
page_id = info_dic["id"]
title = info_dic["title"]
desc = info_dic.get("desc","")
time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp()
if info_dic["type"] != "normal":
continue
page_data = self.get_one_page(page_id,proxies=proxies_num,cookies=cookie_dic)
print(page_data)
title = title
anwser = desc
try:
pid = page_data["NoteView"]["content"]["id"]
except:
print("get pid error")
continue
if rds.hexists("xiaohongshu",pid):
continue
rds.hset("xiaohongshu",key=pid,value=json.dumps(page_data))
yield page_data
# break
def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
count_false = 0
for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time < video_time:
if video_time < end_time:
yield res
else:
count_false += 1
if count_false > allow:
break
else:
yield res
if __name__ == '__main__':
test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list =[
]
for url in url_list:
print(url)
res = test.releaser_page(url,proxies_num=0)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment