Commit f247fe38 authored by litaolemo's avatar litaolemo

update

parent d817d81c
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
# @author : litao # @author : litao
import copy import copy
import datetime import datetime
import hashlib
import random import random
import time import time
...@@ -153,7 +154,7 @@ def xiaohongshu_xiaochengxu(res_json): ...@@ -153,7 +154,7 @@ def xiaohongshu_xiaochengxu(res_json):
# print(res_json["NoteView"].get("data")) # print(res_json["NoteView"].get("data"))
for comment in res_json["data"]["commentList"]: for comment in res_json["data"]["commentList"]:
video_dic["content"] = comment['content'] video_dic["content"] = comment['content']
video_dic["platform_id"] = comment['id'] video_dic["platform_id"] = hashlib.md5(comment['user']['id']+comment['content'].encode("utf8")).hexdigest()
comment_id_list_copy = copy.deepcopy(majiayonghu_list) comment_id_list_copy = copy.deepcopy(majiayonghu_list)
comment_id = random.choice(comment_id_list_copy) comment_id = random.choice(comment_id_list_copy)
video_dic["user_id"] = comment_id video_dic["user_id"] = comment_id
......
...@@ -141,7 +141,7 @@ class Crawler_xiaohongshu(): ...@@ -141,7 +141,7 @@ class Crawler_xiaohongshu():
releaser_id = self.get_releaser_id(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl)
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1: while count <= releaser_page_num_max:
releaserUrl = "https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/user/%s/notes?page=%s&page_size=15" % (releaser_id,str(count)) releaserUrl = "https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/user/%s/notes?page=%s&page_size=15" % (releaser_id,str(count))
sign = releaserUrl.replace("https://www.xiaohongshu.com", "") + "WSUDD" sign = releaserUrl.replace("https://www.xiaohongshu.com", "") + "WSUDD"
pid = "X" + hashlib.md5(sign.encode("utf8")).hexdigest() pid = "X" + hashlib.md5(sign.encode("utf8")).hexdigest()
...@@ -165,7 +165,7 @@ class Crawler_xiaohongshu(): ...@@ -165,7 +165,7 @@ class Crawler_xiaohongshu():
time.sleep(random.randint(1, 2)) time.sleep(random.randint(1, 2))
data_list = res.json() data_list = res.json()
if data_list["code"] != 0 or not data_list["success"]: if not data_list["data"]:
break break
if data_list: if data_list:
print("get data at releaser: %s page: %s" % (releaser_id, count)) print("get data at releaser: %s page: %s" % (releaser_id, count))
...@@ -179,8 +179,7 @@ class Crawler_xiaohongshu(): ...@@ -179,8 +179,7 @@ class Crawler_xiaohongshu():
time_ts = datetime.datetime.strptime(info_dic["time"], '%Y-%m-%d %H:%M').timestamp() time_ts = datetime.datetime.strptime(info_dic["time"], '%Y-%m-%d %H:%M').timestamp()
page_data = self.get_one_page_xiaochengxu(page_id, proxies=proxies_num) page_data = self.get_one_page_xiaochengxu(page_id, proxies=proxies_num)
# print(page_data) # print(page_data)
rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
rds.hset("xiaohongshu", key=pid, value=json.dumps(page_data))
yield page_data yield page_data
def releaser_page_by_pc(self, releaserUrl, def releaser_page_by_pc(self, releaserUrl,
...@@ -284,7 +283,7 @@ class Crawler_xiaohongshu(): ...@@ -284,7 +283,7 @@ class Crawler_xiaohongshu():
continue continue
if rds.hexists("xiaohongshu",pid): if rds.hexists("xiaohongshu",pid):
continue continue
rds.hset("xiaohongshu",key=pid,value=json.dumps(page_data)) rds.hset("xiaohongshu",key=page_id,value=json.dumps(page_data))
yield page_data yield page_data
# break # break
...@@ -313,7 +312,7 @@ if __name__ == '__main__': ...@@ -313,7 +312,7 @@ if __name__ == '__main__':
test = Crawler_xiaohongshu() test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae' releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list = [ url_list = [
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae", # "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
"https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3", "https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3",
"https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86", "https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86",
"https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740", "https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740",
...@@ -418,8 +417,11 @@ if __name__ == '__main__': ...@@ -418,8 +417,11 @@ if __name__ == '__main__':
'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07', 'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07',
'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65', 'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65',
] ]
count =0
for url in url_list: for url in url_list:
print(url) print(url)
res = test.releaser_page(url,proxies_num=0) res = test.releaser_page(url,proxies_num=0)
for r in res: for r in res:
print(r) count += 1
print(count)
# pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment