update

f247fe38 · litaolemo · d817d81c · f247fe38 · f247fe38
Commit f247fe38 authored Jan 20, 2021 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 8 deletions

xiaohongshu_to_rpc.py crawler_sys/scheduler/xiaohongshu_to_rpc.py +2 -1

crawler_xiaohongshu.py crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py +9 -7

No files found.
--- a/crawler_sys/scheduler/xiaohongshu_to_rpc.py
+++ b/crawler_sys/scheduler/xiaohongshu_to_rpc.py
@@ -5,6 +5,7 @@
 # @author : litao
 import copy
 import datetime
+import hashlib
 import random
 import time

@@ -153,7 +154,7 @@ def xiaohongshu_xiaochengxu(res_json):
                # print(res_json["NoteView"].get("data"))
                for comment in res_json["data"]["commentList"]:
                    video_dic["content"] = comment['content']
-                    video_dic["platform_id"] = comment['id']
+                    video_dic["platform_id"] = hashlib.md5(comment['user']['id']+comment['content'].encode("utf8")).hexdigest()
                    comment_id_list_copy = copy.deepcopy(majiayonghu_list)
                    comment_id = random.choice(comment_id_list_copy)
                    video_dic["user_id"] = comment_id

--- a/crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
+++ b/crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
@@ -141,7 +141,7 @@ class Crawler_xiaohongshu():
        releaser_id = self.get_releaser_id(releaserUrl)

        # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
-        while count <= releaser_page_num_max and count <= 1:
+        while count <= releaser_page_num_max:
            releaserUrl = "https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/user/%s/notes?page=%s&page_size=15" % (releaser_id,str(count))
            sign = releaserUrl.replace("https://www.xiaohongshu.com", "") + "WSUDD"
            pid = "X" + hashlib.md5(sign.encode("utf8")).hexdigest()
@@ -165,7 +165,7 @@ class Crawler_xiaohongshu():

            time.sleep(random.randint(1, 2))
            data_list = res.json()
-            if data_list["code"] != 0 or not data_list["success"]:
+            if not data_list["data"]:
                break
            if data_list:
                print("get data at releaser: %s page: %s" % (releaser_id, count))
@@ -179,8 +179,7 @@ class Crawler_xiaohongshu():
                    time_ts = datetime.datetime.strptime(info_dic["time"], '%Y-%m-%d %H:%M').timestamp()
                    page_data = self.get_one_page_xiaochengxu(page_id, proxies=proxies_num)
                    # print(page_data)
-
-                    rds.hset("xiaohongshu", key=pid, value=json.dumps(page_data))
+                    rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
                    yield page_data

    def releaser_page_by_pc(self, releaserUrl,
@@ -284,7 +283,7 @@ class Crawler_xiaohongshu():
                        continue
                    if rds.hexists("xiaohongshu",pid):
                        continue
-                    rds.hset("xiaohongshu",key=pid,value=json.dumps(page_data))
+                    rds.hset("xiaohongshu",key=page_id,value=json.dumps(page_data))
                    yield page_data
                    # break

@@ -313,7 +312,7 @@ if __name__ == '__main__':
    test = Crawler_xiaohongshu()
    releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
    url_list = [
-        "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
+        # "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
        "https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3",
        "https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86",
        "https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740",
@@ -418,8 +417,11 @@ if __name__ == '__main__':
        'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07',
        'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65',
    ]
+    count =0
    for url in url_list:
        print(url)
        res = test.releaser_page(url,proxies_num=0)
        for r in res:
-            print(r)
+            count += 1
+            print(count)
+            # pass