update

0aaa57e6 · litaolemo · 5223e9cc · 0aaa57e6
Commit 0aaa57e6 authored Jan 20, 2021 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 10 deletions

crawler_xiaohongshu.py crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py +11 -10

No files found.
--- a/crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
+++ b/crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
@@ -16,7 +16,6 @@ import requests
 import json
 import datetime
 import re
-# from . import bulk_write_into_es
 import hashlib
 import time
 from selenium import webdriver
@@ -29,12 +28,11 @@ from selenium import webdriver
 try:
    from write_data_into_es.func_get_releaser_id import *
 except:
-    from func_get_releaser_id import *
-from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
+    from crawler.write_data_into_es.func_get_releaser_id import *
+
 # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
 import random, urllib
-# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
 rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)

 class Crawler_xiaohongshu():
@@ -69,16 +67,16 @@ class Crawler_xiaohongshu():
        # self.chrome_options.add_argument('sec-fetch-user="?1"')
        # self.chrome_options.add_argument('upgrade-insecure-requests="1"')

-        self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+        # self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
        prefs = {"profile.managed_default_content_settings.images": 2}
-        self.chrome_options.add_experimental_option("prefs", prefs)
+        # self.chrome_options.add_experimental_option("prefs", prefs)
        # self.driver = webdriver.Chrome(options=self.chrome_options)


    def __exit__(self):
-        self.driver.close()
-
+        # self.driver.close()
+        pass

    def get_one_page_xiaochengxu(self, page_id, proxies=0):
        url = "https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/note/%s/single_feed" % page_id
@@ -178,8 +176,10 @@ class Crawler_xiaohongshu():
                        continue
                    time_ts = datetime.datetime.strptime(info_dic["time"], '%Y-%m-%d %H:%M').timestamp()
                    page_data = self.get_one_page_xiaochengxu(page_id, proxies=proxies_num)
+                    page_data['release_time'] = int(time_ts*1e3)
+                    page_data['platform'] = 'xiaohongshu'
                    # print(page_data)
-                    rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
+                    # rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
                    yield page_data

    def releaser_page_by_pc(self, releaserUrl,
@@ -312,7 +312,7 @@ if __name__ == '__main__':
    test = Crawler_xiaohongshu()
    releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
    url_list = [
-        # "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
+        "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
        "https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3",
        "https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86",
        "https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740",
@@ -417,6 +417,7 @@ if __name__ == '__main__':
        'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07',
        'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65',
    ]
+    print(len(url_list))
    count =0
    for url in url_list:
        print(url)