update

bc83352f · litaolemo · 8ae3640f · bc83352f · bc83352f · bc83352f
Commit bc83352f authored Jan 08, 2021 by litaolemo
3 changed files
--- a/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+++ b/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
@@ -42,7 +42,7 @@ rds = func_get_redis()
 def get_proxy_from_redis():
    try:
        one_proxy = rds.randomkey()
-        proxies = {"http": "http://{}".format(one_proxy), "https": "https://{}".format(one_proxy)}
+        proxies = {"http": "http://{}".format(one_proxy), "https": "http://{}".format(one_proxy)}
        return proxies
    except Exception as e:
        print(e)
@@ -100,7 +100,7 @@ def func_get_proxy_to_redis():
            rds.set(key, key, ex=int(seconds[key]) - 3)
    except Exception as e:
        print(e)
-    return ips[0]
+    return {"http": "http://{}".format(ips[0]), "https": "http://{}".format(ips[0])}


 def proxy_test(proxies):

--- a/crawler_sys/site_crawler/crawler_xiaohongshu.py
+++ b/crawler_sys/site_crawler/crawler_xiaohongshu.py
@@ -27,7 +27,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
 # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
 import random, urllib
-from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
+# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data

 user_id_list = [29865245,
 36426151,
@@ -64,7 +64,7 @@ class Crawler_xiaohongshu():
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        self.chrome_options = webdriver.ChromeOptions()
-        self.chrome_options.add_argument('--headless')
+        # self.chrome_options.add_argument('--headless')
        # self.chrome_options.add_argument('--disable-gpu')
        # self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"')
@@ -82,7 +82,7 @@ class Crawler_xiaohongshu():
        self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
        prefs = {"profile.managed_default_content_settings.images": 2}
        self.chrome_options.add_experimental_option("prefs", prefs)
-        self.driver = webdriver.Chrome(options=self.chrome_options)
+        # self.driver = webdriver.Chrome(options=self.chrome_options)

    def __exit__(self):
        self.driver.close()
@@ -137,30 +137,30 @@ class Crawler_xiaohongshu():
                      doc_type=None,
                      output_to_es_register=False,
                      push_to_redis=False, proxies_num=None, **kwargs):
-        self.driver.get("https://www.xiaohongshu.com/")
-        self.driver.implicitly_wait(2)
-        time.sleep(1)
-        self.driver.get(releaserUrl)
-        self.driver.implicitly_wait(2)
-        time.sleep(1)
-
-        
-        self.driver.refresh()
-        page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
-        print(page_element)
-        cookie = self.driver.get_cookies()
+        # self.driver.get("https://www.xiaohongshu.com/")
+        # self.driver.implicitly_wait(2)
+        # time.sleep(1)
+        # self.driver.get(releaserUrl)
+        # self.driver.implicitly_wait(2)
+        # time.sleep(1)
+        #
+        #
+        # self.driver.refresh()
+        # page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
+        # print(page_element)
+        # cookie = self.driver.get_cookies()

        # print(self.driver.get_log("performance"))
-        cookie_dic={}
-        for k in cookie:
-            cookie_dic[k["name"]] = k["value"]
-        print(cookie_dic)
+        # cookie_dic={}
+        # for k in cookie:
+        #     cookie_dic[k["name"]] = k["value"]
+        # print(cookie_dic)
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
-            "cookie": "xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTrackerId=38ec6dcb-d882-45e8-c539-834386696a14; xhsuid=Dv4OTnGbDg9LivGu; timestamp2=202101062497d4bed842476b2618e0ea; timestamp2.sig=-Jax1vd_iNZtToaWYMOMoFUmCJwojKQPnfP8iMeOpAc; extra_exp_ids=gif_clt1,ques_clt1; xhs_spses.5dde=*; xhs_spid.5dde=59a50d47116c4333.1609921946.2.1609997747.1609922354.16a344a9-3e6e-43d8-b894-33a7205e181f",
+            # "cookie": "xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTrackerId=38ec6dcb-d882-45e8-c539-834386696a14; xhsuid=Dv4OTnGbDg9LivGu; timestamp2=202101062497d4bed842476b2618e0ea; timestamp2.sig=-Jax1vd_iNZtToaWYMOMoFUmCJwojKQPnfP8iMeOpAc; extra_exp_ids=gif_clt1,ques_clt1; xhs_spses.5dde=*; xhs_spid.5dde=59a50d47116c4333.1609921946.2.1609997747.1609922354.16a344a9-3e6e-43d8-b894-33a7205e181f",
            "referer": releaserUrl,
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
@@ -178,12 +178,12 @@ class Crawler_xiaohongshu():
        releaserUrl = 'https://www.xiaohongshu.com/user/profile/%s' % releaser_id
        self.video_data['releaserUrl'] = releaserUrl
        pcursor = 0
-
+        cookie_dic = {'timestamp2.sig': 'QaPtkKr8VeAbx324ZSJgUSeLhjE2Lj1kDhdmZReaewo', 'timestamp2': '20210108b8c577995da3b1aa5e9a7392', 'xhsuid': 'cqq3glNpFsMgH50j', 'xhs_spses.5dde': '*', 'xhs_spid.5dde': 'fa1043ce96194610.1610072893.1.1610072895.1610072893.3536bab9-1e85-4a3a-8a46-37e694100de1', 'extra_exp_ids': 'gif_clt1,ques_clt1', 'xhsTrackerId': '591fba69-1884-4ab2-ca05-9ae70ab77d2e'}
        # print(proxies)
        # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
        while count <= releaser_page_num_max and count <= 1000:
            try:
-                res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num)
+                res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num,cookies=cookie_dic)
            except:
                continue
            # print(get_page.content)
@@ -216,11 +216,11 @@ class Crawler_xiaohongshu():
                    # video_dic["content"] = anwser
                    video_dic["user_id"] = random.choice(user_id_list)
                    video_dic["create_time"] = time_ts
-                    rpc_res = post_data(video_dic,"cims/question/batch_create")
+                    # rpc_res = post_data(video_dic,"cims/question/batch_create")
                    print(res)
                    break

 if __name__ == '__main__':
    test = Crawler_xiaohongshu()
    releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
-    res = test.releaser_page(releaserurl,proxies_num=0)
+    res = test.releaser_page(releaserurl,proxies_num=1)
--- a/crawler_sys/utils/output_results.py
+++ b/crawler_sys/utils/output_results.py
@@ -338,6 +338,7 @@ def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs):
        try:
            if proxies:
                proxies_dic = get_proxy(proxies)
+                print(proxies_dic)
                if not proxies_dic:
                    get_resp = requests.get(url, timeout=timeout, **kwargs)
                else: