update

4362749f · litaolemo · fee41916 · 4362749f · 4362749f · 4362749f
Commit 4362749f authored Aug 07, 2020 by litaolemo
11 changed files
--- a/crawler_sys/framework/redis_interact.py
+++ b/crawler_sys/framework/redis_interact.py
@@ -8,11 +8,16 @@ import redis, json
 from crawler_sys.framework.platform_redis_register import get_redis_list_name
 from crawler_sys.framework.es_crawler import scan_crawler_url_register

-rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
+
+def redis_path(redis_type=""):
+    if redis_type == "on_line":
+        rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
+    else:
+        rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
+    return rds


-def feed_url_into_redis(dict_Lst, expire=0,
-                        ):
+def feed_url_into_redis(dict_Lst, expire=0,rds=redis_path):
    """
    release_time_lower_bdr must be an int value represent
    timestamp in milliseconds if given.

--- a/crawler_sys/framework/search_page_single_process.py
+++ b/crawler_sys/framework/search_page_single_process.py
@@ -16,20 +16,18 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
 #                                              '/crawler_sys/framework/config'
 #                                              '/search_keywords.ini'),
 #                    help=('config file absolute path'))
-PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
+PARSER.add_argument('-p', '--platform', default=["toutiao","weibo", "zhihu"], action='append',
                    help=('legal platform name is required'))
 PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
                    help=('key_word_legal platform name is required'))
-PARSER.add_argument('-w', '--output_to_es_raw', default=True,
+PARSER.add_argument('-w', '--output_to_es_raw', default=False,
                    help=('output to es raw'))
-PARSER.add_argument('-g', '--output_to_es_register', default=False,
+PARSER.add_argument('-g', '--output_to_es_register', default=True,
                    help=('output to es register'))
 PARSER.add_argument('-n', '--maxpage', default=20,
                    help=('maxpage'))

 ARGS = PARSER.parse_args()
-es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
-                             http_auth=('crawler', 'XBcasfo8dgfs'))



@@ -41,45 +39,92 @@ if ARGS.platform != []:
 #               "program will exit" % platform)
 #         sys.exit(0)

-# CONFIG = configparser.ConfigParser()
-# CONFIG.read(ARGS.conf, encoding='utf-8')

 OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
 OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register

-
+#
+# def func_search_keywordlist(platform):
+#     search_body = {"query": {"bool": {"filter": []}}}
+#     search_resp = es_framework.search(index=index_target_releaser,
+#                                       doc_type=doc_type_target_releaser,
+#                                       body=search_body,
+#                                       size=0,
+#                                       request_timeout=100)
+#     total_hit = search_resp['hits']['total']
+#     releaser_dic = {}
+#     if total_hit > 0:
+#         print('Got %d releaser for platform %s.' % (total_hit, platform))
+#         scan_resp = scan(client=es_framework, query=search_body,
+#                          index=index_target_releaser,
+#                          doc_type=doc_type_target_releaser,
+#                          request_timeout=200)
+#         for line in scan_resp:
+#             try:
+#                 title = line['_source']['title']
+#                 page = line['_source']['page']
+#                 releaser_dic[title] = page
+#             except:
+#                 print('error in :', line)
+#                 continue
+#     else:
+#         print('Got zero hits.')
+#     return releaser_dic
 def func_search_keywordlist(platform):
-    search_body = {"query": {"bool": {"filter": []}}}
-    search_resp = es_framework.search(index=index_target_releaser,
-                                      doc_type=doc_type_target_releaser,
-                                      body=search_body,
-                                      size=0,
-                                      request_timeout=100)
-    total_hit = search_resp['hits']['total']
-    releaser_dic = {}
-    if total_hit > 0:
-        print('Got %d releaser for platform %s.' % (total_hit, platform))
-        scan_resp = scan(client=es_framework, query=search_body,
-                         index=index_target_releaser,
-                         doc_type=doc_type_target_releaser,
-                         request_timeout=200)
-        for line in scan_resp:
-            try:
-                title = line['_source']['title']
-                page = line['_source']['page']
-                releaser_dic[title] = page
-            except:
-                print('error in :', line)
-                continue
-    else:
-        print('Got zero hits.')
-    return releaser_dic
-
-
-if OUTPUT_TO_ES_RAW is True:
-    ES_INDEX = 'test2'
-    DOC_TYPE = 'doc'
-    print(ES_INDEX, DOC_TYPE)
+    res_dic = {}
+    res_list = ["比基尼线脱毛",
+                "嗨体泪沟",
+                "根据脸型选发型",
+                "圆脸适合什么发型",
+                "5热玛吉",
+                "耳软骨假体鼻综合",
+                "肉毒素去法令纹",
+                "吸脂瘦腹部",
+                "嗨体填充泪沟",
+                "6d小脸针",
+                "水剥离",
+                "嗨体去颈纹",
+                "胶原蛋白填充泪沟",
+                "吸脂瘦全身",
+                "肉毒素去狐臭",
+                "吸脂瘦腰部",
+                "fotona4d",
+                "嘴综合",
+                "胸部下垂矫正",
+                "5g天使光雕",
+                "唇综合",
+                "SVF-gel脂肪胶",
+                "嘴角上扬术",
+                "嗨体注射",
+                "脂肪填充修复",
+                "比基尼脱毛",
+                "lams吸脂",
+                "脂肪填充面部年轻化",
+                "嗨体",
+                "吸脂祛副乳",
+                "m22",
+                "胸部提升",
+                "fotona",
+                "O型腿矫正",
+                "肋骨鼻",
+                "欣颜",
+                "唯颜",
+                "垫眉骨",
+                "咬肌切除",
+                "背部吸脂",
+                "m22王者之冠",
+                "bbl",
+                "胶原蛋白填充祛黑眼圈",
+                "热玛吉",
+                "热玛吉5代",
+                ]
+    for l in res_list:
+        res_dic[l] = 1
+    return res_dic
+
+
+ES_INDEX = 'crawler-data-raw'
+print(ES_INDEX)
 pages = ARGS.maxpage

 for platform in PLATFORM_LIST:
@@ -92,12 +137,11 @@ for platform in PLATFORM_LIST:
        print("search keyword '%s' on platform %s" % (keyword, platform))
        search_pages = int(KEYWORD_dic[keyword])
        try:
-            if platform != "腾讯新闻":
-                crawler.search_page(keyword=keyword,
-                                    search_pages_max=search_pages,
-                                    output_to_es_raw=OUTPUT_TO_ES_RAW,
-                                    output_to_es_register=OUTPUT_TO_ES_REGISTER,
-                                    es_index=ES_INDEX,)
+            crawler.search_page(keyword=keyword,
+                                search_pages_max=search_pages,
+                                output_to_es_raw=OUTPUT_TO_ES_RAW,
+                                output_to_es_register=OUTPUT_TO_ES_REGISTER,
+                                es_index=ES_INDEX,)

        except Exception as e:
            print(e)

--- a/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+++ b/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
@@ -10,7 +10,7 @@
 """
 import redis, random
 import kdl, requests
-
+import sys
 # from redis.sentinel import Sentinel

 # sentinel = Sentinel([('192.168.17.65', 26379),
@@ -23,9 +23,23 @@ import kdl, requests
 # slave = sentinel.discover_slaves('ida_redis_master')
 # # 连接数据库
 # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
-rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=18, decode_responses=True, password='ReDis!GmTx*0aN12')


+
+def func_get_redis():
+    sys_path = sys.path
+    for p in sys_path:
+        if "C:\\" in p:
+            stats = "test"
+            break
+    if stats == "on_line":
+        rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=18, decode_responses=True, password='ReDis!GmTx*0aN12')
+    else:
+        rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
+    return rds
+
+rds = func_get_redis()
+
 def get_proxy_from_redis():
    try:
        one_proxy = rds.randomkey()

--- a/crawler_sys/scheduler/generate_redis_url_batch.py
+++ b/crawler_sys/scheduler/generate_redis_url_batch.py
@@ -20,11 +20,11 @@ parser.add_argument('-d', '--days_from_now', default=30, type=int,
                          'default 30.'))
 args = parser.parse_args()

+
 def redis_url_batch_gen(platform, batch_str, release_time_lower_bdr):
    url_Lst = pull_url_from_es(platform, release_time_lower_bdr)
    if url_Lst != []:
-        redis_list_name, push_counter = feed_url_into_redis(url_Lst, platform,
-                                                            batch_str=batch_str)
+        redis_list_name, push_counter = feed_url_into_redis(url_Lst, platform,)
        return (redis_list_name, push_counter)
    else:
        return (None, None)

--- a/crawler_sys/site_crawler/crawler_toutiao.py
+++ b/crawler_sys/site_crawler/crawler_toutiao.py
@@ -343,8 +343,7 @@ class Crawler_toutiao():
    def search_page_old(self, keyword, search_pages_max=12,
                        output_to_es_raw=False,
                        output_to_es_register=False,
-                        es_index=None,
-                        doc_type=None,proxies_num=0):
+                        es_index=None,proxies_num=0):
        headers_search = {
            "accept": "application/json, text/javascript",
            "accept-encoding": "gzip, deflate",
@@ -428,9 +427,10 @@ class Crawler_toutiao():
                            print("method get_web_article_info error %s" % e)
                        print(D0)
                        toutiao_Lst.append(D0)
-                    except KeyError:
+                    except Exception as e:
                        # It's totally ok to drop the last return data value.
                        # The search api just return something seems related to search
+                        print(e)
                        continue
            else:
                break
@@ -440,7 +440,7 @@ class Crawler_toutiao():
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              es_index=es_index,
-                              doc_type=doc_type)
+                           )
                toutiao_Lst.clear()

        if toutiao_Lst != []:
@@ -449,7 +449,7 @@ class Crawler_toutiao():
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          es_index=es_index,
-                          doc_type=doc_type)
+                         )

        return toutiao_Lst

@@ -461,7 +461,7 @@ class Crawler_toutiao():
        self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
                             output_to_es_register=output_to_es_register,
                             es_index=es_index,
-                             doc_type=doc_type, proxies_num=proxies_num)
+                             proxies_num=proxies_num)

    def find_releaser_id(self, releaserUrl):
        return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
@@ -1799,4 +1799,4 @@ if __name__ == '__main__':
        #                            doc_type='doc',
        #                            releaser_page_num_max=3, proxies_num=1))
    # test.releaser_page(u)
-    test.search_page("热玛吉五代")
+    test.search_page("比基尼线脱毛")
--- a/crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
+++ b/crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
@@ -432,6 +432,7 @@ class Crawler_weibo():
            video_dic["releaserUrl"] = data["userinfo"].get('url')
            video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"])
            video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"])
+            video_dic["mid"] = article_id
            return video_dic
        except Exception as e:
            print("single data row formate error %s" % e)
@@ -442,6 +443,7 @@ class Crawler_weibo():
                        output_to_es_register=False,
                        es_index=None,
                        doc_type=None, proxies_num=0):
+        count_false = 0
        headers_search = {
            "Accept": "application/json, text/plain, */*",
            "MWeibo-Pwa": "1",
@@ -463,6 +465,13 @@ class Crawler_weibo():
                if get_page.status_code != 200:
                    continue
            page_dict = get_page.json()
+            while page_dict['data'].get("msg") == '这里还没有内容':
+                get_page = retry_get_url(search_page_url, headers=headers_search)
+                page_dict = get_page.json()
+                count_false += 1
+                if count_false >= 3:
+                    continue
+
            if page_dict['data'].get("cards")[0].get("card_group"):
                for one_line in page_dict['data'].get("cards")[0].get("card_group"):
                    try:
@@ -488,7 +497,7 @@ class Crawler_weibo():
                        # D0['play_count'] = play_count
                        # D0['comment_count'] = comment_count
                        # D0['favorite_count'] = favorite_count
-                        D0['article_id'] = article_id
+                        D0['mid'] = article_id
                        # D0['releaser'] = releaser
                        # D0['releaserUrl'] = releaserUrl
                        # D0['release_time'] = release_time
@@ -501,6 +510,7 @@ class Crawler_weibo():
                            D0.update(article_info)
                        except Exception as e:
                            print("method get_web_article_info error %s" % e)
+                            continue
                        print(D0)
                        weibo_Lst.append(D0)
                    except KeyError:
@@ -850,5 +860,5 @@ if __name__ == '__main__':
    #    test_search2 = weibo.search_page(keyword, user_name, password)
    #    test_repost = weibo.repost_page(weibo_id, user_name, password)
    # user_page = weibo.user_page(user_id, user_name, password)
-    weibo.search_page("迪丽热巴")
+    weibo.search_page("迪丽热巴",output_to_es_register=True,es_index="crawler-data-raw",search_pages_max=1)
    # print(user_page)
--- a/crawler_sys/site_crawler/crawler_zhihu.py
+++ b/crawler_sys/site_crawler/crawler_zhihu.py
@@ -24,8 +24,8 @@ import requests
 # import execjs
 import hashlib
 import requests
-
-from bs4 import BeautifulSoup
+import execjs
+# from bs4 import BeautifulSoup
 from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
 from crawler.crawler_sys.utils.output_results import retry_get_url, output_result
 from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
@@ -48,6 +48,9 @@ class Crawler_zhihu():
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
+        with open('./zhihu.js', 'r', encoding='utf-8') as f:
+            js = f.read()
+        self.exec_js = execjs.compile(js)
        for popk in pop_key_Lst:
            self.video_data.pop(popk)

@@ -71,7 +74,7 @@ class Crawler_zhihu():
            requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
            tres_json_test = requests_res.text
            res_json = json.loads(re.findall('<script id="js-initialData" type="text/json">(.*?)</script>',tres_json_test)[0])
-            print(res_json)
+            # print(res_json)
            data = res_json["initialState"]
            video_dic = {}
            video_dic["url"] = url
@@ -131,7 +134,7 @@ class Crawler_zhihu():
            pass
        return res_dict

-    def search_article_page(self, keyword, search_pages_max=12,
+    def search_article_page(self, keyword, search_pages_max=10,
                            output_to_es_raw=False,
                            output_to_es_register=False,
                            es_index=None,
@@ -151,7 +154,7 @@ class Crawler_zhihu():
            "x-app-za": "OS=Web",
            "x-requested-with": "fetch",
            "x-zse-83": "3_2.0",
-            "x-zse-86": "1.0_a_Yy6euBS_xfbM28ZhtycHU8gG2XoHtyGTxqHve8rXtY",
+            "x-zse-86": None,
            "referer": "https://www.zhihu.com/search?type=content&q={0}".format(urllib.parse.quote(keyword)),

        }
@@ -163,6 +166,10 @@ class Crawler_zhihu():
        url = "https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0".format(
            urllib.parse.quote(keyword))
        offset = 0
+        f = "+".join(["3_2.0", url.replace("https://www.zhihu.com",""), headers_search["referer"], cookies_dict["d_c0"]])
+        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
+        headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
+        res_list = []
        while offset <= search_pages_max * 20:
            offset += 20
            get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
@@ -176,7 +183,6 @@ class Crawler_zhihu():
            # print(get_page.cookies.get_dict())
            cookies_dict.update(get_page.cookies.get_dict())
            headers_search.pop("x-zse-86", 0)
-            res_list = []
            if page_dict.get("data"):
                for one_line in page_dict['data']:
                    try:
@@ -191,7 +197,7 @@ class Crawler_zhihu():
                            D0.update(res_data)
                        except Exception as e:
                            print("method get_web_article_info error %s" % e)
-                        print(D0)
+                        # print(D0)
                        res_list.append(D0)
                    except KeyError:
                        # It's totally ok to drop the last return data value.
@@ -231,137 +237,6 @@ class Crawler_zhihu():
                                 es_index=es_index,
                                 doc_type=doc_type, proxies_num=proxies_num)

-    def repost_page(self, weibo_id, user_name, password):
-        total_page = 0
-        result_lst = []
-        cookie = self.manipulate_login(user_name=user_name,
-                                       password=password)
-        # cookie = self.test_cookie(get_cookie)
-        if cookie is not None:
-            current_time = int(time.time() * 1000)
-            repost_url = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=1&__rnd=' + str(
-                current_time)
-            get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
-            get_page.encoding = 'utf-8'
-            try:
-                page_dic = get_page.json()
-                total_page = page_dic['data']['page']['totalpage']
-                repost_info = page_dic['data']['html']
-                repost_soup = BeautifulSoup(repost_info, 'html.parser')
-                repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
-                for line in repost_agg:
-                    try:
-                        one_repost = self.get_repost_info(line)
-                        result_lst.append(one_repost)
-                        print('get one repost')
-                    except:
-                        print('one repost data error')
-                print(one_repost)
-            except:
-                print("can't get repost data")
-        time.sleep(6)
-        if cookie is not None and total_page != 0:
-            for page_num in range(1, total_page + 1):
-                current_time = int(time.time() * 1000)
-                repost_url = ('https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id +
-                              '&max_id=0&page=' + str(page_num) + '&__rnd=' + str(current_time))
-                get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
-                time.sleep(3)
-                get_page.encoding = 'utf-8'
-                try:
-                    page_dic = get_page.json()
-                    total_page = page_dic['data']['page']['totalpage']
-                    repost_info = page_dic['data']['html']
-                    repost_soup = BeautifulSoup(repost_info, 'html.parser')
-                    repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
-                    for line in repost_agg:
-                        one_repost = self.get_repost_info(line)
-                        result_lst.append(one_repost)
-                        print('get one repost at %s' % page_num)
-                    print(one_repost)
-                except:
-                    print("can't get repost data")
-        if result_lst != []:
-            return result_lst
-        else:
-            print("can't get repost data")
-            return None
-
-    def user_page(self, user_id, user_name, password):
-        result_lst = []
-        cookie_pool = open('cookie_pool',
-                           'r', encoding='utf-8')
-        for coo in cookie_pool:
-            print(coo)
-            cookie = json.loads(coo)
-        # cookie = self.manipulate_login(user_name=user_name,password=password)
-        # cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
-        if cookie is not None:
-            for page_num in range(1, 3):
-                first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0'
-                                                                '&profile_ftype=1&page=' + str(page_num) + '#feedtop')
-                get_page = requests.get(first_url, headers=self.headers, cookies=cookie)
-                get_page.encoding = 'utf-8'
-                page = get_page.text
-                soup = BeautifulSoup(page, 'html.parser')
-                sfa = soup.find_all('script')
-                find_content = ''
-                for line in sfa:
-                    if 'Pl_Official_MyProfileFeed__' in str(line):
-                        find_content = str(line)
-                find_content = find_content.replace('<script>FM.view(', '').replace(')</script>', '')
-                # print(find_content)
-                find_content_dic = json.loads(find_content)
-                content_for_soup = find_content_dic['html']
-                soup_content = BeautifulSoup(content_for_soup, 'html.parser')
-                weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'})
-                # time.sleep(15)
-                for line_count, line in enumerate(weibo_lst):
-                    weibo_info = self.get_user_weibo_info(line, cookie)
-                    weibo_info['user_id'] = user_id
-                    weibo_info['user_url'] = 'https://weibo.com/' + user_id
-                    result_lst.append(weibo_info)
-                    print('get data at element page:%s pagebar:%s' % (page_num, line_count))
-                get_parameter = soup.find_all('script', {'type': 'text/javascript'})
-                for line in get_parameter:
-                    if 'pid' in str(line) and 'oid' in str(line):
-                        parameter_str = str(line)
-                parameter_str = parameter_str.replace('\r', '').replace('\n', '').replace("\'", '')
-                domain = re.findall('\d+', ''.join(re.findall("pid]=\d+", parameter_str)))[0]
-                special_id = re.findall('\d+', ''.join(re.findall("page_id]=\d+", parameter_str)))[0]
-                current_time = int(time.time() * 1000)
-                for pagebar in [0, 1]:
-                    user_url = ('https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain='
-                                + domain + '&profile_ftype=1&is_all=1&pagebar=' + str(pagebar) +
-                                '&pl_name=Pl_Official_MyProfileFeed__22&id=' + special_id +
-                                '&script_uri=/' + user_id + '&feed_type=0&page=' + str(page_num) + '&pre_page=1'
-                                                                                                   '&domain_op=' + domain + '&__rnd=' + str(
-                                current_time))
-                    get_page = requests.get(user_url, headers=self.headers, cookies=cookie)
-                    get_page.encoding = 'utf-8'
-                    try:
-                        page_dic = get_page.json()
-                        user_weibo_str = page_dic['data']
-                        user_weibo_soup = BeautifulSoup(user_weibo_str, 'html.parser')
-                        user_weibo_agg = user_weibo_soup.find_all('div', {'action-type': 'feed_list_item'})
-                        # time.sleep(15)
-                        for line in user_weibo_agg:
-                            try:
-                                weibo_info = self.get_user_weibo_info(line, cookie)
-                                weibo_info['user_id'] = user_id
-                                weibo_info['user_url'] = 'https://weibo.com/' + user_id
-                                result_lst.append(weibo_info)
-                                print('get data at ajax page page_num:%s pagebar:%s'
-                                      % (page_num, pagebar))
-                            except:
-                                print('one weibo_info error')
-                    except:
-                        print('page error at page_num:%s pagebar:%s' % (page_num, pagebar))
-        if result_lst != []:
-            return result_lst
-        else:
-            print("can't get repost data")
-            return None

    @staticmethod
    def get_single_page(mid):
@@ -530,40 +405,11 @@ class Crawler_zhihu():
 if __name__ == '__main__':
    zhihu = Crawler_zhihu()
    import os
-    # import PyV8

-    import execjs
-    os.environ["EXECJS_RUNTIME"] = 'Node'
+    # print(execjs.get().name )
+    # os.environ["EXECJS_RUNTIME"] = 'Node'
+    # print(execjs.get().name )
    # zhihu.get_serach_page_cookies("热玛吉")
-    # zhihu.search_page("比基尼线脱毛")
+    zhihu.search_page("双眼皮",search_pages_max=1,output_to_es_register=True)
    # zhihu.get_single_answer_page("325099876","1209953121")
    # print(user_page)
-    if True:
-    # with PyV8.JSContext() as ctx:
-        url = "api/v4/search_v3?t=general&q=%E7%83%AD%E7%8E%9B%E5%90%89&correction=1&offset=20&limit=20&lc_idx=25&show_all_topics=0&search_hash_id=12d60c255d0be17b9830355a0d04de5b&vertical_info=0%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1"
-
-        referer = "https://www.zhihu.com/search?type=content&q=%E7%83%AD%E7%8E%9B%E5%90%89"
-
-        f = "+".join(["3_2.0", url, referer, '"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"'])
-
-        fmd5 = hashlib.new('md5', f.encode()).hexdigest()
-
-        # with open('./zhihu_js.js', 'r') as f:
-        #     # print(f.read())
-        #     ctx.eval(f.read())
-        # encrypt_str = ctx.locals.add('b',fmd5)
-
-        with open('./zhihu.js', 'r', encoding='utf-8') as f:
-            js = f.read()
-        encrypt_str = execjs.compile(js).call('b', fmd5)
-        headers = {
-            "referer": referer,
-            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
-            "cookie": 'd_c0="AACSLMY7lBGPTo9fXdy2pmiGQ4ZVVUcqzC4=|1594785557";',
-            "x-api-version": "3.0.91",
-            "x-zse-83": "3_2.0",
-            "x-zse-86": "1.0_%s" % encrypt_str,
-        }
-        print(headers)
-        r = requests.get("https://www.zhihu.com" + url, headers=headers)
-        print(r.text)
--- a/crawler_sys/utils/output_results.py
+++ b/crawler_sys/utils/output_results.py
@@ -15,7 +15,7 @@ import pymysql
 import requests
 from elasticsearch.exceptions import TransportError
 from crawler_sys.framework.redis_interact import feed_url_into_redis
-from crawler_sys.framework.redis_interact import rds
+from crawler_sys.framework.redis_interact import redis_path
 from crawler_sys.framework.es_ccr_index_defination import es_framework as es_site_crawler
 from crawler_sys.framework.es_ccr_index_defination import index_url_register
 from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
@@ -296,7 +296,9 @@ def output_result(result_Lst, platform,
                  push_to_redis=False,
                  output_to_test_mysql=False,
                  output_to_mimas_mysql=False,
-                  es_index=index_site_crawler, **kwargs):
+                  es_index=index_site_crawler,
+                  rds_path="on_line",
+                  **kwargs):
    # write data into es crawler-raw index
    if output_to_es_raw:
        bulk_write_into_es(result_Lst, es_index)
@@ -308,14 +310,16 @@ def output_result(result_Lst, platform,
                           index=es_index,
                           construct_id=True,
                           platform=platform,
+
                           )

    if output_to_test_mysql:
        pass
    # feed url into redis
    if push_to_redis:
+        rds = redis_path(rds_path)
        feed_url_into_redis(
-            result_Lst, expire=kwargs.get("expire"))
+            result_Lst, expire=kwargs.get("expire"),rds=rds)

    # output into file according to passed in parameters
    if output_to_file is True and filepath is not None:
@@ -451,7 +455,7 @@ def load_json_file_into_dict_Lst(filename, path):
    return data_Lst


-def crawl_a_url_and_update_redis(url, platform, urlhash, processID=-1):
+def crawl_a_url_and_update_redis(url, platform, urlhash, rds,processID=-1,):
    # find crawler
    # perform crawling, get the data
    # write es or output to files
@@ -469,7 +473,7 @@ def crawl_batch_task(url_Lst):
                                     url_info['urlhash'])


-def scan_redis_to_crawl():
+def scan_redis_to_crawl(rds):
    batch_size = 1000
    cur = 0
    task_batchs = []
@@ -491,13 +495,13 @@ def scan_redis_to_crawl():
                                        'urlhash': urlhash})
                    if len(task_batchs) == batch_size:
                        # multi-processing here
-                        crawl_batch_task(task_batchs)
+                        crawl_batch_task(rds,task_batchs)
                        task_batchs.clear()
        if cur == 0:
            break


-def remove_fetched_url_from_redis(remove_interval=10):
+def remove_fetched_url_from_redis(rds,remove_interval=10):
    time.sleep(remove_interval)
    cur = 0
    delete_counter = 0

--- a/requirements.txt
+++ b/requirements.txt
-git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
+# git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
 lxml==4.5.1
 requests==2.23.0
 tqdm==4.46.1

--- a/write_data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
+++ b/write_data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
@@ -5,7 +5,7 @@
 # @author : litao


-def calculate_douban_id(data_dic):
+def calculate_zhihu_id(data_dic):
    if "answer" in data_dic["url"]:
        return data_dic["_id"].replace("zhihu_","")
    else:

--- a/write_data_into_es/func_cal_doc_id.py
+++ b/write_data_into_es/func_cal_doc_id.py
@@ -17,6 +17,7 @@ from write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id import ca
 from write_data_into_es.calculate_doc_id.func_calculate_douyin_id import calculate_douyin_id
 from write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id import calculate_haokan_id
 from write_data_into_es.calculate_doc_id.func_calculate_weibo_id import calculate_weibo_id
+from write_data_into_es.calculate_doc_id.func_calculate_zhihu_id import calculate_zhihu_id
 from write_data_into_es.calculate_doc_id.func_calculate_douban_id import calculate_douban_id


@@ -32,7 +33,7 @@ def vid_cal_func(platform):
        "haokan":calculate_haokan_id,
        "weibo":calculate_weibo_id,
        "douban":calculate_douban_id,
-        "zhihu":calculate_douban_id,
+        "zhihu":calculate_zhihu_id,
        }

    def general_vid_cal_func(url):