update

fd388d44 · litaolemo · 5efa3f31 · fd388d44 · fd388d44 · fd388d44
Commit fd388d44 authored Jul 24, 2020 by litaolemo
8 changed files
--- a/crawler_sys/framework/es_crawler.py
+++ b/crawler_sys/framework/es_crawler.py
@@ -67,19 +67,3 @@ def scan_index(index, doc_type, search_body):
    return (total_hit, scan_resp)
-def construct_id_for_url_register(platform, url):
-    if platform == 'new_tudou':
-        vid_bare = calculate_newTudou_video_id(url)
-        vid = 'new_tudou_%s' % vid_bare
-    elif platform == 'toutiao':
-        vid_bare = calculate_toutiao_video_id(url)
-        vid = 'toutiao_%s' % vid_bare
-    elif platform == '腾讯新闻':
-        c_time = str(int(time.time()))
-        vid = "tencent_news_%s_%s" % (url, c_time)
-    elif platform == '网易新闻':
-        vid = "163_news_%s" % calculate_wangyi_news_id(url)
-    else:
-        vid_bare = url
-        vid = vid_bare
-    return vid
--- a/crawler_sys/framework/es_target_releasers.py
+++ b/crawler_sys/framework/es_target_releasers.py
@@ -9,13 +9,14 @@ import random
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import scan
-#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
+#  rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
 es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
 index_target_releaser = 'target_releasers'
 doc_type_target_releaser = 'doc'
 def bulk_write_target_releasers(dict_Lst,
                                index=index_target_releaser,
                                doc_type=doc_type_target_releaser):
@@ -74,7 +75,7 @@ def get_releaserUrls_from_es(platform,
                releaserUrl_Lst.append((releaserUrl,releaser))
            except:
-                print('error in :' ,line)
+                print('error in :', line)
                continue
    else:
        print('Got zero hits.')

--- a/crawler_sys/proxy_pool/__init__.py
+++ b/crawler_sys/proxy_pool/__init__.py
+# -*- coding:UTF-8 -*-
+# @Time  : 2020/7/24 10:51
+# @File  : __init__.py
+# @email : litao@igengmei.com
+# @author : litao
\ No newline at end of file
--- a/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+++ b/crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
@@ -8,8 +8,8 @@
    目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种，默认使用 "simple"鉴权。
    所有方法均可添加关键字参数sign_type修改鉴权方式。
 """
-import redis,random
+import redis, random
-import kdl,requests
+import kdl, requests
 # from redis.sentinel import Sentinel
@@ -25,6 +25,7 @@ import kdl,requests
 # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
 rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True)
 def get_proxy_from_redis():
    try:
        one_proxy = rds.randomkey()
@@ -32,14 +33,15 @@ def get_proxy_from_redis():
        password = "i9mmu0a3"
        proxies = {
-                "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
+            "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
-                "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
+            "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
        }
        return proxies
    except Exception as e:
        print(e)
        return None
 def func_get_proxy_to_redis():
    # chance = random.random()
    auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v")
@@ -68,14 +70,13 @@ def func_get_proxy_to_redis():
    # ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
    # print("dps proxy: ", ips)
    # 检测私密代理有效性： 返回 ip: true/false 组成的dict
-    #ips = client.get_dps(1, sign_type='simple', format='json')
+    # ips = client.get_dps(1, sign_type='simple', format='json')
    # valids = client.check_dps_valid(ips)
    # print("valids: ", valids)
    # 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
-    ips = client.get_dps(1, format='json',dedup=1)
+    ips = client.get_dps(1, format='json', dedup=1)
    seconds = client.get_dps_valid_time(ips)
    # print("seconds: ", seconds)
    for key in seconds:
@@ -84,10 +85,12 @@ def func_get_proxy_to_redis():
    # 获取计数版ip余额（仅私密代理计数版）
    # balance = client.get_ip_balance(sign_type='hmacsha1')
    # print("balance: ", balance)
 def proxy_test(proxies):
    page_url = "http://dev.kdlapi.com/testproxy/"
    headers = {
-            "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
+        "Accept-Encoding": "Gzip",  # 使用gzip压缩传输数据让访问更快
    }
    res = requests.get(url=page_url, proxies=proxies, headers=headers)
@@ -95,6 +98,7 @@ def proxy_test(proxies):
    if res.status_code == 200:
        print(res.content.decode('utf-8'))  # 获取页面内容
 def get_proxy_dic(max_proxies=None):
    if not max_proxies:
        max_proxies = 8
@@ -111,6 +115,7 @@ def get_proxy_dic(max_proxies=None):
    else:
        return get_proxy_from_redis()
 def get_proxy(proxies_num=None):
    if proxies_num:
        proxies = get_proxy_dic(max_proxies=proxies_num)
@@ -119,8 +124,9 @@ def get_proxy(proxies_num=None):
    else:
        return None
 if __name__ == "__main__":
    proxy_pool_dic = get_proxy(11)
    print(proxy_pool_dic)
    proxy_test(proxy_pool_dic)
    print(get_proxy_from_redis())
\ No newline at end of file
--- a/crawler_sys/site_crawler_test/crawler_weibo.py
+++ b/crawler_sys/site_crawler_test/crawler_weibo.py
@@ -227,17 +227,17 @@ class Crawler_weibo():
                            pass
                        data_lis.append(res)
-                        # if len(data_lis) >= 100:
+                        if len(data_lis) >= 100:
-                        #     output_result(result_Lst=data_lis,
+                            output_result(result_Lst=data_lis,
-                        #                   platform=self.platform,
+                                          platform=self.platform,
-                        #                   output_to_file=output_to_file,
+                                          output_to_file=output_to_file,
-                        #                   filepath=filepath,
+                                          filepath=filepath,
-                        #                   push_to_redis=push_to_redis,
+                                          push_to_redis=push_to_redis,
-                        #                   output_to_es_register=output_to_es_register,
+                                          output_to_es_register=output_to_es_register,
-                        #                   output_to_es_raw=output_to_es_raw,
+                                          output_to_es_raw=output_to_es_raw,
-                        #                   es_index=es_index,
+                                          es_index=es_index,
-                        #                )
+                                       )
-                        #     data_lis.clear()
+                            data_lis.clear()
                else:
                    count_false += 1
                    if count_false > 10:
@@ -297,7 +297,7 @@ if __name__ == '__main__':
    # for r in res:
    #     print(r)
    for u in url_list:
-        test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False,
+        test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
                                          es_index='crawler-data-raw',
                                          doc_type='doc', releaser_page_num_max=4000)
    # test.get_single_page(4524055937468233)
\ No newline at end of file
--- a/crawler_sys/utils/output_results.py
+++ b/crawler_sys/utils/output_results.py
@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit
 from crawler_sys.framework.es_ccr_index_defination import index_url_register
 from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
 from crawler_sys.framework.es_ccr_index_defination import fields_url_register
-from crawler_sys.framework.es_crawler import construct_id_for_url_register
+from write_data_into_es.func_cal_doc_id import  cal_doc_id
 from crawler_sys.utils.write_into_file import write_str_into_file
 from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
@@ -82,11 +82,11 @@ def output_result(result_Lst, platform,
    # write data into es crawler-url-register index
    if output_to_es_register:
-        data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
+        # data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
-        bulk_write_into_es(data_Lst_reg,
+        bulk_write_into_es(result_Lst,
-                           index=index_url_register,
+                           index=es_index,
                           construct_id=True,
-                           platform=platform
+                           platform=platform,
                          )
    # feed url into redis
@@ -182,7 +182,7 @@ def bulk_write_into_es(dict_Lst,
    for line in dict_Lst:
        write_counter += 1
        if construct_id and platform is not None:
-            doc_id = construct_id_for_url_register(platform, line['url'])
+            doc_id = cal_doc_id(platform, url=line["url"], doc_id_type='all-time-url',data_dict=line)
            action_str = ('{ "index" : { "_index" : "%s", "_id" : "%s" } }'
                          % (index, doc_id))
        else:

--- a/write_data_into_es/func_get_releaser_id.py
+++ b/write_data_into_es/func_get_releaser_id.py
@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
 def weibo(releaserUrl,**kwargs):
    try:
-        containerid = ""
        if "/u/" in releaserUrl:
-            releaser_id = containerid = re.findall("/u/(\d+)",releaserUrl)[0]
+            releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
        elif "/p/" in releaserUrl:
-            releaser_id = containerid =re.findall("/p/(\d+)",releaserUrl)[0]
+            releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
            if len(releaser_id) >= 15:
                releaser_id = releaser_id[6:]
        elif "/" in releaserUrl:
-            releaser_id = containerid= re.findall("(\d+)",releaserUrl)[0]
+            releaser_id = re.findall("(\d+)",releaserUrl)[0]
        else:
            try:
-                releaserid = int(releaserUrl)
+                releaser_id = int(releaserUrl)
            except:
                return None
-        return releaser_id,containerid
+        return releaser_id
    except:
        return None

--- a/write_data_into_es/target_releaser_add.py
+++ b/write_data_into_es/target_releaser_add.py