Commit fd388d44 authored by litaolemo's avatar litaolemo

update

parent 5efa3f31
...@@ -67,19 +67,3 @@ def scan_index(index, doc_type, search_body): ...@@ -67,19 +67,3 @@ def scan_index(index, doc_type, search_body):
return (total_hit, scan_resp) return (total_hit, scan_resp)
def construct_id_for_url_register(platform, url):
if platform == 'new_tudou':
vid_bare = calculate_newTudou_video_id(url)
vid = 'new_tudou_%s' % vid_bare
elif platform == 'toutiao':
vid_bare = calculate_toutiao_video_id(url)
vid = 'toutiao_%s' % vid_bare
elif platform == '腾讯新闻':
c_time = str(int(time.time()))
vid = "tencent_news_%s_%s" % (url, c_time)
elif platform == '网易新闻':
vid = "163_news_%s" % calculate_wangyi_news_id(url)
else:
vid_bare = url
vid = vid_bare
return vid
...@@ -9,13 +9,14 @@ import random ...@@ -9,13 +9,14 @@ import random
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0) # rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200) es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
index_target_releaser = 'target_releasers' index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc' doc_type_target_releaser = 'doc'
def bulk_write_target_releasers(dict_Lst, def bulk_write_target_releasers(dict_Lst,
index=index_target_releaser, index=index_target_releaser,
doc_type=doc_type_target_releaser): doc_type=doc_type_target_releaser):
...@@ -74,7 +75,7 @@ def get_releaserUrls_from_es(platform, ...@@ -74,7 +75,7 @@ def get_releaserUrls_from_es(platform,
releaserUrl_Lst.append((releaserUrl,releaser)) releaserUrl_Lst.append((releaserUrl,releaser))
except: except:
print('error in :' ,line) print('error in :', line)
continue continue
else: else:
print('Got zero hits.') print('Got zero hits.')
......
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 10:51
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
...@@ -8,8 +8,8 @@ ...@@ -8,8 +8,8 @@
目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。 目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。
所有方法均可添加关键字参数sign_type修改鉴权方式。 所有方法均可添加关键字参数sign_type修改鉴权方式。
""" """
import redis,random import redis, random
import kdl,requests import kdl, requests
# from redis.sentinel import Sentinel # from redis.sentinel import Sentinel
...@@ -25,6 +25,7 @@ import kdl,requests ...@@ -25,6 +25,7 @@ import kdl,requests
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True) # rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True) rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis(): def get_proxy_from_redis():
try: try:
one_proxy = rds.randomkey() one_proxy = rds.randomkey()
...@@ -32,14 +33,15 @@ def get_proxy_from_redis(): ...@@ -32,14 +33,15 @@ def get_proxy_from_redis():
password = "i9mmu0a3" password = "i9mmu0a3"
proxies = { proxies = {
"http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}, "http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
"https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy} "https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
} }
return proxies return proxies
except Exception as e: except Exception as e:
print(e) print(e)
return None return None
def func_get_proxy_to_redis(): def func_get_proxy_to_redis():
# chance = random.random() # chance = random.random()
auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v") auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v")
...@@ -68,14 +70,13 @@ def func_get_proxy_to_redis(): ...@@ -68,14 +70,13 @@ def func_get_proxy_to_redis():
# ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东') # ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
# print("dps proxy: ", ips) # print("dps proxy: ", ips)
# 检测私密代理有效性: 返回 ip: true/false 组成的dict # 检测私密代理有效性: 返回 ip: true/false 组成的dict
#ips = client.get_dps(1, sign_type='simple', format='json') # ips = client.get_dps(1, sign_type='simple', format='json')
# valids = client.check_dps_valid(ips) # valids = client.check_dps_valid(ips)
# print("valids: ", valids) # print("valids: ", valids)
# 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict # 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
ips = client.get_dps(1, format='json',dedup=1) ips = client.get_dps(1, format='json', dedup=1)
seconds = client.get_dps_valid_time(ips) seconds = client.get_dps_valid_time(ips)
# print("seconds: ", seconds) # print("seconds: ", seconds)
for key in seconds: for key in seconds:
...@@ -84,10 +85,12 @@ def func_get_proxy_to_redis(): ...@@ -84,10 +85,12 @@ def func_get_proxy_to_redis():
# 获取计数版ip余额(仅私密代理计数版) # 获取计数版ip余额(仅私密代理计数版)
# balance = client.get_ip_balance(sign_type='hmacsha1') # balance = client.get_ip_balance(sign_type='hmacsha1')
# print("balance: ", balance) # print("balance: ", balance)
def proxy_test(proxies): def proxy_test(proxies):
page_url = "http://dev.kdlapi.com/testproxy/" page_url = "http://dev.kdlapi.com/testproxy/"
headers = { headers = {
"Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快 "Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快
} }
res = requests.get(url=page_url, proxies=proxies, headers=headers) res = requests.get(url=page_url, proxies=proxies, headers=headers)
...@@ -95,6 +98,7 @@ def proxy_test(proxies): ...@@ -95,6 +98,7 @@ def proxy_test(proxies):
if res.status_code == 200: if res.status_code == 200:
print(res.content.decode('utf-8')) # 获取页面内容 print(res.content.decode('utf-8')) # 获取页面内容
def get_proxy_dic(max_proxies=None): def get_proxy_dic(max_proxies=None):
if not max_proxies: if not max_proxies:
max_proxies = 8 max_proxies = 8
...@@ -111,6 +115,7 @@ def get_proxy_dic(max_proxies=None): ...@@ -111,6 +115,7 @@ def get_proxy_dic(max_proxies=None):
else: else:
return get_proxy_from_redis() return get_proxy_from_redis()
def get_proxy(proxies_num=None): def get_proxy(proxies_num=None):
if proxies_num: if proxies_num:
proxies = get_proxy_dic(max_proxies=proxies_num) proxies = get_proxy_dic(max_proxies=proxies_num)
...@@ -119,8 +124,9 @@ def get_proxy(proxies_num=None): ...@@ -119,8 +124,9 @@ def get_proxy(proxies_num=None):
else: else:
return None return None
if __name__ == "__main__": if __name__ == "__main__":
proxy_pool_dic = get_proxy(11) proxy_pool_dic = get_proxy(11)
print(proxy_pool_dic) print(proxy_pool_dic)
proxy_test(proxy_pool_dic) proxy_test(proxy_pool_dic)
print(get_proxy_from_redis()) print(get_proxy_from_redis())
\ No newline at end of file
...@@ -227,17 +227,17 @@ class Crawler_weibo(): ...@@ -227,17 +227,17 @@ class Crawler_weibo():
pass pass
data_lis.append(res) data_lis.append(res)
# if len(data_lis) >= 100: if len(data_lis) >= 100:
# output_result(result_Lst=data_lis, output_result(result_Lst=data_lis,
# platform=self.platform, platform=self.platform,
# output_to_file=output_to_file, output_to_file=output_to_file,
# filepath=filepath, filepath=filepath,
# push_to_redis=push_to_redis, push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register, output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw, output_to_es_raw=output_to_es_raw,
# es_index=es_index, es_index=es_index,
# ) )
# data_lis.clear() data_lis.clear()
else: else:
count_false += 1 count_false += 1
if count_false > 10: if count_false > 10:
...@@ -297,7 +297,7 @@ if __name__ == '__main__': ...@@ -297,7 +297,7 @@ if __name__ == '__main__':
# for r in res: # for r in res:
# print(r) # print(r)
for u in url_list: for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False, test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
es_index='crawler-data-raw', es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000) doc_type='doc', releaser_page_num_max=4000)
# test.get_single_page(4524055937468233) # test.get_single_page(4524055937468233)
\ No newline at end of file
...@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit ...@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit
from crawler_sys.framework.es_ccr_index_defination import index_url_register from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.es_ccr_index_defination import fields_url_register from crawler_sys.framework.es_ccr_index_defination import fields_url_register
from crawler_sys.framework.es_crawler import construct_id_for_url_register from write_data_into_es.func_cal_doc_id import cal_doc_id
from crawler_sys.utils.write_into_file import write_str_into_file from crawler_sys.utils.write_into_file import write_str_into_file
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
...@@ -82,11 +82,11 @@ def output_result(result_Lst, platform, ...@@ -82,11 +82,11 @@ def output_result(result_Lst, platform,
# write data into es crawler-url-register index # write data into es crawler-url-register index
if output_to_es_register: if output_to_es_register:
data_Lst_reg = form_data_Lst_for_url_register(result_Lst) # data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
bulk_write_into_es(data_Lst_reg, bulk_write_into_es(result_Lst,
index=index_url_register, index=es_index,
construct_id=True, construct_id=True,
platform=platform platform=platform,
) )
# feed url into redis # feed url into redis
...@@ -182,7 +182,7 @@ def bulk_write_into_es(dict_Lst, ...@@ -182,7 +182,7 @@ def bulk_write_into_es(dict_Lst,
for line in dict_Lst: for line in dict_Lst:
write_counter += 1 write_counter += 1
if construct_id and platform is not None: if construct_id and platform is not None:
doc_id = construct_id_for_url_register(platform, line['url']) doc_id = cal_doc_id(platform, url=line["url"], doc_id_type='all-time-url',data_dict=line)
action_str = ('{ "index" : { "_index" : "%s", "_id" : "%s" } }' action_str = ('{ "index" : { "_index" : "%s", "_id" : "%s" } }'
% (index, doc_id)) % (index, doc_id))
else: else:
......
...@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs): ...@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def weibo(releaserUrl,**kwargs): def weibo(releaserUrl,**kwargs):
try: try:
containerid = ""
if "/u/" in releaserUrl: if "/u/" in releaserUrl:
releaser_id = containerid = re.findall("/u/(\d+)",releaserUrl)[0] releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
elif "/p/" in releaserUrl: elif "/p/" in releaserUrl:
releaser_id = containerid =re.findall("/p/(\d+)",releaserUrl)[0] releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
if len(releaser_id) >= 15: if len(releaser_id) >= 15:
releaser_id = releaser_id[6:] releaser_id = releaser_id[6:]
elif "/" in releaserUrl: elif "/" in releaserUrl:
releaser_id = containerid= re.findall("(\d+)",releaserUrl)[0] releaser_id = re.findall("(\d+)",releaserUrl)[0]
else: else:
try: try:
releaserid = int(releaserUrl) releaser_id = int(releaserUrl)
except: except:
return None return None
return releaser_id,containerid return releaser_id
except: except:
return None return None
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment