Commit fd388d44 authored by litaolemo's avatar litaolemo

update

parent 5efa3f31
......@@ -67,19 +67,3 @@ def scan_index(index, doc_type, search_body):
return (total_hit, scan_resp)
def construct_id_for_url_register(platform, url):
if platform == 'new_tudou':
vid_bare = calculate_newTudou_video_id(url)
vid = 'new_tudou_%s' % vid_bare
elif platform == 'toutiao':
vid_bare = calculate_toutiao_video_id(url)
vid = 'toutiao_%s' % vid_bare
elif platform == '腾讯新闻':
c_time = str(int(time.time()))
vid = "tencent_news_%s_%s" % (url, c_time)
elif platform == '网易新闻':
vid = "163_news_%s" % calculate_wangyi_news_id(url)
else:
vid_bare = url
vid = vid_bare
return vid
......@@ -9,13 +9,14 @@ import random
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
# rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
def bulk_write_target_releasers(dict_Lst,
index=index_target_releaser,
doc_type=doc_type_target_releaser):
......@@ -74,7 +75,7 @@ def get_releaserUrls_from_es(platform,
releaserUrl_Lst.append((releaserUrl,releaser))
except:
print('error in :' ,line)
print('error in :', line)
continue
else:
print('Got zero hits.')
......
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 10:51
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
......@@ -8,8 +8,8 @@
目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。
所有方法均可添加关键字参数sign_type修改鉴权方式。
"""
import redis,random
import kdl,requests
import redis, random
import kdl, requests
# from redis.sentinel import Sentinel
......@@ -25,6 +25,7 @@ import kdl,requests
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='154.8.190.251', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis():
try:
one_proxy = rds.randomkey()
......@@ -32,14 +33,15 @@ def get_proxy_from_redis():
password = "i9mmu0a3"
proxies = {
"http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
"https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
"http": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy},
"https": "http://%(user)s:%(pwd)s@%(ip)s/" % {'user': username, 'pwd': password, 'ip': one_proxy}
}
return proxies
except Exception as e:
print(e)
return None
def func_get_proxy_to_redis():
# chance = random.random()
auth = kdl.Auth("990866563045611", "quxguz4hwm9cxnx6wpjhkokx04klpr8v")
......@@ -68,14 +70,13 @@ def func_get_proxy_to_redis():
# ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
# print("dps proxy: ", ips)
# 检测私密代理有效性: 返回 ip: true/false 组成的dict
#ips = client.get_dps(1, sign_type='simple', format='json')
# ips = client.get_dps(1, sign_type='simple', format='json')
# valids = client.check_dps_valid(ips)
# print("valids: ", valids)
# 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
ips = client.get_dps(1, format='json',dedup=1)
ips = client.get_dps(1, format='json', dedup=1)
seconds = client.get_dps_valid_time(ips)
# print("seconds: ", seconds)
for key in seconds:
......@@ -84,10 +85,12 @@ def func_get_proxy_to_redis():
# 获取计数版ip余额(仅私密代理计数版)
# balance = client.get_ip_balance(sign_type='hmacsha1')
# print("balance: ", balance)
def proxy_test(proxies):
page_url = "http://dev.kdlapi.com/testproxy/"
headers = {
"Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快
"Accept-Encoding": "Gzip", # 使用gzip压缩传输数据让访问更快
}
res = requests.get(url=page_url, proxies=proxies, headers=headers)
......@@ -95,6 +98,7 @@ def proxy_test(proxies):
if res.status_code == 200:
print(res.content.decode('utf-8')) # 获取页面内容
def get_proxy_dic(max_proxies=None):
if not max_proxies:
max_proxies = 8
......@@ -111,6 +115,7 @@ def get_proxy_dic(max_proxies=None):
else:
return get_proxy_from_redis()
def get_proxy(proxies_num=None):
if proxies_num:
proxies = get_proxy_dic(max_proxies=proxies_num)
......@@ -119,8 +124,9 @@ def get_proxy(proxies_num=None):
else:
return None
if __name__ == "__main__":
proxy_pool_dic = get_proxy(11)
print(proxy_pool_dic)
proxy_test(proxy_pool_dic)
print(get_proxy_from_redis())
\ No newline at end of file
print(get_proxy_from_redis())
......@@ -227,17 +227,17 @@ class Crawler_weibo():
pass
data_lis.append(res)
# if len(data_lis) >= 100:
# output_result(result_Lst=data_lis,
# platform=self.platform,
# output_to_file=output_to_file,
# filepath=filepath,
# push_to_redis=push_to_redis,
# output_to_es_register=output_to_es_register,
# output_to_es_raw=output_to_es_raw,
# es_index=es_index,
# )
# data_lis.clear()
if len(data_lis) >= 100:
output_result(result_Lst=data_lis,
platform=self.platform,
output_to_file=output_to_file,
filepath=filepath,
push_to_redis=push_to_redis,
output_to_es_register=output_to_es_register,
output_to_es_raw=output_to_es_raw,
es_index=es_index,
)
data_lis.clear()
else:
count_false += 1
if count_false > 10:
......@@ -297,7 +297,7 @@ if __name__ == '__main__':
# for r in res:
# print(r)
for u in url_list:
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False,
test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000)
# test.get_single_page(4524055937468233)
\ No newline at end of file
......@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit
from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.es_ccr_index_defination import fields_url_register
from crawler_sys.framework.es_crawler import construct_id_for_url_register
from write_data_into_es.func_cal_doc_id import cal_doc_id
from crawler_sys.utils.write_into_file import write_str_into_file
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
......@@ -82,11 +82,11 @@ def output_result(result_Lst, platform,
# write data into es crawler-url-register index
if output_to_es_register:
data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
bulk_write_into_es(data_Lst_reg,
index=index_url_register,
# data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
bulk_write_into_es(result_Lst,
index=es_index,
construct_id=True,
platform=platform
platform=platform,
)
# feed url into redis
......@@ -182,7 +182,7 @@ def bulk_write_into_es(dict_Lst,
for line in dict_Lst:
write_counter += 1
if construct_id and platform is not None:
doc_id = construct_id_for_url_register(platform, line['url'])
doc_id = cal_doc_id(platform, url=line["url"], doc_id_type='all-time-url',data_dict=line)
action_str = ('{ "index" : { "_index" : "%s", "_id" : "%s" } }'
% (index, doc_id))
else:
......
......@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def weibo(releaserUrl,**kwargs):
try:
containerid = ""
if "/u/" in releaserUrl:
releaser_id = containerid = re.findall("/u/(\d+)",releaserUrl)[0]
releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
elif "/p/" in releaserUrl:
releaser_id = containerid =re.findall("/p/(\d+)",releaserUrl)[0]
releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
if len(releaser_id) >= 15:
releaser_id = releaser_id[6:]
elif "/" in releaserUrl:
releaser_id = containerid= re.findall("(\d+)",releaserUrl)[0]
releaser_id = re.findall("(\d+)",releaserUrl)[0]
else:
try:
releaserid = int(releaserUrl)
releaser_id = int(releaserUrl)
except:
return None
return releaser_id,containerid
return releaser_id
except:
return None
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment