Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
fd388d44
Commit
fd388d44
authored
Jul 24, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
5efa3f31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
49 additions
and
52 deletions
+49
-52
es_crawler.py
crawler_sys/framework/es_crawler.py
+0
-16
es_target_releasers.py
crawler_sys/framework/es_target_releasers.py
+3
-2
__init__.py
crawler_sys/proxy_pool/__init__.py
+6
-0
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+15
-10
crawler_weibo.py
crawler_sys/site_crawler_test/crawler_weibo.py
+13
-12
output_results.py
crawler_sys/utils/output_results.py
+6
-6
func_get_releaser_id.py
write_data_into_es/func_get_releaser_id.py
+6
-6
target_releaser_add.py
write_data_into_es/target_releaser_add.py
+0
-0
No files found.
crawler_sys/framework/es_crawler.py
View file @
fd388d44
...
...
@@ -67,19 +67,3 @@ def scan_index(index, doc_type, search_body):
return
(
total_hit
,
scan_resp
)
def
construct_id_for_url_register
(
platform
,
url
):
if
platform
==
'new_tudou'
:
vid_bare
=
calculate_newTudou_video_id
(
url
)
vid
=
'new_tudou_
%
s'
%
vid_bare
elif
platform
==
'toutiao'
:
vid_bare
=
calculate_toutiao_video_id
(
url
)
vid
=
'toutiao_
%
s'
%
vid_bare
elif
platform
==
'腾讯新闻'
:
c_time
=
str
(
int
(
time
.
time
()))
vid
=
"tencent_news_
%
s_
%
s"
%
(
url
,
c_time
)
elif
platform
==
'网易新闻'
:
vid
=
"163_news_
%
s"
%
calculate_wangyi_news_id
(
url
)
else
:
vid_bare
=
url
vid
=
vid_bare
return
vid
crawler_sys/framework/es_target_releasers.py
View file @
fd388d44
...
...
@@ -9,13 +9,14 @@ import random
from
elasticsearch
import
Elasticsearch
from
elasticsearch.helpers
import
scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
#
rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework
=
Elasticsearch
(
hosts
=
'172.16.32.37'
,
port
=
9200
)
index_target_releaser
=
'target_releasers'
doc_type_target_releaser
=
'doc'
def
bulk_write_target_releasers
(
dict_Lst
,
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
):
...
...
@@ -74,7 +75,7 @@ def get_releaserUrls_from_es(platform,
releaserUrl_Lst
.
append
((
releaserUrl
,
releaser
))
except
:
print
(
'error in :'
,
line
)
print
(
'error in :'
,
line
)
continue
else
:
print
(
'Got zero hits.'
)
...
...
crawler_sys/proxy_pool/__init__.py
0 → 100644
View file @
fd388d44
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 10:51
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
fd388d44
...
...
@@ -8,8 +8,8 @@
目前支持的鉴权方式有 "simple" 和 "hmacsha1" 两种,默认使用 "simple"鉴权。
所有方法均可添加关键字参数sign_type修改鉴权方式。
"""
import
redis
,
random
import
kdl
,
requests
import
redis
,
random
import
kdl
,
requests
# from redis.sentinel import Sentinel
...
...
@@ -25,6 +25,7 @@ import kdl,requests
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'154.8.190.251'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
def
get_proxy_from_redis
():
try
:
one_proxy
=
rds
.
randomkey
()
...
...
@@ -32,14 +33,15 @@ def get_proxy_from_redis():
password
=
"i9mmu0a3"
proxies
=
{
"http"
:
"http://
%(user)
s:
%(pwd)
s@
%(ip)
s/"
%
{
'user'
:
username
,
'pwd'
:
password
,
'ip'
:
one_proxy
},
"https"
:
"http://
%(user)
s:
%(pwd)
s@
%(ip)
s/"
%
{
'user'
:
username
,
'pwd'
:
password
,
'ip'
:
one_proxy
}
"http"
:
"http://
%(user)
s:
%(pwd)
s@
%(ip)
s/"
%
{
'user'
:
username
,
'pwd'
:
password
,
'ip'
:
one_proxy
},
"https"
:
"http://
%(user)
s:
%(pwd)
s@
%(ip)
s/"
%
{
'user'
:
username
,
'pwd'
:
password
,
'ip'
:
one_proxy
}
}
return
proxies
except
Exception
as
e
:
print
(
e
)
return
None
def
func_get_proxy_to_redis
():
# chance = random.random()
auth
=
kdl
.
Auth
(
"990866563045611"
,
"quxguz4hwm9cxnx6wpjhkokx04klpr8v"
)
...
...
@@ -68,14 +70,13 @@ def func_get_proxy_to_redis():
# ips = client.get_dps(1, sign_type='simple', format='json', pt=2, area='北京,上海,广东')
# print("dps proxy: ", ips)
# 检测私密代理有效性: 返回 ip: true/false 组成的dict
#ips = client.get_dps(1, sign_type='simple', format='json')
#
ips = client.get_dps(1, sign_type='simple', format='json')
# valids = client.check_dps_valid(ips)
# print("valids: ", valids)
# 获取私密代理剩余时间: 返回 ip: seconds(剩余秒数) 组成的dict
ips
=
client
.
get_dps
(
1
,
format
=
'json'
,
dedup
=
1
)
ips
=
client
.
get_dps
(
1
,
format
=
'json'
,
dedup
=
1
)
seconds
=
client
.
get_dps_valid_time
(
ips
)
# print("seconds: ", seconds)
for
key
in
seconds
:
...
...
@@ -84,10 +85,12 @@ def func_get_proxy_to_redis():
# 获取计数版ip余额(仅私密代理计数版)
# balance = client.get_ip_balance(sign_type='hmacsha1')
# print("balance: ", balance)
def
proxy_test
(
proxies
):
page_url
=
"http://dev.kdlapi.com/testproxy/"
headers
=
{
"Accept-Encoding"
:
"Gzip"
,
# 使用gzip压缩传输数据让访问更快
"Accept-Encoding"
:
"Gzip"
,
# 使用gzip压缩传输数据让访问更快
}
res
=
requests
.
get
(
url
=
page_url
,
proxies
=
proxies
,
headers
=
headers
)
...
...
@@ -95,6 +98,7 @@ def proxy_test(proxies):
if
res
.
status_code
==
200
:
print
(
res
.
content
.
decode
(
'utf-8'
))
# 获取页面内容
def
get_proxy_dic
(
max_proxies
=
None
):
if
not
max_proxies
:
max_proxies
=
8
...
...
@@ -111,6 +115,7 @@ def get_proxy_dic(max_proxies=None):
else
:
return
get_proxy_from_redis
()
def
get_proxy
(
proxies_num
=
None
):
if
proxies_num
:
proxies
=
get_proxy_dic
(
max_proxies
=
proxies_num
)
...
...
@@ -119,8 +124,9 @@ def get_proxy(proxies_num=None):
else
:
return
None
if
__name__
==
"__main__"
:
proxy_pool_dic
=
get_proxy
(
11
)
print
(
proxy_pool_dic
)
proxy_test
(
proxy_pool_dic
)
print
(
get_proxy_from_redis
())
\ No newline at end of file
print
(
get_proxy_from_redis
())
crawler_sys/site_crawler_test/crawler_weibo.py
View file @
fd388d44
...
...
@@ -227,17 +227,17 @@ class Crawler_weibo():
pass
data_lis
.
append
(
res
)
#
if len(data_lis) >= 100:
#
output_result(result_Lst=data_lis,
#
platform=self.platform,
#
output_to_file=output_to_file,
#
filepath=filepath,
#
push_to_redis=push_to_redis,
#
output_to_es_register=output_to_es_register,
#
output_to_es_raw=output_to_es_raw,
#
es_index=es_index,
#
)
#
data_lis.clear()
if
len
(
data_lis
)
>=
100
:
output_result
(
result_Lst
=
data_lis
,
platform
=
self
.
platform
,
output_to_file
=
output_to_file
,
filepath
=
filepath
,
push_to_redis
=
push_to_redis
,
output_to_es_register
=
output_to_es_register
,
output_to_es_raw
=
output_to_es_raw
,
es_index
=
es_index
,
)
data_lis
.
clear
()
else
:
count_false
+=
1
if
count_false
>
10
:
...
...
@@ -297,7 +297,7 @@ if __name__ == '__main__':
# for r in res:
# print(r)
for
u
in
url_list
:
test
.
releaser_page_by_time
(
1590940800000
,
1595468554268
,
u
,
output_to_es_r
aw
=
Fals
e
,
test
.
releaser_page_by_time
(
1590940800000
,
1595468554268
,
u
,
output_to_es_r
egister
=
Tru
e
,
es_index
=
'crawler-data-raw'
,
doc_type
=
'doc'
,
releaser_page_num_max
=
4000
)
# test.get_single_page(4524055937468233)
\ No newline at end of file
crawler_sys/utils/output_results.py
View file @
fd388d44
...
...
@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit
from
crawler_sys.framework.es_ccr_index_defination
import
index_url_register
from
crawler_sys.framework.es_ccr_index_defination
import
doc_type_url_register
from
crawler_sys.framework.es_ccr_index_defination
import
fields_url_register
from
crawler_sys.framework.es_crawler
import
construct_id_for_url_register
from
write_data_into_es.func_cal_doc_id
import
cal_doc_id
from
crawler_sys.utils.write_into_file
import
write_str_into_file
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
...
...
@@ -82,11 +82,11 @@ def output_result(result_Lst, platform,
# write data into es crawler-url-register index
if
output_to_es_register
:
data_Lst_reg
=
form_data_Lst_for_url_register
(
result_Lst
)
bulk_write_into_es
(
data_Lst_reg
,
index
=
index_url_register
,
#
data_Lst_reg = form_data_Lst_for_url_register(result_Lst)
bulk_write_into_es
(
result_Lst
,
index
=
es_index
,
construct_id
=
True
,
platform
=
platform
platform
=
platform
,
)
# feed url into redis
...
...
@@ -182,7 +182,7 @@ def bulk_write_into_es(dict_Lst,
for
line
in
dict_Lst
:
write_counter
+=
1
if
construct_id
and
platform
is
not
None
:
doc_id
=
c
onstruct_id_for_url_register
(
platform
,
line
[
'url'
]
)
doc_id
=
c
al_doc_id
(
platform
,
url
=
line
[
"url"
],
doc_id_type
=
'all-time-url'
,
data_dict
=
line
)
action_str
=
(
'{ "index" : { "_index" : "
%
s", "_id" : "
%
s" } }'
%
(
index
,
doc_id
))
else
:
...
...
write_data_into_es/func_get_releaser_id.py
View file @
fd388d44
...
...
@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def
weibo
(
releaserUrl
,
**
kwargs
):
try
:
containerid
=
""
if
"/u/"
in
releaserUrl
:
releaser_id
=
containerid
=
re
.
findall
(
"/u/(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
re
.
findall
(
"/u/(
\
d+)"
,
releaserUrl
)[
0
]
elif
"/p/"
in
releaserUrl
:
releaser_id
=
containerid
=
re
.
findall
(
"/p/(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
re
.
findall
(
"/p/(
\
d+)"
,
releaserUrl
)[
0
]
if
len
(
releaser_id
)
>=
15
:
releaser_id
=
releaser_id
[
6
:]
elif
"/"
in
releaserUrl
:
releaser_id
=
containerid
=
re
.
findall
(
"(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
re
.
findall
(
"(
\
d+)"
,
releaserUrl
)[
0
]
else
:
try
:
releaserid
=
int
(
releaserUrl
)
releaser
_
id
=
int
(
releaserUrl
)
except
:
return
None
return
releaser_id
,
containerid
return
releaser_id
except
:
return
None
...
...
write_data_into_es/target_releaser_add.py
View file @
fd388d44
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment