Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
42aa9e6d
Commit
42aa9e6d
authored
Aug 04, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update redis 更换地址
parent
c95a181f
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
348 additions
and
66 deletions
+348
-66
redis_interact.py
crawler_sys/framework/redis_interact.py
+1
-1
search_page_multi_process.py
crawler_sys/framework/search_page_multi_process.py
+82
-37
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+1
-1
write_releasers_to_redis.py
crawler_sys/framework/write_releasers_to_redis.py
+1
-1
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+1
-1
crawler_zhihu.py
crawler_sys/site_crawler/crawler_zhihu.py
+15
-0
output_results.py
crawler_sys/utils/output_results.py
+246
-24
send_email_with_file_auto_task.py
maintenance/send_email_with_file_auto_task.py
+1
-1
No files found.
crawler_sys/framework/redis_interact.py
View file @
42aa9e6d
...
...
@@ -8,7 +8,7 @@ import redis, json
from
crawler_sys.framework.platform_redis_register
import
get_redis_list_name
from
crawler_sys.framework.es_crawler
import
scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
)
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
...
...
crawler_sys/framework/search_page_multi_process.py
View file @
42aa9e6d
...
...
@@ -14,7 +14,7 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"腾讯新闻"
,
"腾讯视频"
,
"new_tudou"
],
action
=
'append'
,
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"腾讯新闻"
,
"腾讯视频"
,
"new_tudou"
],
action
=
'append'
,
help
=
(
'legal platform name is required'
))
PARSER
.
add_argument
(
'-k'
,
'--key_word_platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'key_word_legal platform name is required'
))
...
...
@@ -29,8 +29,8 @@ ARGS = PARSER.parse_args()
es_framework
=
Elasticsearch
(
hosts
=
'192.168.17.11'
,
port
=
80
,
http_auth
=
(
'crawler'
,
'XBcasfo8dgfs'
))
index_target_releaser
=
'search_keywords'
doc_type_target_releaser
=
'doc'
#
index_target_releaser = 'search_keywords'
#
doc_type_target_releaser = 'doc'
# index_target_releaser = 'test2'
# doc_type_target_releaser = 'keywrod'
...
...
@@ -52,31 +52,82 @@ OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def
func_search_keywordlist
(
platform
):
search_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[]}}}
search_resp
=
es_framework
.
search
(
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
body
=
search_body
,
size
=
0
,
request_timeout
=
100
)
total_hit
=
search_resp
[
'hits'
][
'total'
]
releaser_dic
=
{}
if
total_hit
>
0
:
print
(
'Got
%
d releaser for platform
%
s.'
%
(
total_hit
,
platform
))
scan_resp
=
scan
(
client
=
es_framework
,
query
=
search_body
,
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
request_timeout
=
200
)
for
line
in
scan_resp
:
try
:
title
=
line
[
'_source'
][
'title'
]
page
=
line
[
'_source'
][
'page'
]
releaser_dic
[
title
]
=
page
except
:
print
(
'error in :'
,
line
)
continue
else
:
print
(
'Got zero hits.'
)
return
releaser_dic
res_dic
=
{}
res_list
=
[
"比基尼线脱毛"
,
"嗨体泪沟"
,
"根据脸型选发型"
,
"圆脸适合什么发型"
,
"5热玛吉"
,
"耳软骨假体鼻综合"
,
"肉毒素去法令纹"
,
"吸脂瘦腹部"
,
"嗨体填充泪沟"
,
"6d小脸针"
,
"水剥离"
,
"嗨体去颈纹"
,
"胶原蛋白填充泪沟"
,
"吸脂瘦全身"
,
"肉毒素去狐臭"
,
"吸脂瘦腰部"
,
"fotona4d"
,
"嘴综合"
,
"胸部下垂矫正"
,
"5g天使光雕"
,
"唇综合"
,
"SVF-gel脂肪胶"
,
"嘴角上扬术"
,
"嗨体注射"
,
"脂肪填充修复"
,
"比基尼脱毛"
,
"lams吸脂"
,
"脂肪填充面部年轻化"
,
"嗨体"
,
"吸脂祛副乳"
,
"m22"
,
"胸部提升"
,
"fotona"
,
"O型腿矫正"
,
"肋骨鼻"
,
"欣颜"
,
"唯颜"
,
"垫眉骨"
,
"咬肌切除"
,
"背部吸脂"
,
"m22王者之冠"
,
"bbl"
,
"胶原蛋白填充祛黑眼圈"
,
]
for
l
in
res_list
:
res_dic
[
l
]
=
10
return
res_dic
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
if
OUTPUT_TO_ES_RAW
is
True
:
...
...
@@ -99,20 +150,13 @@ def search_page_task(platform, output_to_es_raw,
print
(
"search keyword '
%
s' on platform
%
s"
%
(
keyword
,
platform
))
search_pages
=
int
(
KEYWORD_dic
[
keyword
])
try
:
if
platform
!=
"腾讯新闻"
:
crawler
.
search_page
(
keyword
=
keyword
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
else
:
crawler
.
search_video_page
(
keyword
,
None
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
,
releaser
=
False
)
except
Exception
as
e
:
print
(
e
)
continue
...
...
@@ -128,7 +172,8 @@ kwargs_dict = {
}
pool
=
Pool
(
processes
=
4
)
for
platform
in
PLATFORM_LIST
:
res
=
pool
.
apply_async
(
func
=
search_page_task
,
args
=
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
,
DOC_TYPE
))
res
=
pool
.
apply_async
(
func
=
search_page_task
,
args
=
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
,
DOC_TYPE
))
result
.
append
(
res
)
pool
.
close
()
pool
.
join
()
...
...
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
42aa9e6d
...
...
@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-n'
,
'--max_page'
,
default
=
30
,
type
=
int
,
...
...
crawler_sys/framework/write_releasers_to_redis.py
View file @
42aa9e6d
...
...
@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
...
...
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
42aa9e6d
...
...
@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
def
get_proxy_from_redis
():
...
...
crawler_sys/site_crawler/crawler_zhihu.py
View file @
42aa9e6d
...
...
@@ -107,6 +107,21 @@ class Crawler_zhihu():
print
(
requests_res
.
cookies
.
get_dict
())
return
requests_res
.
cookies
.
get_dict
()
def
parse_sigle_page
(
self
,
aid
,
data_dict
,
article_type
):
if
article_type
==
"knowledge_ad"
:
pass
elif
article_type
==
"zvideo"
:
pass
elif
article_type
==
"search_result"
:
article_type
==
data_dict
[
"object"
][
"type"
]
url
=
data_dict
[
"object"
][
"type"
]
elif
article_type
==
"search_club"
:
pass
elif
article_type
==
"relevant_query"
:
pass
else
:
pass
def
search_article_page
(
self
,
keyword
,
search_pages_max
=
12
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
...
...
crawler_sys/utils/output_results.py
View file @
42aa9e6d
...
...
@@ -7,8 +7,11 @@ Created on Tue May 15 13:59:43 2018
import
json
import
datetime
import
random
import
time
import
re
from
typing
import
Dict
,
List
import
pymysql
import
requests
from
elasticsearch.exceptions
import
TransportError
from
crawler_sys.framework.redis_interact
import
feed_url_into_redis
...
...
@@ -20,12 +23,30 @@ from crawler_sys.framework.es_ccr_index_defination import fields_url_register
from
write_data_into_es.func_cal_doc_id
import
cal_doc_id
from
crawler_sys.utils.write_into_file
import
write_str_into_file
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
lxml
import
html
from
lxml.html.clean
import
Cleaner
from
crawler.gm_upload.gm_upload
import
upload
,
upload_file
index_site_crawler
=
'crawler-data-raw'
doc_type_site_crawler
=
'doc'
# 实例化mysql连接对象
class
mysql_conn
():
def
__init__
(
self
,
mysql_name
):
if
mysql_name
==
"test"
:
self
.
conn
=
pymysql
.
connect
(
host
=
'bj-cdb-6slgqwlc.sql.tencentcdb.com'
,
port
=
62120
,
user
=
'work'
,
passwd
=
'Gengmei1'
,
db
=
'mimas_test'
,
charset
=
'utf8'
)
elif
mysql_name
==
"mimas"
:
self
.
conn
=
pymysql
.
connect
(
host
=
'172.16.30.138'
,
port
=
3306
,
user
=
'mimas'
,
passwd
=
'GJL3UJe1Ck9ggL6aKnZCq4cRvM'
,
db
=
'mimas_prod'
,
charset
=
'utf8mb4'
)
self
.
cur
=
self
.
conn
.
cursor
()
def
form_data_Lst_for_url_register
(
data_Lst_ori
):
ts
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
ts
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
data_Lst_reg
=
[]
for
line
in
data_Lst_ori
:
try
:
...
...
@@ -41,15 +62,15 @@ def form_data_Lst_for_url_register(data_Lst_ori):
return
data_Lst_reg
def
hot_words_output_result
(
result_Lst
,
output_index
=
"short-video-hotwords"
):
def
hot_words_output_result
(
result_Lst
,
output_index
=
"short-video-hotwords"
):
bulk_all_body
=
""
for
count
,
result
in
enumerate
(
result_Lst
):
doc_id
=
result
[
"platform"
]
+
"_"
+
result
[
"title"
]
for
count
,
result
in
enumerate
(
result_Lst
):
doc_id
=
result
[
"platform"
]
+
"_"
+
result
[
"title"
]
bulk_head
=
'{"index": {"_id":"
%
s"}}'
%
doc_id
data_str
=
json
.
dumps
(
result
,
ensure_ascii
=
False
)
bulk_one_body
=
bulk_head
+
'
\n
'
+
data_str
+
'
\n
'
bulk_all_body
+=
bulk_one_body
if
count
%
500
==
0
and
count
>
0
:
if
count
%
500
==
0
and
count
>
0
:
eror_dic
=
es_site_crawler
.
bulk
(
index
=
output_index
,
body
=
bulk_all_body
,
request_timeout
=
200
)
...
...
@@ -59,7 +80,6 @@ def hot_words_output_result(result_Lst,output_index="short-video-hotwords"):
print
(
bulk_all_body
)
print
(
count
)
if
bulk_all_body
!=
''
:
eror_dic
=
es_site_crawler
.
bulk
(
body
=
bulk_all_body
,
index
=
output_index
,
...
...
@@ -69,14 +89,214 @@ def hot_words_output_result(result_Lst,output_index="short-video-hotwords"):
print
(
eror_dic
)
WHITE_TAGS
=
{
"basic"
:
[
"div"
,
"p"
,
"span"
,
"img"
,
"br"
,
"video"
,
'a'
],
# 暂定小程序及爬取数据使用
"all"
:
[
"div"
,
"p"
,
"span"
,
"img"
,
"br"
,
"video"
,
"audio"
,
"a"
,
"b"
,
"strong"
,
"i"
,
"ul"
,
"ol"
,
"li"
,
"em"
,
"h1"
,
"h2"
,
"h3"
,
"h4"
,
"h5"
,
"h6"
,
"iframe"
,
]
# 可以展示的所有白标签
}
img_type
=
{
"OTHER"
:
1
,
# '其他图片'
"GIF"
:
2
,
# "GIF动图")
"JPG"
:
3
,
# "JPG图片")
"JPEG"
:
4
,
# "JPEG图片")
"PNG"
:
5
,
# "PNG图片")
"BMP"
:
6
,
# "BMP位图")
"WEBP"
:
7
,
# "WEBP图片类型")
"TIFF"
:
8
,
# "TIFF图片类型")
}
def
gm_convert_html_tags
(
rich_text
,
all_tags
=
False
,
remove_tags
=
None
):
"""
富文本内容重新清洗,剔除不需要的样式
:param rich_text: 富文本
:param all_tags: 是否需要匹配所有白名单中的标签
:param remove_tags: 需要剔除的,白名单标签 []
:return:
"""
if
not
rich_text
:
return
""
# rich_text = _get_rich_text(rich_text)
# 标签清洗 + 补齐 参数
tags
=
WHITE_TAGS
[
"all"
]
if
all_tags
else
WHITE_TAGS
[
"basic"
]
if
remove_tags
:
tags
=
[
tag
for
tag
in
tags
if
tag
not
in
remove_tags
]
kw
=
{
"remove_unknown_tags"
:
False
,
"allow_tags"
:
tags
,
"safe_attrs"
:
[
"src"
,
],
}
if
"a"
in
tags
:
kw
[
"safe_attrs"
]
.
append
(
"href"
)
elif
all_tags
:
kw
[
"safe_attrs"
]
.
extend
([
"class"
,
"style"
])
if
"iframe"
in
kw
[
"allow_tags"
]:
kw
[
"embedded"
]
=
False
clear
=
Cleaner
(
**
kw
)
rich_text
=
clear
.
clean_html
(
rich_text
)
# 增加样式
element_obj
=
html
.
fromstring
(
rich_text
)
for
element
in
element_obj
.
xpath
(
u"//img|//video"
):
if
not
all_tags
:
# 小程序,普通用户,爬取数据
element
.
attrib
[
"width"
]
=
"100
%
"
# 图片、视频增加宽度 100%
if
element
.
tag
==
"video"
and
all_tags
:
element
.
attrib
[
"class"
]
=
"js_richtext_video"
# 移除a标签中跳转链不是gengmei开头的链接
for
item
in
element_obj
.
xpath
(
"//a[not(starts-with(@href, 'gengmei://'))]"
):
item
.
getparent
()
.
remove
(
item
)
# a 标签追加样式
for
item
in
element_obj
.
xpath
(
"//a"
):
item
.
attrib
[
"style"
]
=
'color:#3FB5AF'
# a标签颜色
rich_text
=
html
.
tostring
(
element_obj
,
encoding
=
"unicode"
)
return
rich_text
def
push_data_to_user
(
res_data
:
Dict
)
->
Dict
:
"""
处理数据为可以入库的格式
:param res_data:
:return:
"""
qiniu_img_list
=
[]
if
res_data
[
"img_list"
]:
for
img_url
in
res_data
[
"img_list"
]:
try
:
img_wb
=
retry_get_url
(
img_url
)
.
content
res
=
upload
(
img_wb
)
print
(
res
)
img_info
=
retry_get_url
(
res
+
"-imageinfo"
)
img_info_json
=
img_info
.
json
()
qiniu_img_list
.
append
((
res
+
"-w"
,
img_info_json
))
except
Exception
as
e
:
print
(
"down load img error
%
s"
%
e
)
return
{}
# 替换图片
if
res_data
[
"platform"
]
==
"weibo"
:
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
if
"http://t.cn/"
in
res_data
[
"title"
]:
res_data
[
"title"
]
=
res_data
[
"title"
]
.
split
(
"http://t.cn/"
)[
0
]
res_data
[
"content"
]
=
res_data
[
"title"
]
elif
res_data
[
"platform"
]
==
"douban"
:
content
=
res_data
.
get
(
"content"
)
if
content
:
for
count
,
img_url
in
enumerate
(
res_data
[
"img_list"
]):
# print(qiniu_img_list[count][0])
content
=
content
.
replace
(
img_url
,
qiniu_img_list
[
count
][
0
])
res_data
[
"qiniu_img_list"
]
=
qiniu_img_list
res_data
[
"content"
]
=
content
if
res_data
[
"platform"
]
==
"weibo"
:
res_data
[
"content"
]
=
gm_convert_html_tags
(
res_data
[
"title"
],
all_tags
=
True
)
res_data
[
"title"
]
=
""
elif
res_data
[
"platform"
]
==
"douban"
:
res_data
[
"content"
]
=
gm_convert_html_tags
(
res_data
[
"content"
],
all_tags
=
True
)
return
res_data
def
write_data_into_mysql
(
res_data
,
mysql
):
now_str
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
# 清洗数据为可以入库的格式
data
=
push_data_to_user
(
res_data
)
if
not
data
.
get
(
"content"
):
return
None
if
not
data
.
get
(
"qiniu_img_list"
):
return
None
tractate_id
=
0
try
:
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
0
,
status
=
2
,
platform
=
15
,
content_level
=
data
[
"level"
],
is_excellent
=
0
,
create_time
=
now_str
,
last_modified
=
now_str
,
user_del
=
0
,
low_quality
=
0
,
low_quality_deal
=
0
,
platform_id
=
data
[
"doc_id"
],
pgc_type
=
0
,
title
=
data
[
"title"
])
res
=
mysql
.
cur
.
execute
(
sql_query
)
tractate_id
=
int
(
mysql
.
conn
.
insert_id
())
if
res
:
mysql
.
conn
.
commit
()
except
Exception
as
e
:
print
(
"commit error
%
s"
%
e
)
print
(
data
)
mysql
.
conn
.
rollback
()
if
data
.
get
(
"qiniu_img_list"
):
for
img_info
in
data
.
get
(
"qiniu_img_list"
):
if
img_info
[
0
]
in
data
.
get
(
"content"
):
image_url_source
=
2
else
:
image_url_source
=
3
try
:
image_type
=
img_type
.
get
(
img_info
[
1
][
"format"
]
.
upper
())
except
:
image_type
=
1
try
:
width
=
img_info
[
1
][
"width"
]
height
=
img_info
[
1
][
"height"
]
except
:
width
=
0
height
=
0
try
:
if
img_type
==
7
:
sql_query
=
"""
insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
image_webp
=
img_info
[
0
],
create_time
=
now_str
,
update_time
=
now_str
)
else
:
sql_query
=
"""
insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time)
values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}')
"""
.
format
(
tractate_id
=
tractate_id
,
image_url
=
img_info
[
0
],
width
=
width
,
height
=
height
,
image_url_source
=
image_url_source
,
image_type
=
image_type
,
create_time
=
now_str
,
update_time
=
now_str
)
res
=
mysql
.
cur
.
execute
(
sql_query
)
if
res
:
mysql
.
conn
.
commit
()
except
Exception
as
e
:
print
(
"commit error
%
s"
%
e
)
mysql
.
conn
.
rollback
()
if
tractate_id
:
return
tractate_id
else
:
return
None
def
output_result
(
result_Lst
,
platform
,
output_to_file
=
False
,
filepath
=
None
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
push_to_redis
=
False
,
batch_str
=
Non
e
,
release_time_lower_bdr
=
Non
e
,
es_index
=
index_site_crawler
,
**
kwargs
):
output_to_test_mysql
=
Fals
e
,
output_to_mimas_mysql
=
Fals
e
,
es_index
=
index_site_crawler
,
**
kwargs
):
# write data into es crawler-raw index
if
output_to_es_raw
:
bulk_write_into_es
(
result_Lst
,
es_index
)
...
...
@@ -90,34 +310,36 @@ def output_result(result_Lst, platform,
platform
=
platform
,
)
if
output_to_test_mysql
:
pass
# feed url into redis
if
push_to_redis
:
feed_url_into_redis
(
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
))
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
))
# output into file according to passed in parameters
if
output_to_file
is
True
and
filepath
is
not
None
:
output_fn
=
(
'crawler_
%
s_on_
%
s_json'
%
(
platform
,
datetime
.
datetime
.
now
()
.
isoformat
()[:
10
]))
output_f
=
open
(
filepath
+
'/'
+
output_fn
,
'a'
,
encoding
=
'utf-8'
)
output_f
=
open
(
filepath
+
'/'
+
output_fn
,
'a'
,
encoding
=
'utf-8'
)
write_into_file
(
result_Lst
,
output_f
)
output_f
.
close
()
else
:
return
result_Lst
def
retry_get_url
(
url
,
retrys
=
3
,
proxies
=
None
,
timeout
=
10
,
**
kwargs
):
def
retry_get_url
(
url
,
retrys
=
3
,
proxies
=
None
,
timeout
=
10
,
**
kwargs
):
retry_c
=
0
while
retry_c
<
retrys
:
try
:
if
proxies
:
proxies_dic
=
get_proxy
(
proxies
)
if
not
proxies_dic
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
else
:
get_resp
=
requests
.
get
(
url
,
proxies
=
proxies_dic
,
timeout
=
timeout
,
**
kwargs
)
get_resp
=
requests
.
get
(
url
,
proxies
=
proxies_dic
,
timeout
=
timeout
,
**
kwargs
)
else
:
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
get_resp
=
requests
.
get
(
url
,
timeout
=
timeout
,
**
kwargs
)
return
get_resp
except
Exception
as
e
:
retry_c
+=
1
...
...
@@ -151,7 +373,7 @@ def bulk_write_into_es(dict_Lst,
bulk_write_body
=
''
write_counter
=
0
def
bulk_write_with_retry_UnicodeEncodeError
(
index
,
bulk_write_body
,
def
bulk_write_with_retry_UnicodeEncodeError
(
index
,
bulk_write_body
,
retry_counter_for_UnicodeEncodeError
):
if
bulk_write_body
!=
''
:
try
:
...
...
@@ -159,7 +381,7 @@ def bulk_write_into_es(dict_Lst,
body
=
bulk_write_body
,
request_timeout
=
100
)
bulk_write_body
=
''
# print(bulk_write_resp)
# print(bulk_write_resp)
print
(
'Writing into es done'
)
except
UnicodeEncodeError
as
ue
:
print
(
'Got UnicodeEncodeError, will remove ill formed string and retry.'
)
...
...
@@ -167,9 +389,9 @@ def bulk_write_into_es(dict_Lst,
UnicodeEncodeError_msg
=
ue
.
__str__
()
ill_str_idxs
=
get_ill_encoded_str_posi
(
UnicodeEncodeError_msg
)
if
len
(
ill_str_idxs
)
==
2
:
ill_str
=
bulk_write_body
[
ill_str_idxs
[
0
]:
ill_str_idxs
[
1
]
+
1
]
ill_str
=
bulk_write_body
[
ill_str_idxs
[
0
]:
ill_str_idxs
[
1
]
+
1
]
bulk_write_body
=
bulk_write_body
.
replace
(
ill_str
,
''
)
bulk_write_with_retry_UnicodeEncodeError
(
index
,
bulk_write_body
,
bulk_write_with_retry_UnicodeEncodeError
(
index
,
bulk_write_body
,
retry_counter_for_UnicodeEncodeError
)
except
TransportError
:
...
...
@@ -182,7 +404,7 @@ def bulk_write_into_es(dict_Lst,
for
line
in
dict_Lst
:
write_counter
+=
1
if
construct_id
and
platform
is
not
None
:
doc_id
=
cal_doc_id
(
platform
,
url
=
line
[
"url"
],
doc_id_type
=
'all-time-url'
,
data_dict
=
line
)
doc_id
=
cal_doc_id
(
platform
,
url
=
line
[
"url"
],
doc_id_type
=
'all-time-url'
,
data_dict
=
line
)
action_str
=
(
'{ "index" : { "_index" : "
%
s", "_id" : "
%
s" } }'
%
(
index
,
doc_id
))
else
:
...
...
@@ -191,7 +413,7 @@ def bulk_write_into_es(dict_Lst,
data_str
=
json
.
dumps
(
line
,
ensure_ascii
=
False
)
line_body
=
action_str
+
'
\n
'
+
data_str
+
'
\n
'
bulk_write_body
+=
line_body
if
write_counter
%
1000
==
0
or
write_counter
==
len
(
dict_Lst
):
if
write_counter
%
1000
==
0
or
write_counter
==
len
(
dict_Lst
):
print
(
'Writing into es
%
s
%
d/
%
d'
%
(
index
,
write_counter
,
len
(
dict_Lst
)))
...
...
@@ -215,7 +437,7 @@ def load_json_file_into_dict_Lst(filename, path):
if
path
[
-
1
]
!=
'/'
:
path
+=
'/'
data_Lst
=
[]
with
open
(
path
+
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
path
+
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
f
:
line_d
=
json
.
loads
(
line
)
if
'data_provider'
not
in
line_d
:
...
...
@@ -234,7 +456,7 @@ def crawl_a_url_and_update_redis(url, platform, urlhash, processID=-1):
# perform crawling, get the data
# write es or output to files
# update redis
ts
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
ts
=
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
redis_hmset_dict
=
{
'push_time'
:
ts
,
'is_fetched'
:
1
,
'url'
:
url
,
'platform'
:
platform
}
rds
.
hmset
(
urlhash
,
redis_hmset_dict
)
...
...
@@ -254,7 +476,7 @@ def scan_redis_to_crawl():
scan_counter
=
0
while
True
:
scan_counter
+=
1
if
scan_counter
%
5
==
0
:
if
scan_counter
%
5
==
0
:
print
(
scan_counter
,
'cur:'
,
cur
,
datetime
.
datetime
.
now
())
cur
,
hash_keys
=
rds
.
scan
(
cur
)
for
urlhash
in
hash_keys
:
...
...
maintenance/send_email_with_file_auto_task.py
View file @
42aa9e6d
...
...
@@ -4,7 +4,7 @@
import
redis
,
time
,
json
,
datetime
,
sys
from
maintenance.func_send_email_with_file
import
send_file_email
rds
=
redis
.
StrictRedis
(
host
=
'1
54.8.190.251
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.10
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
def
write_email_task_to_redis
(
task_name
=
None
,
file_path
=
None
,
data_str
=
None
,
email_group
=
[],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment