Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
4362749f
Commit
4362749f
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
fee41916
master
xiangwan
No related merge requests found
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
166 additions
and
242 deletions
+166
-242
redis_interact.py
crawler_sys/framework/redis_interact.py
+8
-3
search_page_single_process.py
crawler_sys/framework/search_page_single_process.py
+89
-45
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+16
-2
generate_redis_url_batch.py
crawler_sys/scheduler/generate_redis_url_batch.py
+2
-2
crawler_toutiao.py
crawler_sys/site_crawler/crawler_toutiao.py
+7
-7
crawler_weibo.py
crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
+12
-2
crawler_zhihu.py
crawler_sys/site_crawler/crawler_zhihu.py
+17
-171
output_results.py
crawler_sys/utils/output_results.py
+11
-7
requirements.txt
requirements.txt
+1
-1
func_calculate_zhihu_id.py
..._data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
+1
-1
func_cal_doc_id.py
write_data_into_es/func_cal_doc_id.py
+2
-1
No files found.
crawler_sys/framework/redis_interact.py
View file @
4362749f
...
...
@@ -8,11 +8,16 @@ import redis, json
from
crawler_sys.framework.platform_redis_register
import
get_redis_list_name
from
crawler_sys.framework.es_crawler
import
scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164'
,
port
=
6379
,
db
=
19
,
password
=
'ReDis!GmTx*0aN12'
)
def
redis_path
(
redis_type
=
""
):
if
redis_type
==
"on_line"
:
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164'
,
port
=
6379
,
db
=
19
,
password
=
'ReDis!GmTx*0aN12'
)
else
:
rds
=
redis
.
StrictRedis
(
host
=
'172.18.51.10'
,
port
=
6379
,
db
=
19
)
return
rds
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
):
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
rds
=
redis_path
):
"""
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/search_page_single_process.py
View file @
4362749f
...
...
@@ -16,20 +16,18 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"
腾讯新闻"
,
"腾讯视频"
,
"new_tudo
u"
],
action
=
'append'
,
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"
weibo"
,
"zhih
u"
],
action
=
'append'
,
help
=
(
'legal platform name is required'
))
PARSER
.
add_argument
(
'-k'
,
'--key_word_platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'key_word_legal platform name is required'
))
PARSER
.
add_argument
(
'-w'
,
'--output_to_es_raw'
,
default
=
Tru
e
,
PARSER
.
add_argument
(
'-w'
,
'--output_to_es_raw'
,
default
=
Fals
e
,
help
=
(
'output to es raw'
))
PARSER
.
add_argument
(
'-g'
,
'--output_to_es_register'
,
default
=
Fals
e
,
PARSER
.
add_argument
(
'-g'
,
'--output_to_es_register'
,
default
=
Tru
e
,
help
=
(
'output to es register'
))
PARSER
.
add_argument
(
'-n'
,
'--maxpage'
,
default
=
20
,
help
=
(
'maxpage'
))
ARGS
=
PARSER
.
parse_args
()
es_framework
=
Elasticsearch
(
hosts
=
'192.168.17.11'
,
port
=
80
,
http_auth
=
(
'crawler'
,
'XBcasfo8dgfs'
))
...
...
@@ -41,45 +39,92 @@ if ARGS.platform != []:
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW
=
ARGS
.
output_to_es_raw
OUTPUT_TO_ES_REGISTER
=
ARGS
.
output_to_es_register
#
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
def
func_search_keywordlist
(
platform
):
search_body
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[]}}}
search_resp
=
es_framework
.
search
(
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
body
=
search_body
,
size
=
0
,
request_timeout
=
100
)
total_hit
=
search_resp
[
'hits'
][
'total'
]
releaser_dic
=
{}
if
total_hit
>
0
:
print
(
'Got
%
d releaser for platform
%
s.'
%
(
total_hit
,
platform
))
scan_resp
=
scan
(
client
=
es_framework
,
query
=
search_body
,
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
,
request_timeout
=
200
)
for
line
in
scan_resp
:
try
:
title
=
line
[
'_source'
][
'title'
]
page
=
line
[
'_source'
][
'page'
]
releaser_dic
[
title
]
=
page
except
:
print
(
'error in :'
,
line
)
continue
else
:
print
(
'Got zero hits.'
)
return
releaser_dic
if
OUTPUT_TO_ES_RAW
is
True
:
ES_INDEX
=
'test2'
DOC_TYPE
=
'doc'
print
(
ES_INDEX
,
DOC_TYPE
)
res_dic
=
{}
res_list
=
[
"比基尼线脱毛"
,
"嗨体泪沟"
,
"根据脸型选发型"
,
"圆脸适合什么发型"
,
"5热玛吉"
,
"耳软骨假体鼻综合"
,
"肉毒素去法令纹"
,
"吸脂瘦腹部"
,
"嗨体填充泪沟"
,
"6d小脸针"
,
"水剥离"
,
"嗨体去颈纹"
,
"胶原蛋白填充泪沟"
,
"吸脂瘦全身"
,
"肉毒素去狐臭"
,
"吸脂瘦腰部"
,
"fotona4d"
,
"嘴综合"
,
"胸部下垂矫正"
,
"5g天使光雕"
,
"唇综合"
,
"SVF-gel脂肪胶"
,
"嘴角上扬术"
,
"嗨体注射"
,
"脂肪填充修复"
,
"比基尼脱毛"
,
"lams吸脂"
,
"脂肪填充面部年轻化"
,
"嗨体"
,
"吸脂祛副乳"
,
"m22"
,
"胸部提升"
,
"fotona"
,
"O型腿矫正"
,
"肋骨鼻"
,
"欣颜"
,
"唯颜"
,
"垫眉骨"
,
"咬肌切除"
,
"背部吸脂"
,
"m22王者之冠"
,
"bbl"
,
"胶原蛋白填充祛黑眼圈"
,
"热玛吉"
,
"热玛吉5代"
,
]
for
l
in
res_list
:
res_dic
[
l
]
=
1
return
res_dic
ES_INDEX
=
'crawler-data-raw'
print
(
ES_INDEX
)
pages
=
ARGS
.
maxpage
for
platform
in
PLATFORM_LIST
:
...
...
@@ -92,12 +137,11 @@ for platform in PLATFORM_LIST:
print
(
"search keyword '
%
s' on platform
%
s"
%
(
keyword
,
platform
))
search_pages
=
int
(
KEYWORD_dic
[
keyword
])
try
:
if
platform
!=
"腾讯新闻"
:
crawler
.
search_page
(
keyword
=
keyword
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
OUTPUT_TO_ES_RAW
,
output_to_es_register
=
OUTPUT_TO_ES_REGISTER
,
es_index
=
ES_INDEX
,)
crawler
.
search_page
(
keyword
=
keyword
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
OUTPUT_TO_ES_RAW
,
output_to_es_register
=
OUTPUT_TO_ES_REGISTER
,
es_index
=
ES_INDEX
,)
except
Exception
as
e
:
print
(
e
)
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
4362749f
...
...
@@ -10,7 +10,7 @@
"""
import
redis
,
random
import
kdl
,
requests
import
sys
# from redis.sentinel import Sentinel
# sentinel = Sentinel([('192.168.17.65', 26379),
...
...
@@ -23,9 +23,23 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
,
password
=
'ReDis!GmTx*0aN12'
)
def
func_get_redis
():
sys_path
=
sys
.
path
for
p
in
sys_path
:
if
"C:
\\
"
in
p
:
stats
=
"test"
break
if
stats
==
"on_line"
:
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
,
password
=
'ReDis!GmTx*0aN12'
)
else
:
rds
=
redis
.
StrictRedis
(
host
=
'172.18.51.10'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
return
rds
rds
=
func_get_redis
()
def
get_proxy_from_redis
():
try
:
one_proxy
=
rds
.
randomkey
()
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/scheduler/generate_redis_url_batch.py
View file @
4362749f
...
...
@@ -20,11 +20,11 @@ parser.add_argument('-d', '--days_from_now', default=30, type=int,
'default 30.'
))
args
=
parser
.
parse_args
()
def
redis_url_batch_gen
(
platform
,
batch_str
,
release_time_lower_bdr
):
url_Lst
=
pull_url_from_es
(
platform
,
release_time_lower_bdr
)
if
url_Lst
!=
[]:
redis_list_name
,
push_counter
=
feed_url_into_redis
(
url_Lst
,
platform
,
batch_str
=
batch_str
)
redis_list_name
,
push_counter
=
feed_url_into_redis
(
url_Lst
,
platform
,)
return
(
redis_list_name
,
push_counter
)
else
:
return
(
None
,
None
)
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_toutiao.py
View file @
4362749f
...
...
@@ -343,8 +343,7 @@ class Crawler_toutiao():
def
search_page_old
(
self
,
keyword
,
search_pages_max
=
12
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
0
):
es_index
=
None
,
proxies_num
=
0
):
headers_search
=
{
"accept"
:
"application/json, text/javascript"
,
"accept-encoding"
:
"gzip, deflate"
,
...
...
@@ -428,9 +427,10 @@ class Crawler_toutiao():
print
(
"method get_web_article_info error
%
s"
%
e
)
print
(
D0
)
toutiao_Lst
.
append
(
D0
)
except
KeyError
:
except
Exception
as
e
:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
print
(
e
)
continue
else
:
break
...
...
@@ -440,7 +440,7 @@ class Crawler_toutiao():
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
)
toutiao_Lst
.
clear
()
if
toutiao_Lst
!=
[]:
...
...
@@ -449,7 +449,7 @@ class Crawler_toutiao():
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
)
return
toutiao_Lst
...
...
@@ -461,7 +461,7 @@ class Crawler_toutiao():
self
.
search_page_old
(
keyword
,
search_pages_max
=
search_pages_max
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
,
proxies_num
=
proxies_num
)
proxies_num
=
proxies_num
)
def
find_releaser_id
(
self
,
releaserUrl
):
return
get_releaser_id
(
platform
=
self
.
platform
,
releaserUrl
=
releaserUrl
)
...
...
@@ -1799,4 +1799,4 @@ if __name__ == '__main__':
# doc_type='doc',
# releaser_page_num_max=3, proxies_num=1))
# test.releaser_page(u)
test
.
search_page
(
"
热玛吉五代
"
)
test
.
search_page
(
"
比基尼线脱毛
"
)
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
View file @
4362749f
...
...
@@ -432,6 +432,7 @@ class Crawler_weibo():
video_dic
[
"releaserUrl"
]
=
data
[
"userinfo"
]
.
get
(
'url'
)
video_dic
[
"releaser_id_str"
]
=
"weibo_"
+
str
(
video_dic
[
"releaser_id"
])
video_dic
[
"img_list"
]
=
re
.
findall
(
'img src="(.*?)"'
,
data
[
"content"
])
video_dic
[
"mid"
]
=
article_id
return
video_dic
except
Exception
as
e
:
print
(
"single data row formate error
%
s"
%
e
)
...
...
@@ -442,6 +443,7 @@ class Crawler_weibo():
output_to_es_register
=
False
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
0
):
count_false
=
0
headers_search
=
{
"Accept"
:
"application/json, text/plain, */*"
,
"MWeibo-Pwa"
:
"1"
,
...
...
@@ -463,6 +465,13 @@ class Crawler_weibo():
if
get_page
.
status_code
!=
200
:
continue
page_dict
=
get_page
.
json
()
while
page_dict
[
'data'
]
.
get
(
"msg"
)
==
'这里还没有内容'
:
get_page
=
retry_get_url
(
search_page_url
,
headers
=
headers_search
)
page_dict
=
get_page
.
json
()
count_false
+=
1
if
count_false
>=
3
:
continue
if
page_dict
[
'data'
]
.
get
(
"cards"
)[
0
]
.
get
(
"card_group"
):
for
one_line
in
page_dict
[
'data'
]
.
get
(
"cards"
)[
0
]
.
get
(
"card_group"
):
try
:
...
...
@@ -488,7 +497,7 @@ class Crawler_weibo():
# D0['play_count'] = play_count
# D0['comment_count'] = comment_count
# D0['favorite_count'] = favorite_count
D0
[
'
article_
id'
]
=
article_id
D0
[
'
m
id'
]
=
article_id
# D0['releaser'] = releaser
# D0['releaserUrl'] = releaserUrl
# D0['release_time'] = release_time
...
...
@@ -501,6 +510,7 @@ class Crawler_weibo():
D0
.
update
(
article_info
)
except
Exception
as
e
:
print
(
"method get_web_article_info error
%
s"
%
e
)
continue
print
(
D0
)
weibo_Lst
.
append
(
D0
)
except
KeyError
:
...
...
@@ -850,5 +860,5 @@ if __name__ == '__main__':
# test_search2 = weibo.search_page(keyword, user_name, password)
# test_repost = weibo.repost_page(weibo_id, user_name, password)
# user_page = weibo.user_page(user_id, user_name, password)
weibo
.
search_page
(
"迪丽热巴"
)
weibo
.
search_page
(
"迪丽热巴"
,
output_to_es_register
=
True
,
es_index
=
"crawler-data-raw"
,
search_pages_max
=
1
)
# print(user_page)
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_zhihu.py
View file @
4362749f
...
...
@@ -24,8 +24,8 @@ import requests
# import execjs
import
hashlib
import
requests
from
bs4
import
BeautifulSoup
import
execjs
#
from bs4 import BeautifulSoup
from
crawler.crawler_sys.framework.video_fields_std
import
Std_fields_video
from
crawler.crawler_sys.utils.output_results
import
retry_get_url
,
output_result
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
...
...
@@ -48,6 +48,9 @@ class Crawler_zhihu():
self
.
video_data
[
'platform'
]
=
self
.
platform
# remove fields that crawled data don't have
pop_key_Lst
=
[
'channel'
,
'describe'
,
'isOriginal'
,
"repost_count"
,
"video_id"
]
with
open
(
'./zhihu.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
self
.
exec_js
=
execjs
.
compile
(
js
)
for
popk
in
pop_key_Lst
:
self
.
video_data
.
pop
(
popk
)
...
...
@@ -71,7 +74,7 @@ class Crawler_zhihu():
requests_res
=
retry_get_url
(
url
,
headers
=
headers
,
proxies
=
proxies_num
)
tres_json_test
=
requests_res
.
text
res_json
=
json
.
loads
(
re
.
findall
(
'<script id="js-initialData" type="text/json">(.*?)</script>'
,
tres_json_test
)[
0
])
print
(
res_json
)
#
print(res_json)
data
=
res_json
[
"initialState"
]
video_dic
=
{}
video_dic
[
"url"
]
=
url
...
...
@@ -131,7 +134,7 @@ class Crawler_zhihu():
pass
return
res_dict
def
search_article_page
(
self
,
keyword
,
search_pages_max
=
1
2
,
def
search_article_page
(
self
,
keyword
,
search_pages_max
=
1
0
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
es_index
=
None
,
...
...
@@ -151,7 +154,7 @@ class Crawler_zhihu():
"x-app-za"
:
"OS=Web"
,
"x-requested-with"
:
"fetch"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
"1.0_a_Yy6euBS_xfbM28ZhtycHU8gG2XoHtyGTxqHve8rXtY"
,
"x-zse-86"
:
None
,
"referer"
:
"https://www.zhihu.com/search?type=content&q={0}"
.
format
(
urllib
.
parse
.
quote
(
keyword
)),
}
...
...
@@ -163,6 +166,10 @@ class Crawler_zhihu():
url
=
"https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0"
.
format
(
urllib
.
parse
.
quote
(
keyword
))
offset
=
0
f
=
"+"
.
join
([
"3_2.0"
,
url
.
replace
(
"https://www.zhihu.com"
,
""
),
headers_search
[
"referer"
],
cookies_dict
[
"d_c0"
]])
fmd5
=
hashlib
.
new
(
'md5'
,
f
.
encode
())
.
hexdigest
()
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
res_list
=
[]
while
offset
<=
search_pages_max
*
20
:
offset
+=
20
get_page
=
retry_get_url
(
url
,
headers
=
headers_search
,
cookies
=
cookies_dict
,
proxies
=
proxies_num
)
...
...
@@ -176,7 +183,6 @@ class Crawler_zhihu():
# print(get_page.cookies.get_dict())
cookies_dict
.
update
(
get_page
.
cookies
.
get_dict
())
headers_search
.
pop
(
"x-zse-86"
,
0
)
res_list
=
[]
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
try
:
...
...
@@ -191,7 +197,7 @@ class Crawler_zhihu():
D0
.
update
(
res_data
)
except
Exception
as
e
:
print
(
"method get_web_article_info error
%
s"
%
e
)
print
(
D0
)
#
print(D0)
res_list
.
append
(
D0
)
except
KeyError
:
# It's totally ok to drop the last return data value.
...
...
@@ -231,137 +237,6 @@ class Crawler_zhihu():
es_index
=
es_index
,
doc_type
=
doc_type
,
proxies_num
=
proxies_num
)
def
repost_page
(
self
,
weibo_id
,
user_name
,
password
):
total_page
=
0
result_lst
=
[]
cookie
=
self
.
manipulate_login
(
user_name
=
user_name
,
password
=
password
)
# cookie = self.test_cookie(get_cookie)
if
cookie
is
not
None
:
current_time
=
int
(
time
.
time
()
*
1000
)
repost_url
=
'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id='
+
weibo_id
+
'&max_id=0&page=1&__rnd='
+
str
(
current_time
)
get_page
=
requests
.
get
(
repost_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
total_page
=
page_dic
[
'data'
][
'page'
][
'totalpage'
]
repost_info
=
page_dic
[
'data'
][
'html'
]
repost_soup
=
BeautifulSoup
(
repost_info
,
'html.parser'
)
repost_agg
=
repost_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
for
line
in
repost_agg
:
try
:
one_repost
=
self
.
get_repost_info
(
line
)
result_lst
.
append
(
one_repost
)
print
(
'get one repost'
)
except
:
print
(
'one repost data error'
)
print
(
one_repost
)
except
:
print
(
"can't get repost data"
)
time
.
sleep
(
6
)
if
cookie
is
not
None
and
total_page
!=
0
:
for
page_num
in
range
(
1
,
total_page
+
1
):
current_time
=
int
(
time
.
time
()
*
1000
)
repost_url
=
(
'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id='
+
weibo_id
+
'&max_id=0&page='
+
str
(
page_num
)
+
'&__rnd='
+
str
(
current_time
))
get_page
=
requests
.
get
(
repost_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
time
.
sleep
(
3
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
total_page
=
page_dic
[
'data'
][
'page'
][
'totalpage'
]
repost_info
=
page_dic
[
'data'
][
'html'
]
repost_soup
=
BeautifulSoup
(
repost_info
,
'html.parser'
)
repost_agg
=
repost_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
for
line
in
repost_agg
:
one_repost
=
self
.
get_repost_info
(
line
)
result_lst
.
append
(
one_repost
)
print
(
'get one repost at
%
s'
%
page_num
)
print
(
one_repost
)
except
:
print
(
"can't get repost data"
)
if
result_lst
!=
[]:
return
result_lst
else
:
print
(
"can't get repost data"
)
return
None
def
user_page
(
self
,
user_id
,
user_name
,
password
):
result_lst
=
[]
cookie_pool
=
open
(
'cookie_pool'
,
'r'
,
encoding
=
'utf-8'
)
for
coo
in
cookie_pool
:
print
(
coo
)
cookie
=
json
.
loads
(
coo
)
# cookie = self.manipulate_login(user_name=user_name,password=password)
# cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
if
cookie
is
not
None
:
for
page_num
in
range
(
1
,
3
):
first_url
=
(
'https://weibo.com/u/'
+
user_id
+
'?visible=0&is_all=1&is_tag=0'
'&profile_ftype=1&page='
+
str
(
page_num
)
+
'#feedtop'
)
get_page
=
requests
.
get
(
first_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
page
=
get_page
.
text
soup
=
BeautifulSoup
(
page
,
'html.parser'
)
sfa
=
soup
.
find_all
(
'script'
)
find_content
=
''
for
line
in
sfa
:
if
'Pl_Official_MyProfileFeed__'
in
str
(
line
):
find_content
=
str
(
line
)
find_content
=
find_content
.
replace
(
'<script>FM.view('
,
''
)
.
replace
(
')</script>'
,
''
)
# print(find_content)
find_content_dic
=
json
.
loads
(
find_content
)
content_for_soup
=
find_content_dic
[
'html'
]
soup_content
=
BeautifulSoup
(
content_for_soup
,
'html.parser'
)
weibo_lst
=
soup_content
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
# time.sleep(15)
for
line_count
,
line
in
enumerate
(
weibo_lst
):
weibo_info
=
self
.
get_user_weibo_info
(
line
,
cookie
)
weibo_info
[
'user_id'
]
=
user_id
weibo_info
[
'user_url'
]
=
'https://weibo.com/'
+
user_id
result_lst
.
append
(
weibo_info
)
print
(
'get data at element page:
%
s pagebar:
%
s'
%
(
page_num
,
line_count
))
get_parameter
=
soup
.
find_all
(
'script'
,
{
'type'
:
'text/javascript'
})
for
line
in
get_parameter
:
if
'pid'
in
str
(
line
)
and
'oid'
in
str
(
line
):
parameter_str
=
str
(
line
)
parameter_str
=
parameter_str
.
replace
(
'
\r
'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
"
\'
"
,
''
)
domain
=
re
.
findall
(
'
\
d+'
,
''
.
join
(
re
.
findall
(
"pid]=
\
d+"
,
parameter_str
)))[
0
]
special_id
=
re
.
findall
(
'
\
d+'
,
''
.
join
(
re
.
findall
(
"page_id]=
\
d+"
,
parameter_str
)))[
0
]
current_time
=
int
(
time
.
time
()
*
1000
)
for
pagebar
in
[
0
,
1
]:
user_url
=
(
'https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain='
+
domain
+
'&profile_ftype=1&is_all=1&pagebar='
+
str
(
pagebar
)
+
'&pl_name=Pl_Official_MyProfileFeed__22&id='
+
special_id
+
'&script_uri=/'
+
user_id
+
'&feed_type=0&page='
+
str
(
page_num
)
+
'&pre_page=1'
'&domain_op='
+
domain
+
'&__rnd='
+
str
(
current_time
))
get_page
=
requests
.
get
(
user_url
,
headers
=
self
.
headers
,
cookies
=
cookie
)
get_page
.
encoding
=
'utf-8'
try
:
page_dic
=
get_page
.
json
()
user_weibo_str
=
page_dic
[
'data'
]
user_weibo_soup
=
BeautifulSoup
(
user_weibo_str
,
'html.parser'
)
user_weibo_agg
=
user_weibo_soup
.
find_all
(
'div'
,
{
'action-type'
:
'feed_list_item'
})
# time.sleep(15)
for
line
in
user_weibo_agg
:
try
:
weibo_info
=
self
.
get_user_weibo_info
(
line
,
cookie
)
weibo_info
[
'user_id'
]
=
user_id
weibo_info
[
'user_url'
]
=
'https://weibo.com/'
+
user_id
result_lst
.
append
(
weibo_info
)
print
(
'get data at ajax page page_num:
%
s pagebar:
%
s'
%
(
page_num
,
pagebar
))
except
:
print
(
'one weibo_info error'
)
except
:
print
(
'page error at page_num:
%
s pagebar:
%
s'
%
(
page_num
,
pagebar
))
if
result_lst
!=
[]:
return
result_lst
else
:
print
(
"can't get repost data"
)
return
None
@staticmethod
def
get_single_page
(
mid
):
...
...
@@ -530,40 +405,11 @@ class Crawler_zhihu():
if
__name__
==
'__main__'
:
zhihu
=
Crawler_zhihu
()
import
os
# import PyV8
import
execjs
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
# print(execjs.get().name )
# os.environ["EXECJS_RUNTIME"] = 'Node'
# print(execjs.get().name )
# zhihu.get_serach_page_cookies("热玛吉")
# zhihu.search_page("比基尼线脱毛"
)
zhihu
.
search_page
(
"双眼皮"
,
search_pages_max
=
1
,
output_to_es_register
=
True
)
# zhihu.get_single_answer_page("325099876","1209953121")
# print(user_page)
if
True
:
# with PyV8.JSContext() as ctx:
url
=
"api/v4/search_v3?t=general&q=
%
E7
%83%
AD
%
E7
%8
E
%9
B
%
E5
%90%89
&correction=1&offset=20&limit=20&lc_idx=25&show_all_topics=0&search_hash_id=12d60c255d0be17b9830355a0d04de5b&vertical_info=0
%2
C1
%2
C0
%2
C0
%2
C0
%2
C0
%2
C0
%2
C0
%2
C0
%2
C1"
referer
=
"https://www.zhihu.com/search?type=content&q=
%
E7
%83%
AD
%
E7
%8
E
%9
B
%
E5
%90%89
"
f
=
"+"
.
join
([
"3_2.0"
,
url
,
referer
,
'"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"'
])
fmd5
=
hashlib
.
new
(
'md5'
,
f
.
encode
())
.
hexdigest
()
# with open('./zhihu_js.js', 'r') as f:
# # print(f.read())
# ctx.eval(f.read())
# encrypt_str = ctx.locals.add('b',fmd5)
with
open
(
'./zhihu.js'
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
js
=
f
.
read
()
encrypt_str
=
execjs
.
compile
(
js
)
.
call
(
'b'
,
fmd5
)
headers
=
{
"referer"
:
referer
,
"user-agent"
:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
"cookie"
:
'd_c0="AACSLMY7lBGPTo9fXdy2pmiGQ4ZVVUcqzC4=|1594785557";'
,
"x-api-version"
:
"3.0.91"
,
"x-zse-83"
:
"3_2.0"
,
"x-zse-86"
:
"1.0_
%
s"
%
encrypt_str
,
}
print
(
headers
)
r
=
requests
.
get
(
"https://www.zhihu.com"
+
url
,
headers
=
headers
)
print
(
r
.
text
)
This diff is collapsed.
Click to expand it.
crawler_sys/utils/output_results.py
View file @
4362749f
...
...
@@ -15,7 +15,7 @@ import pymysql
import
requests
from
elasticsearch.exceptions
import
TransportError
from
crawler_sys.framework.redis_interact
import
feed_url_into_redis
from
crawler_sys.framework.redis_interact
import
r
ds
from
crawler_sys.framework.redis_interact
import
r
edis_path
from
crawler_sys.framework.es_ccr_index_defination
import
es_framework
as
es_site_crawler
from
crawler_sys.framework.es_ccr_index_defination
import
index_url_register
from
crawler_sys.framework.es_ccr_index_defination
import
doc_type_url_register
...
...
@@ -296,7 +296,9 @@ def output_result(result_Lst, platform,
push_to_redis
=
False
,
output_to_test_mysql
=
False
,
output_to_mimas_mysql
=
False
,
es_index
=
index_site_crawler
,
**
kwargs
):
es_index
=
index_site_crawler
,
rds_path
=
"on_line"
,
**
kwargs
):
# write data into es crawler-raw index
if
output_to_es_raw
:
bulk_write_into_es
(
result_Lst
,
es_index
)
...
...
@@ -308,14 +310,16 @@ def output_result(result_Lst, platform,
index
=
es_index
,
construct_id
=
True
,
platform
=
platform
,
)
if
output_to_test_mysql
:
pass
# feed url into redis
if
push_to_redis
:
rds
=
redis_path
(
rds_path
)
feed_url_into_redis
(
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
))
result_Lst
,
expire
=
kwargs
.
get
(
"expire"
)
,
rds
=
rds
)
# output into file according to passed in parameters
if
output_to_file
is
True
and
filepath
is
not
None
:
...
...
@@ -451,7 +455,7 @@ def load_json_file_into_dict_Lst(filename, path):
return
data_Lst
def
crawl_a_url_and_update_redis
(
url
,
platform
,
urlhash
,
processID
=-
1
):
def
crawl_a_url_and_update_redis
(
url
,
platform
,
urlhash
,
rds
,
processID
=-
1
,
):
# find crawler
# perform crawling, get the data
# write es or output to files
...
...
@@ -469,7 +473,7 @@ def crawl_batch_task(url_Lst):
url_info
[
'urlhash'
])
def
scan_redis_to_crawl
():
def
scan_redis_to_crawl
(
rds
):
batch_size
=
1000
cur
=
0
task_batchs
=
[]
...
...
@@ -491,13 +495,13 @@ def scan_redis_to_crawl():
'urlhash'
:
urlhash
})
if
len
(
task_batchs
)
==
batch_size
:
# multi-processing here
crawl_batch_task
(
task_batchs
)
crawl_batch_task
(
rds
,
task_batchs
)
task_batchs
.
clear
()
if
cur
==
0
:
break
def
remove_fetched_url_from_redis
(
remove_interval
=
10
):
def
remove_fetched_url_from_redis
(
r
ds
,
r
emove_interval
=
10
):
time
.
sleep
(
remove_interval
)
cur
=
0
delete_counter
=
0
...
...
This diff is collapsed.
Click to expand it.
requirements.txt
View file @
4362749f
git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
#
git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
lxml
==4.5.1
requests
==2.23.0
tqdm
==4.46.1
...
...
This diff is collapsed.
Click to expand it.
write_data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
View file @
4362749f
...
...
@@ -5,7 +5,7 @@
# @author : litao
def
calculate_
douban
_id
(
data_dic
):
def
calculate_
zhihu
_id
(
data_dic
):
if
"answer"
in
data_dic
[
"url"
]:
return
data_dic
[
"_id"
]
.
replace
(
"zhihu_"
,
""
)
else
:
...
...
This diff is collapsed.
Click to expand it.
write_data_into_es/func_cal_doc_id.py
View file @
4362749f
...
...
@@ -17,6 +17,7 @@ from write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id import ca
from
write_data_into_es.calculate_doc_id.func_calculate_douyin_id
import
calculate_douyin_id
from
write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id
import
calculate_haokan_id
from
write_data_into_es.calculate_doc_id.func_calculate_weibo_id
import
calculate_weibo_id
from
write_data_into_es.calculate_doc_id.func_calculate_zhihu_id
import
calculate_zhihu_id
from
write_data_into_es.calculate_doc_id.func_calculate_douban_id
import
calculate_douban_id
...
...
@@ -32,7 +33,7 @@ def vid_cal_func(platform):
"haokan"
:
calculate_haokan_id
,
"weibo"
:
calculate_weibo_id
,
"douban"
:
calculate_douban_id
,
"zhihu"
:
calculate_
douban
_id
,
"zhihu"
:
calculate_
zhihu
_id
,
}
def
general_vid_cal_func
(
url
):
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment