Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
b480d5d9
Commit
b480d5d9
authored
Jan 30, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
应用用户搜索词爬取
parent
1e32f0fc
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
8 deletions
+29
-8
zhihu_login.py
zhihu_login.py
+29
-8
No files found.
zhihu_login.py
View file @
b480d5d9
...
@@ -57,6 +57,8 @@ majia_user_list = [
...
@@ -57,6 +57,8 @@ majia_user_list = [
"32269952"
,
"32269956"
,
"32269962"
,
"32269966"
,
"32269973"
,
"32269978"
,
"32269980"
,
"32269982"
,
"32269987"
,
"32269989"
,
"32270003"
,
"32270004"
,
"32270007"
,
"32270012"
,
"32270015"
,
"32270017"
,
"32270020"
,
"32270024"
,
"32270027"
,
"32270031"
,
"32270041"
,
"32270044"
,
"32270047"
,
"32270050"
,
"32270054"
,
"32270055"
,
"32270057"
,
"32270059"
,
"32270063"
,
"32270066"
,
"32269913"
,
"32269918"
,
"32269920"
,
"32269927"
,
"32269933"
,
"32269939"
,
"32269943"
,
"32269948"
,
"32269957"
,
"32269965"
,
"32269972"
,
"32269979"
,
"32269983"
,
"32269988"
,
"32269995"
,
"32270002"
,
"32270005"
,
"32270011"
,
"32270016"
,
"32270022"
,
"32270029"
,
"32270036"
,
"32270040"
,
"32270051"
,
"32270061"
,
"32270065"
,
"32270071"
,
"32270075"
,
"32270081"
,
"32270085"
,
"32270094"
,
"32270096"
,
"32270110"
,
"32270116"
,
"32270121"
,
"32270141"
,
"32270147"
,
"32270152"
,
"32270156"
,
"32270161"
,
"32270114"
,
"32270119"
,
"32270122"
,
"32270125"
,
"32270129"
,
"32270131"
,
"32270133"
,
"32270134"
,
"32270137"
,
"32270167"
,
"32270068"
,
"32270070"
,
"32270076"
,
"32270078"
,
"32270083"
,
"32270087"
,
"32270093"
,
"32270095"
,
"32270099"
,
"32270105"
,
"32269992"
,
"32270018"
,
"32270023"
,
"32270030"
,
"32270034"
,
"32270043"
,
"32270048"
,
"32270052"
,
"32270056"
,
"32270060"
"32269952"
,
"32269956"
,
"32269962"
,
"32269966"
,
"32269973"
,
"32269978"
,
"32269980"
,
"32269982"
,
"32269987"
,
"32269989"
,
"32270003"
,
"32270004"
,
"32270007"
,
"32270012"
,
"32270015"
,
"32270017"
,
"32270020"
,
"32270024"
,
"32270027"
,
"32270031"
,
"32270041"
,
"32270044"
,
"32270047"
,
"32270050"
,
"32270054"
,
"32270055"
,
"32270057"
,
"32270059"
,
"32270063"
,
"32270066"
,
"32269913"
,
"32269918"
,
"32269920"
,
"32269927"
,
"32269933"
,
"32269939"
,
"32269943"
,
"32269948"
,
"32269957"
,
"32269965"
,
"32269972"
,
"32269979"
,
"32269983"
,
"32269988"
,
"32269995"
,
"32270002"
,
"32270005"
,
"32270011"
,
"32270016"
,
"32270022"
,
"32270029"
,
"32270036"
,
"32270040"
,
"32270051"
,
"32270061"
,
"32270065"
,
"32270071"
,
"32270075"
,
"32270081"
,
"32270085"
,
"32270094"
,
"32270096"
,
"32270110"
,
"32270116"
,
"32270121"
,
"32270141"
,
"32270147"
,
"32270152"
,
"32270156"
,
"32270161"
,
"32270114"
,
"32270119"
,
"32270122"
,
"32270125"
,
"32270129"
,
"32270131"
,
"32270133"
,
"32270134"
,
"32270137"
,
"32270167"
,
"32270068"
,
"32270070"
,
"32270076"
,
"32270078"
,
"32270083"
,
"32270087"
,
"32270093"
,
"32270095"
,
"32270099"
,
"32270105"
,
"32269992"
,
"32270018"
,
"32270023"
,
"32270030"
,
"32270034"
,
"32270043"
,
"32270048"
,
"32270052"
,
"32270056"
,
"32270060"
]
]
g_query_word_set
=
set
()
g_if_get_query_word
=
False
ZHENGXING_HOST
=
"172.16.30.141"
ZHENGXING_HOST
=
"172.16.30.141"
ZHENGXING_USER
=
"work"
ZHENGXING_USER
=
"work"
...
@@ -363,7 +365,7 @@ class ZhihuAccount(object):
...
@@ -363,7 +365,7 @@ class ZhihuAccount(object):
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
,
cur_word_index
):
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
,
cur_word_index
):
cur_image_index
=
0
cur_image_index
=
0
for
begin_index
in
range
(
0
,
2
00
,
10
):
for
begin_index
in
range
(
0
,
1
00
,
10
):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
...
@@ -468,7 +470,7 @@ class ZhihuAccount(object):
...
@@ -468,7 +470,7 @@ class ZhihuAccount(object):
# 知乎问题对应的回答列表
# 知乎问题对应的回答列表
def
zhihu_answers_list_by_question
(
self
,
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
):
def
zhihu_answers_list_by_question
(
self
,
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
):
for
begin_index
in
range
(
0
,
2
00
,
10
):
for
begin_index
in
range
(
0
,
1
00
,
10
):
# answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + \
# answers_list_by_question_url = "https://www.zhihu.com/api/v4/questions/" + str(question_id) + \
# "/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&" \
# "/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&" \
# "sort_by=default&platform=desktop" + "&offset=" + str(begin_index) + "&limit=10"
# "sort_by=default&platform=desktop" + "&offset=" + str(begin_index) + "&limit=10"
...
@@ -551,16 +553,35 @@ def get_query_word():
...
@@ -551,16 +553,35 @@ def get_query_word():
global
g_cur_word_index
global
g_cur_word_index
query
_word
=
""
ret
_word
=
""
g_cur_word_index
+=
1
g_cur_word_index
+=
1
try
:
try
:
query_word
=
top_query_list
.
pop
()
# query_word = top_query_list.pop()
global
g_query_word_set
global
g_if_get_query_word
if
len
(
g_query_word_set
)
==
0
and
not
g_if_get_query_word
:
g_if_get_query_word
=
True
offi_query_word_fd
=
open
(
"/data/log/spider/test_service/offi_query_word_from_20190101_20200115.txt"
,
"r"
)
for
line
in
offi_query_word_fd
:
line
=
line
.
strip
()
line
=
line
.
strip
(
"
\r
"
)
line
=
line
.
strip
(
"
\t
"
)
line
=
line
.
strip
(
" "
)
query_word
,
query_counts
=
line
.
split
(
"
\t
"
)
query_word
=
query_word
.
strip
()
g_query_word_set
.
add
(
query_word
)
offi_query_word_fd
.
close
()
ret_word
=
g_query_word_set
.
pop
()
except
:
except
:
print
(
traceback
.
format_exc
())
print
(
traceback
.
format_exc
())
mutex_for_get_query_word
.
release
()
mutex_for_get_query_word
.
release
()
return
query
_word
,
g_cur_word_index
return
ret
_word
,
g_cur_word_index
def
concurrence_dispose_query_word
(
account_obj
):
def
concurrence_dispose_query_word
(
account_obj
):
...
@@ -571,10 +592,10 @@ def concurrence_dispose_query_word(account_obj):
...
@@ -571,10 +592,10 @@ def concurrence_dispose_query_word(account_obj):
query_word
,
g_cur_word_index
=
get_query_word
()
query_word
,
g_cur_word_index
=
get_query_word
()
print
(
"query_word:
%
s"
%
query_word
)
print
(
"query_word:
%
s"
%
query_word
)
zhihu_spider_data_file
=
"./zhihu_spider_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_data_file
=
"./
data/
zhihu_spider_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_fd
=
open
(
zhihu_spider_data_file
,
"w"
)
zhihu_spider_fd
=
open
(
zhihu_spider_data_file
,
"w"
)
zhihu_spider_question_data_file
=
"./zhihu_spider_question_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_question_data_file
=
"./
data/
zhihu_spider_question_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_question_fd
=
open
(
zhihu_spider_question_data_file
,
"w"
)
zhihu_spider_question_fd
=
open
(
zhihu_spider_question_data_file
,
"w"
)
# 问题回答映射词典
# 问题回答映射词典
...
@@ -612,7 +633,7 @@ if __name__ == '__main__':
...
@@ -612,7 +633,7 @@ if __name__ == '__main__':
gevent_spawn_obj_list
=
list
()
gevent_spawn_obj_list
=
list
()
for
cur_index
in
range
(
0
,
50
,
1
):
for
cur_index
in
range
(
0
,
50
0
,
1
):
g_obj
=
gevent
.
spawn
(
concurrence_dispose_query_word
,
account
)
g_obj
=
gevent
.
spawn
(
concurrence_dispose_query_word
,
account
)
gevent_spawn_obj_list
.
append
(
g_obj
)
gevent_spawn_obj_list
.
append
(
g_obj
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment