Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
5512a01d
Commit
5512a01d
authored
Jan 15, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
获取回答列表
parent
66e74343
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
109 additions
and
19 deletions
+109
-19
zhihu_login.py
zhihu_login.py
+109
-19
No files found.
zhihu_login.py
View file @
5512a01d
...
...
@@ -264,7 +264,7 @@ class ZhihuAccount(object):
return
content
,
cur_image_index
# 知乎搜索词搜索
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
cur_image_index
):
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
cur_image_index
,
question_answer_dict
):
for
begin_index
in
range
(
0
,
200
,
10
):
query_by_word_url
=
"https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&"
\
...
...
@@ -285,9 +285,10 @@ class ZhihuAccount(object):
data_type
=
data_item
[
"object"
][
"type"
]
content
=
data_item
[
"object"
][
"content"
]
# content = copy.deepcopy(tmp_content)
platform_id
=
data_item
[
"object"
][
"id"
]
platform_id
=
str
(
data_item
[
"object"
][
"id"
])
user_id
=
random
.
choice
(
majia_user_list
)
question_id
=
""
have_saved_this_answer
=
False
img_url_list
=
re
.
findall
(
'src="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
)
...
...
@@ -298,23 +299,47 @@ class ZhihuAccount(object):
if
data_type
==
"article"
:
title
=
data_item
[
"object"
][
"title"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
elif
data_type
==
"answer"
:
title
=
data_item
[
"object"
][
"question"
][
"name"
]
question_id
=
data_item
[
"object"
][
"question"
][
"id"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
if
question_id
not
in
question_answer_dict
:
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
)
question_item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
question_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
""
}
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
else
:
have_saved_this_answer
=
True
else
:
print
(
"type is:
%
s"
%
data_type
)
title
=
""
item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
platform_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
question_id
}
if
not
have_saved_this_answer
:
item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
platform_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
question_id
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
...
...
@@ -323,9 +348,66 @@ class ZhihuAccount(object):
# 知乎问题对应的回答列表
def
zhihu_answers_list_by_question
(
self
,
question_id
):
answers_list_by_question_url
=
"https://www.zhihu.com/api/v4/questions/"
+
str
(
question_id
)
+
"/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Cis_sticky
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Crelevant_info
%2
Cquestion
%2
Cexcerpt
%2
Crelationship.is_authorized
%2
Cis_author
%2
Cvoting
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_labeled
%2
Cis_recognized
%2
Cpaid_info
%2
Cpaid_info_content
%3
Bdata
%5
B*
%5
D.mark_infos
%5
B*
%5
D.url
%3
Bdata
%5
B*
%5
D.author.follower_count
%2
Cbadge
%5
B*
%5
D.topics&offset=0&limit=10&sort_by=default&platform=desktop"
def
zhihu_answers_list_by_question
(
self
,
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
):
for
begin_index
in
range
(
0
,
200
,
10
):
answers_list_by_question_url
=
"https://www.zhihu.com/api/v4/questions/"
+
str
(
question_id
)
+
\
"/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Cis_sticky
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Crelevant_info
%2
Cquestion
%2
Cexcerpt
%2
Crelationship.is_authorized
%2
Cis_author
%2
Cvoting
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_labeled
%2
Cis_recognized
%2
Cpaid_info
%2
Cpaid_info_content
%3
Bdata
%5
B*
%5
D.mark_infos
%5
B*
%5
D.url
%3
Bdata
%5
B*
%5
D.author.follower_count
%2
Cbadge
%5
B*
%5
D.topics&"
\
"sort_by=default&platform=desktop"
+
"&offset="
+
str
(
begin_index
)
+
"&limit=10"
res
=
self
.
session
.
get
(
answers_list_by_question_url
,
allow_redirects
=
False
)
print
(
10
*
"*"
)
raw_content
=
brotli
.
decompress
(
res
.
content
)
print
(
type
(
raw_content
))
raw_content_dict
=
json
.
loads
(
str
(
raw_content
,
encoding
=
"utf-8"
))
if
"data"
in
raw_content_dict
:
for
data_item
in
raw_content_dict
[
"data"
]:
try
:
user_id
=
random
.
choice
(
majia_user_list
)
data_type
=
data_item
[
"type"
]
platform_id
=
str
(
data_item
[
"id"
])
data_content
=
data_item
[
"content"
]
question_id
=
""
question_title
=
""
have_saved_this_answer
=
False
img_url_list
=
re
.
findall
(
'src="(.*?)"'
,
data_content
)
data_content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
data_content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
)
img_url_list
=
re
.
findall
(
'data-original="(.*?)"'
,
data_content
)
data_content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
data_content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
)
if
data_type
==
"answer"
and
"question"
in
data_item
:
question_id
=
str
(
data_item
[
"question"
][
"id"
])
question_title
=
data_item
[
"question"
][
"title"
]
if
question_id
not
in
question_answer_dict
:
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
else
:
have_saved_this_answer
=
True
if
not
have_saved_this_answer
:
item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
platform_id
,
"title"
:
question_title
,
"content"
:
data_content
,
"type"
:
data_type
,
"question_id"
:
question_id
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
return
if
__name__
==
'__main__'
:
...
...
@@ -336,10 +418,18 @@ if __name__ == '__main__':
account
.
login
(
captcha_lang
=
'en'
,
load_cookies
=
True
)
#account.test_member_article()
zhihu_spider_data
=
"./zhihu_spider_data.txt"
zhihu_spider_fd
=
open
(
zhihu_spider_data
,
"w"
)
zhihu_spider_data
_file
=
"./zhihu_spider_data.txt"
zhihu_spider_fd
=
open
(
zhihu_spider_data
_file
,
"w"
)
zhihu_spider_question_data_file
=
"./zhihu_spider_question_data.txt"
zhihu_spider_question_fd
=
open
(
zhihu_spider_question_data_file
,
"w"
)
# 问题回答映射词典
question_answer_dict
=
dict
()
cur_image_index
=
0
account
.
zhihu_query_by_word
(
top_query_list
[
0
],
zhihu_spider_fd
,
cur_image_index
)
account
.
zhihu_query_by_word
(
top_query_list
[
0
],
zhihu_spider_fd
,
zhihu_spider_question_fd
,
cur_image_index
,
question_answer_dict
)
zhihu_spider_fd
.
close
()
\ No newline at end of file
zhihu_spider_fd
.
close
()
zhihu_spider_question_fd
.
close
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment