Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
b7b19ad0
Commit
b7b19ad0
authored
Jan 30, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
应用用户搜索词爬取
parent
b480d5d9
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
13 deletions
+22
-13
zhihu_login.py
zhihu_login.py
+22
-13
No files found.
zhihu_login.py
View file @
b7b19ad0
...
...
@@ -363,9 +363,9 @@ class ZhihuAccount(object):
# 知乎搜索词搜索
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
,
cur_word_index
):
try
:
cur_image_index
=
0
for
begin_index
in
range
(
0
,
100
,
10
):
for
begin_index
in
range
(
0
,
100
,
10
):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
...
...
@@ -384,14 +384,14 @@ class ZhihuAccount(object):
}
query_by_word_url
+=
urllib
.
parse
.
urlencode
(
query_params_dict
)
res
=
self
.
session
.
get
(
query_by_word_url
,
allow_redirects
=
False
)
print
(
10
*
"*"
)
res
=
self
.
session
.
get
(
query_by_word_url
,
allow_redirects
=
False
)
print
(
10
*
"*"
)
print
(
query_by_word_url
)
print
(
res
)
raw_content
=
brotli
.
decompress
(
res
.
content
)
print
(
type
(
raw_content
))
raw_content_dict
=
json
.
loads
(
str
(
raw_content
,
encoding
=
"utf-8"
))
raw_content_dict
=
json
.
loads
(
str
(
raw_content
,
encoding
=
"utf-8"
))
if
"data"
in
raw_content_dict
:
for
data_item
in
raw_content_dict
[
"data"
]:
...
...
@@ -406,19 +406,24 @@ class ZhihuAccount(object):
have_saved_this_answer
=
False
img_url_list
=
re
.
findall
(
'src="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
img_url_list
=
re
.
findall
(
'data-original="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
if
data_type
==
"article"
:
title
=
data_item
[
"object"
][
"title"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
elif
data_type
==
"answer"
:
title
=
data_item
[
"object"
][
"question"
][
"name"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
...
...
@@ -426,7 +431,11 @@ class ZhihuAccount(object):
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
question_item_dict
=
{
"user_id"
:
user_id
,
...
...
@@ -463,8 +472,8 @@ class ZhihuAccount(object):
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
# time.sleep(2
)
except
:
print
(
traceback
.
format_exc
()
)
# 知乎问题对应的回答列表
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment