Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
b7b19ad0
Commit
b7b19ad0
authored
Jan 30, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
应用用户搜索词爬取
parent
b480d5d9
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
104 additions
and
95 deletions
+104
-95
zhihu_login.py
zhihu_login.py
+104
-95
No files found.
zhihu_login.py
View file @
b7b19ad0
...
...
@@ -363,108 +363,117 @@ class ZhihuAccount(object):
# 知乎搜索词搜索
def
zhihu_query_by_word
(
self
,
query_word
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
,
cur_word_index
):
try
:
cur_image_index
=
0
for
begin_index
in
range
(
0
,
100
,
10
):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
query_by_word_url
=
"https://www.zhihu.com/api/v4/search_v3?"
query_params_dict
=
{
"q"
:
query_word
,
"offset"
:
begin_index
,
"limit"
:
10
,
"lc_idx"
:
22
,
"show_all_topics"
:
0
,
"search_hash_id"
:
"dc4a11848e2540981cf28634ff3609c0"
,
"vertical_info"
:
"0,0,0,0,0,0,0,0,0,1"
,
"correction"
:
1
,
"t"
:
"general"
}
query_by_word_url
+=
urllib
.
parse
.
urlencode
(
query_params_dict
)
res
=
self
.
session
.
get
(
query_by_word_url
,
allow_redirects
=
False
)
print
(
10
*
"*"
)
print
(
query_by_word_url
)
print
(
res
)
raw_content
=
brotli
.
decompress
(
res
.
content
)
print
(
type
(
raw_content
))
raw_content_dict
=
json
.
loads
(
str
(
raw_content
,
encoding
=
"utf-8"
))
if
"data"
in
raw_content_dict
:
for
data_item
in
raw_content_dict
[
"data"
]:
if
data_item
[
"type"
]
==
"search_result"
:
try
:
data_type
=
data_item
[
"object"
][
"type"
]
content
=
data_item
[
"object"
][
"content"
]
if
"content"
in
data_item
[
"object"
]
else
""
# content = copy.deepcopy(tmp_content)
platform_id
=
str
(
data_item
[
"object"
][
"id"
])
user_id
=
random
.
choice
(
majia_user_list
)
question_id
=
""
have_saved_this_answer
=
False
img_url_list
=
re
.
findall
(
'src="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
img_url_list
=
re
.
findall
(
'data-original="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
if
data_type
==
"article"
:
title
=
data_item
[
"object"
][
"title"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
elif
data_type
==
"answer"
:
title
=
data_item
[
"object"
][
"question"
][
"name"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
if
question_id
not
in
question_answer_dict
:
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
question_item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
question_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
""
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else
:
have_saved_this_answer
=
True
else
:
print
(
"type is:
%
s"
%
data_type
)
title
=
""
cur_image_index
=
0
for
begin_index
in
range
(
0
,
100
,
10
):
# query_by_word_url = "https://www.zhihu.com/api/v4/search_v3?t=general&correction=1&lc_idx=62&" \
# "show_all_topics=0&search_hash_id=1dbb1e923a17f147356177932d1236e1&" \
# "vertical_info=0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1" + "&q=" + str(query_word) + "&offset=" + str(begin_index) + "&limit=10"
query_by_word_url
=
"https://www.zhihu.com/api/v4/search_v3?"
query_params_dict
=
{
"q"
:
query_word
,
"offset"
:
begin_index
,
"limit"
:
10
,
"lc_idx"
:
22
,
"show_all_topics"
:
0
,
"search_hash_id"
:
"dc4a11848e2540981cf28634ff3609c0"
,
"vertical_info"
:
"0,0,0,0,0,0,0,0,0,1"
,
"correction"
:
1
,
"t"
:
"general"
}
query_by_word_url
+=
urllib
.
parse
.
urlencode
(
query_params_dict
)
res
=
self
.
session
.
get
(
query_by_word_url
,
allow_redirects
=
False
)
print
(
10
*
"*"
)
print
(
query_by_word_url
)
print
(
res
)
raw_content
=
brotli
.
decompress
(
res
.
content
)
print
(
type
(
raw_content
))
raw_content_dict
=
json
.
loads
(
str
(
raw_content
,
encoding
=
"utf-8"
))
if
"data"
in
raw_content_dict
:
for
data_item
in
raw_content_dict
[
"data"
]:
if
data_item
[
"type"
]
==
"search_result"
:
try
:
data_type
=
data_item
[
"object"
][
"type"
]
content
=
data_item
[
"object"
][
"content"
]
if
"content"
in
data_item
[
"object"
]
else
""
# content = copy.deepcopy(tmp_content)
platform_id
=
str
(
data_item
[
"object"
][
"id"
])
user_id
=
random
.
choice
(
majia_user_list
)
question_id
=
""
have_saved_this_answer
=
False
img_url_list
=
re
.
findall
(
'src="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
img_url_list
=
re
.
findall
(
'data-original="(.*?)"'
,
content
)
content
,
cur_image_index
=
self
.
_dispose_content_url
(
content
=
content
,
img_url_list
=
img_url_list
,
cur_image_index
=
cur_image_index
,
cur_word_index
=
cur_word_index
)
if
data_type
==
"article"
:
title
=
data_item
[
"object"
][
"title"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
elif
data_type
==
"answer"
:
title
=
data_item
[
"object"
][
"question"
][
"name"
]
title
=
title
.
replace
(
"<em>"
,
""
)
title
=
title
.
replace
(
"</em>"
,
""
)
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
if
question_id
not
in
question_answer_dict
:
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
question_item_dict
=
{
if
not
have_saved_this_answer
:
item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
question
_id
,
"platform_id"
:
platform
_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
""
,
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
# self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else
:
have_saved_this_answer
=
True
else
:
print
(
"type is:
%
s"
%
data_type
)
title
=
""
if
not
have_saved_this_answer
:
item_dict
=
{
"user_id"
:
user_id
,
"platform_id"
:
platform_id
,
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
# time.sleep(2)
# self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
except
:
print
(
traceback
.
format_exc
())
# 知乎问题对应的回答列表
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment