Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
3a5d56a3
Commit
3a5d56a3
authored
Jan 15, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
加入并发
parent
1212dc7d
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
33 deletions
+34
-33
zhihu_login.py
zhihu_login.py
+34
-33
No files found.
zhihu_login.py
View file @
3a5d56a3
...
...
@@ -405,9 +405,9 @@ class ZhihuAccount(object):
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
if
question_id
not
in
question_answer_dict
:
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
#
question_answer_dict[question_id] = set()
#
question_answer_dict[question_id].add(platform_id)
#
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
question_item_dict
=
{
...
...
@@ -419,11 +419,11 @@ class ZhihuAccount(object):
"question_id"
:
""
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_question_fd
,
item_dict
=
question_item_dict
)
#
zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
#
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_question_fd,item_dict=question_item_dict)
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
#
question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
#
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,question_id=question_id,platform_id=platform_id)
else
:
have_saved_this_answer
=
True
else
:
...
...
@@ -440,8 +440,8 @@ class ZhihuAccount(object):
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_fd
,
item_dict
=
item_dict
)
#
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
#
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
...
...
@@ -488,14 +488,14 @@ class ZhihuAccount(object):
question_title
=
data_item
[
"question"
][
"title"
]
if
question_id
not
in
question_answer_dict
:
#
question_answer_dict[question_id] = set()
#
question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
#
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
#
question_id=question_id, platform_id=platform_id)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
#
question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
#
self.mutex_for_zhihu_save_question_info(question_answer_dict=question_answer_dict,
#
question_id=question_id, platform_id=platform_id)
else
:
have_saved_this_answer
=
True
...
...
@@ -510,8 +510,8 @@ class ZhihuAccount(object):
"tags"
:
self
.
get_tfidf_words_from_content
(
data_content
)
}
#
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_fd
,
item_dict
=
item_dict
)
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
#
self.mutex_for_zhihu_save_file_info(file_fd=zhihu_spider_fd, item_dict=item_dict)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
...
...
@@ -537,7 +537,7 @@ def get_query_word():
return
query_word
,
g_cur_word_index
def
concurrence_dispose_query_word
(
account_obj
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
):
def
concurrence_dispose_query_word
(
account_obj
):
try
:
is_run
=
True
...
...
@@ -545,12 +545,26 @@ def concurrence_dispose_query_word(account_obj,zhihu_spider_fd,zhihu_spider_ques
query_word
,
g_cur_word_index
=
get_query_word
()
print
(
"query_word:
%
s"
%
query_word
)
zhihu_spider_data_file
=
"./zhihu_spider_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_fd
=
open
(
zhihu_spider_data_file
,
"w"
)
zhihu_spider_question_data_file
=
"./zhihu_spider_question_data_for_query_word_"
+
str
(
query_word
)
+
".txt"
zhihu_spider_question_fd
=
open
(
zhihu_spider_question_data_file
,
"w"
)
# 问题回答映射词典
question_answer_dict
=
dict
()
if
query_word
and
len
(
query_word
)
>
0
:
account_obj
.
zhihu_query_by_word
(
query_word
=
query_word
,
zhihu_spider_fd
=
zhihu_spider_fd
,
zhihu_spider_question_fd
=
zhihu_spider_question_fd
,
question_answer_dict
=
question_answer_dict
,
cur_word_index
=
g_cur_word_index
)
else
:
is_run
=
False
zhihu_spider_fd
.
close
()
zhihu_spider_question_fd
.
close
()
except
:
print
(
traceback
.
format_exc
())
...
...
@@ -564,15 +578,6 @@ if __name__ == '__main__':
account
.
add_jieba_tag_word
()
#account.test_member_article()
zhihu_spider_data_file
=
"./zhihu_spider_data.txt"
zhihu_spider_fd
=
open
(
zhihu_spider_data_file
,
"w"
)
zhihu_spider_question_data_file
=
"./zhihu_spider_question_data.txt"
zhihu_spider_question_fd
=
open
(
zhihu_spider_question_data_file
,
"w"
)
# 问题回答映射词典
question_answer_dict
=
dict
()
# cur_word_index = 0
# for query_word in top_query_list:
# cur_word_index += 1
...
...
@@ -582,12 +587,9 @@ if __name__ == '__main__':
gevent_spawn_obj_list
=
list
()
for
cur_index
in
range
(
0
,
50
,
1
):
g_obj
=
gevent
.
spawn
(
concurrence_dispose_query_word
,
account
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
)
g_obj
=
gevent
.
spawn
(
concurrence_dispose_query_word
,
account
)
gevent_spawn_obj_list
.
append
(
g_obj
)
for
g_obj
in
gevent_spawn_obj_list
:
g_obj
.
join
()
zhihu_spider_fd
.
close
()
zhihu_spider_question_fd
.
close
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment