Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
462cd16e
Commit
462cd16e
authored
Jan 15, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加图片存储目录
parent
36b96254
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
107 additions
and
17 deletions
+107
-17
zhihu_login.py
zhihu_login.py
+107
-17
No files found.
zhihu_login.py
View file @
462cd16e
...
...
@@ -36,9 +36,18 @@ import pymysql
import
jieba
import
jieba.analyse
import
os
from
gevent
import
monkey
;
monkey
.
patch_socket
()
import
gevent
from
threading
import
Thread
,
Lock
mutex_for_get_query_word
=
Lock
()
mutex_for_zhihu_save_question_info
=
Lock
()
mutex_for_zhihu_save_file
=
Lock
()
g_cur_word_index
=
0
top_query_list
=
[
"瘦脸针"
,
"双眼皮"
,
"水光针"
,
"手术面部提升"
,
"鼻翼缩小"
,
"玻尿酸"
,
"吸脂"
,
"线雕"
,
"鼻综合"
,
"光子嫩肤"
,
"瘦腿针"
,
"美白针"
,
"热玛吉"
,
"隆鼻"
,
"超声刀"
,
"脱毛"
,
"祛斑"
,
"果酸焕肤"
,
"面部吸脂"
,
"皮秒"
,
"瘦肩针"
,
"自体脂肪填充面部"
,
"牙齿矫正"
,
"微针"
,
"热拉提"
,
"鼻翼缩小"
,
"瘦脸"
,
"下巴"
,
"植发"
,
"溶脂针"
,
"点阵激光"
,
"双眼皮修复"
,
"小气泡"
,
"鼻基底"
,
"祛眼袋"
,
"隆胸"
,
"祛痘"
,
"开眼角"
,
"除皱"
,
"牙齿美白"
,
"埋线双眼皮"
,
"颧骨"
,
"下颌角"
,
"纹眉"
,
"激光脱毛"
,
"玻尿酸丰下巴"
,
"法令纹"
,
"玻尿酸隆鼻"
,
"洗牙"
,
"吸脂瘦大腿"
,
"溶脂"
,
"保妥适"
,
"黄金微针"
,
"自体脂肪填充"
,
"美白"
,
"黑眼圈"
,
"白瓷娃娃"
,
"祛疤"
,
"切开双眼皮"
,
"泪沟"
,
"光纤溶脂"
,
"磨骨"
,
"嗨体"
,
"肉毒素"
,
"丰胸(隆胸)"
,
"微针祛痘坑"
,
"激光祛斑"
,
"假体下巴"
,
"植发际线"
,
"面部提升"
,
"肋骨鼻"
,
"蜂巢皮秒"
,
"祛痘祛痘印"
,
"腰腹吸脂"
,
"瘦腿"
,
"面部填充"
,
"厚唇改薄术"
,
"下眼睑下至"
,
"溶解酶"
,
"私密"
,
"点痣"
,
"酒窝"
,
"女性私密紧致"
,
"艾莉薇"
,
"伊婉V"
,
"无针水光"
,
"自体脂肪"
,
"人中缩短"
,
"m22"
,
"激光点痣"
,
"丰唇"
,
"脸型"
,
"埋线隆鼻"
,
"埋线"
,
"收缩毛孔"
,
"黑脸娃娃"
,
"伊婉C"
,
"开外眼角"
,
"童颜针"
,
"妊娠纹"
]
...
...
@@ -277,6 +286,32 @@ class ZhihuAccount(object):
print
(
traceback
.
format_exc
())
return
[]
def
mutex_for_zhihu_save_question_info
(
self
,
question_answer_dict
,
question_id
,
platform_id
):
mutex_for_zhihu_save_question_info
.
acquire
(
True
)
try
:
if
question_id
not
in
question_answer_dict
:
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
except
:
print
(
traceback
.
format_exc
())
mutex_for_zhihu_save_question_info
.
release
()
def
mutex_for_zhihu_save_file_info
(
self
,
file_fd
,
item_dict
):
mutex_for_zhihu_save_file
.
acquire
(
True
)
try
:
file_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
except
:
print
(
traceback
.
format_exc
())
mutex_for_zhihu_save_file
.
release
()
# 知乎个人文章列表
def
test_member_article
(
self
):
member_article_url
=
"https://www.zhihu.com/api/v4/members/li-pei-rong-96/articles?include=data
%5
B*
%5
D.comment_count
%2
Csuggest_edit
%2
Cis_normal
%2
Cthumbnail_extra_info
%2
Cthumbnail
%2
Ccan_comment
%2
Ccomment_permission
%2
Cadmin_closed_comment
%2
Ccontent
%2
Cvoteup_count
%2
Ccreated
%2
Cupdated
%2
Cupvoted_followees
%2
Cvoting
%2
Creview_info
%2
Cis_labeled
%2
Clabel_info
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics&offset=40&limit=20&sort_by=created"
...
...
@@ -316,6 +351,9 @@ class ZhihuAccount(object):
qiniu_url
=
upload_file
(
local_cropped_img_url_path
)
content
=
content
.
replace
(
ori_img_url
,
qiniu_url
)
os
.
remove
(
local_img_url_path
)
os
.
remove
(
local_cropped_img_url_path
)
return
content
,
cur_image_index
except
:
print
(
traceback
.
format_exc
())
...
...
@@ -367,8 +405,9 @@ class ZhihuAccount(object):
question_id
=
str
(
data_item
[
"object"
][
"question"
][
"id"
])
if
question_id
not
in
question_answer_dict
:
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
# question_answer_dict[question_id] = set()
# question_answer_dict[question_id].add(platform_id)
cur_image_index
=
self
.
zhihu_answers_list_by_question
(
question_id
,
question_answer_dict
,
zhihu_spider_fd
,
cur_image_index
,
cur_word_index
)
question_item_dict
=
{
...
...
@@ -380,9 +419,11 @@ class ZhihuAccount(object):
"question_id"
:
""
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_question_fd
,
item_dict
=
question_item_dict
)
# zhihu_spider_question_fd.write(json.dumps(question_item_dict) + "\n")
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
# question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
else
:
have_saved_this_answer
=
True
else
:
...
...
@@ -399,13 +440,13 @@ class ZhihuAccount(object):
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_fd
,
item_dict
=
item_dict
)
# zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
time
.
sleep
(
2
)
#
time.sleep(2)
# 知乎问题对应的回答列表
...
...
@@ -447,10 +488,14 @@ class ZhihuAccount(object):
question_title
=
data_item
[
"question"
][
"title"
]
if
question_id
not
in
question_answer_dict
:
question_answer_dict
[
question_id
]
=
set
()
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
# question_answer_dict[question_id] = set()
# question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
question_answer_dict
[
question_id
]
.
add
(
platform_id
)
# question_answer_dict[question_id].add(platform_id)
self
.
mutex_for_zhihu_save_question_info
(
question_answer_dict
=
question_answer_dict
,
question_id
=
question_id
,
platform_id
=
platform_id
)
else
:
have_saved_this_answer
=
True
...
...
@@ -465,8 +510,8 @@ class ZhihuAccount(object):
"tags"
:
self
.
get_tfidf_words_from_content
(
data_content
)
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
#
zhihu_spider_fd.write(json.dumps(item_dict) + "\n")
self
.
mutex_for_zhihu_save_file_info
(
file_fd
=
zhihu_spider_fd
,
item_dict
=
item_dict
)
except
:
print
(
traceback
.
format_exc
())
print
(
str
(
data_item
))
...
...
@@ -474,6 +519,42 @@ class ZhihuAccount(object):
return
cur_image_index
def
get_query_word
():
mutex_for_get_query_word
.
acquire
(
True
)
global
g_cur_word_index
query_word
=
""
g_cur_word_index
+=
1
try
:
query_word
=
top_query_list
.
pop
()
except
:
print
(
traceback
.
format_exc
())
mutex_for_get_query_word
.
release
()
return
query_word
,
g_cur_word_index
def
concurrence_dispose_query_word
(
account_obj
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
):
try
:
is_run
=
True
while
is_run
:
query_word
,
g_cur_word_index
=
get_query_word
()
print
(
"query_word:
%
s"
%
query_word
)
if
query_word
and
len
(
query_word
)
>
0
:
account_obj
.
zhihu_query_by_word
(
query_word
=
query_word
,
zhihu_spider_fd
=
zhihu_spider_fd
,
zhihu_spider_question_fd
=
zhihu_spider_question_fd
,
question_answer_dict
=
question_answer_dict
,
cur_word_index
=
g_cur_word_index
)
else
:
is_run
=
False
except
:
print
(
traceback
.
format_exc
())
if
__name__
==
'__main__'
:
os
.
environ
.
setdefault
(
"DJANGO_SETTINGS_MODULE"
,
"settings.settings"
)
...
...
@@ -492,13 +573,21 @@ if __name__ == '__main__':
# 问题回答映射词典
question_answer_dict
=
dict
()
cur_word_index
=
0
for
query_word
in
top_query_list
:
cur_word_index
+=
1
print
(
"query_word:
%
s"
%
query_word
,
flush
=
True
)
account
.
zhihu_query_by_word
(
query_word
=
query_word
,
zhihu_spider_fd
=
zhihu_spider_fd
,
zhihu_spider_question_fd
=
zhihu_spider_question_fd
,
question_answer_dict
=
question_answer_dict
,
cur_word_index
=
cur_word_index
)
# cur_word_index = 0
# for query_word in top_query_list:
# cur_word_index += 1
# print("query_word:%s" % query_word,flush=True)
# account.zhihu_query_by_word(query_word=query_word,zhihu_spider_fd=zhihu_spider_fd,zhihu_spider_question_fd=zhihu_spider_question_fd,question_answer_dict=question_answer_dict,cur_word_index=cur_word_index)
gevent_spawn_obj_list
=
list
()
for
cur_index
in
range
(
0
,
50
,
1
):
g_obj
=
gevent
.
spawn
(
concurrence_dispose_query_word
,
account
,
zhihu_spider_fd
,
zhihu_spider_question_fd
,
question_answer_dict
)
gevent_spawn_obj_list
.
append
(
g_obj
)
for
g_obj
in
gevent_spawn_obj_list
:
g_obj
.
join
()
zhihu_spider_fd
.
close
()
zhihu_spider_question_fd
.
close
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment