Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_spider
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_spider
Commits
7a03edf5
Commit
7a03edf5
authored
Jan 15, 2020
by
段英荣
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加切词内容
parent
0096ff6e
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
3 deletions
+58
-3
zhihu_login.py
zhihu_login.py
+58
-3
No files found.
zhihu_login.py
View file @
7a03edf5
...
...
@@ -32,6 +32,9 @@ import os
import
cv2
import
copy
import
traceback
import
pymysql
import
jieba
import
jieba.analyse
...
...
@@ -44,6 +47,13 @@ majia_user_list = [
"32269952"
,
"32269956"
,
"32269962"
,
"32269966"
,
"32269973"
,
"32269978"
,
"32269980"
,
"32269982"
,
"32269987"
,
"32269989"
,
"32270003"
,
"32270004"
,
"32270007"
,
"32270012"
,
"32270015"
,
"32270017"
,
"32270020"
,
"32270024"
,
"32270027"
,
"32270031"
,
"32270041"
,
"32270044"
,
"32270047"
,
"32270050"
,
"32270054"
,
"32270055"
,
"32270057"
,
"32270059"
,
"32270063"
,
"32270066"
,
"32269913"
,
"32269918"
,
"32269920"
,
"32269927"
,
"32269933"
,
"32269939"
,
"32269943"
,
"32269948"
,
"32269957"
,
"32269965"
,
"32269972"
,
"32269979"
,
"32269983"
,
"32269988"
,
"32269995"
,
"32270002"
,
"32270005"
,
"32270011"
,
"32270016"
,
"32270022"
,
"32270029"
,
"32270036"
,
"32270040"
,
"32270051"
,
"32270061"
,
"32270065"
,
"32270071"
,
"32270075"
,
"32270081"
,
"32270085"
,
"32270094"
,
"32270096"
,
"32270110"
,
"32270116"
,
"32270121"
,
"32270141"
,
"32270147"
,
"32270152"
,
"32270156"
,
"32270161"
,
"32270114"
,
"32270119"
,
"32270122"
,
"32270125"
,
"32270129"
,
"32270131"
,
"32270133"
,
"32270134"
,
"32270137"
,
"32270167"
,
"32270068"
,
"32270070"
,
"32270076"
,
"32270078"
,
"32270083"
,
"32270087"
,
"32270093"
,
"32270095"
,
"32270099"
,
"32270105"
,
"32269992"
,
"32270018"
,
"32270023"
,
"32270030"
,
"32270034"
,
"32270043"
,
"32270048"
,
"32270052"
,
"32270056"
,
"32270060"
]
ZHENGXING_HOST
=
"172.16.30.141"
ZHENGXING_USER
=
"work"
ZHENGXING_PWD
=
"BJQaT9VzDcuPBqkd"
ZHENGXING_DATABASE
=
"zhengxing"
class
ZhihuAccount
(
object
):
def
__init__
(
self
,
username
:
str
=
None
,
password
:
str
=
None
):
...
...
@@ -70,6 +80,15 @@ class ZhihuAccount(object):
}
self
.
session
.
cookies
=
cookiejar
.
LWPCookieJar
(
filename
=
'./cookies.txt'
)
self
.
zhengxing_conn
=
pymysql
.
connect
(
host
=
ZHENGXING_HOST
,
user
=
ZHENGXING_USER
,
password
=
ZHENGXING_PWD
,
database
=
ZHENGXING_DATABASE
,
charset
=
"utf8"
)
self
.
zhengxing_cursor
=
self
.
zhengxing_conn
.
cursor
()
self
.
tag_words_set
=
set
()
def
login
(
self
,
captcha_lang
:
str
=
'en'
,
load_cookies
:
bool
=
True
):
"""
模拟登录知乎
...
...
@@ -222,6 +241,38 @@ class ZhihuAccount(object):
js
=
execjs
.
compile
(
f
.
read
())
return
js
.
call
(
'Q'
,
urlencode
(
form_data
))
def
add_jieba_tag_word
(
self
):
try
:
tag_sql
=
"""
select tag_type,name from api_tag where is_online=true;
"""
self
.
zhengxing_cursor
.
execute
(
tag_sql
)
sql_tag_results
=
self
.
zhengxing_cursor
.
fetchall
()
for
tag_item
in
sql_tag_results
:
tag_name
=
tag_item
[
1
]
tag_type
=
tag_item
[
0
]
jieba
.
add_word
(
tag_name
)
self
.
tag_words_set
.
add
(
tag_name
)
except
:
print
(
traceback
.
format_exc
())
def
get_tfidf_words_from_content
(
self
,
content
):
try
:
key_tag_list
=
list
()
keywords_list
=
jieba
.
analyse
.
extract_tags
(
content
,
topK
=
20
,
withWeight
=
True
)
for
key_item
in
keywords_list
:
if
key_item
[
0
]
in
self
.
tag_words_set
:
key_tag_list
.
append
(
key_item
[
0
])
return
key_tag_list
except
:
print
(
traceback
.
format_exc
())
return
[]
# 知乎个人文章列表
def
test_member_article
(
self
):
...
...
@@ -318,7 +369,8 @@ class ZhihuAccount(object):
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
""
"question_id"
:
""
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
zhihu_spider_question_fd
.
write
(
json
.
dumps
(
question_item_dict
)
+
"
\n
"
)
elif
platform_id
not
in
question_answer_dict
[
question_id
]:
...
...
@@ -336,7 +388,8 @@ class ZhihuAccount(object):
"title"
:
title
,
"content"
:
content
,
"type"
:
data_type
,
"question_id"
:
question_id
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
content
)
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
...
...
@@ -400,7 +453,8 @@ class ZhihuAccount(object):
"title"
:
question_title
,
"content"
:
data_content
,
"type"
:
data_type
,
"question_id"
:
question_id
"question_id"
:
question_id
,
"tags"
:
self
.
get_tfidf_words_from_content
(
data_content
)
}
zhihu_spider_fd
.
write
(
json
.
dumps
(
item_dict
)
+
"
\n
"
)
...
...
@@ -416,6 +470,7 @@ if __name__ == '__main__':
account
=
ZhihuAccount
(
''
,
''
)
account
.
login
(
captcha_lang
=
'en'
,
load_cookies
=
True
)
account
.
add_jieba_tag_word
()
#account.test_member_article()
zhihu_spider_data_file
=
"./zhihu_spider_data.txt"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment