Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
8a17245b
Commit
8a17245b
authored
Nov 26, 2020
by
haowang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
modify
parent
aff367ce
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
172 additions
and
119 deletions
+172
-119
crawler_zhihu_test.py
crawler_sys/site_crawler/crawler_zhihu_test.py
+172
-119
No files found.
crawler_sys/site_crawler/crawler_zhihu_test.py
View file @
8a17245b
...
...
@@ -24,94 +24,96 @@ from pymysql import escape_string
class
Crawler_zhihu
():
#初始化数据库,调整js规则
def
__init__
(
self
):
'''
初始化数据库,调整js规则
'''
self
.
conn
=
pymysql
.
connect
(
host
=
'bj-cdb-b8oeejac.sql.tencentcdb.com'
,
port
=
62118
,
user
=
'work'
,
passwd
=
'Gengmei1'
,
db
=
'mimas_dev'
,
charset
=
'utf8'
)
self
.
cur
=
self
.
conn
.
cursor
()
self
.
cur
.
execute
(
"drop table if exists zhihu_answer"
)
sql
=
"""create table zhihu_answer(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_article"
)
sql
=
"""create table zhihu_article(title char(40),
content text(59999),
id int,
created_time int,
comment_count int)"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_answer_root_comment"
)
sql
=
"""create table zhihu_answer_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_child_comment"
)
sql
=
"""create table zhihu_child_comment(root_comment_id int,
author_name char(40),
content text(59999),
reply_name char(40),
child_comment_id int,
created_time int,
author_id char(50))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_article_root_comment"
)
sql
=
"""create table zhihu_article_root_comment(root_comment_id int,
author_name char(40),
content text(59999),
answerid int,
child_comment_count int,
featured char(5),
created_time int,
author_id char(50))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_answer_picture_url"
)
sql
=
"""create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_article_picture_url"
)
sql
=
"""create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_thought"
)
sql
=
"""create table zhihu_thought(id char(50),
content text(59999),
created_time int,
comment_count int)"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_thought_comment"
)
sql
=
"""create table zhihu_thought_comment(thought_comment_id int,
author_name char(40),
content text(59999),
answerid char(50),
created_time int,
author_id char(50))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_thought_picture_url"
)
sql
=
"""create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
#
self.cur.execute("drop table if exists zhihu_answer")
#
sql = """create table zhihu_answer(title char(40),
#
content text(59999),
#
id int,
#
created_time int,
#
comment_count int)"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_article")
#
sql = """create table zhihu_article(title char(40),
#
content text(59999),
#
id int,
#
created_time int,
#
comment_count int)"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_answer_root_comment")
#
sql = """create table zhihu_answer_root_comment(root_comment_id int,
#
author_name char(40),
#
content text(59999),
#
answerid int,
#
child_comment_count int,
#
featured char(5),
#
created_time int,
#
author_id char(50))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
self.cur.execute("drop table if exists zhihu_child_comment")
#
sql = """create table zhihu_child_comment(root_comment_id int,
#
author_name char(40),
#
content text(59999),
#
reply_name char(40),
#
child_comment_id int,
#
created_time int,
#
author_id char(50))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_article_root_comment")
#
sql = """create table zhihu_article_root_comment(root_comment_id int,
#
author_name char(40),
#
content text(59999),
#
answerid int,
#
child_comment_count int,
#
featured char(5),
#
created_time int,
#
author_id char(50))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_answer_picture_url")
#
sql = """create table zhihu_answer_picture_url(answer_id int, url text(59999), new_url text(59999))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_article_picture_url")
#
sql = """create table zhihu_article_picture_url(article_id int, url text(59999), new_url text(59999))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_thought")
#
sql = """create table zhihu_thought(id char(50),
#
content text(59999),
#
created_time int,
#
comment_count int)"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_thought_comment")
#
sql = """create table zhihu_thought_comment(thought_comment_id int,
#
author_name char(40),
#
content text(59999),
#
answerid char(50),
#
created_time int,
#
author_id char(50))"""
#
self.cur.execute(sql)
#
self.conn.commit()
#
#
self.cur.execute("drop table if exists zhihu_thought_picture_url")
#
sql = """create table zhihu_thought_picture_url(thought_id char(50), url text(59999), new_url text(59999))"""
#
self.cur.execute(sql)
#
self.conn.commit()
os
.
environ
[
"EXECJS_RUNTIME"
]
=
'Node'
try
:
...
...
@@ -123,8 +125,11 @@ class Crawler_zhihu():
# print(js)
self
.
exec_js
=
execjs
.
compile
(
js
,
cwd
=
r'/usr/local/lib/node_modules'
)
#cookies更新
def
get_serach_page_cookies
(
self
):
'''
cookies更新
'''
url
=
"https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
...
...
@@ -143,8 +148,10 @@ class Crawler_zhihu():
requests_res
=
retry_get_url
(
url
,
headers
=
headers
)
return
requests_res
.
cookies
.
get_dict
()
#插入主要内容数据和图片的url,寻找评论
def
parse_sigle_page
(
self
,
data_dict
,
mark
):
'''
插入主要内容数据和图片的url,寻找评论
'''
if
mark
==
0
:
into
=
"insert into zhihu_answer(title, content, id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s,
%
s)"
...
...
@@ -175,8 +182,10 @@ class Crawler_zhihu():
return
#函数主入口
def
search_page
(
self
,
answer_page_max
,
article_page_max
,
thought_page_max
):
'''
函数主入口
'''
offset
=
0
for
i
in
range
(
answer_page_max
):
...
...
@@ -193,18 +202,18 @@ class Crawler_zhihu():
self
.
search_thought_page
(
offset
)
offset
=
offset
+
20
self
.
answer_picture_doneload_and_cut
()
self
.
answer_refresh_content
()
self
.
article_picture_doneload_and_cut
()
self
.
article_refresh_content
()
self
.
answer_data_complex
()
# self.answer_picture_doneload_and_cut()
# self.answer_refresh_content()
# self.article_picture_doneload_and_cut()
# self.article_refresh_content()
# self.answer_data_complex()
self
.
conn
.
close
()
return
#实现文章和回答的数据包请求
def
search_answer_article_page
(
self
,
offset
,
mark
,
proxies_num
=
0
):
'''
实现文章和回答的数据包请求
'''
offset
=
str
(
offset
)
if
mark
==
0
:
url
=
"https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data
%5
B*
%5
D.is_normal
%2
Cadmin_closed_comment
%2
Creward_info
%2
Cis_collapsed
%2
Cannotation_action
%2
Cannotation_detail
%2
Ccollapse_reason
%2
Ccollapsed_by
%2
Csuggest_edit
%2
Ccomment_count
%2
Ccan_comment
%2
Ccontent
%2
Ceditable_content
%2
Cvoteup_count
%2
Creshipment_settings
%2
Ccomment_permission
%2
Cmark_infos
%2
Ccreated_time
%2
Cupdated_time
%2
Creview_info
%2
Cexcerpt
%2
Cis_labeled
%2
Clabel_info
%2
Crelationship.is_authorized
%2
Cvoting
%2
Cis_author
%2
Cis_thanked
%2
Cis_nothelp
%2
Cis_recognized
%3
Bdata
%5
B*
%5
D.author.badge
%5
B
%3
F(type
%3
Dbest_answerer)
%5
D.topics
%3
Bdata
%5
B*
%5
D.question.has_publishing_draft
%2
Crelationship&offset={0}&limit=20&sort_by=created"
.
format
(
offset
)
...
...
@@ -236,8 +245,10 @@ class Crawler_zhihu():
return
#实现父评论的数据包请求
def
search_root_comment
(
self
,
answerid
,
offset
,
mark
,
proxies_num
=
0
):
'''
实现父评论的数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
if
mark
==
0
:
...
...
@@ -270,8 +281,10 @@ class Crawler_zhihu():
return
next
#插入父评论相关信息并关联子评论
def
root_comment_data
(
self
,
data_dict
,
answerid
,
mark
):
'''
插入父评论相关信息并关联子评论
'''
if
mark
==
0
:
into
=
"insert into zhihu_answer_root_comment(root_comment_id, author_name, content, answerid, child_comment_count, featured, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
elif
mark
==
1
:
...
...
@@ -289,8 +302,10 @@ class Crawler_zhihu():
return
#文章和回答的数据包请求
def
search_child_comment
(
self
,
root_comment_id
,
offset
,
proxies_num
=
0
):
'''
文章和回答的数据包请求
'''
root_comment_id
=
str
(
root_comment_id
)
offsets
=
offset
offset
=
str
(
offset
)
...
...
@@ -321,8 +336,10 @@ class Crawler_zhihu():
next
=
1
return
next
#子评论数据插入
def
child_comment_data
(
self
,
data_dict
,
root_comment_id
):
'''
子评论数据插入
'''
into
=
"insert into zhihu_child_comment(root_comment_id, author_name, content, reply_name, child_comment_id, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
root_comment_id
,
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
data_dict
[
"reply_to_author"
][
"member"
][
"name"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"name"
])
...
...
@@ -331,8 +348,10 @@ class Crawler_zhihu():
return
#url请求中的头部伪装
def
headers_handle
(
self
,
url
):
'''
url请求中的头部伪装
'''
res_cookies_dict
=
self
.
get_serach_page_cookies
()
headers_search
=
{
...
...
@@ -364,8 +383,10 @@ class Crawler_zhihu():
headers_search
[
"x-zse-86"
]
=
"1.0_"
+
self
.
exec_js
.
call
(
"b"
,
fmd5
)
return
headers_search
,
cookies_dict
#回答图片剪切和下载
def
answer_picture_doneload_and_cut
(
self
):
'''
回答图片剪切和下载
'''
sql
=
"""select answer_id, url from zhihu_answer_picture_url"""
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
...
...
@@ -407,7 +428,6 @@ class Crawler_zhihu():
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
...
...
@@ -427,8 +447,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#图片上传并得到新url
def
upload_image_with_path
(
self
,
path
,
img_type
=
IMG_TYPE
.
TOPICIMAGE
):
'''
图片上传并得到新url
'''
'''非站内图片处理'''
try
:
# with open(path, 'rb') as f:
...
...
@@ -443,8 +465,10 @@ class Crawler_zhihu():
print
(
'upload ..... error'
)
return
None
#替换url,更新回答内容
def
answer_refresh_content
(
self
):
'''
替换url,更新回答内容
'''
sql
=
"""select answer_id, url, new_url from zhihu_answer_picture_url"""
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
...
...
@@ -464,12 +488,14 @@ class Crawler_zhihu():
temp_tuples
)
new_content
=
r'
%
s'
%
(
new_content
)
new_content
=
escape_string
(
new_content
)
sql
=
"""update zhihu_answer set content = '{}' WHERE id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
sql
=
"""update zhihu_answer set
new_
content = '{}' WHERE id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
#文章图片剪切和下载
def
article_picture_doneload_and_cut
(
self
):
'''
文章图片剪切和下载
'''
sql
=
"""select article_id, url from zhihu_article_picture_url"""
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
...
...
@@ -511,7 +537,6 @@ class Crawler_zhihu():
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
...
...
@@ -531,8 +556,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#替换url,更新文章内容
def
article_refresh_content
(
self
):
'''
替换url,更新文章内容
'''
sql
=
"""select article_id, url, new_url from zhihu_article_picture_url"""
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
...
...
@@ -552,13 +579,14 @@ class Crawler_zhihu():
temp_tuples
)
new_content
=
r'
%
s'
%
(
new_content
)
new_content
=
escape_string
(
new_content
)
sql
=
"""update zhihu_article set content = '{}' WHERE id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
sql
=
"""update zhihu_article set
new_
content = '{}' WHERE id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
#想法数据包请求
def
search_thought_page
(
self
,
offset
,
proxies_num
=
0
):
'''
想法数据包请求
'''
offset
=
str
(
offset
)
url
=
"https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data
%5
B*
%5
D.upvoted_followees
%2
Cadmin_closed_comment"
.
format
(
offset
)
[
headers_search
,
cookies_dict
]
=
self
.
headers_handle
(
url
)
...
...
@@ -584,9 +612,10 @@ class Crawler_zhihu():
return
#想法内容插入
def
parse_thought_sigle_page
(
self
,
data_dict
):
'''
想法内容插入
'''
for
one_dict
in
data_dict
[
"content"
]:
if
one_dict
[
"type"
]
==
"text"
:
into
=
"insert into zhihu_thought(content, id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s)"
...
...
@@ -607,8 +636,10 @@ class Crawler_zhihu():
return
#想法评论数据包请求
def
search_thought_comment
(
self
,
answerid
,
offset
,
proxies_num
=
0
):
'''
想法评论数据包请求
'''
offset
=
str
(
offset
)
answerid
=
str
(
answerid
)
url
=
"https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open"
.
format
(
answerid
,
offset
)
...
...
@@ -638,8 +669,10 @@ class Crawler_zhihu():
return
next
#想法评论数据插入
def
thought_comment_data
(
self
,
data_dict
,
answerid
):
'''
想法评论数据插入
'''
into
=
"insert into zhihu_thought_comment(thought_comment_id, author_name, content, answerid, created_time, author_id) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"id"
],
data_dict
[
"author"
][
"member"
][
"name"
],
data_dict
[
"content"
],
answerid
,
data_dict
[
"created_time"
],
data_dict
[
"author"
][
"member"
][
"id"
])
self
.
cur
.
execute
(
into
,
values
)
...
...
@@ -647,8 +680,10 @@ class Crawler_zhihu():
return
#想法图片剪切和下载
def
thought_picture_doneload_and_cut
(
self
):
'''
想法图片剪切和下载
'''
sql
=
"""select thought_id, url from zhihu_thought_picture_url"""
self
.
cur
.
execute
(
sql
)
tuple
=
self
.
cur
.
fetchall
()
...
...
@@ -690,7 +725,6 @@ class Crawler_zhihu():
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
# for picture_deals in picture_deal:
# result = str(list[i])
# result = pattern.findall(result)
...
...
@@ -710,8 +744,10 @@ class Crawler_zhihu():
# paths = "/Users/xuwei/Desktop/picture_cut/num" + str(i) + ".jpg"
# cv2.imwrite(paths, cropped)
#封装回答最终数据结果格式
def
answer_data_complex
(
self
):
'''
封装回答最终数据结果格式
'''
sql
=
"""select id, content, created_time, comment_count from zhihu_answer"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -754,8 +790,10 @@ class Crawler_zhihu():
)
return
topics
#封装文章最终数据结果格式
def
article_data_complex
(
self
):
'''
封装文章最终数据结果格式
'''
sql
=
"""select id, content, created_time, comment_count from zhihu_article"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -798,8 +836,10 @@ class Crawler_zhihu():
)
return
topics
#封装回答最终数据结果格式
def
thought_data_complex
(
self
):
'''
封装回答最终数据结果格式
'''
sql
=
"""select id, content, created_time, comment_count from zhihu_thought"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -834,6 +874,19 @@ class Crawler_zhihu():
}
)
return
topics
def
clean_data
(
self
):
self
.
answer_refresh_content
()
self
.
article_picture_doneload_and_cut
()
self
.
article_refresh_content
()
self
.
conn
.
close
()
return
def
complex_data
(
self
):
self
.
answer_data_complex
()
self
.
article_data_complex
()
self
.
thought_data_complex
()
self
.
conn
.
close
()
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment