Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
5761803e
Commit
5761803e
authored
Nov 26, 2020
by
向万
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
code
parent
aff367ce
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
20 deletions
+25
-20
crawler_zhihu_test.py
crawler_sys/site_crawler/crawler_zhihu_test.py
+25
-20
No files found.
crawler_sys/site_crawler/crawler_zhihu_test.py
View file @
5761803e
...
...
@@ -33,17 +33,19 @@ class Crawler_zhihu():
self
.
cur
.
execute
(
"drop table if exists zhihu_answer"
)
sql
=
"""create table zhihu_answer(title char(40),
content text(59999),
id int,
answer_
id int,
created_time int,
comment_count int)"""
comment_count int,
new_content text(59999))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_article"
)
sql
=
"""create table zhihu_article(title char(40),
content text(59999),
id int,
article_
id int,
created_time int,
comment_count int)"""
comment_count int,
new_content text(59999))"""
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_answer_root_comment"
)
...
...
@@ -91,7 +93,7 @@ class Crawler_zhihu():
self
.
conn
.
commit
()
self
.
cur
.
execute
(
"drop table if exists zhihu_thought"
)
sql
=
"""create table zhihu_thought(id char(50),
sql
=
"""create table zhihu_thought(
thought_
id char(50),
content text(59999),
created_time int,
comment_count int)"""
...
...
@@ -147,11 +149,11 @@ class Crawler_zhihu():
def
parse_sigle_page
(
self
,
data_dict
,
mark
):
if
mark
==
0
:
into
=
"insert into zhihu_answer(title, content,
id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"question"
][
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"comment_count"
])
into
=
"insert into zhihu_answer(title, content,
answer_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"question"
][
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created_time"
],
data_dict
[
"comment_count"
]
,
data_dict
[
"content"
]
)
elif
mark
==
1
:
into
=
"insert into zhihu_article(title, content,
id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
])
into
=
"insert into zhihu_article(title, content,
article_id, created_time, comment_count, new_content) value(
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)"
values
=
(
data_dict
[
"title"
],
data_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
]
,
data_dict
[
"content"
]
)
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
offset
=
0
...
...
@@ -198,6 +200,7 @@ class Crawler_zhihu():
self
.
answer_refresh_content
()
self
.
article_picture_doneload_and_cut
()
self
.
article_refresh_content
()
self
.
thought_picture_doneload_and_cut
()
self
.
answer_data_complex
()
self
.
conn
.
close
()
return
...
...
@@ -220,7 +223,7 @@ class Crawler_zhihu():
print
(
"article_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
for
one_line
in
page_dict
[
'data'
]
[:
1
]
:
try
:
if
one_line
[
"content"
]
!=
None
:
self
.
parse_sigle_page
(
one_line
,
mark
)
...
...
@@ -450,10 +453,12 @@ class Crawler_zhihu():
tuple
=
self
.
cur
.
fetchall
()
self
.
conn
.
commit
()
for
i
in
range
(
len
(
tuple
)):
if
tuple
[
i
][
2
]
==
None
:
continue
find_id
=
tuple
[
i
][
0
]
temp
=
str
(
tuple
[
i
][
1
])
temp1
=
temp
.
replace
(
"?"
,
"#"
)
sql
=
"""select
content from zhihu_answer where zhihu_answer.
id = '{}' """
.
format
(
find_id
)
sql
=
"""select
new_content from zhihu_answer where zhihu_answer.answer_
id = '{}' """
.
format
(
find_id
)
self
.
cur
.
execute
(
sql
)
tuples
=
self
.
cur
.
fetchall
()
# tuples = str(tuples)
...
...
@@ -464,7 +469,7 @@ class Crawler_zhihu():
temp_tuples
)
new_content
=
r'
%
s'
%
(
new_content
)
new_content
=
escape_string
(
new_content
)
sql
=
"""update zhihu_answer set
content = '{}' WHERE
id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
sql
=
"""update zhihu_answer set
new_content = '{}' WHERE answer_
id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
...
...
@@ -541,7 +546,7 @@ class Crawler_zhihu():
find_id
=
tuple
[
i
][
0
]
temp
=
str
(
tuple
[
i
][
1
])
temp1
=
temp
.
replace
(
"?"
,
"#"
)
sql
=
"""select
content from zhihu_article where zhihu_article.
id = '{}' """
.
format
(
find_id
)
sql
=
"""select
new_content from zhihu_article where zhihu_article.article_
id = '{}' """
.
format
(
find_id
)
self
.
cur
.
execute
(
sql
)
tuples
=
self
.
cur
.
fetchall
()
# tuples = str(tuples)
...
...
@@ -552,7 +557,7 @@ class Crawler_zhihu():
temp_tuples
)
new_content
=
r'
%
s'
%
(
new_content
)
new_content
=
escape_string
(
new_content
)
sql
=
"""update zhihu_article set
content = '{}' WHERE
id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
sql
=
"""update zhihu_article set
new_content = '{}' WHERE article_
id = '{}' """
.
format
(
new_content
,
tuple
[
i
][
0
])
self
.
cur
.
execute
(
sql
)
self
.
conn
.
commit
()
...
...
@@ -571,7 +576,7 @@ class Crawler_zhihu():
print
(
"article_error"
)
page_dict
=
get_page
.
json
()
if
page_dict
.
get
(
"data"
):
for
one_line
in
page_dict
[
'data'
]:
for
one_line
in
page_dict
[
'data'
]
[:
1
]
:
try
:
self
.
parse_thought_sigle_page
(
one_line
)
print
(
"finshed_article"
+
offset
)
...
...
@@ -589,7 +594,7 @@ class Crawler_zhihu():
for
one_dict
in
data_dict
[
"content"
]:
if
one_dict
[
"type"
]
==
"text"
:
into
=
"insert into zhihu_thought(content, id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s)"
into
=
"insert into zhihu_thought(content,
thought_
id, created_time, comment_count) value(
%
s,
%
s,
%
s,
%
s)"
values
=
(
one_dict
[
"content"
],
data_dict
[
"id"
],
data_dict
[
"created"
],
data_dict
[
"comment_count"
])
self
.
cur
.
execute
(
into
,
values
)
self
.
conn
.
commit
()
...
...
@@ -712,7 +717,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式
def
answer_data_complex
(
self
):
sql
=
"""select id, content, created_time, comment_count from zhihu_answer"""
sql
=
"""select
anwser_
id, content, created_time, comment_count from zhihu_answer"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -756,7 +761,7 @@ class Crawler_zhihu():
#封装文章最终数据结果格式
def
article_data_complex
(
self
):
sql
=
"""select id, content, created_time, comment_count from zhihu_article"""
sql
=
"""select
article_
id, content, created_time, comment_count from zhihu_article"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -800,7 +805,7 @@ class Crawler_zhihu():
#封装回答最终数据结果格式
def
thought_data_complex
(
self
):
sql
=
"""select id, content, created_time, comment_count from zhihu_thought"""
sql
=
"""select
thought_
id, content, created_time, comment_count from zhihu_thought"""
self
.
cur
.
execute
(
sql
)
topics
=
[]
...
...
@@ -845,5 +850,5 @@ if __name__ == '__main__':
print
(
datetime
.
now
())
zhihu
=
Crawler_zhihu
()
zhihu
.
search_page
(
1
,
0
,
0
)
zhihu
.
search_page
(
1
,
1
,
1
)
print
(
datetime
.
now
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment