Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
f74bbb7f
Commit
f74bbb7f
authored
5 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
42f394f9
master
xiangwan
No related merge requests found
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
10 deletions
+15
-10
crawler_douban.py
crawler_sys/site_crawler_by_redis/crawler_douban.py
+15
-10
No files found.
crawler_sys/site_crawler_by_redis/crawler_douban.py
View file @
f74bbb7f
...
...
@@ -62,7 +62,7 @@ class CrawlerDouban():
def
get_single_page
(
self
,
mid
,
proxies
):
count_true
=
0
while
count_true
<=
3
:
while
count_true
<=
5
:
try
:
count_true
+=
1
url
=
"https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc"
.
format
(
mid
,
random
.
randint
(
10000
,
99999
),
random
.
choice
(
self
.
sig_list
))
...
...
@@ -93,6 +93,8 @@ class CrawlerDouban():
except
Exception
as
e
:
print
(
"single page error
%
s"
%
e
)
continue
print
(
"single page error"
)
return
None
def
get_releaser_id
(
self
,
releaserUrl
):
return
get_releaser_id
(
platform
=
self
.
platform
,
releaserUrl
=
releaserUrl
)
...
...
@@ -159,8 +161,8 @@ class CrawlerDouban():
for
one
in
page_dic
:
releaser_id
=
one
[
"author"
][
"id"
]
mid
=
one
[
"id"
]
try
:
if
True
:
#
try:
res_dic
=
{
"release_time"
:
trans_strtime_to_timestamp
(
one
[
"create_time"
]),
"url"
:
one
[
"url"
],
...
...
@@ -180,13 +182,13 @@ class CrawlerDouban():
doc_id_type
=
"all-time-url"
)
res_dic
[
"doc_id"
]
=
doc_id
res_dic
.
update
(
self
.
get_single_page
(
mid
,
proxies_num
))
print
(
res_dic
)
#
print(res_dic)
yield
res_dic
except
Exception
as
e
:
print
(
one
)
print
(
"row formate error
%
s"
%
e
)
continue
#
except Exception as e:
#
print(one)
#
print("row formate error %s" % e)
#
continue
# @logged
def
releaser_page
(
self
,
releaserUrl
,
...
...
@@ -244,7 +246,9 @@ if __name__ == '__main__':
# for r in res:
# print(r)
for
u
in
url_list
:
test
.
releaser_page_by_time
(
1590940800000
,
1595468554268
,
u
,
output_to_es_register
=
True
,
t
tt
=
t
est
.
releaser_page_by_time
(
1590940800000
,
1595468554268
,
u
,
output_to_es_register
=
True
,
es_index
=
'crawler-data-raw'
,
doc_type
=
'doc'
,
releaser_page_num_max
=
4000
)
doc_type
=
'doc'
,
releaser_page_num_max
=
4000
,
allow
=
20
)
for
t
in
ttt
:
print
(
t
)
# test.get_single_page(4524055937468233)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment