Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
d540d564
Commit
d540d564
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
fea8a745
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
17 additions
and
3 deletions
+17
-3
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+4
-2
crawler_douban.py
crawler_sys/site_crawler_by_redis/crawler_douban.py
+1
-0
crawler_weibo.py
crawler_sys/site_crawler_by_redis/crawler_weibo.py
+6
-0
crawler_weibo.py
crawler_sys/site_crawler_test/crawler_weibo.py
+6
-1
No files found.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
d540d564
...
...
@@ -133,6 +133,8 @@ def scan_from_redis(push_rule_class_list) -> Dict:
comment_count
=
one_data
.
get
(
"comment_count"
)
favorite_count
=
one_data
.
get
(
"favorite_count"
)
continue
if
one_data
.
get
(
"article_type"
)
!=
"article"
:
continue
for
push_bool
in
push_rule_class_list
:
bool_res
=
push_bool
.
parse_data
(
fetch_time_last
=
fetch_time
,
repost_count_last
=
repost_count
,
comment_count_last
=
comment_count
,
...
...
@@ -315,11 +317,11 @@ def write_data_into_mysql(res_data):
sql_query
=
"""insert into api_tractate
(user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title)
values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');"""
.
format
(
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
1
,
status
=
2
,
platform
=
15
,
user_id
=
random
.
choice
(
user_id_list
),
content
=
data
[
"content"
],
is_online
=
0
,
status
=
2
,
platform
=
15
,
content_level
=
data
[
"level"
],
is_excellent
=
0
,
create_time
=
now_str
,
last_modified
=
now_str
,
user_del
=
0
,
low_quality
=
0
,
low_quality_deal
=
0
,
platform_id
=
data
[
"doc_id"
],
pgc_type
=
1
,
title
=
data
[
"title"
])
low_quality
=
0
,
low_quality_deal
=
0
,
platform_id
=
data
[
"doc_id"
],
pgc_type
=
0
,
title
=
data
[
"title"
])
res
=
cur
.
execute
(
sql_query
)
tractate_id
=
int
(
conn
.
insert_id
())
if
res
:
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_by_redis/crawler_douban.py
View file @
d540d564
...
...
@@ -178,6 +178,7 @@ class CrawlerDouban():
'video_img'
:
one
[
"cover_url"
],
"mid"
:
mid
,
"platform"
:
"douban"
,
"article_type"
:
"article"
# "doc_id":doc_id
}
doc_id
=
cal_doc_id
(
platform
=
res_dic
[
"platform"
],
url
=
res_dic
[
"url"
],
data_dict
=
res_dic
,
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_by_redis/crawler_weibo.py
View file @
d540d564
...
...
@@ -159,6 +159,11 @@ class Crawler_weibo():
text
,
repost_count
,
comment_count
,
favorite_count
=
self
.
get_single_page
(
mid
)
else
:
text
=
mblog
[
"raw_text"
]
if
mblog
.
get
(
"page_info"
):
article_type
=
mblog
.
get
(
"page_info"
)
.
get
(
"type"
)
else
:
article_type
=
"article"
res_dic
=
{
"release_time"
:
trans_strtime_to_timestamp
(
mblog
[
"created_at"
]),
"fetch_time"
:
int
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
),
...
...
@@ -176,6 +181,7 @@ class Crawler_weibo():
"releaser_id_str"
:
"weibo_
%
s"
%
releaser_id
,
"img_list"
:
self
.
get_img
(
mblog
),
"platform"
:
"weibo"
,
"article_type"
:
article_type
# "doc_id":doc_id
}
res_dic
[
"doc_id"
]
=
cal_doc_id
(
platform
=
"weibo"
,
url
=
one
[
"scheme"
],
data_dict
=
res_dic
,
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_test/crawler_weibo.py
View file @
d540d564
...
...
@@ -179,6 +179,10 @@ class Crawler_weibo():
text
,
repost_count
,
comment_count
,
favorite_count
=
self
.
get_single_page
(
mid
)
else
:
text
=
mblog
[
"raw_text"
]
if
mblog
.
get
(
"page_info"
):
article_type
=
mblog
.
get
(
"page_info"
)
.
get
(
"type"
)
res_dic
=
{
"release_time"
:
trans_strtime_to_timestamp
(
mblog
[
"created_at"
]),
"url"
:
one
[
"scheme"
],
...
...
@@ -193,7 +197,8 @@ class Crawler_weibo():
"mid"
:
mid
,
"releaserUrl"
:
"https://www.weibo.com/u/
%
s"
%
releaser_id
,
"releaser_id_str"
:
"weibo_
%
s"
%
releaser_id
,
"platform"
:
"weibo"
"platform"
:
"weibo"
,
"article_type"
:
article_type
}
# from write_data_into_es.func_cal_doc_id import cal_doc_id
# id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment