Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
61e83a10
Commit
61e83a10
authored
Aug 11, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
ec314915
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
59 deletions
+4
-59
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+4
-4
yujiang-zhou-03492e71-21ea-4f52-9fdd-b9dbe21cc8da-v5.json
...yujiang-zhou-03492e71-21ea-4f52-9fdd-b9dbe21cc8da-v5.json
+0
-2
crontab
tasks/crontab/192.168.18.11/etc/crontab
+0
-53
No files found.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
61e83a10
...
...
@@ -112,11 +112,11 @@ class push_rule(object):
def
scan_from_redis
(
push_rule_class_list
)
->
Dict
:
# len_id_list = rds.llen("doc_id")
set_name
=
"exists_doc_id_set_
%
s"
%
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
rds
.
sadd
(
set_name
,
"test"
)
rds
.
expire
(
set_name
,
259200
)
out_ts
=
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
-
86400000
while
True
:
set_name
=
"exists_doc_id_set_
%
s"
%
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d"
)
rds
.
sadd
(
set_name
,
"test"
)
rds
.
expire
(
set_name
,
259200
)
out_ts
=
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
-
86400000
doc_id
=
rds
.
lpop
(
"doc_id"
)
if
doc_id
:
res
=
rds
.
llen
(
doc_id
)
...
...
maintenance/EsLicense/yujiang-zhou-03492e71-21ea-4f52-9fdd-b9dbe21cc8da-v5.json
deleted
100644 → 0
View file @
ec314915
{
"license"
:{
"uid"
:
"03492e71-21ea-4f52-9fdd-b9dbe21cc8da"
,
"type"
:
"basic"
,
"issue_date_in_millis"
:
1571961600000
,
"expiry_date_in_millis"
:
1603670399999
,
"max_nodes"
:
100
,
"issued_to"
:
"YuJiang Zhou (csm)"
,
"issuer"
:
"Web Form"
,
"signature"
:
"AAAAAwAAAA1+zC+3Nc7L82v4y8hvAAABmC9ZN0hjZDBGYnVyRXpCOW5Bb3FjZDAxOWpSbTVoMVZwUzRxVk1PSmkxaktJRVl5MUYvUWh3bHZVUTllbXNPbzBUemtnbWpBbmlWRmRZb25KNFlBR2x0TXc2K2p1Y1VtMG1UQU9TRGZVSGRwaEJGUjE3bXd3LzRqZ05iLzRteWFNekdxRGpIYlFwYkJiNUs0U1hTVlJKNVlXekMrSlVUdFIvV0FNeWdOYnlESDc3MWhlY3hSQmdKSjJ2ZTcvYlBFOHhPQlV3ZHdDQ0tHcG5uOElCaDJ4K1hob29xSG85N0kvTWV3THhlQk9NL01VMFRjNDZpZEVXeUtUMXIyMlIveFpJUkk2WUdveEZaME9XWitGUi9WNTZVQW1FMG1DenhZU0ZmeXlZakVEMjZFT2NvOWxpZGlqVmlHNC8rWVVUYzMwRGVySHpIdURzKzFiRDl4TmM1TUp2VTBOUlJZUlAyV0ZVL2kvVk10L0NsbXNFYVZwT3NSU082dFNNa2prQ0ZsclZ4NTltbU1CVE5lR09Bck93V2J1Y3c9PQAAAQCO/MqlprtquUMe/M6sXV7TdP8yFjvFAkIi7yMSQemhy3ORqgjk+jFLu0LtKtD051cy6PjKGP8qvbrYQTFIIU0PiMW5dVfHbGA75EbXOExhW1tSyiKvFNBb0ewCXdQVL+CwQFxtJ5oRmgzyKlYXxuS3gyb2fNgbRTnM6anLExA8WiJJTpAZ77xiHlN/rXSk9+VqdpEtSHai6/2KtgF+ENFMIgOcX5yXB3tWdMq2R6toPtEk1Mdg82XR5e1NVFWnBxARqym0rikaarkdARrliQpzoVZGsFUlgL27hGoRNXEdKydsE3aBla5yUoiwHqQeMc/cfsLnMdp71Tg08XfwnGNA"
,
"start_date_in_millis"
:
1571961600000
}}
\ No newline at end of file
tasks/crontab/192.168.18.11/etc/crontab
deleted
100644 → 0
View file @
ec314915
SHELL=/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin
MAILTO=hanye
HOME=/
# For details see man 4 crontabs
# Example of job definition:
# .---------------- minute (0 - 59)
# | .------------- hour (0 - 23)
# | | .---------- day of month (1 - 31)
# | | | .------- month (1 - 12) OR jan,feb,mar,apr ...
# | | | | .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
# | | | | |
# * * * * * user-name command to be executed
# 1 update video data in target release index daily
0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -n 10 -s 10
0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -n 30 -s 10
0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p iqiyi -n 30 -s 10
0 0 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -n 10 -s 10
# 2 write crawled data into short-video-prodtion index daily
0 05 * * * hanye python3 /home/hanye/crawlers/tasks/update_DU_ATU_from_crawler_raw.py
# 3 create redis url batch
0 18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30
0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/scheduler/generate_redis_url_batch.py -p iqiyi -p 腾讯视频 -b 02 -d 30
# 4 scrap redis url list
0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p 腾讯视频 -b 02
0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_redis_urls.py -p iqiyi -b 02
0 03 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages.py -p youku -n 30
# 5 scrap list pages
0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p iqiyi -n 30
0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p youku -n 30
0 10 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p 腾讯视频 -n 30
0 7,10,15 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/scrap_list_pages_multi_process.py -p toutiao -n 160 -s 10
# update haokan target releaser
0 0,8,12,18 * * * hanye python3 /home/hanye/crawlers/crawler_sys/framework/update_data_in_target_releasers_single_thread.py -p haokan -n 20
# get tencent news search page
0 0,4,8,10,14,16,20 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/search_page_single_process.py -p 腾讯新闻
#high frequency releasers ,crawler executes every hour, only for haokan currently
0 0,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/high_fre_releasers.py -p haokan
#high frequency releasers. execute on 1 pm and 6pm
0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p toutiao -fre 3 -n 20 -s 15
0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p 腾讯视频 -fre 3 -n 20 -s 15
0 0,13,18 * * * hanye python3 /home/hanye/crawlersNew/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process.py -p new_tudou -fre 3 -n 20 -s 15
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment