Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
81cd3b51
Commit
81cd3b51
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
233c49d3
master
litao
mr/develop/xiaohongshu
soyang
No related merge requests found
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
62 additions
and
121 deletions
+62
-121
README.md
README.md
+2
-2
craw_image_to_oss_daily_task.py
crawler_sys/framework/craw_image_to_oss_daily_task.py
+0
-0
high_fre_releasers.py
crawler_sys/framework/high_fre_releasers.py
+0
-57
platform_crawler_register.py
crawler_sys/framework/platform_crawler_register.py
+18
-14
redis_interact.py
crawler_sys/framework/redis_interact.py
+1
-1
search_page_multi_process.py
crawler_sys/framework/search_page_multi_process.py
+17
-29
search_page_single_process.py
crawler_sys/framework/search_page_single_process.py
+3
-11
update_data_in_target_releasers_multi_process_by_date_from_redis.py
...a_in_target_releasers_multi_process_by_date_from_redis.py
+1
-1
write_releasers_to_redis.py
crawler_sys/framework/write_releasers_to_redis.py
+1
-1
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+1
-1
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+1
-1
crawler_tudou.py
crawler_sys/site_crawler/crawler_tudou.py
+2
-2
crawler_zhihu.py
crawler_sys/site_crawler/crawler_zhihu.py
+0
-0
send_email_with_file_auto_task.py
maintenance/send_email_with_file_auto_task.py
+1
-1
func_calculate_zhihu_id.py
..._data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
+13
-0
func_cal_doc_id.py
write_data_into_es/func_cal_doc_id.py
+1
-0
No files found.
README.md
View file @
81cd3b51
# crawler
## 发布者页爬虫
1.
部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
1.
部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
crontab -e
2.
切换权限 sudo su - gmuser
3.
source /root/anaconda3/bin/activate
4.
创建虚拟环境 conda activate crawler_env/conda deactivate
5.
抓取程序 nohup python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py > /data/log/fect_task.log &
6.
写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
7.
##搜索页爬虫
pass
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/craw_image_to_oss_daily_task.py
deleted
100644 → 0
View file @
233c49d3
This diff is collapsed.
Click to expand it.
crawler_sys/framework/high_fre_releasers.py
deleted
100644 → 0
View file @
233c49d3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 10 17:07:09 2018
@author: fangyucheng
"""
import
sys
import
argparse
import
configparser
from
multiprocessing
import
Pool
from
crawler.crawler_sys.framework.platform_crawler_register
import
get_crawler
from
crawler.crawler_sys.framework.platform_crawler_register
import
platform_crawler_reg
parser
=
argparse
.
ArgumentParser
(
description
=
'a special crawler framework for key customer'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'legal platform name is required'
))
parser
.
add_argument
(
'-c'
,
'--conf'
,
default
=
'/home/hanye/crawlersNew/crawler/crawler_sys/framework/config/high_fre.ini'
,
help
=
(
'absolute path of config file'
))
parser
.
add_argument
(
'-num'
,
'--page_num'
,
default
=
20
,
type
=
int
,
help
=
(
'the number of scrolling page'
))
args
=
parser
.
parse_args
()
if
args
.
platform
!=
[]:
platform_list
=
args
.
platform
for
platform
in
platform_list
:
if
platform
not
in
platform_crawler_reg
:
print
(
"
%
s is not a legal platform name"
%
platform
)
sys
.
exit
(
0
)
config_file_path
=
args
.
conf
config
=
configparser
.
ConfigParser
()
config
.
sections
()
config
.
read
(
config_file_path
)
releaser_page_num_max
=
args
.
page_num
ARGS_DICT
=
{
"releaser_page_num_max"
:
releaser_page_num_max
,
"output_to_es_raw"
:
True
,
"output_es_index"
:
"crawler-data-raw"
,
"output_doc_type"
:
"doc"
,
"output_to_es_register"
:
True
}
for
platform
in
platform_list
:
crawler_initialization
=
get_crawler
(
platform
)
crawler
=
crawler_initialization
()
.
releaser_page
get_task_list
=
config
[
platform
]
TASK_LIST
=
[]
for
key
,
value
in
get_task_list
.
items
():
TASK_LIST
.
append
(
value
)
pool
=
Pool
(
processes
=
20
)
for
releaserUrl
in
TASK_LIST
:
pool
.
apply_async
(
func
=
crawler
,
args
=
(
releaserUrl
,),
kwds
=
ARGS_DICT
)
pool
.
close
()
pool
.
join
()
print
(
'Multiprocessing done for platform
%
s'
%
platform
)
This diff is collapsed.
Click to expand it.
crawler_sys/framework/platform_crawler_register.py
View file @
81cd3b51
...
...
@@ -18,24 +18,28 @@ from crawler.crawler_sys.site_crawler import (crawler_toutiao,
crawler_mango
,
crawler_wangyi_news
,
crawler_kwai
,
crawler_douyin
crawler_douyin
,
crawler_zhihu
,
)
from
crawler.crawler_sys.site_crawler.crawler_weibo.crawler_weibo
import
Crawler_weibo
platform_crawler_reg
=
{
'toutiao'
:
crawler_toutiao
.
Crawler_toutiao
,
'腾讯视频'
:
crawler_v_qq
.
Crawler_v_qq
,
'iqiyi'
:
crawler_iqiyi
.
Crawler_iqiyi
,
'youku'
:
crawler_youku
.
Crawler_youku
,
'new_tudou'
:
crawler_tudou
.
Crawler_tudou
,
'haokan'
:
crawler_haokan
.
Crawler_haokan
,
'腾讯新闻'
:
crawler_tencent_news
.
Crawler_Tencent_News
,
'miaopai'
:
crawler_miaopai
.
Crawler_miaopai
,
'pearvideo'
:
crawler_pear
.
Crawler_pear
,
'bilibili'
:
crawler_bilibili
.
Crawler_bilibili
,
'Mango'
:
crawler_mango
,
"网易新闻"
:
crawler_wangyi_news
.
Crawler_wangyi_news
,
"kwai"
:
crawler_kwai
.
Crawler_kwai
,
'抖音'
:
crawler_douyin
.
Crawler_douyin
,
# '腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
# 'new_tudou': crawler_tudou.Crawler_tudou,
# 'haokan': crawler_haokan.Crawler_haokan,
# '腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
# "网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
# "kwai": crawler_kwai.Crawler_kwai,
# '抖音': crawler_douyin.Crawler_douyin,
"zhihu"
:
crawler_zhihu
.
Crawler_zhihu
,
"weibo"
:
Crawler_weibo
}
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/redis_interact.py
View file @
81cd3b51
...
...
@@ -8,7 +8,7 @@ import redis, json
from
crawler_sys.framework.platform_redis_register
import
get_redis_list_name
from
crawler_sys.framework.es_crawler
import
scan_crawler_url_register
rds
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
19
)
def
feed_url_into_redis
(
dict_Lst
,
expire
=
0
,
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/search_page_multi_process.py
View file @
81cd3b51
...
...
@@ -10,17 +10,14 @@ from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from
multiprocessing
import
Pool
PARSER
=
argparse
.
ArgumentParser
(
description
=
'video platform search page crawler'
)
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"toutiao"
,
"腾讯新闻"
,
"腾讯视频"
,
"new_tudou"
],
action
=
'append'
,
PARSER
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[
"zhihu"
,
"weibo"
,
"toutiao"
],
action
=
'append'
,
help
=
(
'legal platform name is required'
))
PARSER
.
add_argument
(
'-k'
,
'--key_word_platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'key_word_legal platform name is required'
))
PARSER
.
add_argument
(
'-w'
,
'--output_to_es_raw'
,
default
=
Tru
e
,
PARSER
.
add_argument
(
'-w'
,
'--output_to_es_raw'
,
default
=
Fals
e
,
help
=
(
'output to es raw'
))
PARSER
.
add_argument
(
'-g'
,
'--output_to_es_register'
,
default
=
Fals
e
,
PARSER
.
add_argument
(
'-g'
,
'--output_to_es_register'
,
default
=
Tru
e
,
help
=
(
'output to es register'
))
PARSER
.
add_argument
(
'-n'
,
'--maxpage'
,
default
=
20
,
help
=
(
'maxpage'
))
...
...
@@ -38,14 +35,8 @@ es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
if
ARGS
.
platform
!=
[]:
PLATFORM_LIST
=
ARGS
.
platform
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW
=
ARGS
.
output_to_es_raw
OUTPUT_TO_ES_REGISTER
=
ARGS
.
output_to_es_register
...
...
@@ -96,6 +87,8 @@ def func_search_keywordlist(platform):
"m22王者之冠"
,
"bbl"
,
"胶原蛋白填充祛黑眼圈"
,
"热玛吉"
,
"热玛吉5代"
,
]
for
l
in
res_list
:
res_dic
[
l
]
=
10
...
...
@@ -132,7 +125,6 @@ def func_search_keywordlist(platform):
if
OUTPUT_TO_ES_RAW
is
True
:
ES_INDEX
=
'crawler-data-raw'
# ES_INDEX = 'test2'
DOC_TYPE
=
'doc'
print
(
ES_INDEX
,
DOC_TYPE
)
pages
=
ARGS
.
maxpage
...
...
@@ -140,8 +132,7 @@ pages = ARGS.maxpage
def
search_page_task
(
platform
,
output_to_es_raw
,
output_to_es_register
,
es_index
,
doc_type
):
es_index
):
search_pages
=
[]
initialize_crawler
=
get_crawler
(
platform
)
crawler
=
initialize_crawler
()
...
...
@@ -154,33 +145,30 @@ def search_page_task(platform, output_to_es_raw,
search_pages_max
=
search_pages
,
output_to_es_raw
=
output_to_es_raw
,
output_to_es_register
=
output_to_es_register
,
es_index
=
es_index
,
doc_type
=
doc_type
)
es_index
=
es_index
)
except
Exception
as
e
:
print
(
e
)
continue
ES_INDEX
=
"crawler-data-raw"
result
=
[]
kwargs_dict
=
{
'output_to_es_raw'
:
OUTPUT_TO_ES_RAW
,
'output_to_es_register'
:
OUTPUT_TO_ES_REGISTER
,
'es_index'
:
ES_INDEX
,
'doc_type'
:
DOC_TYPE
,
}
pool
=
Pool
(
processes
=
4
)
#
pool = Pool(processes=4)
for
platform
in
PLATFORM_LIST
:
res
=
pool
.
apply_async
(
func
=
search_page_task
,
args
=
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
,
DOC_TYPE
))
result
.
append
(
res
)
pool
.
close
()
pool
.
join
()
search_page_task
(
platform
,
OUTPUT_TO_ES_RAW
,
OUTPUT_TO_ES_REGISTER
,
ES_INDEX
)
# res = pool.apply_async(func=search_page_task,
# args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX))
# result.append(res)
# pool.close()
# pool.join()
print
(
'================='
)
for
i
in
result
:
print
(
i
.
get
())
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
This diff is collapsed.
Click to expand it.
crawler_sys/framework/search_page_single_process.py
View file @
81cd3b51
...
...
@@ -31,8 +31,7 @@ ARGS = PARSER.parse_args()
es_framework
=
Elasticsearch
(
hosts
=
'192.168.17.11'
,
port
=
80
,
http_auth
=
(
'crawler'
,
'XBcasfo8dgfs'
))
index_target_releaser
=
'search_keywords'
doc_type_target_releaser
=
'doc'
if
ARGS
.
platform
!=
[]:
PLATFORM_LIST
=
ARGS
.
platform
...
...
@@ -98,15 +97,8 @@ for platform in PLATFORM_LIST:
search_pages_max
=
search_pages
,
output_to_es_raw
=
OUTPUT_TO_ES_RAW
,
output_to_es_register
=
OUTPUT_TO_ES_REGISTER
,
es_index
=
ES_INDEX
,
doc_type
=
DOC_TYPE
)
else
:
crawler
.
search_video_page
(
keyword
,
None
,
search_pages_max
=
search_pages
,
output_to_es_raw
=
OUTPUT_TO_ES_RAW
,
output_to_es_register
=
OUTPUT_TO_ES_REGISTER
,
es_index
=
ES_INDEX
,
doc_type
=
DOC_TYPE
)
es_index
=
ES_INDEX
,)
except
Exception
as
e
:
print
(
e
)
continue
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py
View file @
81cd3b51
...
...
@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds_1
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-n'
,
'--max_page'
,
default
=
30
,
type
=
int
,
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/framework/write_releasers_to_redis.py
View file @
81cd3b51
...
...
@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
81cd3b51
...
...
@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
18
,
decode_responses
=
True
)
def
get_proxy_from_redis
():
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
81cd3b51
...
...
@@ -26,7 +26,7 @@ from lxml.html.clean import Cleaner
import
random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
19
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
19
)
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
# db='mimas_test', charset='utf8')
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_tudou.py
View file @
81cd3b51
...
...
@@ -12,13 +12,13 @@ import time
import
copy
import
requests
import
datetime
import
aiohttp
#
import aiohttp
import
urllib
try
:
from
crawler_sys.framework.func_get_releaser_id
import
*
except
:
from
func_get_releaser_id
import
*
from
write_data_into_es.
func_get_releaser_id
import
*
from
bs4
import
BeautifulSoup
from
multiprocessing
import
Pool
from
multiprocessing
import
Process
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_zhihu.py
View file @
81cd3b51
This diff is collapsed.
Click to expand it.
maintenance/send_email_with_file_auto_task.py
View file @
81cd3b51
...
...
@@ -4,7 +4,7 @@
import
redis
,
time
,
json
,
datetime
,
sys
from
maintenance.func_send_email_with_file
import
send_file_email
rds
=
redis
.
StrictRedis
(
host
=
'1
92.144.194.19
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
rds
=
redis
.
StrictRedis
(
host
=
'1
72.18.51.1
0'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
def
write_email_task_to_redis
(
task_name
=
None
,
file_path
=
None
,
data_str
=
None
,
email_group
=
[],
...
...
This diff is collapsed.
Click to expand it.
write_data_into_es/calculate_doc_id/func_calculate_zhihu_id.py
0 → 100644
View file @
81cd3b51
# -*- coding:UTF-8 -*-
# @Time : 2020/8/5 14:29
# @File : func_calculate_zhihu_id.py
# @email : litao@igengmei.com
# @author : litao
def
calculate_douban_id
(
data_dic
):
if
"answer"
in
data_dic
[
"url"
]:
return
data_dic
[
"_id"
]
.
replace
(
"zhihu_"
,
""
)
else
:
return
data_dic
[
"url"
]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
write_data_into_es/func_cal_doc_id.py
View file @
81cd3b51
...
...
@@ -32,6 +32,7 @@ def vid_cal_func(platform):
"haokan"
:
calculate_haokan_id
,
"weibo"
:
calculate_weibo_id
,
"douban"
:
calculate_douban_id
,
"zhihu"
:
calculate_douban_id
,
}
def
general_vid_cal_func
(
url
):
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment