Commit 81cd3b51 authored by litaolemo's avatar litaolemo

update

parent 233c49d3
# crawler
## 发布者页爬虫
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/ crontab -e
2. 切换权限 sudo su - gmuser
3. source /root/anaconda3/bin/activate
4. 创建虚拟环境 conda activate crawler_env/conda deactivate
5. 抓取程序 nohup python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py > /data/log/fect_task.log &
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
7.
##搜索页爬虫
pass
......
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 10 17:07:09 2018
@author: fangyucheng
"""
import sys
import argparse
import configparser
from multiprocessing import Pool
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
parser = argparse.ArgumentParser(description='a special crawler framework for key customer')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('legal platform name is required'))
parser.add_argument('-c', '--conf', default='/home/hanye/crawlersNew/crawler/crawler_sys/framework/config/high_fre.ini',
help=('absolute path of config file'))
parser.add_argument('-num', '--page_num', default=20, type=int,
help=('the number of scrolling page'))
args = parser.parse_args()
if args.platform != []:
platform_list = args.platform
for platform in platform_list:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name" % platform)
sys.exit(0)
config_file_path = args.conf
config = configparser.ConfigParser()
config.sections()
config.read(config_file_path)
releaser_page_num_max = args.page_num
ARGS_DICT = {"releaser_page_num_max": releaser_page_num_max,
"output_to_es_raw": True,
"output_es_index": "crawler-data-raw",
"output_doc_type": "doc",
"output_to_es_register": True}
for platform in platform_list:
crawler_initialization = get_crawler(platform)
crawler = crawler_initialization().releaser_page
get_task_list = config[platform]
TASK_LIST = []
for key, value in get_task_list.items():
TASK_LIST.append(value)
pool = Pool(processes=20)
for releaserUrl in TASK_LIST:
pool.apply_async(func=crawler, args=(releaserUrl,), kwds=ARGS_DICT)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
......@@ -18,24 +18,28 @@ from crawler.crawler_sys.site_crawler import (crawler_toutiao,
crawler_mango,
crawler_wangyi_news,
crawler_kwai,
crawler_douyin
crawler_douyin,
crawler_zhihu,
)
from crawler.crawler_sys.site_crawler.crawler_weibo.crawler_weibo import Crawler_weibo
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
'iqiyi': crawler_iqiyi.Crawler_iqiyi,
'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
'miaopai': crawler_miaopai.Crawler_miaopai,
'pearvideo': crawler_pear.Crawler_pear,
'bilibili': crawler_bilibili.Crawler_bilibili,
'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai,
'抖音': crawler_douyin.Crawler_douyin,
# '腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
# 'new_tudou': crawler_tudou.Crawler_tudou,
# 'haokan': crawler_haokan.Crawler_haokan,
# '腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
# "网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
# "kwai": crawler_kwai.Crawler_kwai,
# '抖音': crawler_douyin.Crawler_douyin,
"zhihu":crawler_zhihu.Crawler_zhihu,
"weibo":Crawler_weibo
}
......
......@@ -8,7 +8,7 @@ import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
def feed_url_into_redis(dict_Lst, expire=0,
......
......@@ -10,17 +10,14 @@ from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from multiprocessing import Pool
PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao", "腾讯新闻", "腾讯视频", "new_tudou"], action='append',
PARSER.add_argument('-p', '--platform', default=["zhihu","weibo", "toutiao"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
PARSER.add_argument('-w', '--output_to_es_raw', default=False,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
PARSER.add_argument('-g', '--output_to_es_register', default=True,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
......@@ -38,14 +35,8 @@ es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
......@@ -96,6 +87,8 @@ def func_search_keywordlist(platform):
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
"热玛吉",
"热玛吉5代",
]
for l in res_list:
res_dic[l] = 10
......@@ -132,7 +125,6 @@ def func_search_keywordlist(platform):
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'crawler-data-raw'
# ES_INDEX = 'test2'
DOC_TYPE = 'doc'
print(ES_INDEX, DOC_TYPE)
pages = ARGS.maxpage
......@@ -140,8 +132,7 @@ pages = ARGS.maxpage
def search_page_task(platform, output_to_es_raw,
output_to_es_register,
es_index,
doc_type):
es_index):
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
......@@ -154,33 +145,30 @@ def search_page_task(platform, output_to_es_raw,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
es_index=es_index)
except Exception as e:
print(e)
continue
ES_INDEX = "crawler-data-raw"
result = []
kwargs_dict = {
'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX,
'doc_type': DOC_TYPE,
}
pool = Pool(processes=4)
# pool = Pool(processes=4)
for platform in PLATFORM_LIST:
res = pool.apply_async(func=search_page_task,
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX, DOC_TYPE))
result.append(res)
pool.close()
pool.join()
search_page_task(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)
# res = pool.apply_async(func=search_page_task,
# args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX))
# result.append(res)
# pool.close()
# pool.join()
print('=================')
for i in result:
print(i.get())
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
......@@ -31,8 +31,7 @@ ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
......@@ -98,15 +97,8 @@ for platform in PLATFORM_LIST:
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
es_index=ES_INDEX,)
except Exception as e:
print(e)
continue
......
......@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1 = redis.StrictRedis(host='192.144.194.190', port=6379, db=19, decode_responses=True)
rds_1 = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-n', '--max_page', default=30, type=int,
......
......@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
......
......@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=18, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis():
......
......@@ -26,7 +26,7 @@ from lxml.html.clean import Cleaner
import random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
# db='mimas_test', charset='utf8')
......
......@@ -12,13 +12,13 @@ import time
import copy
import requests
import datetime
import aiohttp
# import aiohttp
import urllib
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing import Process
......
This diff is collapsed.
......@@ -4,7 +4,7 @@
import redis,time,json,datetime,sys
from maintenance.func_send_email_with_file import send_file_email
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19,decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19,decode_responses=True)
def write_email_task_to_redis(task_name=None,file_path=None, data_str=None, email_group=[],
......
# -*- coding:UTF-8 -*-
# @Time : 2020/8/5 14:29
# @File : func_calculate_zhihu_id.py
# @email : litao@igengmei.com
# @author : litao
def calculate_douban_id(data_dic):
if "answer" in data_dic["url"]:
return data_dic["_id"].replace("zhihu_","")
else:
return data_dic["url"]
\ No newline at end of file
......@@ -32,6 +32,7 @@ def vid_cal_func(platform):
"haokan":calculate_haokan_id,
"weibo":calculate_weibo_id,
"douban":calculate_douban_id,
"zhihu":calculate_douban_id,
}
def general_vid_cal_func(url):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment