Commit d83f79a7 authored by litaolemo's avatar litaolemo

update

parent 408c928f
/crawler_sys/framework/check_wrong_url_daily.py
*.pyc
# -*- coding:UTF-8 -*-
# @Time : 2020/7/21 9:51
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Tue May 15 13:59:44 2018
@author: hanye
"""
[haokan]
看看新闻knews = https://haokan.baidu.com/haokan/wiseauthor?app_id=1565285080839434
一手video = https://haokan.baidu.com/haokan/wiseauthor?app_id=1546617034936582
生活欢乐汇 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462926329612
消息直升机 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022360094300
万物杂谈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022359827373
横漂一姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611821585605765
吃瓜少女萌 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822155649253
人人视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1563996947927117
追剧小师妹 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822302181315
新娱乐萌主 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611553722549281
探剧全能王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1610928916930156
青春影剧场 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611822943891552
肥仔电影圈 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1601813907799169
芒果tv = https://haokan.baidu.com/haokan/wiseauthor?app_id=1549963812551792
科技观察猿 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1567462927568986
撩剧大师 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1607132398047134
欧阳华北 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1547961620896856
澎湃新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1574072008111128
娱乐不晚点 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022346878545
看剧小资姐 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594361771699213
热剧宅急送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353698233619
毒舌影视街 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195426989529
江西网络广播电视台 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1577139229517527
热剧乐翻天 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594362729953997
全球视频大魔王 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1597149386529756
精彩剧集大放送 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593533634618523
影视水煮鱼 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594257498968349
财经新鲜事 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1555591483551002
酷哥撩大剧 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593525911983865
咸鱼说片 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594356024003023
安徽海豚播报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611201539330357
看剧大球球 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1593542564661281
长沙政法频道 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1552122607183011
体坛先锋报 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1589022347820284
综艺杂货铺 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195671591267
视频展览馆 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1611195775370668
钱江视频 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1602058994708441
动漫铲屎官 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1594353045050780
荔枝新闻 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1558731555412280
武侠超新世界 = https://haokan.baidu.com/haokan/wiseauthor?app_id=1613833957799396
\ No newline at end of file
[haokan]
经视大直播 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610033451422491
湖北卫视资讯站 = https://haokan.hao123.com/haokan/wiseauthor?app_id=3513
湖北经视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1573243419235544
湖北卫视长江新闻号 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1588754674509461
湖北卫视非正式会谈 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609570087894225
非正式课堂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1596696077980564
大王小湖北卫视 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609563211193403
长江云 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1604613511192710
调解面对面life = https://haokan.hao123.com/haokan/wiseauthor?app_id=1578861535741379
我为喜剧疯狂 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1610026230407548
湖北调解现场 = https://haokan.hao123.com/haokan/wiseauthor?app_id=1609575047119026
\ No newline at end of file
[legal_platforms_to_update_production]
legal_platforms = 腾讯新闻,kwai,miaopai,new_tudou,toutiao,haokan,腾讯视频,网易新闻,pearvideo,央视新闻+,人民日报,看了吗,youtube,facebook,新华社,youku,iqiyi,,bilibili,抖音,toutiao_microheadlines,toutiao_article
[new_tudou]
推荐 = http://www.tudou.com/api/getfeeds?secCateId=10016&utdid=T8v9EQPOimUCAXL%2FAz0YrDOB&page_size=24
乐活 = http://www.tudou.com/api/getfeeds?secCateId=10195&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
搞笑 = http://www.tudou.com/api/getfeeds?secCateId=622736331&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
影视 = http://www.tudou.com/api/getfeeds?secCateId=622769673&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
动漫 = http://www.tudou.com/api/getfeeds?secCateId=10116&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
身边 = http://www.tudou.com/api/getfeeds?secCateId=622621940&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
综娱 = http://www.tudou.com/api/getfeeds?secCateId=10198&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
音乐 = http://www.tudou.com/api/getfeeds?secCateId=622336449&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
游戏 = http://www.tudou.com/api/getfeeds?secCateId=10051&utdid=XA2EFIGslWoCAWp4y3KXcZh7&page_size=24
[腾讯视频]
音乐 = http://v.qq.com/x/list/music
新闻 = http://v.qq.com/x/list/news
军事 = http://v.qq.com/x/list/military
娱乐 = http://v.qq.com/x/list/ent
体育 = http://v.qq.com/x/list/sports
游戏 = http://v.qq.com/x/list/games
搞笑 = http://v.qq.com/x/list/fun
时尚 = http://v.qq.com/x/list/fashion
生活 = http://v.qq.com/x/list/life
母婴 = http://v.qq.com/x/list/baby
汽车 = http://v.qq.com/x/list/auto
科技 = http://v.qq.com/x/list/tech
教育 = http://v.qq.com/x/list/education
财经 = http://v.qq.com/x/list/finance
房产 = http://v.qq.com/x/list/house
旅游 = http://v.qq.com/x/list/travel
王者荣耀 = http://v.qq.com/x/list/kings
[toutiao]
#only one list page and don't rely on list page url
list_url = toutiao
[iqiyi]
纪录片 = http://list.iqiyi.com/www/3/-------------4-fangyucheng-2-iqiyi--.html
游戏 = http://list.iqiyi.com/www/8/-------------4-fangyucheng-2-iqiyi--.html
资讯 = http://list.iqiyi.com/www/25/-------------4-fangyucheng-2-iqiyi-1-.html
娱乐 = http://list.iqiyi.com/www/7/-------------4-fangyucheng-2-iqiyi-1-.html
财经 = http://list.iqiyi.com/www/24/-------------4-fangyucheng-2-iqiyi--.html
片花-电影 = http://list.iqiyi.com/www/10/1007-------------4-fangyucheng-2--1-.html
片花-电视剧 = http://list.iqiyi.com/www/10/1006-------------4-fangyucheng-2--1-.html
音乐 = http://list.iqiyi.com/www/5/-------------4-fangyucheng-2-iqiyi--.html
军事 = http://list.iqiyi.com/www/28/-------------4-fangyucheng-2-iqiyi-1-.html
教育 = http://list.iqiyi.com/www/12/-------------4-fangyucheng-2-iqiyi-1-.html
体育 = http://list.iqiyi.com/www/17/-------------4-fangyucheng-2-iqiyi--.html
[youku]
全部 = http://list.youku.com/category/video/c_0_d_1_s_2_p_fangyucheng.html
音乐 = http://list.youku.com/category/show/c_95_s_5_d_1_p_fangyucheng.html
资讯 = http://list.youku.com/category/video/c_91_d_1_s_2_p_fangyucheng.html
搞笑 = http://list.youku.com/category/video/c_94_d_1_s_2_p_fangyucheng.html
生活 = http://list.youku.com/category/video/c_103_d_1_s_2_p_fangyucheng.html
汽车 = http://list.youku.com/category/video/c_104_d_1_s_2_p_fangyucheng.html
科技 = http://list.youku.com/category/video/c_105_d_1_s_2_p_fangyucheng.html
时尚 = http://list.youku.com/category/video/c_89_d_1_s_2_p_fangyucheng.html
亲子 = http://list.youku.com/category/video/c_90_d_1_s_2_p_fangyucheng.html
旅游 = http://list.youku.com/category/video/c_88_d_1_s_2_p_fangyucheng.html
微电影 = http://list.youku.com/category/video/c_171_d_1_s_2_p_2_fangyucheng.html
网剧 = http://list.youku.com/category/video/c_172_d_1_s_2_p_2_fangyucheng.html
拍客 = http://list.youku.com/category/video/c_174_d_1_s_2_p_2_fangyucheng.html
创意视频 = http://list.youku.com/category/video/c_175_d_1_s_2_p_2_fangyucheng.html
自拍 = http://list.youku.com/category/video/c_176_d_1_s_2_p_2_fangyucheng.html
广告 = http://list.youku.com/category/video/c_102_d_1_s_2_p_2_fangyucheng.html
\ No newline at end of file
[v_qq]
list_page_html = v_qq_list_page_html
# For every keyword, there should be an search_pages value, in the same order
# There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
# space cannot be just stripped out.
[腾讯新闻]
keywords = 致敬中国英雄
search_pages = 20
[腾讯视频]
keywords = 致敬中国英雄
search_pages = 20
[new_tudou]
keywords = 致敬中国英雄
search_pages = 20
[toutiao]
keywords = 致敬中国英雄
search_pages = 20
[youku]
keywords = 致敬中国英雄
search_pages = 2
[pearvideo]
keywords = 任正非 BBC
search_pages = 2
[bilibili]
keywords = 任正非 BBC
search_pages = 2
# For every keyword, there should be an search_pages value, in the same order
# There should be NO SPACE around comma, because in some cases, there will be space within a keyword,
# space cannot be just stripped out.
[腾讯新闻]
keywords = 2019东方卫视春晚,2019东方卫视跨年演唱会,BesTV百视通,SMG摄界,SMG阳阳STUDIO,第一财经,第一财经《财经早班车》,东方卫视,东方卫视-东方新闻,东方卫视-看东方,东方午新闻,动感101TV,话匣子,酱紫娱乐,交叉点看,究竟视频,剧说有毒,看东方,看看新闻Knews,可凡倾听,青春旅社,上海电视台新闻综合频道,新闻坊,游戏风云gamefy,最美公路,北京时间,时间财经,时间测评,时间国际视频,时间新闻,时间新闻视频,时间直播,大揭秘,大王小王湖北卫视,非正式课堂,湖北电视台《钓鱼频道》,湖北广电纪录片部,湖北经视,湖北调解面对面,经视大直播,经视好吃佬,经视乐生活,经视人家,调解现场,问新闻,笑啦,长江新闻号,长江新闻号湖北卫视,长江云,《奇兵神犬》,《我是大侦探》,风影车都,湖南电视剧频道,湖南电视台《风影车都》,湖南电影频道,湖南都市频道,湖南广播电视台茶频道,湖南经视,湖南卫视,湖南卫视芒果云,湖南卫视天天向上,湖南娱乐,幻乐之城,金鹰卡通卫视,快乐垂钓频道,芒果V直播,芒果都市,平民英雄,亲爱的客栈,亲爱的客栈第2季,我家那小子,我是未来,我想和你唱,欲望都市show,中餐厅,中餐厅第二季,江苏卫视官方帐号,江苏卫视幸福剧场,江苏新闻,江苏新闻广播,金曲捞,荔枝新闻,南京零距离,无限歌谣季,新闻眼,缘来非诚勿扰,动历史,老板联播,梨北京,梨青岛,梨视频,梨视频ING直播,梨视频微视,梨视频游戏,一手Video,澎湃视频,澎湃新闻,第一现场,深圳卫视,深圳卫视《军情直播间》,深圳卫视《正午30分》,深圳卫视军情直播间,深圳卫视正午30分,深圳卫视直播港澳台,正午30分,直播港澳台,新京报,新京报动新闻,新京报经济新闻,新京报书评周刊,1818黄金眼,2019浙江卫视领跑演唱会,FM988浙江新闻广播,奔跑吧兄弟 第4季,大冰小将,范大姐帮忙,钱江视频,熟悉的味道 第2季,喜剧总动员,喜剧总动员 第2季,小强实验室,异口同声,浙江广播电视台,浙江经视新闻,浙江台车行天下,浙江卫视,浙江卫视《新闻深一度》,浙江新闻频道,浙江之声,中国蓝TV,中国蓝新闻,中国蓝新闻蓝媒视频,看看新闻Knews,任正非 BBC
search_pages = 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,30,30,2
[腾讯视频]
keywords = 任正非 BBC
search_pages = 2
[new_tudou]
keywords = 任正非 BBC
search_pages = 2
[toutiao]
keywords = 任正非 BBC
search_pages = 2
[youku]
keywords = 任正非 BBC
search_pages = 2
[pearvideo]
keywords = 任正非 BBC
search_pages = 2
[bilibili]
keywords = 任正非 BBC
search_pages = 2
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 5 18:53:49 2018
@author: hanye
"""
#import redis
from crawler_sys.framework.platform_crawler_register import get_crawler
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.utils.output_results import output_result
import time
from crawler_sys.framework.redis_interact import rds
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
seconds_to_sleep_between_waitings_for_redis_list = 60
def crawle_platform(platform,
write_into_file=False,
will_write_into_es=True):
Platform_crawler = get_crawler(platform)
if Platform_crawler == None:
print('Failed to get crawler for platform %s' % platform)
else:
crawler_instant = Platform_crawler()
redis_list = get_redis_list_name(platform)
video_Lst = []
crawler_counter = 0
if redis_list!=None:
while True:
url = rds.rpop(redis_list).decode()
if url!=None: # which means get url from redis sucessfully
video_dict = crawler_instant.video_page(url)
if video_dict!=None:
video_Lst.append(video_dict)
crawler_counter += 1
else:
print('Empty redis list, wait...')
time.sleep(seconds_to_sleep_between_waitings_for_redis_list)
if crawler_counter%1000==0:
print('crawle_server: writing 1000 lines into es, '
'platform %s crawler_couter: %d'
%(platform, crawler_counter))
output_result(video_Lst, platform,
output_to_es=will_write_into_es)
video_Lst.clear()
if video_Lst!=[]:
output_result(video_Lst, platform,
output_to_es=will_write_into_es)
video_Lst.clear()
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 17:09:09 2018
@author: hanye
"""
from elasticsearch import Elasticsearch
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
index_short_video = 'short-video-production'
doc_type_short_video_DU = 'daily-url'
doc_type_short_video_ATU = 'all-time-url'
index_crawler_raw = 'crawler-data-raw'
doc_type_crawler_raw = 'doc'
index_url_register = 'crawler-url-register'
doc_type_url_register = 'doc'
fields_url_register = ['platform', 'url', 'video_id',
'release_time', 'timestamp']
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 17:20:57 2018
@author: hanye
"""
import time
from elasticsearch.helpers import scan
from crawler_sys.framework.es_ccr_index_defination import es_framework
from crawler_sys.framework.es_ccr_index_defination import index_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import doc_type_crawler_raw
from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from crawler_sys.framework.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from crawler_sys.framework.func_calculate_wangyi_news_id import calculate_wangyi_news_id
def scan_crawler_raw_index(search_body):
total_hit, scan_resp = scan_index(index=index_crawler_raw,
doc_type=doc_type_crawler_raw,
search_body=search_body)
# search_resp = es_framework.search(index=index_crawler_raw,
# doc_type=doc_type_crawler_raw,
# body=search_body,
# size=0, request_timeout=100)
# total_hit = search_resp['hits']['total']
# print('Index: %s total hit: %d'
# % (index_crawler_raw, total_hit))
# if total_hit>0:
# scan_resp = scan(client=es_framework,
# query=search_body,
# index=index_crawler_raw,
# doc_type=doc_type_crawler_raw,
# request_timeout=300)
# else:
# print('Zero hit.')
# scan_resp = None
return (total_hit, scan_resp)
def scan_crawler_url_register(search_body):
total_hit, scan_resp = scan_index(index=index_url_register,
doc_type=doc_type_url_register,
search_body=search_body)
return (total_hit, scan_resp)
def scan_index(index, doc_type, search_body):
search_resp = es_framework.search(index=index,
doc_type=doc_type,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
print('Index: %s total hit: %d'
% (index, total_hit))
if total_hit > 0:
scan_resp = scan(client=es_framework,
query=search_body,
index=index,
doc_type=doc_type,
request_timeout=300)
else:
print('Zero hit.')
scan_resp = None
return (total_hit, scan_resp)
def construct_id_for_url_register(platform, url):
if platform == 'new_tudou':
vid_bare = calculate_newTudou_video_id(url)
vid = 'new_tudou_%s' % vid_bare
elif platform == 'toutiao':
vid_bare = calculate_toutiao_video_id(url)
vid = 'toutiao_%s' % vid_bare
elif platform == '腾讯新闻':
c_time = str(int(time.time()))
vid = "tencent_news_%s_%s" % (url, c_time)
elif platform == '网易新闻':
vid = "163_news_%s" % calculate_wangyi_news_id(url)
else:
vid_bare = url
vid = vid_bare
return vid
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 18:14:32 2018
@author: hanye
"""
import datetime
import json
from elasticsearch import Elasticsearch
# from crawler_sys.framework.short_video_vid_cal_func import vid_cal_func
from write_data_into_es.func_cal_doc_id import cal_doc_id
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
index_short_video = 'short-video-production'
index_short_video_all_time_url = 'short-video-all-time-url'
doc_type_short_video_DU = 'daily-url'
doc_type_short_video_ATU = 'all-time-url'
def cal_vid(platform, url):
pass
def bulk_write_short_video(dict_Lst,
index=index_short_video,
index_all_time_url=index_short_video_all_time_url,
doc_type_daily=doc_type_short_video_DU,
doc_type_ATU=doc_type_short_video_ATU,
write_daily=True,
write_ATU=True):
"""
If not explictly specified, will write daily-url and all-time-url.
"""
if write_daily is False and write_ATU is False:
return None
else:
bulk_write_SV_bd_daily = ''
bulk_write_SV_bd_ATU = ''
write_counter = 0
for line in dict_Lst:
try:
url = line['url']
platform = line['platform']
fetch_time_ts = line['fetch_time']
fetch_time_T = datetime.datetime.fromtimestamp(fetch_time_ts / 1e3)
fetch_time_day_str = fetch_time_T.isoformat()[:10]
id_daily = cal_doc_id(platform, url=url, doc_id_type='daily-url',data_dict=line,fetch_day_str=fetch_time_day_str)
id_ATU = cal_doc_id(platform, url=url, doc_id_type='all-time-url',data_dict=line)
data_str = json.dumps(line, ensure_ascii=False)
if write_daily is True:
action_str_daily = '{"index": {"_id":"%s"}}' % id_daily
line_body_for_daily = action_str_daily + '\n' + data_str + '\n'
bulk_write_SV_bd_daily += line_body_for_daily
if write_ATU is True:
action_str_ATU = '{"index": {"_id":"%s"}}' % id_ATU
line_body_for_ATU = action_str_ATU + '\n' + data_str + '\n'
bulk_write_SV_bd_ATU += line_body_for_ATU
write_counter += 1
except:
pass
t1 = datetime.datetime.now()
if write_daily is True and bulk_write_SV_bd_daily != '':
bulk_resp = es_framework.bulk(index=index, doc_type=doc_type_daily,
body=bulk_write_SV_bd_daily,
request_timeout=200)
if bulk_resp['errors'] is True:
print(bulk_resp)
t2 = datetime.datetime.now()
td = t2 - t1
# print(bulk_resp)
print('written %d lines into %s, costs %s,'
% (write_counter, doc_type_daily, td),
datetime.datetime.now())
bulk_write_SV_bd_daily = ''
t3 = datetime.datetime.now()
if write_ATU is True and bulk_write_SV_bd_ATU != '':
es_framework.bulk(index=index_all_time_url, doc_type=doc_type_ATU,
body=bulk_write_SV_bd_ATU, request_timeout=200)
t4 = datetime.datetime.now()
tdd = t4 - t3
print('written %d lines into %s, costs %s,'
% (write_counter, doc_type_ATU, tdd),
datetime.datetime.now())
bulk_write_SV_bd_ATU = ''
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 18:13:14 2018
@author: hanye
"""
import json #, redis
import random
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
#rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
def bulk_write_target_releasers(dict_Lst,
index=index_target_releaser,
doc_type=doc_type_target_releaser):
bulk_write_body=''
write_counter=0
for line in dict_Lst:
write_counter+=1
try:
releaser=line['releaser']
platform=line['platform']
doc_id_releaser='%s_%s' % (platform, releaser)
action_str=('{ "index" : { "_index" : "%s", "_type" : "%s","_id" : "%s" } }'
% (index_target_releaser, doc_type_target_releaser, doc_id_releaser) )
data_str=json.dumps(line, ensure_ascii=False)
line_body = action_str + '\n' + data_str + '\n'
bulk_write_body += line_body
except:
print('ill-formed data', line)
if write_counter%1000==0 or write_counter==len(dict_Lst):
print('Writing into es %d/%d' % (write_counter, len(dict_Lst)))
if bulk_write_body!='':
es_framework.bulk(body=bulk_write_body, request_timeout=100)
def get_releaserUrls_from_es(platform,
releaser=None,
frequency=None,
target_index=None,
project_tags=[]):
search_body = {"query": {"bool": {"filter": [{"term": {"platform.keyword": platform}}]}}}
if releaser is not None:
releaser_dict = {"term": {"releaser.keyword": releaser}}
search_body['query']['bool']['filter'].append(releaser_dict)
if frequency is not None:
frequency_dict = {"range": {"frequency": {"gte": frequency}}}
search_body['query']['bool']['filter'].append(frequency_dict)
if project_tags:
frequency_dict = {"terms":{"project_tags.keyword":project_tags}}
search_body['query']['bool']['filter'].append(frequency_dict)
# print(target_index,doc_type_target_releaser,search_body)
search_resp=es_framework.search(index=target_index,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaserUrl_Lst = []
if total_hit > 0:
print('Got %d releaserUrls for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=target_index,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
releaserUrl = line['_source']['releaserUrl']
releaser = line['_source']['releaser']
releaserUrl_Lst.append((releaserUrl,releaser))
except:
print('error in :' ,line)
continue
else:
print('Got zero hits.')
return releaserUrl_Lst
# -*- coding:utf-8 -*-
# @Time : 2019/7/16 16:08
# @Author : litao
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import re
def calculate_douyin_id(url):
if "?" in url:
find_vid = url.split("?")
elif "video" in url:
find_vid = re.findall('/video/(.*?)/', url)
if find_vid:
find_vid = ["https://www.iesdouyin.com/share/video/%s/" % find_vid[0]]
else:
return url
if find_vid != []:
vid = find_vid[0]
else:
vid = url
return vid
if __name__=='__main__':
print(calculate_douyin_id("https://www.iesdouyin.com/share/vido/6688242923181591821/?mid=6688519042262665996"))
print(calculate_douyin_id("https://www.iesdouyin.com/share/video/6689249077596671245/?mid=6689052145968450308"))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 6 15:53:10 2018
@author: zhouyujiang
"""
import re
def calculate_kwai_video_id_by_data_by_url(kwai_url):
doc_id_str = re.findall(r"/u/(.+)?",kwai_url)
if doc_id_str!=[]:
vid = doc_id_str[0].replace('/','_')
return vid
else:
return None
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 12 16:40:20 2017
@author: hanye
"""
import re
def calculate_newTudou_video_id(newTudou_url):
try:
d_url_s_Lst = newTudou_url.split('.html')
d_videoID = d_url_s_Lst[0]
newTudou_video_id = re.findall(r"/\w/(.+)?", d_videoID)[0]
except:
newTudou_video_id = None
return newTudou_video_id
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 6 09:54:09 2017
@author: hanye
"""
import re
def calculate_toutiao_video_id(toutiao_url):
if toutiao_url[-1] != '/':
toutiao_url = toutiao_url + '/'
find_vid = re.findall('[0-9]+/', toutiao_url)
if find_vid!=[]:
vid = find_vid[0].replace('/', '')
return vid
else:
return None
from crawler_sys.framework.func_get_releaser_id import get_releaser_id
def calculate_txxw_video_id(data_dict):
try:
releaser_id = get_releaser_id(platform="腾讯新闻", releaserUrl=data_dict["releaserUrl"])
video_id = data_dict['video_id']
if releaser_id:
return video_id + "_" +releaser_id
else:
return video_id
except:
print('error in :', data_dict)
return None
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 6 09:54:09 2017
@author: hanye
"""
import re
def calculate_v_qq_video_id(v_qq_page_url):
find_vid = re.findall('/[0-9a-zA-Z]+.html', v_qq_page_url)
if find_vid != []:
vid = find_vid[0].split('/')[1].split('.')[0]
else:
vid = v_qq_page_url
return vid
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import re
def calculate_wangyi_news_id(url):
if "/sub/" in url:
find_vid = re.findall('/sub/(.+)\.html', url)
elif "/v/" in url:
find_vid = re.findall('/v/(.+)\.html', url)
else:
return url
if find_vid != []:
vid = find_vid[0]
else:
vid = url
return vid
if __name__=='__main__':
print(calculate_wangyi_news_id("https://c.m.163.com/news/v/VA9LBOJ7S.html"))
print(calculate_wangyi_news_id("https://c.m.163.com/news/sub/T1539761239294.html"))
\ No newline at end of file
This diff is collapsed.
# -*- coding:utf-8 -*-
# @Time : 2019/9/26 14:50
# @Author : litao
import datetime
def week_num(year=None, cycle=None, cycle_num=None, compare_type=None):
now = datetime.datetime.now()
now_canlendar = now.isocalendar()
if not cycle_num:
week_canlendar = now_canlendar
else:
week_canlendar = (now.year, cycle_num + 1, 0)
year = week_canlendar[0]
this_week = week_canlendar[1] - 1
if this_week == 0:
last_year = year - 1
this_week = 1
else:
last_year = year
if this_week == 1:
last_week = "52"
else:
last_week = this_week - 1
today = datetime.datetime(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day)
# today = datetime.datetime(year=2018, month=12, day=25)
first_day_in_week = today - datetime.timedelta(
days=now_canlendar[2] + 7 * (now_canlendar[1] - week_canlendar[1] + 1))
fisrt_day_ts = int(first_day_in_week.timestamp() * 1e3)
last_day_in_week = first_day_in_week + datetime.timedelta(days=7)
last_day_ts = int(last_day_in_week.timestamp() * 1e3)
this_week_index = 'short-video-weekly'
this_week_doc = 'daily-url-' + str(year) + '_w' + format(this_week, '>02d') + '_s1'
last_week_index = 'releaser-weekly-short-video'
last_week_doc = 'doc'
if compare_type == "new_released":
this_week_index = last_week_index
this_week_doc = last_week_doc
return this_week_index, this_week_doc, last_week_index, last_week_doc, fisrt_day_ts, last_day_ts, this_week, last_week, last_year
def month_num(year=None, cycle=None, cycle_num=None, compare_type=None):
now = datetime.datetime.now()
if not year:
year = now.year
if not cycle_num:
this_mon = now.month - 1 if now.month != 1 else 12
last_mon = this_mon - 1 if this_mon > 1 else this_mon - 1 + 12
if this_mon == 12:
last_year = year - 1
else:
last_year = year
else:
this_mon = cycle_num
last_mon = cycle_num - 1 if this_mon > 1 else this_mon - 1 + 12
if this_mon == 1:
last_year = year - 1
else:
last_year = year
first_day_ts = int(datetime.datetime(year=last_year, month=this_mon, day=1).timestamp() * 1e3)
if this_mon == 12:
next_year = last_year + 1
next_month = 1
else:
next_year = year
next_month = this_mon + 1
last_day_ts = int(datetime.datetime(year=next_year, month=next_month, day=1).timestamp() * 1e3)
this_mon_index = "short-video-production-%s" % last_year
this_mon_doc = "daily-url-%s" % (
datetime.datetime(year=last_year, month=next_month, day=1) + datetime.timedelta(days=-1)).strftime(
"%Y-%m-%d")
last_mon_index = "releaser"
last_mon_doc = "releasers"
if compare_type == "new_released":
this_mon_index = last_mon_index
this_mon_doc = last_mon_doc
return this_mon_index, this_mon_doc, last_mon_index, last_mon_doc, first_day_ts, last_day_ts, this_mon, last_mon, last_year
def quarter_num(year=None, cycle=None, cycle_num=None, compare_type=None):
now = datetime.datetime.now()
if not cycle_num:
this_quarter = int(now.month / 3) + 1
else:
this_quarter = cycle_num
last_quarter = this_quarter - 1 if cycle_num > 1 else 4
if last_quarter == 4:
last_year = year - 1
else:
last_year = year
first_day_ts = int(datetime.datetime(year=year, month=(this_quarter - 1) * 3 + 1, day=1).timestamp() * 1e3)
last_day_ts = int(datetime.datetime(year=year, month=this_quarter * 3 + 1, day=1).timestamp() * 1e3)
this_quarter_index = "short-video-quarter-%s" % year
this_quarter_doc = "daily-url-2019-Q%s" % this_quarter
last_quarter_index = "releaser"
last_quarter_doc = "releasers-%s-Q%s" % (last_year, last_quarter)
if compare_type == "new_released":
this_quarter_index = last_quarter_index
this_quarter_doc = last_quarter_doc
return this_quarter_index, this_quarter_doc, last_quarter_index, last_quarter_doc, first_day_ts, last_day_ts, this_quarter, last_quarter, last_year
def func_get_doc_and_timestmp(year=None,cycle="week",cycle_num=None,compare_type=None):
if cycle == "week":
this_cycle_index, this_cycle_doc, last_cycle_index, last_cycle_doc, fisrt_day_ts, last_day_ts, this_cycle, last_cycle, last_year = week_num(
year, cycle, cycle_num, compare_type)
elif cycle == "month":
this_cycle_index, this_cycle_doc, last_cycle_index, last_cycle_doc, fisrt_day_ts, last_day_ts, this_cycle, last_cycle, last_year = month_num(
year, cycle, cycle_num, compare_type)
elif cycle == "quarter":
this_cycle_index, this_cycle_doc, last_cycle_index, last_cycle_doc, fisrt_day_ts, last_day_ts, this_cycle, last_cycle, last_year = quarter_num(
year, cycle, cycle_num, compare_type)
elif cycle == "year":
pass
elif cycle == "all-time":
return "short-video-all-time-url","all-time-url",None,None
else:
return None,None,None,None
return this_cycle_index,this_cycle_doc,fisrt_day_ts,last_day_ts
if __name__ == "__main__":
print(func_get_doc_and_timestmp())
\ No newline at end of file
# -*- coding:utf-8 -*-
# @Time : 2019/5/30 11:01
# @Author : litao
import re, requests
try:
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
except:
pass
def toutiao(releaserUrl,**kwargs):
if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl:
pattern = 'user/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
else:
pattern = 'to_user_id=[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('=')[1]
else:
re_find = re.findall("/m(\d+)", releaserUrl)
if re_find:
return re_find[0]
else:
releaser_id = None
return releaser_id
elif 'm.toutiao.com' in releaserUrl:
pattern = 'profile/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
return releaser_id
elif 'm.365yg.com' in releaserUrl:
pattern = 'to_user_id=[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('=')[1]
else:
releaser_id = None
return releaser_id
elif "user_id" in releaserUrl:
re_find = re.findall("user_id=(\d+)",releaserUrl)
if re_find:
return re_find[0]
else:
return None
else:
re_find = re.findall("(\d+)", releaserUrl)
if re_find:
return re_find[0]
else:
return None
def haokan(releaserUrl,**kwargs):
if "app_id=" in releaserUrl:
releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl))
releaser_id = ' '.join(re.findall('\d+', releaser_id_str))
return releaser_id
elif "app_id" in releaserUrl:
try:
releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
except:
releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
else:
releaser_id_str = re.findall('(\d+)', releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
def tengxunshipin(releaserUrl,is_qq=False,**kwargs):
if not is_qq:
try:
releaser_id = re.findall("vplus/(.*)", releaserUrl)[0]
if len(releaser_id) == 32:
return releaser_id
else:
if "#" in releaser_id:
releaser_id = releaser_id.split("#")[0]
if len(releaser_id) == 32 or len(releaser_id) == 16:
return releaser_id
if "/videos" in releaser_id:
releaser_id = releaser_id.split("/videos")[0]
if len(releaser_id) == 32 or len(releaser_id) == 16:
return releaser_id
proxies = get_proxy(1)
get_page = requests.get(releaserUrl, timeout=5,proxies=proxies)
get_page.encoding = 'utf-8'
page = get_page.text
try:
USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# releaser = re.findall("name: '(.*)',", USER_INFO)[0]
releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# number_id = re.findall("number: '(.*)',", USER_INFO)[0]
except:
return None
return releaser_id
except:
return None
else:
proxies = get_proxy(1)
get_page = requests.get(releaserUrl, timeout=2,proxies=proxies)
get_page.encoding = 'utf-8'
page = get_page.text
try:
USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
releaser = re.findall("name: '(.*)',", USER_INFO)[0]
releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
number_id = re.findall("number: '(.*)',", USER_INFO)[0]
except:
return None
D0 = {'releaser': releaser,
'releaser_id': releaser_id,
"number_id": number_id}
return D0
def new_tudou(releaserUrl,**kwargs):
if "?" in releaserUrl:
releaserUrl = releaserUrl.split("?")[0]
if "=" in releaserUrl:
releaserUrl = releaserUrl.replace("=","")
try:
if 'videos' in releaserUrl:
releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl))
releaser_id = releaser_id_str.split('/')[1]
return releaser_id
elif releaserUrl[-1] == "/":
releaserUrl = releaserUrl[0:-1]
releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl))
releaser_id = releaser_id_str
return releaser_id
else:
releaser_id = releaserUrl.split("/")[-1]
return releaser_id
except:
return None
def douyin(releaserUrl,**kwargs):
try:
releaser_id = re.findall("user/(\d+)",releaserUrl)[0]
except:
print(releaserUrl)
return None
return releaser_id
def tencent_news(releaserUrl,**kwargs):
releaserUrl = str(releaserUrl)
try:
if "media/" in releaserUrl:
res = re.findall(r"media/(\d+)", releaserUrl)
if res:
return res[0]
else:
pattern = 'media/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
else:
releaser_id = False
return releaser_id
else:
res = re.findall(r"chlid=(\d+)", releaserUrl)
if res:
return res[0]
except:
return False
def miaopai(releaserUrl,**kwargs):
if 'n.miaopai.com' in releaserUrl:
releaser_id_str = releaserUrl.split('/')[-1]
releaser_id = releaser_id_str.replace('.html', '')
releaser_id = releaser_id_str.replace('.htm', '')
return releaser_id
else:
print("input illegal releaserUrl %s" % releaserUrl)
return None
def kwai(releaserUrl,**kwargs):
if "profile" in releaserUrl:
res = re.findall(r"/profile/(.+)", releaserUrl)
if res:
return res[0]
else:
return ""
elif "/u/" in releaserUrl:
res = re.findall(r"/u/(.+)/", releaserUrl)
if res:
return res[0]
else:
return ""
def wangyi_news(releaserUrl,**kwargs):
if "/sub/" in releaserUrl:
res = re.findall(r"/sub/(.+)\.html", releaserUrl)
if res:
return res[0]
else:
return None
elif "video" in releaserUrl:
res = re.findall(r"/list/(.+)/video", releaserUrl)
if res:
return res[0]
else:
return None
elif "all" in releaserUrl:
res = re.findall(r"/list/(.+)/all", releaserUrl)
if res:
return res[0]
else:
return None
plantform_func = {
"toutiao": toutiao,
"haokan": haokan,
"腾讯视频": tengxunshipin,
"new_tudou": new_tudou,
"腾讯新闻": tencent_news,
"miaopai": miaopai,
"kwai": kwai,
"网易新闻": wangyi_news,
"抖音":douyin,
}
def get_releaser_id(platform=None, releaserUrl=None,is_qq=False):
if platform and releaserUrl:
if platform in plantform_func:
func = plantform_func[platform]
res = func(releaserUrl,is_qq=is_qq)
try:
if res:
return res
else:
print(platform, releaserUrl, "can't git releaser_id")
return None
except:
return None
else:
# print(plantform," not in target list")
return None
if __name__ == "__main__":
# file = r'D:\work_file\发布者账号\SMG.csv'
# with open(file, 'r')as f:
# head = f.readline()
# head_list = head.strip().split(',')
# for i in f:
# line_list = i.strip().split(',')
# line_dict = dict(zip(head_list, line_list))
# platform = line_dict['platform']
# releaser = line_dict['releaser']
# try:
# releaserUrl = line_dict['releaserUrl']
# if platform == 'new_tudou':
# if releaserUrl[-2:] == '==':
# releaserUrl = releaserUrl + '/videos'
# line_dict['releaserUrl'] = releaserUrl
# except:
# pass
# releaser_id = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
# print(platform, releaserUrl, releaser_id)
releaser_id= get_releaser_id("腾讯新闻","https://r.inews.qq.com/getUserVideoList?chlid=5362294&page_time=&coral_uin=ec8bb1459b9d84100312bf035bb43cd4d0&coral_uid=&type=om&uid=7313ae71df9e5367&omgid=&trueVersion=5.8.00&qimei=287801615436009&devid=008796749793280&appver=23_android_5.8.00&qn-rid=9ec6d3f9-d341-4138-b4e2-6b2ed4b98b5b&qn-sig=891289f9217ec9623723c024dd00eaf5")
print(releaser_id)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 10:52:58 2018
tested on toutiao
@author: hanye
"""
import re
from crawler_sys.utils.output_results import retry_get_url
def get_redirected_resp(ori_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,en-US;q=0.7,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
get_host_str = re.findall('://.+?/', ori_url)
if get_host_str!=[]:
host_str = get_host_str[0].replace(':', '').replace('/', '')
headers['Host'] = host_str
else:
pass
# must use allow_redirects=False to avoiding automatically redirect by requests package
r_1st = retry_get_url(ori_url, headers=headers, allow_redirects=False)
print('Respond from direct get: %s' % r_1st)
if r_1st!=None:
if 'location' in r_1st.headers:
print('location in response headers: %s' % r_1st.headers['location'])
redirect_url = r_1st.headers['location']
else:
print('There is no location field in response headers, '
'will check the response.history attribute.')
if len(r_1st.history)>0:
history_headers = r_1st.history[0].headers
if 'Location' in history_headers:
redirect_url = history_headers['Location']
print('Original url %s redirected to %s' % (ori_url, redirect_url))
print('response history: %s\n' % r_1st.history)
else:
redirect_url = None
else:
print('No further redirects.')
redirect_url = ori_url
return r_1st
if redirect_url!=None and redirect_url!=ori_url:
redirect_url = get_redirected_resp(redirect_url)
return redirect_url
else:
return None
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 10 17:07:09 2018
@author: fangyucheng
"""
import sys
import argparse
import configparser
from multiprocessing import Pool
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
parser = argparse.ArgumentParser(description='a special crawler framework for key customer')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('legal platform name is required'))
parser.add_argument('-c', '--conf', default='/home/hanye/crawlersNew/crawler/crawler_sys/framework/config/high_fre.ini',
help=('absolute path of config file'))
parser.add_argument('-num', '--page_num', default=20, type=int,
help=('the number of scrolling page'))
args = parser.parse_args()
if args.platform != []:
platform_list = args.platform
for platform in platform_list:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name" % platform)
sys.exit(0)
config_file_path = args.conf
config = configparser.ConfigParser()
config.sections()
config.read(config_file_path)
releaser_page_num_max = args.page_num
ARGS_DICT = {"releaser_page_num_max": releaser_page_num_max,
"output_to_es_raw": True,
"output_es_index": "crawler-data-raw",
"output_doc_type": "doc",
"output_to_es_register": True}
for platform in platform_list:
crawler_initialization = get_crawler(platform)
crawler = crawler_initialization().releaser_page
get_task_list = config[platform]
TASK_LIST = []
for key, value in get_task_list.items():
TASK_LIST.append(value)
pool = Pool(processes=20)
for releaserUrl in TASK_LIST:
pool.apply_async(func=crawler, args=(releaserUrl,), kwds=ARGS_DICT)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 12 19:03:43 2018
@author: fangyucheng
"""
#import os
import sys
import argparse
import configparser
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description="crawler for video platform list page")
parser.add_argument('-p', '--platform', default=[], type=str, action='append',
help=('legal platform name is required'))
parser.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
'/crawler_sys/framework/config'
'/list_page_urls.ini'),
type=str, help=('absolute path'))
parser.add_argument('-ch', '--channel', default=[], action='append', type=str,
help=('input all of the channel you want to scrap,'
'while no input means all channels'))
parser.add_argument('-fp', '--file_path', default='', type=str,
help=('Output data to file, default to None'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
args = parser.parse_args()
PLATFORM_LIST = []
if args.platform != []:
PLATFORM_LIST = args.platform
for platform in PLATFORM_LIST:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name,"
"program will exit" % platform)
sys.exit(0)
else:
for key, value in platform_crawler_reg.items():
PLATFORM_LIST.append(key)
PLATFORM_LIST.remove('haokan')
PLATFORM_LIST.remove('腾讯新闻')
PLATFORM_LIST.remove('miaopai')
if args.channel != []:
CHANNEL_LIST = args.channel
else:
CHANNEL_LIST = []
config = configparser.RawConfigParser()
config.sections()
config.read(filenames=args.conf, encoding='utf-8')
TASK_DICT = {}
for platform in PLATFORM_LIST:
if CHANNEL_LIST == []:
TASK_DICT[platform] = [value for key, value in config[platform].items()]
else:
LIST_URL_LIST = []
for channel in CHANNEL_LIST:
try:
LIST_URL_LIST.append(config[platform][channel])
except:
print("There is no channel named %s in platform %s"
% (channel, platform))
if LIST_URL_LIST == []:
TASK_DICT[platform] = LIST_URL_LIST
FILE_PATH = args.file_path
if FILE_PATH == '':
FILE_PATH = None
OUTPUT_TO_FILE = False
else:
OUTPUT_TO_FILE = True
PUSH_TO_REDIS = parse_bool_for_args(args.push_to_redis)
OUTPUT_TO_ES_RAW = parse_bool_for_args(args.output_to_es_raw)
OUTPUT_TO_ES_REGISTER = parse_bool_for_args(args.output_to_es_register)
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'crawler-data-raw'
DOC_TYPE = 'doc'
#KWARGS_DICT = {'output_to_file': OUTPUT_TO_FILE,
# 'filepath': FILE_PATH,
# 'push_to_redis': PUSH_TO_REDIS,
# 'output_to_es_raw': args.output_to_es_raw,
# 'es_index': ES_INDEX,
# 'doc_type': DOC_TYPE,
# 'output_to_es_register': args.output_to_es_register}
for platform in PLATFORM_LIST:
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
TASK_LIST = TASK_DICT[platform]
print('processing %s list page' % platform)
crawler.list_page(task_list=TASK_LIST,
output_to_file=OUTPUT_TO_FILE,
filepath=FILE_PATH,
push_to_redis=PUSH_TO_REDIS,
output_to_es_raw=OUTPUT_TO_ES_RAW,
es_index=ES_INDEX,
doc_type=DOC_TYPE,
output_to_es_register=OUTPUT_TO_ES_REGISTER)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
import sys
import argparse
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from multiprocessing import Pool
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-l', '--releasers', default=[], action='append',
help=('Write data into es or not, default to True'))
parser.add_argument('-fre', '--frequency', default=0, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 0'))
parser.add_argument('-s', '--processes_num', default=30, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-v', '--video', default="False", type=str,
help=('Is or not run video_page_crawler'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
else:
print('platform must be input')
sys.exit(0)
for platform in platforms:
if platform not in platform_crawler_reg:
print("illegal platform name %s" % platform)
sys.exit(0)
releaser_page_num_max = args.max_page
output_f_path = args.output_file_path
frequency = args.frequency
if output_f_path == '':
output_f_path = None
if frequency == '':
frequency = None
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
releasers = args.releasers
processes_num = args.processes_num
frequency = args.frequency
if frequency == 0:
frequency = None
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
def get_TTTTTreleaserUrls_from_es(platform):
search_body = {"query": {"bool": {"filter": [
{"term": {"platform.keyword": platform}},
{"term": {"融合传播.keyword": "True"}}
]}}}
search_resp=es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaserUrl_Lst = []
if total_hit > 0:
print('Got %d releaserUrls for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=index_target_releaser,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
releaserUrl = line['_source']['releaserUrl']
releaserUrl_Lst.append(releaserUrl)
except:
print('error in :' ,line)
continue
else:
print('Got zero hits.')
return releaserUrl_Lst
es_index = args.es_index
doc_type = args.doc_type
kwargs_dict = {
'output_to_file': output_to_file,
'filepath': output_f_path,
'releaser_page_num_max': releaser_page_num_max,
'output_to_es_raw': output_to_es_raw,
'es_index': es_index,
'doc_type': doc_type,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis,
}
for platform in platforms:
# 2 get releaserUrl list on each platform from target-releasers index
if releasers == []:
releaserUrl_Lst = get_TTTTTreleaserUrls_from_es(platform=platform)
else:
releaserUrl_Lst = []
for releaser in releasers:
releaserUrl_Lst.extend(get_releaserUrls_from_es(platform, releaser, frequency))
if releaserUrl_Lst == []:
print('Get empty releaserUrl_Lst for platform %s' % platform)
continue
# 3 get crawler for this platform
Platform_crawler = get_crawler(platform)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
if args.video == "True":
crawler = crawler_instant.video_page
else:
crawler = crawler_instant.releaser_page
else:
print('Failed to get crawler for platform %s' % platform)
continue
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool = Pool(processes=processes_num)
for url in releaserUrl_Lst:
pool.apply_async(func=crawler, args=(url,), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
# -*- coding: utf-8 -*-
"""
Created on Wed May 16 12:00:37 2018
@author: hanye
"""
from crawler.crawler_sys.site_crawler import (crawler_toutiao,
crawler_v_qq,
crawler_iqiyi,
crawler_youku,
crawler_tudou,
crawler_haokan,
crawler_tencent_news,
crawler_miaopai,
crawler_pear,
crawler_bilibili,
crawler_mango,
crawler_wangyi_news,
crawler_kwai,
crawler_douyin
)
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
'iqiyi': crawler_iqiyi.Crawler_iqiyi,
'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
'miaopai': crawler_miaopai.Crawler_miaopai,
'pearvideo': crawler_pear.Crawler_pear,
'bilibili': crawler_bilibili.Crawler_bilibili,
'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai,
'抖音': crawler_douyin.Crawler_douyin,
}
def get_crawler(platform):
if platform in platform_crawler_reg:
platform_crawler = platform_crawler_reg[platform]
else:
platform_crawler = None
print("can't get crawler for platform %s, "
"do we have the crawler for that platform?" % platform)
return platform_crawler
# -*- coding: utf-8 -*-
"""
Created on Wed May 16 12:00:37 2018
@author: hanye
"""
"""
Due to the data-type used in redis is set so I changed the word from list to set
"""
platform_redis_set_reg = {
'toutiao': 'toutiao_url_set',
'腾讯视频': 'v_qq_url_set',
'youku': 'youku_url_set',
'iqiyi': 'iqiyi_url_set',
}
def get_redis_list_name(platform, batch_str=None):
if platform in platform_redis_set_reg:
platform_redis_set_name = platform_redis_set_reg[platform]
else:
platform_redis_set_name = None
if batch_str is not None:
platform_redis_set_name += '_%s' % batch_str
return platform_redis_set_name
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 18:18:09 2018
@author: hanye
"""
import redis
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=0)
def url_reformer(platform, url):
"""
to reform url according to platform, in the future.
Say, a url of http://www.toutiao.com/group/1234567890123456789
as a string is different from http://www.365yg.com/u/1234567890123456789,
but they point to the same resource. They should be reformed
to one unique url before pushing into redis for futher crawling.
"""
reformed_url = url
return reformed_url
def feed_url_into_redis(dict_Lst, platform,
release_time_lower_bdr=None,
batch_str=None):
"""
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
All url that is released before release_time_lower_bdr
will not be pushed into redis. If argument release_time_lower_bdr
is not given when call this function, all urls will be
pushed into redis.
"""
redis_list_name = get_redis_list_name(platform, batch_str)
if redis_list_name is None:
print('Failed to get correct redis list name '
'in platform_redis_register for platform: '
% platform)
return (None, None)
else:
print('Feeding url into redis list %s ...' % redis_list_name)
url_counter = 0
for data_dict in dict_Lst:
try:
url = data_dict['url']
url_reformed = url_reformer(platform, url)
if release_time_lower_bdr is None:
sadd_c = rds.sadd(redis_list_name, url_reformed)
url_counter += sadd_c
else:
url_release_time = data_dict['release_time']
if url_release_time >= release_time_lower_bdr:
sadd_c = rds.sadd(redis_list_name, url_reformed)
url_counter += sadd_c
except:
print('Failed to push url into redis, '
'might because of lack of url field '
'or lack of release_time field, or '
'has wrong typed release_time value. '
'The failed data dict is: \n %s' % data_dict)
print('Pushed %d urls into redis' % url_counter)
return (redis_list_name, url_counter)
def pull_url_from_es(platform, release_time_lower_bdr=None):
"""
Just pull urls from es index crawler-url-register.
Url reforming things will be done in the method who
is responsible for pushing urls into redis.
"""
if release_time_lower_bdr is None:
release_time_lower_bdr = 0
else:
pass
search_body = {"query": {"bool": {"filter": [{"range": {"release_time":
{"gte": release_time_lower_bdr}}},
{"term": {"platform.keyword": platform}}]}}}
total_hit, scan_resp = scan_crawler_url_register(search_body)
batch_url_Lst = []
if total_hit > 0:
line_counter = 0
for line in scan_resp:
line_counter += 1
line_d = line['_source']
batch_url_Lst.append(line_d)
else:
pass
return batch_url_Lst
# -*- coding:utf-8 -*-
# @Time : 2020/5/19 17:31
# @Author : litao
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 19:06:38 2018
input platform and releaserUrl, output follower number to file
@author: fangyucheng
"""
import redis , json
from crawler.crawler_sys.utils import trans_format
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
rds3 = redis.StrictRedis(host='192.168.17.60', port=6379, db=3, decode_responses=True)
input_file = r"D:\work_file\发布者账号\一次性需求附件\体育部.csv"
tast_list = trans_format.csv_to_lst_with_headline(input_file)
for line in tast_list:
releaserUrl = line['releaserUrl']
platform = line['platform']
if platform == "抖音":
crawler_initialization = get_crawler(platform)
try:
crawler = crawler_initialization().get_releaser_page
dic = crawler(releaserUrl)
if dic:
dic.update(line)
rds3.hset(platform, releaserUrl, json.dumps(dic))
except:
continue
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 27 19:06:38 2018
input platform and releaserUrl, output follower number to file
@author: fangyucheng
"""
import argparse
#from multiprocessing import Pool
from crawler.crawler_sys.utils import trans_format
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
parser = argparse.ArgumentParser(description='get releaser follow number')
parser.add_argument('-i', '--input', default='/home/hanye/crawlersNew/crawler/tasks/follower_num.csv',
help=('absolute path of input csv'))
parser.add_argument('-o', '--output', default='/home/hanye/crawlersNew/crawler/tasks/follower_num_result.csv',
help=('absolute path of output csv'))
parser.add_argument('-p', '--process_num', default=10, help=('process num'))
args = parser.parse_args()
input_file = args.input
output_file = args.output
input_file = r"D:\work_file\4月补数据1.csv"
tast_list = trans_format.csv_to_lst_with_headline(input_file)
for line in tast_list:
releaserUrl = line['releaserUrl']
platform = line['platform']
follower_num = line['follower_num']
if not follower_num:
crawler_initialization = get_crawler(platform)
try:
crawler = crawler_initialization().get_releaser_follower_num
line['follower_num'] = crawler(releaserUrl)
except:
line['follower_num'] = ""
output_file = "./follower_num_result1.csv"
trans_format.lst_to_csv(listname=tast_list, csvname=output_file)
\ No newline at end of file
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 26 14:25:37 2018
@author: hanye
"""
import argparse
import configparser
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
'/crawler_sys/framework/config'
'/list_page_urls.ini'), type=str,
help=('absolute path of config file'))
parser.add_argument('-ch', '--channels', default=[], action='append',
help=('Specify channel names, illegal channel names will be ignored, '
'default to be all.'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
for platform in platforms:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name" % platform)
else:
platforms = [
'iqiyi',
'youku',
'腾讯视频',
'new_tudou',
'toutiao'
]
config_file = args.conf
config = configparser.RawConfigParser()
config.sections()
config.read(config_file)
channel_Lst = args.channels
for platform in platforms:
print('working on Platform %s' % platform)
Platform_crawler = get_crawler(platform)
crawler = Platform_crawler()
task_list = []
if channel_Lst == []:
for key, value in config[platform].items():
task_list.append(value)
else:
for channel in channel_Lst:
try:
task_url = config[platform][channel]
task_list.append(task_url)
except:
print("there is no channel %s in platform %s" % (channel, platform))
crawler.start_list_page(task_list)
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 26 14:25:37 2018
@author: hanye
"""
import argparse
import time
import datetime
from multiprocessing import Pool
from crawler_sys.framework.platform_crawler_register import get_crawler
from crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-c', '--channels', default=[], action='append',
help=('Specify channel names, illegal channel names will be ignored, '
'default to be all.'))
parser.add_argument('-f', '--output_file_path', default=None, type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-s', '--processes_num', default=20, type=int,
help=('Processes number to be used in multiprocessing'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
else:
platforms = [
'iqiyi',
'youku',
'腾讯视频',
]
max_page = args.max_page
channel_Lst = args.channels
output_f_path = args.output_file_path
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
processes_num = args.processes_num
# multi-processing
workers = Pool(processes=processes_num)
# program exits if any of logical arguments is not parsed correctly
if (push_to_redis is None
or output_to_es_raw is None
or output_to_es_register is None):
print('Program exits. %s' % datetime.datetime.now())
else:
for platform in platforms:
print('Platform %s' % platform)
# get crawler for this platform
Platform_crawler = get_crawler(platform)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
else:
print('Failed to get crawler for platform %s' % platform)
continue
if channel_Lst == []:
channel_Lst = crawler_instant.legal_channels
else:
pass
for ch in channel_Lst:
if ch in crawler_instant.legal_channels:
print('platform: %s, channel: %s' % (platform, ch))
ch_url = crawler_instant.list_page_url_dict[ch]
args = (ch_url, ch)
kwargs_dict = {
'page_num_max': max_page,
'output_to_file': output_to_file,
'filepath': output_f_path,
'output_to_es_raw': output_to_es_raw,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis
}
async_res = workers.apply_async(
crawler_instant.list_page, args=args, kwds=kwargs_dict)
while not async_res.ready():
print('*** scrap_list_pages_multi_process wait for workers complete, %s'
% datetime.datetime.now())
time.sleep(60)
workers.close()
workers.join()
print('scrap_list_pages_multi_process multiprocessing done, %s'
% datetime.datetime.now())
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 28 15:27:39 2018
@author: hanye
"""
import argparse
import datetime
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.redis_interact import rds
from crawler_sys.framework.platform_crawler_register import get_crawler
from crawler_sys.utils.output_results import output_result
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platforms', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-b', '--batch_str_Lst', default=[], action='append',
help=('Pass batch_str names, they will be assembled in python list.'))
args = parser.parse_args()
platform_Lst = args.platforms
batch_str_Lst = args.batch_str_Lst
def get_video_dict_by_url(platform, url):
Platform_crawler = get_crawler(platform)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
else:
print('Failed to get crawler for platform %s' % platform)
return None
video_dict = crawler_instant.video_page(url)
return video_dict
def scrap_redis_urls(platform, batch_str):
redis_list_name = get_redis_list_name(platform, batch_str)
video_dict_Lst = []
if redis_list_name is None:
print('Failed to get correct redis list name '
'in platform_redis_register for platform: '
% platform)
return None
else:
urls_total = rds.scard(redis_list_name)
if urls_total == 0:
print('Got %d urls to be processed for %s, program exits, %s'
% (urls_total, redis_list_name, datetime.datetime.now()))
return None
print('Got %d urls to be processed for %s, %s'
% (urls_total, redis_list_name,
datetime.datetime.now()))
url_bin = rds.spop(redis_list_name)
url_counter = 1
while url_bin is not None:
url = url_bin.decode('utf-8')
video_dict = get_video_dict_by_url(platform, url)
if video_dict is not None:
video_dict_Lst.append(video_dict)
url_bin = rds.spop(redis_list_name)
url_counter += 1
if url_counter%100 == 0 or url_counter == urls_total:
print('%s: %d/%d, %s' % (redis_list_name,
url_counter,
urls_total,
datetime.datetime.now()))
if len(video_dict_Lst) >= 100:
output_result(video_dict_Lst, platform,
output_to_es_raw=True,
output_to_es_register=False,
push_to_redis=False,
output_to_file=False)
video_dict_Lst.clear()
if video_dict_Lst != []:
output_result(video_dict_Lst, platform,
output_to_es_raw=True,
output_to_es_register=False,
push_to_redis=False,
output_to_file=False)
video_dict_Lst.clear()
if platform_Lst == []:
print('No platform is given, program exits.')
else:
for platform in platform_Lst:
print('Scraping platform: %s' % platform)
if batch_str_Lst == []:
batch_str = ''
scrap_redis_urls(platform, batch_str)
else:
for batch_str in batch_str_Lst:
print('platform: %s batch: %s' % (platform, batch_str))
scrap_redis_urls(platform, batch_str)
# -*- coding:utf-8 -*-
# @Time : 2019/7/19 17:09
# @Author : litao
# -*- coding: utf-8 -*-
import argparse
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from multiprocessing import Pool
PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'
# index_target_releaser = 'test2'
# doc_type_target_releaser = 'keywrod'
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def func_search_keywordlist(platform):
search_body = {"query": {"bool": {"filter": []}}}
search_resp = es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaser_dic = {}
if total_hit > 0:
print('Got %d releaser for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=index_target_releaser,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
title = line['_source']['title']
page = line['_source']['page']
releaser_dic[title] = page
except:
print('error in :', line)
continue
else:
print('Got zero hits.')
return releaser_dic
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'crawler-data-raw'
# ES_INDEX = 'test2'
DOC_TYPE = 'doc'
print(ES_INDEX, DOC_TYPE)
pages = ARGS.maxpage
def search_page_task(platform, output_to_es_raw,
output_to_es_register,
es_index,
doc_type):
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform)
for keyword in KEYWORD_dic:
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
if platform != "腾讯新闻":
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type,releaser=False)
except Exception as e:
print(e)
continue
result = []
kwargs_dict = {
'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX,
'doc_type': DOC_TYPE,
}
pool = Pool(processes=4)
for platform in PLATFORM_LIST:
res = pool.apply_async(func=search_page_task, args=(platform,OUTPUT_TO_ES_RAW,OUTPUT_TO_ES_REGISTER,ES_INDEX,DOC_TYPE))
result.append(res)
pool.close()
pool.join()
print('=================')
for i in result:
print(i.get())
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 4 14:00:03 2018
@author: fangyucheng
"""
import argparse
import configparser
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
def func_search_keywordlist(platform):
search_body = {"query": {"bool": {"filter": []}}}
search_resp = es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaser_dic = {}
if total_hit > 0:
print('Got %d releaser for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=index_target_releaser,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
title = line['_source']['title']
page = line['_source']['page']
releaser_dic[title] = page
except:
print('error in :', line)
continue
else:
print('Got zero hits.')
return releaser_dic
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'test2'
DOC_TYPE = 'doc'
print(ES_INDEX, DOC_TYPE)
pages = ARGS.maxpage
for platform in PLATFORM_LIST:
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform)
for keyword in KEYWORD_dic:
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
if platform != "腾讯新闻":
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
except Exception as e:
print(e)
continue
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 3 09:56:46 2018
@author: hanye
"""
from crawler_sys.framework.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from crawler_sys.framework.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from crawler_sys.framework.func_calculate_v_qq_video_id import calculate_v_qq_video_id
from crawler_sys.framework.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url as calculate_kwai_video_id
from crawler_sys.framework.func_calculate_txxw_video_id import calculate_txxw_video_id
from crawler_sys.framework.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from crawler_sys.framework.func_calculate_douyin_id import calculate_douyin_id
from crawler_sys.framework.func_get_releaser_id import get_releaser_id
def vid_cal_func(platform):
vid_cal_func_dict = {
'toutiao': calculate_toutiao_video_id,
'new_tudou': calculate_newTudou_video_id,
'腾讯视频': calculate_v_qq_video_id,
'kwai': calculate_kwai_video_id,
'腾讯新闻':calculate_txxw_video_id,
"网易新闻":calculate_wangyi_news_id,
"抖音":calculate_douyin_id
}
def general_vid_cal_func(url):
return url
if platform in vid_cal_func_dict:
return vid_cal_func_dict[platform]
else:
return general_vid_cal_func
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 16:56:45 2018
@author: hanye
Due to the play count from v_qq video page maybe album play count rather than
video play count, we have to select these video info, not write into
short-video-production but another new index named album-play-count
"""
import datetime
import argparse
from elasticsearch import Elasticsearch
from crawler_sys.framework.es_short_video import bulk_write_short_video
#from crawler_sys.framework.es_short_video import func_search_is_gaopin
from crawler_sys.framework.es_crawler import scan_crawler_raw_index
from crawler_sys.utils.output_results import bulk_write_into_es
parser = argparse.ArgumentParser(description='You can specify a date to process.')
parser.add_argument('-d', '--file_date',
help=('Must in isoformat, similar to "2018-06-07". Other '
'format will just be ignored.'))
args = parser.parse_args()
if args.file_date is not None:
try:
dayT = datetime.datetime.strptime(args.file_date, '%Y-%m-%d')
except:
print('Ill format for parameter -t: %s, should be in isoformat, '
'similar to "2018-06-07". The input parameter is ignored, '
'will continue to run with default parameters. Ctrl-C to '
'interrupt or just kill -9 pid.' % args.file_date)
else:
dayT = datetime.datetime.now()
#dayT = datetime.datetime.today()
fetch_time_start_T = datetime.datetime(dayT.year, dayT.month, dayT.day) - datetime.timedelta(days=1)
# fetch date range spreads on two days rather than one, to
# avoid missing data because task time overlap
fetch_time_end_T = fetch_time_start_T + datetime.timedelta(days=1)
fetch_time_start_ts = int(fetch_time_start_T.timestamp()*1e3)
fetch_time_end_ts = int(fetch_time_end_T.timestamp()*1e3)
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
def func_search_is_gaopin(releaser, platform):
search_body_gaopin = {
"query": {
"bool": {
"filter": [
{"term": {"platform.keyword": platform}},
{"term": {"releaser.keyword": releaser}}
]
}
}
}
search_gaopin_re = es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body_gaopin)
if search_gaopin_re['hits']['total'] > 0:
if search_gaopin_re['hits']['hits'][0]['_source']['frequency'] >= 3:
return True
else:
return False
else:
return False
true_set = set()
false_set = set()
find_data_from_crawler_raw_bd = {
"query": {
"bool": {
"filter": [
{
"range": {
"fetch_time": {
"gte": fetch_time_start_ts,
"lt": fetch_time_end_ts
}
}
}
],
"must_not": [
{"term":{"data_source": "interactioncount"}}
]
}
}
}
total_hit, scan_resp = scan_crawler_raw_index(find_data_from_crawler_raw_bd)
if total_hit > 0:
line_counter = 0
data_Lst = []
for line in scan_resp:
line_counter += 1
line_d = line['_source']
releaser = line_d['releaser']
platform = line_d['platform']
if releaser == None:
continue
if platform+releaser in true_set:
data_Lst.append(line_d)
else:
if func_search_is_gaopin(releaser, platform):
true_set.add(platform+releaser)
data_Lst.append(line_d)
else:
false_set.add(platform+releaser)
if line_counter%1000==0 or line_counter==total_hit:
print('Writing lines %d/%d into short video index, %s'
% (line_counter, total_hit, datetime.datetime.now()))
bulk_write_short_video(data_Lst,
#index='test_write6', # test
)
data_Lst.clear()
if data_Lst != []:
print('Writing lines %d/%d into short video index, %s'
% (line_counter, total_hit, datetime.datetime.now()))
bulk_write_short_video(data_Lst,
#index='test_write6', # test
)
data_Lst.clear()
print('All done. %s' % datetime.datetime.now())
else:
print('Zero hit, program exits. %s' % datetime.datetime.now())
#for those video info with album play count, write them into another es index
find_album_play_count_data = {
"query": {
"bool": {
"filter": [
{
"term": {
"data_source": "interactioncount"
}
},
{
"range": {
"fetch_time": {
"gte": fetch_time_start_ts,
"lt": fetch_time_end_ts
}
}
}
]
}
}
}
total_hit, scan_resp = scan_crawler_raw_index(find_album_play_count_data)
if total_hit > 0:
line_counter = 0
album_play_count_lst = []
for line in scan_resp:
line_counter += 1
line_d = line['_source']
album_play_count_lst.append(line_d)
if line_counter%1000==0 or line_counter==total_hit:
print('Writing lines %d/%d into index album-play-count, %s'
% (line_counter, total_hit, datetime.datetime.now()))
bulk_write_into_es(dict_Lst=album_play_count_lst,
index='album-play-count',
doc_type='doc')
album_play_count_lst.clear()
if album_play_count_lst != []:
print('Writing lines %d/%d index album-play-count, %s'
% (line_counter, total_hit, datetime.datetime.now()))
bulk_write_into_es(dict_Lst=album_play_count_lst,
index='album-play-count',
doc_type='doc')
album_play_count_lst.clear()
print('write album play count into another index. %s' % datetime.datetime.now())
else:
print('Zero hit, program exits. %s' % datetime.datetime.now())
# -*- coding:utf-8 -*-
# @Time : 2019/9/10 15:44
# @Author : litao
import redis,time,json
from crawler.crawler_sys.site_crawler_test import (crawler_toutiao,crawler_v_qq,crawler_tudou,crawler_haokan,
crawler_tencent_news,
crawler_wangyi_news,crawler_kwai,crawler_douyin)
from maintenance.send_email_with_file_auto_task import write_email_task_to_redis
import sys
import argparse, copy,datetime
from multiprocessing import Pool
from qingboAPI.reback_data_api import reback_data,get_biz_from_url
# from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from write_data_into_es.func_get_releaser_id import get_releaser_id
rds = redis.StrictRedis(host='192.168.17.60', port=6379, db=2,decode_responses=True)
# 爬虫补抓系统
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
# "kwai": crawler_kwai.Crawler_kwai,
# "抖音": crawler_douyin.Crawler_douyin
}
def get_crawler(platform):
if platform in platform_crawler_reg:
platform_crawler = platform_crawler_reg[platform]
else:
platform_crawler = None
print("can't get crawler for platform %s, "
"do we have the crawler for that platform?" % platform)
return platform_crawler
def get_crawler_list_from_redis(project):
data_lis = []
one_project_dic = rds.hgetall(project)
crawler_dur = int(one_project_dic.get("duration"))
data = json.loads(one_project_dic.get("data"))
try:
email_dic = json.loads(one_project_dic.get("email"))
except:
email_dic = None
for d in data:
platform,releaserUrl = d.split("&",1)
data_lis.append({"platform":platform,"releaserUrl":releaserUrl})
return data_lis,crawler_dur,email_dic
def get_project_name_from_redis():
res = rds.llen("project")
if res != 0:
one_project_name = rds.lpop("project")
crawler_lis,crawler_duration,email_dic = get_crawler_list_from_redis(one_project_name)
return crawler_lis,crawler_duration,one_project_name,email_dic
else:
return None,None,None,None
def delete_project_form_redis(one_project_name,email_dic):
rds.delete(one_project_name)
rds.hset("task_down",one_project_name,int(datetime.datetime.now().timestamp()*1e3))
if email_dic:
write_email_task_to_redis(task_name=one_project_name,email_group=email_dic.get("email_group"),email_msg_body_str=email_dic.get("email_msg_body_str"),title_str=email_dic.get("title_str"),cc_group=email_dic.get("cc_group"),sender=email_dic.get("sender"))
def start_crawler(crawler_lis, crawler_duration,email_dic,processes=15):
pool = Pool(processes=processes)
start_time = int(crawler_duration)
end_time = int(datetime.datetime.now().timestamp() * 1e3)
kwargs_dict = {
'output_to_file': False,
'filepath': "",
'releaser_page_num_max': 10000,
'output_to_es_raw': True,
'es_index': "crawler-data-raw",
'doc_type': "doc",
'output_to_es_register': False,
'push_to_redis': False,
"proxies_num":3
}
for one_data in crawler_lis:
platform = one_data.get("platform")
releaserUrl = one_data.get("releaserUrl")
# 3 get crawler for this platform
Platform_crawler = get_crawler(platform)
# print(releaserUrl_Lst)
if platform in ["weixin"]:
res = reback_data(platform=platform,releaser_id=get_biz_from_url(releaserUrl),start_time=datetime.datetime.fromtimestamp(start_time/1e3).strftime("%Y-%m-%d %H:%M:%S"),
end_time=datetime.datetime.fromtimestamp(end_time/1e3).strftime("%Y-%m-%d %H:%M:%S"))
email_dic["email_msg_body_str"] += "\n" + platform + " " +releaserUrl + " 提交清博接口回溯" + "成功" if res else "失败"
elif platform in ["miaopai", "kwai", "抖音"]:
res = reback_data(platform=platform, releaser_id=get_releaser_id(releaserUrl),
start_time=datetime.datetime.fromtimestamp(start_time / 1e3).strftime("%Y-%m-%d %H:%M:%S"),
end_time=datetime.datetime.fromtimestamp(end_time / 1e3).strftime("%Y-%m-%d %H:%M:%S"))
email_dic["email_msg_body_str"] += "\n" + platform + " " + releaserUrl + " 提交清博接口回溯" + "成功" if res else "失败"
elif Platform_crawler != None:
crawler_instant = Platform_crawler()
crawler = crawler_instant.releaser_page_by_time
else:
print('Failed to get crawler for platform %s' % platform)
continue
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool.apply_async(func=crawler, args=(start_time, end_time, releaserUrl), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done')
return email_dic
if __name__ == "__main__":
# rds.flushdb()
now = datetime.datetime.now()
while True and now.hour >= 5:
crawler_lis, crawler_duration,one_project_name,email_dic = get_project_name_from_redis()
if crawler_lis and crawler_duration:
try:
new_process = start_crawler
print(crawler_lis)
email_dic = new_process(crawler_lis, crawler_duration,email_dic)
except:
pass
delete_project_form_redis(one_project_name,email_dic)
else:
print("wait for 5s")
now = datetime.datetime.now()
time.sleep(5)
sys.exit(0)
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
import sys
import argparse,copy
from multiprocessing import Pool
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-l', '--releasers', default=[], action='append',
help=('Write data into es or not, default to True'))
parser.add_argument('-fre', '--frequency', default=1, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 1'))
parser.add_argument('-s', '--processes_num', default=30, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-v', '--video', default="False", type=str,
help=('Is or not run video_page_crawler'))
parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
help=('target_releasers_org or target_releasers'))
parser.add_argument('-file', '--file', default="", type=str,
help=('target_releasers_org or target_releasers'))
parser.add_argument('-proxies', '--proxies', default=0, type=int,
help=('Crawler proxies_num'))
args = parser.parse_args()
if args.platform != []:
platforms = args.platform
else:
print('platform must be input')
sys.exit(0)
for platform in platforms:
if platform not in platform_crawler_reg:
print("illegal platform name %s" % platform)
sys.exit(0)
releaser_page_num_max = args.max_page
output_f_path = args.output_file_path
frequency = args.frequency
if output_f_path == '':
output_f_path = None
if frequency == '':
frequency = None
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
releasers = args.releasers
processes_num = args.processes_num
frequency = args.frequency
print(frequency)
if frequency == 0:
frequency = None
es_index = args.es_index
doc_type = args.doc_type
kwargs_dict = {
'output_to_file': output_to_file,
'filepath': output_f_path,
'releaser_page_num_max': releaser_page_num_max,
'output_to_es_raw': output_to_es_raw,
'es_index': es_index,
'doc_type': doc_type,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis,
"proxies_num":0
}
if frequency:
if frequency >= 3:
kwargs_dict["proxies_num"] = 10
if args.proxies:
kwargs_dict["proxies_num"] = args.proxies
for platform in platforms:
# 2 get releaserUrl list on each platform from target-releasers index
if releasers == []:
releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,target_index=args.target_index)
else:
releaserUrl_Lst = []
for releaser in releasers:
releaserUrl_Lst.extend(get_releaserUrls_from_es(platform=platform, releaser=releaser, frequency=frequency,target_index=args.target_index))
if releaserUrl_Lst == []:
print('Get empty releaserUrl_Lst for platform %s' % platform)
continue
# 3 get crawler for this platform
Platform_crawler = get_crawler(platform)
# print(releaserUrl_Lst)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
if args.video == "True":
try:
crawler = crawler_instant.video_page
except:
crawler = crawler_instant.search_video_page
else:
crawler = crawler_instant.releaser_page
else:
print('Failed to get crawler for platform %s' % platform)
continue
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool = Pool(processes=processes_num)
executor = ProcessPoolExecutor(processes_num)
res_list = []
if platform == "腾讯新闻" and args.video == "True":
crawler = crawler_instant.search_video_page
for url, releaser in releaserUrl_Lst:
print(releaser, url)
pool.apply_async(func=crawler, args=(releaser, url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
else:
for url, releaser in releaserUrl_Lst:
# print(url)
# crawler(url,kwargs_dict)
try:
pool.apply_async(func=crawler, args=(url,), kwds=kwargs_dict)
except Exception as e:
print(e)
# res = executor.submit(crawler,url,kwargs_dict)
# res_list.append(res)
# executor.shutdown(True)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
from crawler.crawler_sys.site_crawler_test import (crawler_toutiao,crawler_v_qq,crawler_tudou,crawler_haokan,
crawler_tencent_news,
crawler_wangyi_news,crawler_kwai,crawler_douyin)
import sys
import argparse, copy,datetime
from multiprocessing import Pool
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
# from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-l', '--releasers', default=[], action='append',
help=('Write data into es or not, default to True'))
parser.add_argument('-fre', '--frequency', default=1, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 1'))
parser.add_argument('-s', '--processes_num', default=30, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-v', '--video', default="False", type=str,
help=('Is or not run video_page_crawler'))
parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
help=('target_releasers_org or target_releasers'))
parser.add_argument('-d', '--date', default=3, type=int,
help=('Crawler backtracking data time'))
parser.add_argument('-proxies', '--proxies', default=0, type=int,
help=('Crawler proxies_num'))
args = parser.parse_args()
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
'抖音': crawler_douyin.Crawler_douyin,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai
}
def get_crawler(platform):
if platform in platform_crawler_reg:
platform_crawler = platform_crawler_reg[platform]
else:
platform_crawler = None
print("can't get crawler for platform %s, "
"do we have the crawler for that platform?" % platform)
return platform_crawler
if args.platform != []:
platforms = args.platform
else:
print('platform must be input')
sys.exit(0)
for platform in platforms:
if platform not in platform_crawler_reg:
print("illegal platform name %s" % platform)
sys.exit(0)
releaser_page_num_max = args.max_page
output_f_path = args.output_file_path
frequency = args.frequency
if output_f_path == '':
output_f_path = None
if frequency == '':
frequency = None
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
releasers = args.releasers
processes_num = args.processes_num
frequency = args.frequency
print(frequency)
if frequency == 0:
frequency = None
es_index = args.es_index
doc_type = args.doc_type
start_time = int((datetime.datetime.now() + datetime.timedelta(days=-args.date)).timestamp()*1e3)
end_time = int(datetime.datetime.now().timestamp()*1e3)
kwargs_dict = {
'output_to_file': output_to_file,
'filepath': output_f_path,
'releaser_page_num_max': releaser_page_num_max,
'output_to_es_raw': output_to_es_raw,
'es_index': es_index,
'doc_type': doc_type,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis,
"proxies_num":0
}
if frequency:
if frequency >= 3:
kwargs_dict["proxies_num"] = 10
if args.proxies:
kwargs_dict["proxies_num"] = args.proxies
for platform in platforms:
# 2 get releaserUrl list on each platform from target-releasers index
if releasers == []:
releaserUrl_Lst = get_releaserUrls_from_es(platform=platform, frequency=frequency,
target_index=args.target_index)
else:
releaserUrl_Lst = []
for releaser in releasers:
releaserUrl_Lst.extend(get_releaserUrls_from_es(platform=platform, releaser=releaser, frequency=frequency,
target_index=args.target_index))
if releaserUrl_Lst == []:
print('Get empty releaserUrl_Lst for platform %s' % platform)
continue
# 3 get crawler for this platform
Platform_crawler = get_crawler(platform)
# print(releaserUrl_Lst)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
if args.video == "True":
try:
crawler = crawler_instant.video_page
except:
crawler = crawler_instant.search_video_page
else:
crawler = crawler_instant.releaser_page_by_time
else:
print('Failed to get crawler for platform %s' % platform)
continue
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool = Pool(processes=processes_num)
if platform == "腾讯新闻" and args.video == "True":
crawler = crawler_instant.search_video_page
for url, releaser in releaserUrl_Lst:
print(releaser, url)
pool.apply_async(func=crawler, args=(releaser, url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
else:
for url, releaser in releaserUrl_Lst:
# print(kwargs_dict)
pool.apply_async(func=crawler, args=(start_time,end_time,url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
from crawler.crawler_sys.site_crawler_test import (crawler_toutiao,crawler_v_qq,crawler_tudou,crawler_haokan,
crawler_tencent_news,
crawler_wangyi_news,crawler_kwai)
import sys
import argparse, copy,datetime
from multiprocessing import Pool
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
# from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-l', '--releasers', default=[], action='append',
help=('Write data into es or not, default to True'))
parser.add_argument('-fre', '--frequency', default=0, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 0'))
parser.add_argument('-s', '--processes_num', default=30, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-v', '--video', default="False", type=str,
help=('Is or not run video_page_crawler'))
parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
help=('target_releasers_org or target_releasers'))
parser.add_argument('-d', '--date', default=3, type=int,
help=('Crawler backtracking data time'))
parser.add_argument('-file', '--file', default="", type=str,
help=('target_releasers_org or target_releasers'))
args = parser.parse_args()
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai
}
def get_crawler(platform):
if platform in platform_crawler_reg:
platform_crawler = platform_crawler_reg[platform]
else:
platform_crawler = None
print("can't get crawler for platform %s, "
"do we have the crawler for that platform?" % platform)
return platform_crawler
if args.platform != []:
platforms = args.platform
else:
print('platform must be input')
sys.exit(0)
for platform in platforms:
if platform not in platform_crawler_reg:
print("illegal platform name %s" % platform)
sys.exit(0)
releaser_page_num_max = args.max_page
output_f_path = args.output_file_path
frequency = args.frequency
if output_f_path == '':
output_f_path = None
if frequency == '':
frequency = None
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
releasers = args.releasers
processes_num = args.processes_num
frequency = args.frequency
print(frequency)
if frequency == 0:
frequency = None
es_index = args.es_index
doc_type = args.doc_type
start_time = int((datetime.datetime.now() + datetime.timedelta(days=-args.date)).timestamp()*1e3)
end_time = int(datetime.datetime.now().timestamp()*1e3)
kwargs_dict = {
'output_to_file': output_to_file,
'filepath': output_f_path,
'releaser_page_num_max': releaser_page_num_max,
'output_to_es_raw': output_to_es_raw,
'es_index': es_index,
'doc_type': doc_type,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis,
}
with open(args.file, "r", encoding="gb18030") as f:
header_Lst = f.readline().strip().split(',')
releaserUrl_Lst = []
print("open_file_%s" % args.file)
for line in f:
line_Lst = line.strip().split(',')
line_dict = dict(zip(header_Lst, line_Lst))
releaser = line_dict['releaser']
releaserUrl = line_dict['releaserUrl']
platform = line_dict['platform']
if platform in platforms:
releaserUrl_Lst.append((platform,releaserUrl,releaser))
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
print(args.platform)
Platform_crawler = get_crawler(args.platform[0])
# print(releaserUrl_Lst)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
if args.video == "True":
try:
crawler = crawler_instant.video_page
except:
crawler = crawler_instant.search_video_page
else:
crawler = crawler_instant.releaser_page_by_time
else:
print('Failed to get crawler for platform %s' % platform)
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool = Pool(processes=processes_num)
if platform == "腾讯新闻" and args.video == "True":
crawler = crawler_instant.search_video_page
for platform,url, releaser in releaserUrl_Lst:
print(releaser, url)
pool.apply_async(func=crawler, args=(releaser, url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
else:
for platform,url, releaser in releaserUrl_Lst:
pool.apply_async(func=crawler, args=(start_time,end_time,url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 17:52:02 2018
Find urls in given releaser page, and write first batch data into es.
Everytime this program runs, two things will happen:
1 All video urls in given releaser page will be fetched and put into redis url pool,
2 All data related to 1 will be fetched and stored into es.
Data in es will be update when run this program once.
@author: hanye
"""
from crawler.crawler_sys.site_crawler_test import (crawler_toutiao, crawler_v_qq, crawler_tudou, crawler_haokan,
crawler_tencent_news,
crawler_wangyi_news, crawler_kwai)
import sys
import argparse, copy, datetime
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Pool
from crawler.crawler_sys.framework.es_target_releasers import get_releaserUrls_from_es
# from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
from crawler.crawler_sys.utils.parse_bool_for_args import parse_bool_for_args
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
parser.add_argument('-n', '--max_page', default=30, type=int,
help=('The max page numbers to be scroll for each releaser url, '
'must be an int value, default to 30.'))
parser.add_argument('-f', '--output_file_path', default='', type=str,
help=('Specify output file path, default None.'))
parser.add_argument('-r', '--push_to_redis', default='False', type=str,
help=('Write urls to redis or not, default to True'))
parser.add_argument('-w', '--output_to_es_raw', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-index', '--es_index', default='crawler-data-raw', type=str,
help=('assign a es_index to write into, default to crawler-data-raw'))
parser.add_argument('-doc', '--doc_type', default='doc', type=str,
help=('assign a doc to write into, default to doc'))
parser.add_argument('-g', '--output_to_es_register', default='True', type=str,
help=('Write data into es or not, default to True'))
parser.add_argument('-l', '--releasers', default=[], action='append',
help=('Write data into es or not, default to True'))
parser.add_argument('-fre', '--frequency', default=0, type=int,
help=('choose a frequency to retrieve releaserUrl,'
'1, 3 or 9 is legal number, default 0'))
parser.add_argument('-s', '--processes_num', default=30, type=int,
help=('Processes number to be used in multiprocessing'))
parser.add_argument('-v', '--video', default="False", type=str,
help=('Is or not run video_page_crawler'))
parser.add_argument('-t', '--target_index', default="target_releasers", type=str,
help=('target_releasers_org or target_releasers'))
parser.add_argument('-d', '--date', default=3, type=int,
help=('Crawler backtracking data time'))
parser.add_argument('-file', '--file', default="", type=str,
help=('target_releasers_org or target_releasers'))
args = parser.parse_args()
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai
}
def get_crawler(platform):
if platform in platform_crawler_reg:
platform_crawler = platform_crawler_reg[platform]
else:
platform_crawler = None
print("can't get crawler for platform %s, "
"do we have the crawler for that platform?" % platform)
return platform_crawler
# if args.platform != []:
# platforms = args.platform
# else:
# print('platform must be input')
# sys.exit(0)
#
# for platform in platforms:
# if platform not in platform_crawler_reg:
# print("illegal platform name %s" % platform)
# sys.exit(0)
releaser_page_num_max = args.max_page
output_f_path = args.output_file_path
frequency = args.frequency
if output_f_path == '':
output_f_path = None
if frequency == '':
frequency = None
if output_f_path is None:
output_to_file = False
else:
output_to_file = True
push_to_redis = parse_bool_for_args(args.push_to_redis)
output_to_es_raw = parse_bool_for_args(args.output_to_es_raw)
output_to_es_register = parse_bool_for_args(args.output_to_es_register)
releasers = args.releasers
processes_num = args.processes_num
frequency = args.frequency
print(frequency)
if frequency == 0:
frequency = None
if __name__ == "__main__":
args.file = r"D:\work_file\发布者账号\SMG.csv"
args.file = r"D:\work_file\发布者账号\anhui.csv"
args.file = r"D:\work_file\5月补数据 - 副本.csv"
args.date = 1
es_index = args.es_index
doc_type = args.doc_type
start_time = int((datetime.datetime.now() + datetime.timedelta(days=-args.date)).timestamp() * 1e3)
end_time = int(datetime.datetime.now().timestamp() * 1e3)
kwargs_dict = {
'output_to_file': output_to_file,
'filepath': output_f_path,
'releaser_page_num_max': releaser_page_num_max,
'output_to_es_raw': output_to_es_raw,
'es_index': es_index,
'doc_type': doc_type,
'output_to_es_register': output_to_es_register,
'push_to_redis': push_to_redis,
}
platforms = [
"toutiao",
"new_tudou",
"haokan",
"腾讯视频",
"网易新闻",
"腾讯新闻",
"kwai"
]
for target_platform in platforms:
args.platform = [target_platform]
with open(args.file, "r", encoding="gb18030") as f:
header_Lst = f.readline().strip().split(',')
releaserUrl_Lst = []
print("open_file_%s" % args.file)
for line in f:
line_Lst = line.strip().split(',')
line_dict = dict(zip(header_Lst, line_Lst))
releaser = line_dict['releaser']
releaserUrl = line_dict['releaserUrl']
platform = line_dict['platform']
if platform in [target_platform]:
releaserUrl_Lst.append((platform, releaserUrl, releaser))
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
print(args.platform)
Platform_crawler = get_crawler(args.platform[0])
# print(releaserUrl_Lst)
if Platform_crawler != None:
crawler_instant = Platform_crawler()
if args.video == "True":
try:
crawler = crawler_instant.video_page
except:
crawler = crawler_instant.search_video_page
else:
crawler = crawler_instant.releaser_page_by_time
else:
print('Failed to get crawler for platform %s' % platform)
# 4 for each releaserUrl, get data on the releaser page identified by this
# releaserUrl, with multiprocesses
pool = Pool(processes=processes_num)
executor = ProcessPoolExecutor(max_workers=1)
futures = []
if platform == "腾讯新闻" and args.video == "True":
crawler = crawler_instant.search_video_page
for platform, url, releaser in releaserUrl_Lst:
print(releaser, url)
pool.apply_async(func=crawler, args=(releaser, url), kwds=kwargs_dict)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
else:
for platform, url, releaser in releaserUrl_Lst:
# pool.apply_async(func=crawler, args=(start_time,end_time,url), kwds=kwargs_dict)
future = executor.submit(crawler, start_time, end_time, url
, output_to_es_raw=True, es_index='crawler-data-raw', doc_type='doc',
)
futures.append(future)
executor.shutdown(True)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# -*- coding:utf-8 -*-
# @Time : 2020/2/25 15:20
# @Author : litao
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
"""
Created on Tue May 15 13:59:43 2018
@author: hanye
"""
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment