Commit f1e00560 authored by litaolemo's avatar litaolemo

update 更新doc_id生成逻辑

parent 8ebe168b
This diff is collapsed.
...@@ -23,10 +23,10 @@ from crawler.crawler_sys.utils.output_results import output_result ...@@ -23,10 +23,10 @@ from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log # from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
from crawler.crawler_sys.utils import connect_with_redis # from crawler.crawler_sys.utils import connect_with_redis
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration # from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from crawler.crawler_sys.utils.util_logging import logged # from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy # from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import * from write_data_into_es.func_get_releaser_id import *
...@@ -112,7 +112,7 @@ class Crawler_weibo(): ...@@ -112,7 +112,7 @@ class Crawler_weibo():
doc_type=None,proxies_num=None): doc_type=None,proxies_num=None):
print('Processing releaserUrl %s' % releaserUrl) print('Processing releaserUrl %s' % releaserUrl)
result_Lst = [] result_Lst = []
releaser_id,containerid = self.get_releaser_id(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id) # xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers = { headers = {
"accept": "application/json, text/plain, */*", "accept": "application/json, text/plain, */*",
......
...@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit ...@@ -17,7 +17,7 @@ from crawler_sys.framework.es_ccr_index_defination import es_framework as es_sit
from crawler_sys.framework.es_ccr_index_defination import index_url_register from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
from crawler_sys.framework.es_ccr_index_defination import fields_url_register from crawler_sys.framework.es_ccr_index_defination import fields_url_register
from write_data_into_es.func_cal_doc_id import cal_doc_id from write_data_into_es.func_cal_doc_id import cal_doc_id
from crawler_sys.utils.write_into_file import write_str_into_file from crawler_sys.utils.write_into_file import write_str_into_file
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
......
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:56
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:59
# @File : func_calculate_douban_id.py
# @email : litao@igengmei.com
# @author : litao
import re
def calculate_douban_id(data_dic):
if data_dic.get("mid"):
return data_dic.get("mid")
else:
try:
find_mid = re.findall('(\d+)', data_dic["url"])[0]
return find_mid
except:
return data_dic["url"]
\ No newline at end of file
...@@ -6,10 +6,11 @@ Created on Mon Nov 6 09:54:09 2017 ...@@ -6,10 +6,11 @@ Created on Mon Nov 6 09:54:09 2017
""" """
import re import re
def calculate_v_qq_video_id(v_qq_page_url): def calculate_v_qq_video_id(data_dic):
find_vid = re.findall('/[0-9a-zA-Z]+.html', v_qq_page_url) url = data_dic.get("url")
find_vid = re.findall('/[0-9a-zA-Z]+.html', url)
if find_vid != []: if find_vid != []:
vid = find_vid[0].split('/')[1].split('.')[0] vid = find_vid[0].split('/')[1].split('.')[0]
else: else:
vid = v_qq_page_url vid = url
return vid return vid
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:59
# @File : func_calculate_weibo_id.py
# @email : litao@igengmei.com
# @author : litao
def calculate_weibo_id(data_dic):
if data_dic.get("mid"):
return data_dic.get("mid")
else:
return data_dic.get("url")
\ No newline at end of file
...@@ -7,15 +7,17 @@ Created on Wed Jun 20 09:19:12 2018 ...@@ -7,15 +7,17 @@ Created on Wed Jun 20 09:19:12 2018
import hashlib import hashlib
from write_data_into_es.func_calculate_toutiao_video_id import calculate_toutiao_video_id from write_data_into_es.calculate_doc_id.func_calculate_toutiao_video_id import calculate_toutiao_video_id
from write_data_into_es.func_calculate_newTudou_video_id import calculate_newTudou_video_id from write_data_into_es.calculate_doc_id.func_calculate_newTudou_video_id import calculate_newTudou_video_id
from write_data_into_es.func_calculate_v_qq_video_id import calculate_v_qq_video_id from write_data_into_es.calculate_doc_id.func_calculate_v_qq_video_id import calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data #from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from write_data_into_es.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url from write_data_into_es.calculate_doc_id.func_calculate_kwai_video_id_by_url import calculate_kwai_video_id_by_data_by_url
from write_data_into_es.func_calculate_txxw_video_id import calculate_txxw_video_id from write_data_into_es.calculate_doc_id.func_calculate_txxw_video_id import calculate_txxw_video_id
from write_data_into_es.func_calculate_wangyi_news_id import calculate_wangyi_news_id from write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id import calculate_wangyi_news_id
from write_data_into_es.func_calculate_douyin_id import calculate_douyin_id from write_data_into_es.calculate_doc_id.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.func_calculate_haokan_video_id import calculate_haokan_id from write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id import calculate_haokan_id
from write_data_into_es.calculate_doc_id.func_calculate_weibo_id import calculate_weibo_id
from write_data_into_es.calculate_doc_id.func_calculate_douban_id import calculate_douban_id
def vid_cal_func(platform): def vid_cal_func(platform):
...@@ -27,7 +29,9 @@ def vid_cal_func(platform): ...@@ -27,7 +29,9 @@ def vid_cal_func(platform):
'腾讯新闻':calculate_txxw_video_id, '腾讯新闻':calculate_txxw_video_id,
"网易新闻":calculate_wangyi_news_id, "网易新闻":calculate_wangyi_news_id,
"抖音":calculate_douyin_id, "抖音":calculate_douyin_id,
"haokan":calculate_haokan_id "haokan":calculate_haokan_id,
"weibo":calculate_weibo_id,
"douban":calculate_douban_id,
} }
def general_vid_cal_func(url): def general_vid_cal_func(url):
...@@ -69,10 +73,7 @@ def cal_doc_id(platform, url=None, ...@@ -69,10 +73,7 @@ def cal_doc_id(platform, url=None,
except: except:
url = None url = None
if platform == '腾讯新闻' or platform == 'haokan': vid_bare = vid_cal_func(platform)(data_dict)
vid_bare = vid_cal_func(platform)(data_dict)
else:
vid_bare = vid_cal_func(platform)(url)
if doc_id_type == 'daily-url': if doc_id_type == 'daily-url':
if fetch_day_str != None: if fetch_day_str != None:
if platform == 'toutiao': if platform == 'toutiao':
...@@ -118,7 +119,7 @@ def cal_doc_id(platform, url=None, ...@@ -118,7 +119,7 @@ def cal_doc_id(platform, url=None,
elif platform == '网易新闻': elif platform == '网易新闻':
vid = 'wyxw_%s' % (vid_bare) vid = 'wyxw_%s' % (vid_bare)
else: else:
vid = '%s' % (vid_bare) vid = '%s_%s' % (platform,vid_bare)
elif doc_id_type == 'time-track': elif doc_id_type == 'time-track':
if fetch_time_ts != None: if fetch_time_ts != None:
if platform == 'toutiao': if platform == 'toutiao':
......
# -*- coding:utf-8 -*-
# @Time : 2019/7/16 16:08
# @Author : litao
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import re
def calculate_douyin_id(url):
if "?" in url:
find_vid = url.split("?")
elif "video" in url:
find_vid = re.findall('/video/(.*?)/', url)
if find_vid:
find_vid = ["https://www.iesdouyin.com/share/video/%s/" % find_vid[0]]
else:
return url
if find_vid != []:
vid = find_vid[0]
else:
vid = url
return vid
if __name__=='__main__':
print(calculate_douyin_id("https://www.iesdouyin.com/share/vido/6688242923181591821/?mid=6688519042262665996"))
print(calculate_douyin_id("https://www.iesdouyin.com/share/video/6689249077596671245/?mid=6689052145968450308"))
\ No newline at end of file
# -*- coding:utf-8 -*-
# @Time : 2019/8/27 16:24
# @Author : litao
import re
def calculate_haokan_id(data_dic):
url = data_dic.get("url")
# if data_dic.get("video_id"):
# return data_dic["video_id"]
if "id=" in url:
find_vid = re.findall('id=(\d+)', url)
return find_vid[0]
elif "context=%7B%22nid%22%3A%22sv_" in url:
find_vid = re.findall('context=%7B%22nid%22%3A%22sv_(.+)%22%7D', url)
return find_vid[0]
else:
return url
if __name__ == '__main__':
print(calculate_haokan_id({"url":"https://sv.baidu.com/videoui/page/videoland?context=%7B%22nid%22%3A%22sv_5091548046938576131%22%7D"}))
print(calculate_haokan_id({"url":"https://haokan.baidu.com/v?vid=4596161678511752193"}))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 3 15:12:09 2018
@author: hanye
"""
import hashlib
def calculate_kwai_video_id_by_data(kwai_video_dict):
try:
title = kwai_video_dict['title']
title_c = title.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
releaser = kwai_video_dict['releaser']
release_time_ts = kwai_video_dict['release_time']
kwai_key = title_c + '_' + releaser + '_' + str(release_time_ts)
key_hash = hashlib.md5(kwai_key.encode('utf-8')).hexdigest()
vid = key_hash
except:
try:
kwai_key = kwai_video_dict['url']
vid = kwai_key
except:
vid = None
return vid
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 6 15:53:10 2018
@author: zhouyujiang
"""
import re
def calculate_kwai_video_id_by_data_by_url(kwai_url):
doc_id_str = re.findall(r"/u/(.+)?|/photo/(.+)?",kwai_url)
if doc_id_str!=[]:
for i in doc_id_str[0]:
if i!='':
vid = str(i).replace('/','_')
return vid
else:
return None
if __name__=='__main__':
print(calculate_kwai_video_id_by_data_by_url('https://www.kuaishou.com/u/143139353/5601747480'))
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 12 16:40:20 2017
@author: hanye
"""
import re
def calculate_newTudou_video_id(newTudou_url):
try:
d_url_s_Lst = newTudou_url.split('.html')
d_videoID = d_url_s_Lst[0]
newTudou_video_id = re.findall(r"/\w/(.+)?", d_videoID)[0]
except:
newTudou_video_id = None
return newTudou_video_id
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 6 09:54:09 2017
@author: hanye
"""
import re
def calculate_toutiao_video_id(toutiao_url):
if toutiao_url[-1] != '/':
toutiao_url = toutiao_url + '/'
find_vid = re.findall('[0-9]+/', toutiao_url)
if find_vid!=[]:
vid = find_vid[0].replace('/', '')
return vid
else:
return None
from write_data_into_es.func_get_releaser_id import get_releaser_id
def calculate_txxw_video_id(data_dict):
try:
releaser_id = get_releaser_id(platform="腾讯新闻", releaserUrl=data_dict["releaserUrl"])
video_id = data_dict['video_id']
if releaser_id:
return video_id + "_" +releaser_id
else:
return video_id
except:
print('error in :', data_dict)
return None
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import re
def calculate_wangyi_news_id(url):
if "/sub/" in url:
find_vid = re.findall('/sub/(.+)\.html', url)
elif "/v/" in url:
find_vid = re.findall('/v/(.+)\.html', url)
else:
return url
if find_vid != []:
vid = find_vid[0]
else:
vid = url
return vid
if __name__=='__main__':
print(calculate_wangyi_news_id("https://c.m.163.com/news/v/VA9LBOJ7S.html"))
print(calculate_wangyi_news_id("https://c.m.163.com/news/sub/T1539761239294.html"))
\ No newline at end of file
...@@ -251,7 +251,8 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat ...@@ -251,7 +251,8 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if __name__ == "__main__": if __name__ == "__main__":
data_list = [{"releaserUrl": "https://weibo.com/u/1764615662", "releaser": "娱乐圈贵妃", "platform": "weibo"}, data_list = [
{"releaserUrl": "https://weibo.com/u/1764615662", "releaser": "娱乐圈贵妃", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3662247177", "releaser": "捞娱君", "platform": "weibo"}, {"releaserUrl": "https://weibo.com/u/3662247177", "releaser": "捞娱君", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2378564111", "releaser": "娱乐扒皮", "platform": "weibo"}, {"releaserUrl": "https://weibo.com/u/2378564111", "releaser": "娱乐扒皮", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2983578965", "releaser": "娱乐圈小青年", "platform": "weibo"}, {"releaserUrl": "https://weibo.com/u/2983578965", "releaser": "娱乐圈小青年", "platform": "weibo"},
...@@ -273,7 +274,8 @@ if __name__ == "__main__": ...@@ -273,7 +274,8 @@ if __name__ == "__main__":
"platform": "weibo"}, "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6511173721", "releaser": "圈内课代表", "platform": "weibo"}, {"releaserUrl": "https://weibo.com/u/6511173721", "releaser": "圈内课代表", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place", {"releaserUrl": "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
"releaser": "娱闻少女", "platform": "weibo"}] "releaser": "娱闻少女", "platform": "weibo"}
]
extra_dic = { extra_dic = {
"department_tags":["策略组"], "department_tags":["策略组"],
'key_releaser': True, 'key_releaser': True,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment