Commit 667bc377 authored by litaolemo's avatar litaolemo

update

parent b81bb38b
......@@ -10,16 +10,17 @@ from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
# rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
hosts = '172.18.52.14'
port = 9200
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
HTTP_AUTH = ("elastic", "gm_test")
es_framework = Elasticsearch(hosts=hosts, port=port, http_auth=HTTP_AUTH)
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
def bulk_write_target_releasers(dict_Lst,
index=index_target_releaser,
doc_type=doc_type_target_releaser):
index=index_target_releaser,):
bulk_write_body=''
write_counter=0
for line in dict_Lst:
......@@ -28,8 +29,8 @@ def bulk_write_target_releasers(dict_Lst,
releaser=line['releaser']
platform=line['platform']
doc_id_releaser='%s_%s' % (platform, releaser)
action_str=('{ "index" : { "_index" : "%s", "_type" : "%s","_id" : "%s" } }'
% (index_target_releaser, doc_type_target_releaser, doc_id_releaser) )
action_str=('{ "index" : { "_index" : "%s","_id" : "%s" } }'
% (index_target_releaser, doc_id_releaser))
data_str=json.dumps(line, ensure_ascii=False)
line_body = action_str + '\n' + data_str + '\n'
bulk_write_body += line_body
......@@ -58,7 +59,7 @@ def get_releaserUrls_from_es(platform,
# search_body['query']['bool']['filter'].append(frequency_dict)
# print(target_index,doc_type_target_releaser,search_body)
print(search_body)
search_resp=es_framework.search(index=target_index,
search_resp= es_framework.search(index=target_index,
body=search_body,
size=0,
request_timeout=100)
......
# -*- coding:utf-8 -*-
# @Time : 2019/5/30 11:01
# @Author : litao
import re, requests
try:
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
except:
pass
def toutiao(releaserUrl,**kwargs):
if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl:
pattern = 'user/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
else:
pattern = 'to_user_id=[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('=')[1]
else:
re_find = re.findall("/m(\d+)", releaserUrl)
if re_find:
return re_find[0]
else:
releaser_id = None
return releaser_id
elif 'm.toutiao.com' in releaserUrl:
pattern = 'profile/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
return releaser_id
elif 'm.365yg.com' in releaserUrl:
pattern = 'to_user_id=[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('=')[1]
else:
releaser_id = None
return releaser_id
elif "user_id" in releaserUrl:
re_find = re.findall("user_id=(\d+)",releaserUrl)
if re_find:
return re_find[0]
else:
return None
else:
re_find = re.findall("(\d+)", releaserUrl)
if re_find:
return re_find[0]
else:
return None
def haokan(releaserUrl,**kwargs):
if "app_id=" in releaserUrl:
releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl))
releaser_id = ' '.join(re.findall('\d+', releaser_id_str))
return releaser_id
elif "app_id" in releaserUrl:
try:
releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
except:
releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
else:
releaser_id_str = re.findall('(\d+)', releaserUrl)[0]
if releaser_id_str:
return releaser_id_str
def tengxunshipin(releaserUrl,is_qq=False,**kwargs):
if not is_qq:
try:
releaser_id = re.findall("vplus/(.*)", releaserUrl)[0]
if len(releaser_id) == 32:
return releaser_id
else:
if "#" in releaser_id:
releaser_id = releaser_id.split("#")[0]
if len(releaser_id) == 32 or len(releaser_id) == 16:
return releaser_id
if "/videos" in releaser_id:
releaser_id = releaser_id.split("/videos")[0]
if len(releaser_id) == 32 or len(releaser_id) == 16:
return releaser_id
proxies = get_proxy(1)
get_page = requests.get(releaserUrl, timeout=5,proxies=proxies)
get_page.encoding = 'utf-8'
page = get_page.text
try:
USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# releaser = re.findall("name: '(.*)',", USER_INFO)[0]
releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# number_id = re.findall("number: '(.*)',", USER_INFO)[0]
except:
return None
return releaser_id
except:
return None
else:
proxies = get_proxy(1)
get_page = requests.get(releaserUrl, timeout=2,proxies=proxies)
get_page.encoding = 'utf-8'
page = get_page.text
try:
USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
releaser = re.findall("name: '(.*)',", USER_INFO)[0]
releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
number_id = re.findall("number: '(.*)',", USER_INFO)[0]
except:
return None
D0 = {'releaser': releaser,
'releaser_id': releaser_id,
"number_id": number_id}
return D0
def new_tudou(releaserUrl,**kwargs):
if "?" in releaserUrl:
releaserUrl = releaserUrl.split("?")[0]
if "=" in releaserUrl:
releaserUrl = releaserUrl.replace("=","")
try:
if 'videos' in releaserUrl:
releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl))
releaser_id = releaser_id_str.split('/')[1]
return releaser_id
elif releaserUrl[-1] == "/":
releaserUrl = releaserUrl[0:-1]
releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl))
releaser_id = releaser_id_str
return releaser_id
else:
releaser_id = releaserUrl.split("/")[-1]
return releaser_id
except:
return None
def douyin(releaserUrl,**kwargs):
try:
releaser_id = re.findall("user/(\d+)",releaserUrl)[0]
except:
print(releaserUrl)
return None
return releaser_id
def tencent_news(releaserUrl,**kwargs):
releaserUrl = str(releaserUrl)
try:
if "media/" in releaserUrl:
res = re.findall(r"media/(\d+)", releaserUrl)
if res:
return res[0]
else:
pattern = 'media/[0-9]+'
re_find = re.findall(pattern, releaserUrl)
if re_find != []:
releaser_id = re_find[0].split('/')[1]
else:
releaser_id = False
return releaser_id
else:
res = re.findall(r"chlid=(\d+)", releaserUrl)
if res:
return res[0]
except:
return False
def miaopai(releaserUrl,**kwargs):
if 'n.miaopai.com' in releaserUrl:
releaser_id_str = releaserUrl.split('/')[-1]
releaser_id = releaser_id_str.replace('.html', '')
releaser_id = releaser_id_str.replace('.htm', '')
return releaser_id
else:
print("input illegal releaserUrl %s" % releaserUrl)
return None
def kwai(releaserUrl,**kwargs):
if "profile" in releaserUrl:
res = re.findall(r"/profile/(.+)", releaserUrl)
if res:
return res[0]
else:
return ""
elif "/u/" in releaserUrl:
res = re.findall(r"/u/(.+)/", releaserUrl)
if res:
return res[0]
else:
return ""
def wangyi_news(releaserUrl,**kwargs):
if "/sub/" in releaserUrl:
res = re.findall(r"/sub/(.+)\.html", releaserUrl)
if res:
return res[0]
else:
return None
elif "video" in releaserUrl:
res = re.findall(r"/list/(.+)/video", releaserUrl)
if res:
return res[0]
else:
return None
elif "all" in releaserUrl:
res = re.findall(r"/list/(.+)/all", releaserUrl)
if res:
return res[0]
else:
return None
def xiaohongshu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
res = re.findall(r"user/profile/(.*)", releaserUrl)
if res:
return res[0]
else:
return None
plantform_func = {
"toutiao": toutiao,
"haokan": haokan,
"腾讯视频": tengxunshipin,
"new_tudou": new_tudou,
"腾讯新闻": tencent_news,
"miaopai": miaopai,
"kwai": kwai,
"网易新闻": wangyi_news,
"抖音":douyin,
"xiaohongshu":xiaohongshu
}
def get_releaser_id(platform=None, releaserUrl=None,is_qq=False):
if platform and releaserUrl:
if platform in plantform_func:
func = plantform_func[platform]
res = func(releaserUrl,is_qq=is_qq)
try:
if res:
return res
else:
print(platform, releaserUrl, "can't git releaser_id")
return None
except:
return None
else:
# print(plantform," not in target list")
return None
if __name__ == "__main__":
# file = r'D:\work_file\发布者账号\SMG.csv'
# with open(file, 'r')as f:
# head = f.readline()
# head_list = head.strip().split(',')
# for i in f:
# line_list = i.strip().split(',')
# line_dict = dict(zip(head_list, line_list))
# platform = line_dict['platform']
# releaser = line_dict['releaser']
# try:
# releaserUrl = line_dict['releaserUrl']
# if platform == 'new_tudou':
# if releaserUrl[-2:] == '==':
# releaserUrl = releaserUrl + '/videos'
# line_dict['releaserUrl'] = releaserUrl
# except:
# pass
# releaser_id = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
# print(platform, releaserUrl, releaser_id)
releaser_id= get_releaser_id("腾讯新闻","https://r.inews.qq.com/getUserVideoList?chlid=5362294&page_time=&coral_uin=ec8bb1459b9d84100312bf035bb43cd4d0&coral_uid=&type=om&uid=7313ae71df9e5367&omgid=&trueVersion=5.8.00&qimei=287801615436009&devid=008796749793280&appver=23_android_5.8.00&qn-rid=9ec6d3f9-d341-4138-b4e2-6b2ed4b98b5b&qn-sig=891289f9217ec9623723c024dd00eaf5")
print(releaser_id)
\ No newline at end of file
# # -*- coding:utf-8 -*-
# # @Time : 2019/5/30 11:01
# # @Author : litao
# import re, requests
# try:
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# except:
# pass
#
#
# def toutiao(releaserUrl,**kwargs):
# if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl:
# pattern = 'user/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# else:
# pattern = 'to_user_id=[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('=')[1]
# else:
# re_find = re.findall("/m(\d+)", releaserUrl)
# if re_find:
# return re_find[0]
# else:
# releaser_id = None
# return releaser_id
#
# elif 'm.toutiao.com' in releaserUrl:
# pattern = 'profile/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# return releaser_id
#
# elif 'm.365yg.com' in releaserUrl:
# pattern = 'to_user_id=[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('=')[1]
# else:
# releaser_id = None
# return releaser_id
# elif "user_id" in releaserUrl:
# re_find = re.findall("user_id=(\d+)",releaserUrl)
# if re_find:
# return re_find[0]
# else:
# return None
# else:
# re_find = re.findall("(\d+)", releaserUrl)
# if re_find:
# return re_find[0]
# else:
# return None
# def haokan(releaserUrl,**kwargs):
# if "app_id=" in releaserUrl:
# releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl))
# releaser_id = ' '.join(re.findall('\d+', releaser_id_str))
# return releaser_id
# elif "app_id" in releaserUrl:
# try:
# releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
# except:
# releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
# else:
# releaser_id_str = re.findall('(\d+)', releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
#
#
# def tengxunshipin(releaserUrl,is_qq=False,**kwargs):
# if not is_qq:
# try:
# releaser_id = re.findall("vplus/(.*)", releaserUrl)[0]
# if len(releaser_id) == 32:
# return releaser_id
# else:
# if "#" in releaser_id:
# releaser_id = releaser_id.split("#")[0]
# if len(releaser_id) == 32 or len(releaser_id) == 16:
# return releaser_id
# if "/videos" in releaser_id:
# releaser_id = releaser_id.split("/videos")[0]
# if len(releaser_id) == 32 or len(releaser_id) == 16:
# return releaser_id
# proxies = get_proxy(1)
# get_page = requests.get(releaserUrl, timeout=5,proxies=proxies)
# get_page.encoding = 'utf-8'
# page = get_page.text
# try:
# USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# # releaser = re.findall("name: '(.*)',", USER_INFO)[0]
# releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# # number_id = re.findall("number: '(.*)',", USER_INFO)[0]
# except:
# return None
# return releaser_id
# except:
# return None
# else:
# proxies = get_proxy(1)
# get_page = requests.get(releaserUrl, timeout=2,proxies=proxies)
# get_page.encoding = 'utf-8'
# page = get_page.text
# try:
# USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# releaser = re.findall("name: '(.*)',", USER_INFO)[0]
# releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# number_id = re.findall("number: '(.*)',", USER_INFO)[0]
# except:
# return None
# D0 = {'releaser': releaser,
# 'releaser_id': releaser_id,
# "number_id": number_id}
# return D0
#
#
#
# def new_tudou(releaserUrl,**kwargs):
# if "?" in releaserUrl:
# releaserUrl = releaserUrl.split("?")[0]
# if "=" in releaserUrl:
# releaserUrl = releaserUrl.replace("=","")
# try:
# if 'videos' in releaserUrl:
# releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl))
# releaser_id = releaser_id_str.split('/')[1]
# return releaser_id
# elif releaserUrl[-1] == "/":
# releaserUrl = releaserUrl[0:-1]
# releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl))
# releaser_id = releaser_id_str
# return releaser_id
# else:
# releaser_id = releaserUrl.split("/")[-1]
# return releaser_id
# except:
# return None
#
# def douyin(releaserUrl,**kwargs):
# try:
# releaser_id = re.findall("user/(\d+)",releaserUrl)[0]
# except:
# print(releaserUrl)
# return None
#
# return releaser_id
#
#
# def tencent_news(releaserUrl,**kwargs):
# releaserUrl = str(releaserUrl)
# try:
# if "media/" in releaserUrl:
# res = re.findall(r"media/(\d+)", releaserUrl)
# if res:
# return res[0]
# else:
# pattern = 'media/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# else:
# releaser_id = False
# return releaser_id
# else:
# res = re.findall(r"chlid=(\d+)", releaserUrl)
# if res:
# return res[0]
# except:
# return False
#
#
# def miaopai(releaserUrl,**kwargs):
# if 'n.miaopai.com' in releaserUrl:
# releaser_id_str = releaserUrl.split('/')[-1]
# releaser_id = releaser_id_str.replace('.html', '')
# releaser_id = releaser_id_str.replace('.htm', '')
# return releaser_id
# else:
# print("input illegal releaserUrl %s" % releaserUrl)
# return None
#
#
# def kwai(releaserUrl,**kwargs):
# if "profile" in releaserUrl:
# res = re.findall(r"/profile/(.+)", releaserUrl)
# if res:
# return res[0]
# else:
# return ""
# elif "/u/" in releaserUrl:
# res = re.findall(r"/u/(.+)/", releaserUrl)
# if res:
# return res[0]
# else:
# return ""
#
#
# def wangyi_news(releaserUrl,**kwargs):
# if "/sub/" in releaserUrl:
# res = re.findall(r"/sub/(.+)\.html", releaserUrl)
# if res:
# return res[0]
# else:
# return None
# elif "video" in releaserUrl:
# res = re.findall(r"/list/(.+)/video", releaserUrl)
# if res:
# return res[0]
# else:
# return None
# elif "all" in releaserUrl:
# res = re.findall(r"/list/(.+)/all", releaserUrl)
# if res:
# return res[0]
# else:
# return None
#
# def xiaohongshu(releaserUrl,**kwargs):
# releaserUrl = releaserUrl.split("?")[0]
# res = re.findall(r"user/profile/(.*)", releaserUrl)
# if res:
# return res[0]
# else:
# return None
#
# plantform_func = {
# "toutiao": toutiao,
# "haokan": haokan,
# "腾讯视频": tengxunshipin,
# "new_tudou": new_tudou,
# "腾讯新闻": tencent_news,
# "miaopai": miaopai,
# "kwai": kwai,
# "网易新闻": wangyi_news,
# "抖音":douyin,
# "xiaohongshu":xiaohongshu
# }
#
#
# def get_releaser_id(platform=None, releaserUrl=None,is_qq=False):
# if platform and releaserUrl:
# if platform in plantform_func:
# func = plantform_func[platform]
# res = func(releaserUrl,is_qq=is_qq)
# try:
# if res:
# return res
# else:
# print(platform, releaserUrl, "can't git releaser_id")
# return None
# except:
# return None
# else:
# # print(plantform," not in target list")
# return None
#
#
# if __name__ == "__main__":
# # file = r'D:\work_file\发布者账号\SMG.csv'
# # with open(file, 'r')as f:
# # head = f.readline()
# # head_list = head.strip().split(',')
# # for i in f:
# # line_list = i.strip().split(',')
# # line_dict = dict(zip(head_list, line_list))
# # platform = line_dict['platform']
# # releaser = line_dict['releaser']
# # try:
# # releaserUrl = line_dict['releaserUrl']
# # if platform == 'new_tudou':
# # if releaserUrl[-2:] == '==':
# # releaserUrl = releaserUrl + '/videos'
# # line_dict['releaserUrl'] = releaserUrl
# # except:
# # pass
# # releaser_id = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
# # print(platform, releaserUrl, releaser_id)
# releaser_id= get_releaser_id("腾讯新闻","https://r.inews.qq.com/getUserVideoList?chlid=5362294&page_time=&coral_uin=ec8bb1459b9d84100312bf035bb43cd4d0&coral_uid=&type=om&uid=7313ae71df9e5367&omgid=&trueVersion=5.8.00&qimei=287801615436009&devid=008796749793280&appver=23_android_5.8.00&qn-rid=9ec6d3f9-d341-4138-b4e2-6b2ed4b98b5b&qn-sig=891289f9217ec9623723c024dd00eaf5")
# print(releaser_id)
\ No newline at end of file
......@@ -34,8 +34,8 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
# rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
......
......@@ -21,7 +21,7 @@ from crawler.gm_upload.gm_upload import upload, upload_file
from selenium.webdriver import ActionChains
from selenium import webdriver
try:
from crawler_sys.framework.func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
......
#!/bin/bash
#sudo su - gmuser
#source /root/anaconda3/bin/activate
crawler-ops
#conda activate crawler_env
#/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5 > /data/log/crawler/write_task.log &
/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p douban -d 1 -proxies 5 > /data/log/crawler/write_task.log &
python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5 > /data/log/crawler/write_task.log &
#/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p douban -d 1 -proxies 5 > /data/log/crawler/write_task.log &
......@@ -281,6 +281,20 @@ def douban(releaserUrl,**kwargs):
releaser_id = re.findall(r"people/(.*)", releaserUrl)[0]
return releaser_id
def xiaohongshu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
res = re.findall(r"user/profile/(.*)", releaserUrl)
if res:
return res[0]
else:
return None
def zhihu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
releaser_id = re.findall(r"people/(.+)", releaserUrl)[0]
if "/" in releaser_id:
releaser_id = releaser_id.split("/")[0]
return releaser_id
plantform_func = {
"toutiao": toutiao,
......@@ -297,7 +311,9 @@ plantform_func = {
"weixin":weixin,
"weibo":weibo,
"pearvideo":pearvideo,
"douban":douban
"douban":douban,
"zhihu":zhihu,
"xiaohongshu":xiaohongshu
}
......@@ -335,4 +351,4 @@ if __name__ == "__main__":
# print(get_releaser_id(platform=platform,releaserUrl=releaserUrl))
# print(releaser_id)
# print(weibo("https://weibo.com/1656058115"))
\ No newline at end of file
print(zhihu("https://www.zhihu.com/people/kokokou/jkh?!23"))
\ No newline at end of file
......@@ -18,9 +18,11 @@ from write_data_into_es.func_get_releaser_id import get_releaser_id
import redis
import hashlib
hosts = '172.16.32.37'
hosts = '172.18.52.14'
port = 9200
es = Elasticsearch(hosts=hosts, port=port)
HTTP_AUTH = ("elastic", "gm_test")
es = Elasticsearch(hosts=hosts, port=port, http_auth=HTTP_AUTH)
# pool = redis.ConnectionPool(host='192.168.17.60', port=6379, db=2, decode_responses=True)
# rds = redis.Redis(connection_pool=pool)
......@@ -113,9 +115,6 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if extra_dic:
line_dict.update(extra_dic)
# import pdb;
# pdb.set_trace()
# print(str(get_releaser_id(platform=platform, releaserUrl=releaserUrl)))
line_dict["releaser_id"] = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
if line_dict["releaser_id"]:
......@@ -123,46 +122,12 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
else:
doc_id = platform + '_' + line_dict['releaser']
err_id_line += str(line + 2) + ","
find_exist = {
"query": {
"bool": {
"filter": [
{"term": {"_id": doc_id}}
]
}
}
}
if not extra_dic.get("project_tags"):
extra_dic.pop("project_tags", 0)
if not extra_dic.get("department_tags"):
extra_dic.pop("department_tags", 0)
# search_re = es.search(index='target_releasers', doc_type='doc', body=find_exist)
# if search_re['hits']['total'] > 0:
# search_source = search_re['hits']['hits'][0]['_source']
# # print(search_source)
# if search_source.get("project_tags"):
# try:
# # print(kwargs.get("extra_dic"))
# line_dict["project_tags"].extend(search_source.get("project_tags"))
# line_dict["project_tags"] = list(set(line_dict["project_tags"]))
# search_source.pop("project_tags", 0)
# except Exception as e:
# pass
# # print("project_tags error", e)
# if search_source.get("department_tags"):
# try:
# # print(kwargs.get("extra_dic"))
# line_dict["department_tags"].extend(search_source.get("department_tags"))
# line_dict["department_tags"] = list(set(line_dict["department_tags"]))
# search_source.pop("department_tags", 0)
# except Exception as e:
# pass
# # print("project_tags error", e)
# if update:
# line_dict.update(search_source)
# line_dict["post_time"] = search_source.get("post_time")
if line_dict.get("post_time"):
pass
else:
......@@ -182,24 +147,7 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
line_dict["project_tags"] = []
if not line_dict.get("department_tags"):
line_dict["department_tags"] = []
if line_dict.get("add_departments"):
line_dict["department_tags"].extend(line_dict.get("add_departments"))
line_dict["department_tags"] = list(set(line_dict["department_tags"]))
if line_dict.get("del_departments"):
for key in line_dict.get("del_departments"):
try:
line_dict["department_tags"].remove(key)
except:
continue
if line_dict.get("add_project_tags"):
line_dict["project_tags"].extend(line_dict.get("add_project_tags"))
line_dict["project_tags"] = list(set(line_dict["project_tags"]))
if line_dict.get("del_project_tags"):
for key in line_dict.get("del_project_tags"):
try:
line_dict["project_tags"].remove(key)
except:
continue
bulk_dic = {
"releaser": line_dict.get("releaser"),
"releaserUrl": line_dict.get("releaserUrl"),
......@@ -211,12 +159,11 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
"frequency": 3 if line_dict.get("project_tags") else 1,
"key_releaser": line_dict.get("key_releaser"),
"is_valid": line_dict.get("is_valid"),
"has_data": line_dict.get("has_data") if line_dict.get("has_data") else 0,
# "has_data": line_dict.get("has_data") if line_dict.get("has_data") else 0,
"project_tags": line_dict.get("project_tags"),
"department_tags": line_dict.get("department_tags"),
'timestamp': int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000),
'media_type': line_dict.get("media_type") if line_dict.get("media_type") else "",
'releaser_type': line_dict.get("releaser_type") if line_dict.get("releaser_type") else "",
}
......@@ -251,127 +198,13 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if __name__ == "__main__":
data_list = [
{"releaserUrl": "https://weibo.com/u/1764615662", "releaser": "娱乐圈贵妃", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3662247177", "releaser": "捞娱君", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2378564111", "releaser": "娱乐扒皮", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2983578965", "releaser": "娱乐圈小青年", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3938976579", "releaser": "娱乐捞饭", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6511177474", "releaser": "小组吃瓜蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6343916471", "releaser": "圈内老顽童", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6511177474", "releaser": "八组吃瓜蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2921603920", "releaser": "娱乐圈新鲜事", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6470919752", "releaser": "伊丽莎白骨精啊", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1", "releaser": "娱乐榜姐",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3115996363?is_hot=1", "releaser": "娱乐星事", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place", "releaser": "星探扒皮",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3926129482", "releaser": "星闻追踪", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5509337969?is_hot=1", "releaser": "卦哥娱乐", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5477320351", "releaser": "圈内扒爷", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place", "releaser": "圈八戒 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6511173721", "releaser": "圈内课代表", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB#place", "releaser": "娱闻少女",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3193443435", "releaser": "圈太妹", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2022990945", "releaser": "圈内狙击手", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1809782810?is_all=1", "releaser": "全娱乐爆料", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5157190426?is_all=1", "releaser": "娱乐扒少", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2125613987?is_all=1", "releaser": "圈内一把手 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051948622644/home?from=page_100505&mod=TAB#place",
"releaser": "影视圈扒姐 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2611791490", "releaser": "娱评八公", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1652840683", "releaser": "追星", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5086098727?is_hot=1", "releaser": "闻娱教主", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5101787982?is_all=1", "releaser": "扒婆说", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5101844765?is_hot=1", "releaser": "星娱客 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052115034114/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐明星团 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6473952993?is_hot=1", "releaser": "偶像日报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5106602573?is_hot=1", "releaser": "八哥", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5909342713?", "releaser": "圈内教父", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3200673035?", "releaser": "扒圈老鬼", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005055965621313/home?from=page_100505&mod=TAB#place", "releaser": "圈内师爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1915749764?is_hot=1", "releaser": "迷妹速报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1002061836328652/home?from=page_100206&mod=TAB#place", "releaser": "前线娱乐",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5896207859?is_hot=1", "releaser": "娱记者", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5717515328?is_hot=1", "releaser": "娱老汉", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051795994180/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐News", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5978818414?is_hot=1", "releaser": "娱圈蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2489917511?is_hot=1", "releaser": "芒果捞扒婆 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5279487569?is_hot=1", "releaser": "娱姐速报 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5106602573?is_hot=1", "releaser": "八哥 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5323541229?profile_ftype=1&is_all=1#_0", "releaser": "国内外白富美揭秘 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1003062512591982/home?from=page_100306&mod=TAB#place", "releaser": "圈少爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2821843050?profile_ftype=1&is_all=1#_0", "releaser": "圈内老鬼",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3028215832?profile_ftype=1&is_all=1#_0", "releaser": "娱扒爷",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5336756846?profile_ftype=1&is_all=1#_0", "releaser": "兔兔热议",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005051844235935/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈外汉", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052586409491/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈吃瓜指南 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5255814135", "releaser": "八组兔区爆料", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2871033210?is_hot=1", "releaser": "八组兔区热议 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052813285937/home?from=page_100505&mod=TAB#place",
"releaser": "八组兔区娱乐圈", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052831749482/home?from=page_100505&mod=TAB#place",
"releaser": "八组兔区揭秘", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/2709814831", "releaser": "娱大蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5634795408", "releaser": "圈八戒", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5176743404", "releaser": "瓜瓜搬运机", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5039775130", "releaser": "娱乐揭秘蜀黍", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/7123521074", "releaser": "饭圈日报", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1746658980", "releaser": "饭圈阿姨", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052453653365/home?from=page_100505&mod=TAB#place", "releaser": "圈内星探",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/6311417880?profile_ftype=1&is_all=1#_0", "releaser": "星扒婆 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1420816495?profile_ftype=1&is_all=1#_0", "releaser": "娱尾纹",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1974754790", "releaser": "教父娱乐", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1818950785?refer_flag=1028035010_&is_hot=1", "releaser": "扒圈有鱼",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1893711543", "releaser": "娱乐有饭", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1002061653255165/home?from=page_100206&mod=TAB#place",
"releaser": "娱乐日爆社", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052391322817/home?from=page_100505&mod=TAB#place", "releaser": "小娱乐家",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1003061994712500/home?from=page_100306&mod=TAB#place",
"releaser": "星扒客push", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5700087877", "releaser": "毒舌八卦", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/3779202361", "releaser": "西皮娱乐", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1632619962", "releaser": "瓜组新鲜事", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052103460752/home?from=page_100505&mod=TAB#place", "releaser": "娱嬷嬷 ",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/5874584452", "releaser": "吃瓜鹅每日搬", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005052397961280/home?from=page_100505&mod=TAB#place", "releaser": "娱大白",
"platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005053246379064/home?from=page_100505&mod=TAB#place",
"releaser": "娱乐圈扒姐 ", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/u/1830483711", "releaser": "娱乐女记", "platform": "weibo"},
{"releaserUrl": "https://weibo.com/p/1005053847401640/home?from=page_100505&mod=TAB#place",
"releaser": "吃瓜爆料每日搬 ", "platform": "weibo"},
{"releaserUrl": "https://www.douban.com/people/hot_tag",
"releaser": "hot_tag", "platform": "douban"},
{"releaserUrl": "https://www.douban.com/people/new_tag",
"releaser": "new_tag", "platform": "douban"}
]
file=r"D:\work_file\gengmei\litao\temp.csv"
extra_dic = {
"department_tags":["策略组"],
"department_tags":["运营"],
'key_releaser': True,
'frequency': 3,
}
# csv_type = {"SMG": [], "an_hui": [], "ronghe": [], "su_zhou": []}
#ronghe_releaser_write_es(file, post_by="litao")
write_to_es(data_list, post_by="litao", extra_dic=extra_dic, push_to_redis=False)
write_to_es(file, post_by="litao", extra_dic=extra_dic, push_to_redis=False)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment