Commit 56c55b5a authored by litaolemo's avatar litaolemo

update

parent b9ce66aa
# -*- coding:UTF-8 -*-
# @Time : 2020/8/19 11:47
# @File : revise_table_data_to_table.py
# @email : litao@igengmei.com
# @author : litao
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
"""
:type sql : str
:rtype : tuple
"""
db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
db='jerry_prod')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
sql = "slect * from xxx"
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 5 17:52:53 2018
@author: fangyucheng
"""
from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq
from crawler_sys.utils.output_results import output_result
from crawler_sys.utils import Metaorphosis as meta
from crawler_sys.utils.output_log import output_log
logging = output_log(page_category='video_page',
program_info='tencent')
def tran_input_data_to_lst(file_name, file_category='csv'):
if file_category == 'csv':
video_info_lst = meta.csv_to_lst_whth_headline(file_name)
url_lst = []
for line in video_info_lst:
try:
if line['data_provider'] == 'CCR':
url_lst.append(line['url'])
except:
pass
return url_lst
elif file_category == 'file':
url_lst = meta.str_file_to_lst(file_name)
return url_lst
url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file')
crawler = Crawler_v_qq()
get_video_page = crawler.video_page
def get_data_source(url_lst=url_lst,
output_to_file=False,
filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
output_es_index=None,
output_doc_type=None):
result_lst = []
for url in url_lst:
video_info = get_video_page(url=url)
result_lst.append(video_info)
logging.info('get_data at page %s' % url)
if len(result_lst) >= 100:
if output_es_index is not None and output_doc_type is not None:
output_result(result_lst,
platform='腾讯视频',
output_to_file=output_to_file,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
es_index=output_es_index,
doc_type=output_doc_type)
result_lst.clear()
else:
output_result(result_lst,
platform='腾讯视频',
output_to_file=output_to_file,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis)
result_lst.clear()
if len(result_lst) != []:
if output_es_index is not None and output_doc_type is not None:
output_result(result_lst,
platform='腾讯视频',
output_to_file=output_to_file,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
es_index=output_es_index,
doc_type=output_doc_type)
result_lst.clear()
else:
output_result(result_lst,
platform='腾讯视频',
output_to_file=output_to_file,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis)
result_lst.clear()
if __name__ == '__main__':
get_data_source(output_to_es_raw=True,
output_es_index='test2',
output_doc_type='fyc')
\ No newline at end of file
# -*- coding:UTF-8 -*-
# @Time : 2020/8/19 11:53
# @File : from_sparksql_to_mysql.py
# @email : litao@igengmei.com
# @author : litao
import hashlib
import json
import pymysql
import xlwt, datetime
import redis
# from pyhive import hive
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from elasticsearch_7 import Elasticsearch
from elasticsearch_7.helpers import scan
import sys
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lit
import pytispark.pytispark as pti
db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
db='jerry_prod')
cursor = db.cursor()
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
db='jerry_prod')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
startTime = time.time()
sparkConf = SparkConf()
sparkConf.set("spark.sql.crossJoin.enabled", True)
sparkConf.set("spark.debug.maxToStringFields", "100")
sparkConf.set("spark.tispark.plan.allow_index_double_read", False)
sparkConf.set("spark.tispark.plan.allow_index_read", True)
sparkConf.set("spark.hive.mapred.supports.subdirectories", True)
sparkConf.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", True)
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.set("mapreduce.output.fileoutputformat.compress", False)
sparkConf.set("mapreduce.map.output.compress", False)
sparkConf.set("prod.gold.jdbcuri",
"jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true")
sparkConf.set("prod.mimas.jdbcuri",
"jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true")
sparkConf.set("prod.gaia.jdbcuri",
"jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true")
sparkConf.set("prod.tidb.jdbcuri",
"jdbc:mysql://172.16.40.158:4000/eagle?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
sparkConf.set("prod.jerry.jdbcuri",
"jdbc:mysql://172.16.40.158:4000/jerry_prod?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.158:2379")
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.170:4000")
sparkConf.set("prod.tidb.database", "jerry_prod")
spark = (SparkSession.builder.config(conf=sparkConf).config("spark.sql.extensions", "org.apache.spark.sql.TiExtensions")
.config("spark.tispark.pd.addresses", "172.16.40.170:2379").appName(
"LR PYSPARK TEST").enableHiveSupport().getOrCreate())
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
select_sql = """SELECT * FROM pm.tl_pm_contentpage_ctr"""
device_df = spark.sql(select_sql)
device_df.show(1, False)
sql_res = device_df.collect()
print("-----------------------------------------------------------------------------")
for res in sql_res:
day_id = res.day_id
device_os_type = res.device_os_type
active_type = res.active_type
grey_type = res.grey_type
page_name = res.page_name
content_pv = res.content_pv
content_uv = res.content_uv
wel_exp_pv = res.wel_exp_pv
content_exp_pv = res.content_exp_pv
wel_click_pv = res.wel_click_pv
content_click_pv = res.content_click_pv
slide_wel_click_pv = res.slide_wel_click_pv
self_wel_click_pv = res.self_wel_click_pv
partition_day = res.partition_day
pid = hashlib.md5(day_id + device_os_type + active_type + grey_type + page_name)
sql = """INSERT INTO conent_detail_page_grayscale_ctr(day_id,device_os_type,
active_type,grey_type,page_name,content_pv,content_uv,wel_exp_pv,content_exp_pv,wel_click_pv,content_click_pv,
wel_click_pv,content_click_pv,slide_wel_click_pv,partition_day,pid
) VALUES('{day_id}','{device_os_type}',
'{active_type}','{grey_type}','{page_name}',{content_pv},{content_uv},{wel_exp_pv},{content_exp_pv},{wel_click_pv},{content_click_pv},
{wel_click_pv},{content_click_pv},{slide_wel_click_pv},'{partition_day}','{pid})'""".format(
day_id=day_id,device_os_type=device_os_type,active_type=active_type,grey_type=grey_type,page_name=page_name,
content_pv=content_pv,content_uv=content_uv,wel_exp_pv=wel_exp_pv,content_exp_pv=content_exp_pv,wel_click_pv=wel_click_pv,
content_click_pv=content_click_pv,slide_wel_click_pv=slide_wel_click_pv,self_wel_click_pv=self_wel_click_pv,
partition_day=partition_day, pid=pid
)
cursor.execute(sql)
# cursor.executemany()
db.close()
\ No newline at end of file
This diff is collapsed.
# -*- coding:utf-8 -*-
# @Time : 2019/8/14 18:01
# @Author : litao
import json
# import argparse
import datetime
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from func_find_week_num import find_week_belongs_to
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.utils import trans_format
from write_data_into_es.func_cal_doc_id import cal_doc_id
hosts = '192.168.17.11'
port = 80
user = 'zhouyujiang'
passwd = '8tM9JDN2LVxM'
http_auth = (user, passwd)
es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
# parser = argparse.ArgumentParser()
# parser.add_argument('-w', '--week_str', type=str, default=None)
def week_start_day(week_year, week_no, week_day, week_day_start=1):
year_week_start = find_first_day_for_given_start_weekday(week_year, week_day_start)
week_start = year_week_start + datetime.timedelta(days=(week_no - 1) * 7)
return week_start
def define_doc_type(week_year, week_no, week_day_start):
"""
doc_type = 'daily-url-2018_w24_s2' means select Tuesday as the
first day of each week, it's year 2018's 24th week.
In isocalendar defination, Monday - weekday 1, Tuesday - weekday 2,
..., Saturday - weekday 6, Sunday - weekday 7.
"""
doc_type_str = 'daily-url-%d_w%02d_s%d' % (week_year, week_no, week_day_start)
return doc_type_str
def find_first_day_for_given_start_weekday(year, start_weekday):
i = 0
while i < 7:
dayDi = datetime.date(year, 1, 1) + datetime.timedelta(days=i)
if dayDi.weekday() == start_weekday:
cal_day1D = dayDi - datetime.timedelta(days=1)
break
else:
cal_day1D = None
i += 1
return cal_day1D
def get_target_releaser_video_info(platform,
releaserUrl,
log_file=None,
output_to_es_raw=True,
es_index=None,
doc_type=None,
releaser_page_num_max=100):
if log_file == None:
log_file = open('error.log', 'w')
crawler = get_crawler(platform=platform)
crawler_initialization = crawler()
if platform == 'haokan':
try:
crawler_initialization.releaser_page(releaserUrl=releaserUrl,
releaser_page_num_max=releaser_page_num_max,
output_to_es_raw=True,
es_index=es_index,
doc_type=doc_type,
fetchFavoriteCommnt=True)
except:
print(releaserUrl, platform, file=log_file)
else:
try:
crawler_initialization.releaser_page(releaserUrl=releaserUrl,
releaser_page_num_max=releaser_page_num_max,
output_to_es_raw=True,
es_index=es_index,
doc_type=doc_type)
except:
print(releaserUrl, platform, file=log_file)
def func_search_reUrl_from_target_index(platform, releaser):
search_body = {
"query": {
"bool": {
"filter": [
{"term": {"platform.keyword": platform}},
{"term": {"releaser.keyword": releaser}}
]
}
}
}
search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
if search_re['hits']['total'] > 0:
return search_re['hits']['hits'][0]['_source']['releaserUrl']
else:
print('Can not found:', platform, releaser)
return None
def func_write_into_weekly_index_new_released(line_list, doc_type, index='short-video-weekly'):
count = 0
bulk_all_body = ''
re_list = []
for line in line_list:
count = count + 1
weekly_net_inc_play_count = line['play_count']
weekly_net_inc_comment_count = line['comment_count']
weekly_net_inc_favorite_count = line['favorite_count']
weekly_cal_base = 'accumulate'
timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
line.update({
'timestamp': timestamp,
'weekly_cal_base': weekly_cal_base,
'weekly_net_inc_favorite_count': weekly_net_inc_favorite_count,
'weekly_net_inc_comment_count': weekly_net_inc_comment_count,
'weekly_net_inc_play_count': weekly_net_inc_play_count
})
re_list.append(line)
url = line['url']
platform = line['platform']
doc_id = cal_doc_id(platform, url=url, doc_id_type='all-time-url',data_dict=line)
bulk_head = '{"index": {"_id":"%s"}}' % doc_id
data_str = json.dumps(line, ensure_ascii=False)
bulk_one_body = bulk_head + '\n' + data_str + '\n'
#
bulk_all_body += bulk_one_body
if count % 500 == 0:
eror_dic = es.bulk(index=index, doc_type=doc_type,
body=bulk_all_body, request_timeout=200)
bulk_all_body = ''
if eror_dic['errors'] is True:
print(eror_dic['items'])
print(bulk_all_body)
print(count)
if bulk_all_body != '':
eror_dic = es.bulk(body=bulk_all_body,
index=index,
doc_type=doc_type,
request_timeout=200)
if eror_dic['errors'] is True:
print(eror_dic)
todayT = datetime.datetime.now()
# todayT=datetime.datetime(2019,2,5)
week_day_start = 1
# if args.week_str is None:
seven_days_ago_T = todayT - datetime.timedelta(days=7)
week_year, week_no, week_day = find_week_belongs_to(seven_days_ago_T,
week_day_start)
week_start = week_start_day(week_year, week_no, week_day)
re_s = week_start - datetime.timedelta(1)
re_s_dt = datetime.datetime.strptime(str(re_s), '%Y-%m-%d')
re_s_t = int(datetime.datetime.timestamp(re_s_dt) * 1000)
re_e = week_start + datetime.timedelta(6)
re_e_dt = datetime.datetime.strptime(str(re_e), '%Y-%m-%d')
re_e_t = int(datetime.datetime.timestamp(re_e_dt) * 1000)
# nowT_feihua = week_start + datetime.timedelta(days=6)
weekly_doc_type_name = define_doc_type(week_year, week_no,
week_day_start=week_day_start)
key_releaser_body = {
"query": {
"bool": {
"filter": [
{"term": {"key_releaser.keyword": "True"}}
]
}
}
}
releaser_re = scan(client=es, index='target_releasers', doc_type='doc',
query=key_releaser_body, scroll='3m')
for re in releaser_re:
releaser = re["_source"]['releaser']
platform = re["_source"]['platform']
if releaser != None:
re_list = []
search_body = {
"query": {
"bool": {
"filter": [
{"term": {"platform.keyword": platform}},
{"term": {"releaser.keyword": releaser}},
{"range": {"release_time": {"gte": re_s_t, "lt": re_e_t}}},
{"range": {"fetch_time": {"gte": re_s_t}}}
]
}
}
}
scan_re = scan(client=es, index='short-video-all-time-url', doc_type='all-time-url',
query=search_body, scroll='3m')
for one_scan in scan_re:
re_list.append(one_scan['_source'])
func_write_into_weekly_index_new_released(re_list, doc_type=weekly_doc_type_name)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment