Commit 1bbca574 authored by 张彦钊's avatar 张彦钊

delete feature file

parents 00b27814 c5834874
...@@ -18,39 +18,65 @@ def con_sql(db,sql): ...@@ -18,39 +18,65 @@ def con_sql(db,sql):
return df return df
def multi_hot(df,column,n):
df[column] = df[column].fillna("lost_na")
app_list_value = [i.split(",") for i in df[column].unique()]
app_list_unique = []
for i in app_list_value:
app_list_unique.extend(i)
app_list_unique = list(set(app_list_unique))
number = len(app_list_unique)
app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
df[column] = df[column].apply(app_list_func, args=(app_list_map,))
return number,app_list_map
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data" sql = "select max(stat_date) from {}".format(train_data_set)
validate_date = con_sql(db, sql)[0].values.tolist()[0] validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:" + validate_date) print("validate_date:" + validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=60)).strftime("%Y-%m-%d") start = (temp - datetime.timedelta(days=300)).strftime("%Y-%m-%d")
print(start) print(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,cut.time,dl.app_list " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \ "from {} e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \
"where e.stat_date >= '{}'".format(start) "left join device_app_list dl on e.device_id = dl.device_id " \
"left join diary_feat feat on e.cid_id = feat.diary_id " \
"where e.stat_date >= '{}'".format(train_data_set,start)
df = con_sql(db, sql) df = con_sql(db, sql)
# print(df.shape) # print(df.shape)
df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
12: "device_id", 13: "time"}) 11: "time",12:"app_list"})
print("esmm data ok") print("esmm data ok")
# print(df.head(2) # print(df.head(2)
print("before") print("before")
print(df.shape) print(df.shape)
print("after")
df = df.drop_duplicates() df = df.drop_duplicates()
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date"]) "channel", "top", "time", "stat_date","app_list"])
print("after")
print(df.shape) print(df.shape)
app_list_number,app_list_map = multi_hot(df,"app_list",1)
level2_number,level2_map = multi_hot(df,"clevel2_id",1+app_list_number)
# df["app_list"] = df["app_list"].fillna("lost_na")
# app_list_value = [i.split(",") for i in df["app_list"].unique()]
# app_list_unique = []
# for i in app_list_value:
# app_list_unique.extend(i)
# app_list_unique = list(set(app_list_unique))
# app_list_map = dict(zip(app_list_unique, list(range(1, len(app_list_unique) + 1))))
# df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,))
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
...@@ -58,25 +84,15 @@ def get_data(): ...@@ -58,25 +84,15 @@ def get_data():
# 下面这行代码是为了区分不同的列中有相同的值 # 下面这行代码是为了区分不同的列中有相同的值
df[i] = df[i] + i df[i] = df[i] + i
unique_values.extend(list(df[i].unique())) unique_values.extend(list(df[i].unique()))
for i in ["l1","l2"]:
df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost")
# l1和l2中的值与top类别是一个类别
df[i] = df[i]+"top"
unique_values.extend(list(df[i].unique()))
print("features:")
print(len(unique_values))
print(df.head(2))
temp = list(range(1,len(unique_values)+1)) temp = list(range(1+app_list_number+level2_number, 1 + app_list_number+level2_number + len(unique_values)))
value_map = dict(zip(unique_values,temp)) value_map = dict(zip(unique_values,temp))
df = df.drop("device_id", axis=1) df = df.drop("device_id", axis=1)
train = df train = df[df["stat_date"] != validate_date+"stat_date"]
test = df[df["stat_date"] == validate_date+"stat_date"] test = df[df["stat_date"] == validate_date+"stat_date"]
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]: "channel", "top", "time", "stat_date"]:
train[i] = train[i].map(value_map) train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map) test[i] = test[i].map(value_map)
...@@ -88,7 +104,18 @@ def get_data(): ...@@ -88,7 +104,18 @@ def get_data():
write_csv(train, "tr",100000) write_csv(train, "tr",100000)
write_csv(test, "va",80000) write_csv(test, "va",80000)
return validate_date,value_map return validate_date,value_map,app_list_map,level2_map
def app_list_func(x,l):
b = x.split(",")
e = []
for i in b:
if i in l.keys():
e.append(l[i])
else:
e.append(0)
return ",".join([str(j) for j in e])
def write_csv(df,name,n): def write_csv(df,name,n):
...@@ -102,44 +129,45 @@ def write_csv(df,name,n): ...@@ -102,44 +129,45 @@ def write_csv(df,name,n):
temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False) temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)
def get_predict(date,value_map): def get_predict(date,value_map,app_list_map,level2_map):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,dl.app_list " \
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \ "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid" "left join device_app_list dl on e.device_id = dl.device_id " \
"left join diary_feat feat on e.cid_id = feat.diary_id"
df = con_sql(db, sql) df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top",
12: "device_id", 13: "cid_id", 14: "time"}) 10: "device_id", 11: "cid_id", 12: "time",13:"app_list"})
df["stat_date"] = date df["stat_date"] = date
print(df.head(6))
df["app_list"] = df["app_list"].fillna("lost_na")
df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,))
df["clevel2_id"] = df["clevel2_id"].fillna("lost_na")
df["clevel2_id"] = df["clevel2_id"].apply(app_list_func, args=(level2_map,))
print("predict shape") # print("predict shape")
print(df.shape) # print(df.shape)
df["uid"] = df["device_id"] df["uid"] = df["device_id"]
df["city"] = df["ucity_id"] df["city"] = df["ucity_id"]
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
df[i] = df[i] + i df[i] = df[i] + i
for i in ["l1","l2"]:
df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost")
# l1和l2中的值与top类别是一个类别
df[i] = df[i]+"top"
native_pre = df[df["label"] == 0] native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1) native_pre = native_pre.drop("label", axis=1)
nearby_pre = df[df["label"] == 1] nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1) nearby_pre = nearby_pre.drop("label", axis=1)
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"]: "channel", "top", "time", "stat_date"]:
native_pre[i] = native_pre[i].map(value_map) native_pre[i] = native_pre[i].map(value_map)
# TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
native_pre[i] = native_pre[i].fillna(0) native_pre[i] = native_pre[i].fillna(0)
...@@ -151,19 +179,20 @@ def get_predict(date,value_map): ...@@ -151,19 +179,20 @@ def get_predict(date,value_map):
print("native") print("native")
print(native_pre.shape) print(native_pre.shape)
print(native_pre.head())
native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False) native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False)
write_csv(native_pre, "native",200000) write_csv(native_pre, "native",200000)
print("nearby") print("nearby")
print(nearby_pre.shape) print(nearby_pre.shape)
print(nearby_pre.head())
nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False) nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False)
write_csv(nearby_pre, "nearby", 160000) write_csv(nearby_pre, "nearby", 160000)
if __name__ == '__main__': if __name__ == '__main__':
path = "/home/gmuser/esmm_data/" train_data_set = "esmm_train_data"
date,value = get_data() path = "/data/esmm/"
get_predict(date, value) date,value,app_list,level2 = get_data()
get_predict(date, value,app_list,level2)
#coding=utf-8
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
import datetime
my_sender='gaoyazhe@igengmei.com'
my_pass = 'VCrKTui99a7ALhiK'
my_user1='gaoyazhe@igengmei.com'
my_user2='zhangyanzhao@igengmei.com'
def mail():
ret=True
try:
with open('/home/gmuser/esmm_data/submit.log') as f:
stat_data = f.read()
msg=MIMEText(stat_data,'plain','utf-8')
msg['From']=formataddr(["高雅喆",my_sender])
msg['To']=my_user1 + ',' + my_user2
msg['Subject']= str(datetime.date.today())+"-esmm多目标模型训练指标统计"
server=smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
server.login(my_sender, my_pass)
server.sendmail(my_sender,[my_user1,my_user2],msg.as_string())
server.quit()
except Exception:
ret=False
return ret
ret=mail()
if ret:
print("邮件发送成功")
else:
print("邮件发送失败")
\ No newline at end of file
#! /bin/bash #! /bin/bash
cd /srv/apps/ffm-baseline/eda/esmm
git checkout master git checkout master
PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm/Model_pipline
DATA_PATH=/home/gmuser/esmm_data DATA_PATH=/data/esmm
echo "start time"
current=$(date "+%Y-%m-%d %H:%M:%S")
timeStamp=$(date -d "$current" +%s)
currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current
echo "rm leave tfrecord" echo "rm leave tfrecord"
rm ${DATA_PATH}/tr/* rm ${DATA_PATH}/tr/*
rm ${DATA_PATH}/va/* rm ${DATA_PATH}/va/*
rm ${DATA_PATH}/native/* rm ${DATA_PATH}/native/*
rm ${DATA_PATH}/nearby/* rm ${DATA_PATH}/nearby/*
rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/201* rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/20*
echo "data2ffm"
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/data2ffm.py > ${DATA_PATH}/infer.log
all_sample=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$2$3$4}' | sort | uniq | wc -l`)) echo "data"
uniq_feat=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$4}' | sort | uniq -u | wc -l`)) ${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/feature.log
repe_feat=$((all_sample-uniq_feat))
echo "Bayes Error Rate": $((repe_feat*100/all_sample))%
echo "split data"
split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv
split -l $((`wc -l < ${DATA_PATH}/va.csv`/5)) ${DATA_PATH}/va.csv -d -a 4 ${DATA_PATH}/va/va_ --additional-suffix=.csv
split -l $((`wc -l < ${DATA_PATH}/native.csv`/15)) ${DATA_PATH}/native.csv -d -a 4 ${DATA_PATH}/native/native_ --additional-suffix=.csv
split -l $((`wc -l < ${DATA_PATH}/nearby.csv`/5)) ${DATA_PATH}/nearby.csv -d -a 4 ${DATA_PATH}/nearby/nearby_ --additional-suffix=.csv
echo "csv to tfrecord" echo "csv to tfrecord"
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/ ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/va/ --output_dir=${DATA_PATH}/va/ ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/va/ --output_dir=${DATA_PATH}/va/
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/native/ --output_dir=${DATA_PATH}/native/ ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/native/ --output_dir=${DATA_PATH}/native/
${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/nearby/ --output_dir=${DATA_PATH}/nearby/ ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/nearby/ --output_dir=${DATA_PATH}/nearby/
cat ${DATA_PATH}/tr/*.tfrecord > ${DATA_PATH}/tr/tr.tfrecord cat ${DATA_PATH}/tr/*.tfrecord > ${DATA_PATH}/tr/tr.tfrecord
cat ${DATA_PATH}/va/*.tfrecord > ${DATA_PATH}/va/va.tfrecord cat ${DATA_PATH}/va/*.tfrecord > ${DATA_PATH}/va/va.tfrecord
...@@ -49,35 +30,17 @@ rm ${DATA_PATH}/va/va_* ...@@ -49,35 +30,17 @@ rm ${DATA_PATH}/va/va_*
rm ${DATA_PATH}/native/native_* rm ${DATA_PATH}/native/native_*
rm ${DATA_PATH}/nearby/nearby_* rm ${DATA_PATH}/nearby/nearby_*
echo "data transform time"
current=$(date "+%Y-%m-%d %H:%M:%S")
timeStamp=$(date -d "$current" +%s)
currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current
echo "train..." echo "train..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
echo "train time"
current=$(date "+%Y-%m-%d %H:%M:%S")
timeStamp=$(date -d "$current" +%s)
currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current
echo "infer native..." echo "infer native..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/native_infer.log
echo "infer nearby..." echo "infer nearby..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/nearby_infer.log
echo "sort and 2sql" echo "sort and 2sql"
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/sort_and_2sql.py ${PYTHON_PATH} ${MODEL_PATH}/to_database.py > ${DATA_PATH}/insert_database.log
echo "infer and sort and 2sql time"
current=$(date "+%Y-%m-%d %H:%M:%S")
timeStamp=$(date -d "$current" +%s)
currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/send_mail.py
\ No newline at end of file
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
from sqlalchemy import create_engine from sqlalchemy import create_engine
import pandas as pd import pandas as pd
import pymysql import pymysql
import MySQLdb
import time import time
def con_sql(sql): def con_sql(sql):
...@@ -19,31 +18,41 @@ def con_sql(sql): ...@@ -19,31 +18,41 @@ def con_sql(sql):
return result return result
def set_join(lst): def nearby_set_join(lst):
r = [str(i) for i in lst.unique().tolist()] # return ','.join([str(i) for i in list(lst)])
r =r[:500] return ','.join([str(i) for i in lst.unique().tolist()])
def native_set_join(lst):
l = lst.unique().tolist()
d = int(len(l)/2)
if d == 0:
d = 1
r = [str(i) for i in l]
r =r[:d]
return ','.join(r) return ','.join(r)
def main(): def main():
# native queue # native queue
df2 = pd.read_csv('/home/gmuser/esmm_data/native.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t') df2 = pd.read_csv('/data/esmm/native.csv')
df2['cid_id'] = df2['cid_id'].astype(str) df2['cid_id'] = df2['cid_id'].astype(str)
df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"]) df1 = pd.read_csv("/data/esmm/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"] df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False) df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
df3.columns = ["device_id","city_id","native_queue"] df3.columns = ["device_id","city_id","native_queue"]
print("native_device_count",df3.shape) print("native_device_count",df3.shape)
# nearby queue # nearby queue
df2 = pd.read_csv('/home/gmuser/esmm_data/nearby.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t') df2 = pd.read_csv('/data/esmm/nearby.csv')
df2['cid_id'] = df2['cid_id'].astype(str) df2['cid_id'] = df2['cid_id'].astype(str)
df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"]) df1 = pd.read_csv("/data/esmm/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"] df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False) df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
df4.columns = ["device_id","city_id","nearby_queue"] df4.columns = ["device_id","city_id","nearby_queue"]
print("nearby_device_count",df4.shape) print("nearby_device_count",df4.shape)
...@@ -55,8 +64,6 @@ def main(): ...@@ -55,8 +64,6 @@ def main():
df_all["time"] = ctime df_all["time"] = ctime
print("union_device_count",df_all.shape) print("union_device_count",df_all.shape)
host='10.66.157.22' host='10.66.157.22'
port=4000 port=4000
user='root' user='root'
...@@ -65,11 +72,11 @@ def main(): ...@@ -65,11 +72,11 @@ def main():
charset='utf8' charset='utf8'
engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db)) engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db))
df_merge = df_all['device_id'] + df_all['city_id']
df_merge_str = (str(list(df_merge.values))).strip('[]')
try: try:
# df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1) # df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
df_merge = df_all['device_id'] + df_all['city_id']
df_merge_str = (str(list(df_merge.values))).strip('[]')
delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str) delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str)
con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cur = con.cursor() cur = con.cursor()
...@@ -79,7 +86,7 @@ def main(): ...@@ -79,7 +86,7 @@ def main():
except Exception as e: except Exception as e:
print(e) print(e)
print("done")
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
#coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import sys
import os
import glob
import tensorflow as tf
import numpy as np
import re
from multiprocessing import Pool as ThreadPool
flags = tf.app.flags
FLAGS = flags.FLAGS
LOG = tf.logging
tf.app.flags.DEFINE_string("input_dir", "./", "input dir")
tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
tf.app.flags.DEFINE_integer("threads", 16, "threads num")
def gen_tfrecords(in_file):
basename = os.path.basename(in_file) + ".tfrecord"
out_file = os.path.join(FLAGS.output_dir, basename)
tfrecord_out = tf.python_io.TFRecordWriter(out_file)
df = pd.read_csv(in_file)
for i in range(df.shape[0]):
feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date"]
id = np.array([])
for j in feats:
id = np.append(id,df[j][i])
app_list = np.array(str(df["app_list"][i]).split(","))
level2_list = np.array(str(df["clevel2_id"][i]).split(","))
features = tf.train.Features(feature={
"y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
"z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
"app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
"level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int)))
})
example = tf.train.Example(features = features)
serialized = example.SerializeToString()
tfrecord_out.write(serialized)
tfrecord_out.close()
def main(_):
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
print("total files: %d" % len(file_list))
pool = ThreadPool(FLAGS.threads) # Sets the pool size
pool.map(gen_tfrecords, file_list)
pool.close()
pool.join()
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()
\ No newline at end of file
...@@ -53,9 +53,11 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False): ...@@ -53,9 +53,11 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
features = { features = {
"y": tf.FixedLenFeature([], tf.float32), "y": tf.FixedLenFeature([], tf.float32),
"z": tf.FixedLenFeature([], tf.float32), "z": tf.FixedLenFeature([], tf.float32),
"feat_ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64) "ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64),
#"feat_vals": tf.FixedLenFeature([None], tf.float32), "app_list": tf.VarLenFeature(tf.int64),
"level2_list": tf.VarLenFeature(tf.int64)
} }
parsed = tf.parse_single_example(record, features) parsed = tf.parse_single_example(record, features)
y = parsed.pop('y') y = parsed.pop('y')
z = parsed.pop('z') z = parsed.pop('z')
...@@ -98,15 +100,9 @@ def model_fn(features, labels, mode, params): ...@@ -98,15 +100,9 @@ def model_fn(features, labels, mode, params):
#------bulid weights------ #------bulid weights------
Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer()) Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer())
#------build feaure------- feat_ids = features['ids']
#{U-A-X-C不需要特殊处理的特征} app_list = features['app_list']
feat_ids = features['feat_ids'] level2_list = features['level2_list']
#feat_vals = features['feat_vals']
#{User multi-hot}
#{Ad}
#{X multi-hot}
#x_intids = features['x_intids']
#x_intvals = features['x_intvals']
if FLAGS.task_type != "infer": if FLAGS.task_type != "infer":
y = labels['y'] y = labels['y']
...@@ -114,10 +110,13 @@ def model_fn(features, labels, mode, params): ...@@ -114,10 +110,13 @@ def model_fn(features, labels, mode, params):
#------build f(x)------ #------build f(x)------
with tf.variable_scope("Shared-Embedding-layer"): with tf.variable_scope("Shared-Embedding-layer"):
common_embs = tf.nn.embedding_lookup(Feat_Emb, feat_ids) # None * F' * K embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids)
#common_embs = tf.multiply(common_embs, feat_vals) app_id = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=app_list, sp_weights=None, combiner="sum")
level2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level2_list, sp_weights=None, combiner="sum")
x_concat = tf.concat([tf.reshape(common_embs,shape=[-1, common_dims])],axis=1) # None * (F * K) # x_concat = tf.reshape(embedding_id,shape=[-1, common_dims]) # None * (F * K)
x_concat = tf.concat([tf.reshape(embedding_id,shape=[-1,common_dims]),app_id,level2], axis=1)
with tf.name_scope("CVR_Task"): with tf.name_scope("CVR_Task"):
if mode == tf.estimator.ModeKeys.TRAIN: if mode == tf.estimator.ModeKeys.TRAIN:
...@@ -348,20 +347,6 @@ def main(_): ...@@ -348,20 +347,6 @@ def main(_):
fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr'])) fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr']))
elif FLAGS.task_type == 'export': elif FLAGS.task_type == 'export':
print("Not Implemented, Do It Yourself!") print("Not Implemented, Do It Yourself!")
#feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
#feature_spec = {
# 'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
# 'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
#}
#serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
#feature_spec = {
# 'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'),
# 'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals')
#}
#serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)
#Estimator.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
if __name__ == "__main__": if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
......
...@@ -2,8 +2,9 @@ package com.gmei ...@@ -2,8 +2,9 @@ package com.gmei
import java.io.Serializable import java.io.Serializable
import java.time.LocalDate
import org.apache.spark.sql.{SaveMode, TiContext} import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
import org.apache.log4j.{Level, Logger} import org.apache.log4j.{Level, Logger}
import scopt.OptionParser import scopt.OptionParser
import com.gmei.lib.AbstractParams import com.gmei.lib.AbstractParams
...@@ -30,8 +31,8 @@ object EsmmData { ...@@ -30,8 +31,8 @@ object EsmmData {
.text(s"the databases environment you used") .text(s"the databases environment you used")
.action((x, c) => c.copy(env = x)) .action((x, c) => c.copy(env = x))
opt[String]("date") opt[String]("date")
.text(s"the date you used") .text(s"the date you used")
.action((x,c) => c.copy(date = x)) .action((x,c) => c.copy(date = x))
note( note(
""" """
|For example, the following command runs this app on a tidb dataset: |For example, the following command runs this app on a tidb dataset:
...@@ -69,15 +70,15 @@ object EsmmData { ...@@ -69,15 +70,15 @@ object EsmmData {
if (max_stat_date_str != param.date){ if (max_stat_date_str != param.date){
val stat_date = param.date val stat_date = param.date
println(stat_date) println(stat_date)
// val imp_data = sc.sql( // val imp_data = sc.sql(
// s""" // s"""
// |select distinct stat_date,device_id,city_id as ucity_id, // |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id // | cid_id,diary_service_id
// |from data_feed_exposure // |from data_feed_exposure
// |where cid_type = 'diary' // |where cid_type = 'diary'
// |and stat_date ='${stat_date}' // |and stat_date ='${stat_date}'
// """.stripMargin // """.stripMargin
// ) // )
val imp_data = sc.sql( val imp_data = sc.sql(
s""" s"""
...@@ -90,8 +91,8 @@ object EsmmData { ...@@ -90,8 +91,8 @@ object EsmmData {
""".stripMargin """.stripMargin
) )
// imp_data.show() // imp_data.show()
// println("imp_data.count()") println("imp_data.count()")
// println(imp_data.count()) println(imp_data.count())
val clk_data = sc.sql( val clk_data = sc.sql(
...@@ -104,8 +105,8 @@ object EsmmData { ...@@ -104,8 +105,8 @@ object EsmmData {
""".stripMargin """.stripMargin
) )
// clk_data.show() // clk_data.show()
// println("clk_data.count()") println("clk_data.count()")
// println(clk_data.count()) println(clk_data.count())
...@@ -138,8 +139,9 @@ object EsmmData { ...@@ -138,8 +139,9 @@ object EsmmData {
// println(cvr_data_filter.count()) // println(cvr_data_filter.count())
val other_click = get_other_click(sc,stat_date_not)
val clk_data_filter =clk_data.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0)) val all_click = clk_data.union(other_click)
val clk_data_filter =all_click.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
// clk_data_filter.createOrReplaceTempView("clk_data_filter") // clk_data_filter.createOrReplaceTempView("clk_data_filter")
// clk_data_filter.show() // clk_data_filter.show()
// println("clk_data_filter.count()") // println("clk_data_filter.count()")
...@@ -220,10 +222,10 @@ object EsmmData { ...@@ -220,10 +222,10 @@ object EsmmData {
|group by device_id,cid_id |group by device_id,cid_id
""".stripMargin """.stripMargin
) )
union_data_scity_id2.persist()
GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append) GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append)
GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append)
union_data_scity_id2.unpersist()
} else { } else {
println("esmm_train_data already have param.date data") println("esmm_train_data already have param.date data")
} }
...@@ -232,6 +234,103 @@ object EsmmData { ...@@ -232,6 +234,103 @@ object EsmmData {
} }
} }
def get_other_click(spark:SparkSession,yesterday:String): DataFrame ={
var result01 = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['business_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] != '精选'
|and params['page_name'] = 'home'
""".stripMargin
)
// println(result01.count())
// result01.show(6)
val recommend = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'diarybook_detail_click_recommend_block' and params["business_type"] = "diary"
""".stripMargin
)
// println("详情页推荐日记:")
// println(recommend.count())
// recommend.show(6)
val search_zonghe = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'search_result_click_infomation_item' and params["business_type"] = "diary"
""".stripMargin
)
// println("搜索综合:")
// println(search_zonghe.count())
// search_zonghe.show(6)
val non_home = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["diary_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['page_name'] != 'home'
""".stripMargin
)
// println("non home:")
// println(non_home.count())
// non_home.show(6)
result01 = result01.union(recommend).union(search_zonghe).union(non_home)
// println(result01.count())
result01.createOrReplaceTempView("temp_result")
val result02 = spark.sql(
s"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
""".stripMargin
)
result02.createOrReplaceTempView("temp_result02")
val result_dairy = spark.sql(
s"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.city_id as ucity_id,
| re.cid as cid_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
""".stripMargin
).distinct()
result_dairy
}
} }
...@@ -279,11 +378,20 @@ object EsmmPredData { ...@@ -279,11 +378,20 @@ object EsmmPredData {
ti.tidbMapTable("eagle","search_queue") ti.tidbMapTable("eagle","search_queue")
ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_train_data") ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_train_data")
ti.tidbMapTable("eagle","biz_feed_diary_queue") ti.tidbMapTable("eagle","biz_feed_diary_queue")
ti.tidbMapTable("jerry_prod","data_feed_exposure_precise")
import sc.implicits._ import sc.implicits._
val yesteday_have_seq = GmeiConfig.getMinusNDate(1) val yesteday_have_seq = GmeiConfig.getMinusNDate(1)
val target_user = sc.sql(
s"""
|select concat(t.device_id,",",t.city_id) from
|(select distinct device_id,city_id
|from data_feed_exposure where stat_date='${yesteday_have_seq}') t
""".stripMargin).collect().map(x => x(0).toString)
println("target_user",target_user.length)
//nearby_data //nearby_data
val raw_data = sc.sql( val raw_data = sc.sql(
s""" s"""
...@@ -293,21 +401,50 @@ object EsmmPredData { ...@@ -293,21 +401,50 @@ object EsmmPredData {
|select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue |select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue
|union |union
|select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1 |select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1
|where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date='${yesteday_have_seq}') """.stripMargin)
""".stripMargin // raw_data.show()
)
raw_data.show()
val raw_data1 = raw_data.rdd.groupBy(_.getAs[String]("device_city")).map { val raw_data1 = raw_data.rdd.groupBy(_.getAs[String]("device_city"))
.filter(x => target_user.indexOf(x._1) != -1)
.map {
case (device_city, cid_data) => case (device_city, cid_data) =>
val device_id = Try(device_city.split(",")(0)).getOrElse("") val device_id = Try(device_city.split(",")(0)).getOrElse("")
val city_id = Try(device_city.split(",")(1)).getOrElse("") val city_id = Try(device_city.split(",")(1)).getOrElse("")
val cids = Try(cid_data.toSeq.map(_.getAs[String]("merge_queue").split(",")).flatMap(_.zipWithIndex).sortBy(_._2).map(_._1).distinct.take(500).mkString(",")).getOrElse("") val cids = Try(cid_data.toSeq.map(_.getAs[String]("merge_queue").split(",")).flatMap(_.zipWithIndex).sortBy(_._2).map(_._1).distinct.take(500).mkString(",")).getOrElse("")
(device_id,city_id ,s"$cids") (device_id,city_id ,s"$cids")
}.filter(_._3!="").toDF("device_id","city_id","merge_queue") }.filter(_._3!="").toDF("device_id","city_id","merge_queue")
raw_data1.createOrReplaceTempView("raw_data1")
println("nearby_device_count",raw_data1.count()) // println("nearby_device_count",raw_data1.count())
val start= LocalDate.now().minusDays(14).toString
import sc.implicits._
val sql =
s"""
|select distinct device_id,cid_id from data_feed_exposure_precise
|where stat_date >= "$start" and cid_type = "diary"
""".stripMargin
val history = sc.sql(sql).repartition(200).rdd
.map(x =>(x(0).toString,x(1).toString)).groupByKey().map(x => (x._1,x._2.mkString(",")))
.toDF("device_id","cid_set")
history.persist()
history.createOrReplaceTempView("history")
if (history.take(1).nonEmpty){
raw_data1.createOrReplaceTempView("r")
val sql_nearby_filter =
s"""
|select r.device_id,r.city_id,r.merge_queue,history.cid_set from r
|left join history on r.device_id = history.device_id
""".stripMargin
val df = sc.sql(sql_nearby_filter).na.fill("").rdd
.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
.map(x => (x._1,x._2,x._3.split(",").diff(x._4.split(",")).mkString(",")))
.toDF("device_id","city_id","merge_queue")
df.createOrReplaceTempView("raw_data1")
}else{
raw_data1.createOrReplaceTempView("raw_data1")
}
val raw_data2 = sc.sql( val raw_data2 = sc.sql(
s""" s"""
...@@ -315,7 +452,7 @@ object EsmmPredData { ...@@ -315,7 +452,7 @@ object EsmmPredData {
""".stripMargin """.stripMargin
).withColumn("label",lit(1)) ).withColumn("label",lit(1))
raw_data2.createOrReplaceTempView("raw_data2") raw_data2.createOrReplaceTempView("raw_data2")
println("nearby_explode_count",raw_data2.count()) // println("nearby_explode_count",raw_data2.count())
// native_data // native_data
...@@ -327,8 +464,26 @@ object EsmmPredData { ...@@ -327,8 +464,26 @@ object EsmmPredData {
|where a.stat_date='${yesteday_have_seq}' and b.native_queue != "" |where a.stat_date='${yesteday_have_seq}' and b.native_queue != ""
""".stripMargin """.stripMargin
) )
native_data.createOrReplaceTempView("native_data") // println("native_device_count",native_data.count())
println("native_device_count",native_data.count())
if (history.take(1).nonEmpty){
native_data.createOrReplaceTempView("temp")
val sql_native_filter =
s"""
|select t.device_id,t.city_id,t.native_queue,history.cid_set from temp t
|left join history on t.device_id = history.device_id
""".stripMargin
val df = sc.sql(sql_native_filter).na.fill("").rdd
.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
.map(x => (x._1,x._2,x._3.split(",").diff(x._4.split(",")).mkString(",")))
.toDF("device_id","city_id","native_queue")
df.createOrReplaceTempView("native_data")
}else{
native_data.createOrReplaceTempView("native_data")
}
history.unpersist()
val native_data1 = sc.sql( val native_data1 = sc.sql(
s""" s"""
...@@ -336,9 +491,7 @@ object EsmmPredData { ...@@ -336,9 +491,7 @@ object EsmmPredData {
""".stripMargin """.stripMargin
).withColumn("label",lit(0)) ).withColumn("label",lit(0))
native_data1.createOrReplaceTempView("native_data1") native_data1.createOrReplaceTempView("native_data1")
println("native_explode_count",native_data1.count()) // println("native_explode_count",native_data1.count())
//union //union
val union_data = sc.sql( val union_data = sc.sql(
...@@ -349,7 +502,7 @@ object EsmmPredData { ...@@ -349,7 +502,7 @@ object EsmmPredData {
""".stripMargin """.stripMargin
) )
union_data.createOrReplaceTempView("raw_data") union_data.createOrReplaceTempView("raw_data")
println("union_count",union_data.count()) // println("union_count",union_data.count())
//join feat //join feat
...@@ -364,8 +517,8 @@ object EsmmPredData { ...@@ -364,8 +517,8 @@ object EsmmPredData {
|where b.partition_date = '${yesteday}' |where b.partition_date = '${yesteday}'
""".stripMargin """.stripMargin
) )
// sid_data.show() // sid_data.show()
println(sid_data.count()) // println(sid_data.count())
val sid_data_label = sid_data.withColumn("y",lit(0)).withColumn("z",lit(0)) val sid_data_label = sid_data.withColumn("y",lit(0)).withColumn("z",lit(0))
sid_data_label.createOrReplaceTempView("union_data") sid_data_label.createOrReplaceTempView("union_data")
...@@ -413,10 +566,29 @@ object EsmmPredData { ...@@ -413,10 +566,29 @@ object EsmmPredData {
union_data_ccity_name.createOrReplaceTempView("union_data_ccity_name") union_data_ccity_name.createOrReplaceTempView("union_data_ccity_name")
// union_data_ccity_name.show() // union_data_ccity_name.show()
val jdbcDF = sc.read
.format("jdbc")
.option("driver", "com.mysql.jdbc.Driver")
.option("url", "jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com:3306/zhengxing")
.option("dbtable", "api_punishment")
.option("user", "work")
.option("password", "BJQaT9VzDcuPBqkd")
.load()
jdbcDF.createOrReplaceTempView("api_punishment")
val now = LocalDate.now().toString
val punish_doctor = sc.sql(
s"""
|select doctor_id from api_punishment
|where end_time > '$now'
""".stripMargin).collect().map(x => x(0).toString).distinct
println("punish_doctor")
println(punish_doctor.length)
val union_data_scity_id = sc.sql( val union_data_scity_id = sc.sql(
s""" s"""
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.label,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,a.ccity_name, |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.label,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,a.ccity_name,
| d.city_id as scity_id | d.city_id as scity_id,b.doctor_id,c.hospital_id
|from union_data_ccity_name a |from union_data_ccity_name a
|left join online.tl_meigou_service_view b on a.diary_service_id=b.id |left join online.tl_meigou_service_view b on a.diary_service_id=b.id
|left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id |left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id
...@@ -424,24 +596,28 @@ object EsmmPredData { ...@@ -424,24 +596,28 @@ object EsmmPredData {
|where b.partition_date='${yesteday}' |where b.partition_date='${yesteday}'
|and c.partition_date='${yesteday}' |and c.partition_date='${yesteday}'
|and d.partition_date='${yesteday}' |and d.partition_date='${yesteday}'
|and b.doctor_id not in (${punish_doctor.map(x => s"'$x'").mkString(",")})
""".stripMargin """.stripMargin
) )
union_data_scity_id.createOrReplaceTempView("union_data_scity_id") union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
val union_data_scity_id2 = sc.sql( val union_data_scity_id2 = sc.sql(
s""" s"""
|select device_id,cid_id,first(stat_date) stat_date,first(ucity_id) ucity_id,label,first(diary_service_id)diary_service_id,first(y) y, |select device_id,cid_id,first(stat_date) stat_date,ucity_id,first(label) label,first(diary_service_id) diary_service_id,first(y) y,
|first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,first(scity_id) scity_id |first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,
|first(scity_id) scity_id,first(hospital_id) hospital_id
|from union_data_scity_id |from union_data_scity_id
|group by device_id,cid_id,label |group by device_id,ucity_id,cid_id
""".stripMargin """.stripMargin
) )
// union_data_scity_id.createOrReplaceTempView("union_data_scity_id") // union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
println(union_data_scity_id2.count()) // println(union_data_scity_id2.count())
union_data_scity_id2.persist()
GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_pre_data",SaveMode.Overwrite) GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_pre_data",SaveMode.Overwrite)
GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_pre_data",SaveMode.Overwrite)
union_data_scity_id2.unpersist()
...@@ -497,9 +673,9 @@ object GetDiaryPortrait { ...@@ -497,9 +673,9 @@ object GetDiaryPortrait {
val diary_tag = sc.sql( val diary_tag = sc.sql(
s""" s"""
|select c.diary_id, |select c.diary_id,
| concat_ws(';',collect_set(cast(c.level1_id as string))) as level1_ids, | concat_ws(',',collect_set(cast(c.level1_id as string))) as level1_ids,
| concat_ws(';',collect_set(cast(c.level2_id as string))) as level2_ids, | concat_ws(',',collect_set(cast(c.level2_id as string))) as level2_ids,
| concat_ws(';',collect_set(cast(c.level3_id as string))) as level3_ids from | concat_ws(',',collect_set(cast(c.level3_id as string))) as level3_ids from
| (select a.diary_id,b.level1_id,b.level2_id,b.level3_id | (select a.diary_id,b.level1_id,b.level2_id,b.level3_id
| from online.tl_hdfs_diary_tags_view a | from online.tl_hdfs_diary_tags_view a
| left join online.bl_tag_hierarchy_detail b | left join online.bl_tag_hierarchy_detail b
...@@ -509,10 +685,17 @@ object GetDiaryPortrait { ...@@ -509,10 +685,17 @@ object GetDiaryPortrait {
| group by c.diary_id | group by c.diary_id
""".stripMargin """.stripMargin
) )
diary_tag.show() // diary_tag.show()
println(diary_tag.count()) // println(diary_tag.count())
diary_tag.createOrReplaceTempView("t")
val result = sc.sql(
s"""
|select diary_id,level1_ids,level2_ids,level3_ids,split(level2_ids,",")[0] as level2 from t
""".stripMargin
)
val jdbc = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(diary_tag,"diary_feat",SaveMode.Overwrite) GmeiConfig.writeToJDBCTable(jdbc,result,"diary_feat",SaveMode.Overwrite)
sc.stop() sc.stop()
...@@ -589,19 +772,19 @@ object GetDevicePortrait { ...@@ -589,19 +772,19 @@ object GetDevicePortrait {
device_search_tag.createOrReplaceTempView("tag_count") device_search_tag.createOrReplaceTempView("tag_count")
val max_count_tag = sc.sql( val max_count_tag = sc.sql(
s""" s"""
|select a.device_id,a.stat_date,a.level1_id as max_level1_id,a.level1_count as max_level1_count |select a.device_id,a.stat_date,a.level1_id as max_level1_id,a.level1_count as max_level1_count
|from tag_count a |from tag_count a
|inner join |inner join
|(select device_id,max(level1_count) as max_count from tag_count group by device_id) b |(select device_id,max(level1_count) as max_count from tag_count group by device_id) b
|on a.level1_count = b.max_count and a.device_id = b.device_id |on a.level1_count = b.max_count and a.device_id = b.device_id
""".stripMargin """.stripMargin
) )
// .rdd.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString)) // .rdd.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
// max_count_tag.foreachPartition(GmeiConfig.updateDeviceFeat) // max_count_tag.foreachPartition(GmeiConfig.updateDeviceFeat)
// //
// max_count_tag.take(10).foreach(println) // max_count_tag.take(10).foreach(println)
// println(max_count_tag.count()) // println(max_count_tag.count())
//drop duplicates //drop duplicates
val max_count_tag_rdd = max_count_tag.rdd.groupBy(_.getAs[String]("device_id")).map { val max_count_tag_rdd = max_count_tag.rdd.groupBy(_.getAs[String]("device_id")).map {
...@@ -669,7 +852,7 @@ object GetLevelCount { ...@@ -669,7 +852,7 @@ object GetLevelCount {
import sc.implicits._ import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1).replace("-","") val stat_date = GmeiConfig.getMinusNDate(1).replace("-","")
// val diary_queue = sc.read.json(param.path).rdd.map(x => x(0).toString).distinct().collect().toList.mkString(",") // val diary_queue = sc.read.json(param.path).rdd.map(x => x(0).toString).distinct().collect().toList.mkString(",")
val diary_queue = "16215222,16204965,15361235,16121397,16277565,15491159,16299587,16296887,15294642,16204934,15649199,16122580,16122580,16122580,16122580,16122580,16122580" val diary_queue = "16215222,16204965,15361235,16121397,16277565,15491159,16299587,16296887,15294642,16204934,15649199,16122580,16122580,16122580,16122580,16122580,16122580"
val diary_level1 = sc.sql( val diary_level1 = sc.sql(
s""" s"""
...@@ -804,4 +987,298 @@ object GetDeviceDuration { ...@@ -804,4 +987,298 @@ object GetDeviceDuration {
} }
} }
} }
\ No newline at end of file
object EsmmDataTest {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = GmeiConfig.getMinusNDate(1)
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("EsmmData")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String]("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.EsmmData ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "eagle",tableName = "src_mimas_prod_api_diary_tags")
ti.tidbMapTable(dbName = "eagle",tableName = "src_zhengxing_api_tag")
ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_click")
ti.tidbMapTable(dbName = "jerry_prod",tableName = "data_feed_exposure_precise")
ti.tidbMapTable(dbName = "jerry_test", tableName = "train_data")
click(sc)
val max_stat_date = sc.sql(
s"""
|select max(stat_date) from train_data
""".stripMargin
)
val max_stat_date_str = max_stat_date.collect().map(s => s(0).toString).head
println("max_stat_date_str",max_stat_date_str)
println("param.date",param.date)
if (max_stat_date_str != param.date || max_stat_date_str == null){
val stat_date = param.date
println(stat_date)
// val imp_data = sc.sql(
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
// )
val imp_data = sc.sql(
s"""
|select * from
|(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from data_feed_exposure_precise
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id) a
""".stripMargin
)
// imp_data.show()
// println("imp_data.count()")
// println(imp_data.count())
val clk_data = sc.sql(
s"""
|select distinct stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from esmm_click
|where stat_date ='${stat_date}'
""".stripMargin
)
// clk_data.show()
// println("clk_data.count()")
// println(clk_data.count())
val imp_data_filter = imp_data.except(clk_data).withColumn("y",lit(0)).withColumn("z",lit(0))
// imp_data_filter.createOrReplaceTempView("imp_data_filter")
// imp_data_filter.show()
// println("imp_data_filter.count()")
// println(imp_data_filter.count())
val stat_date_not = stat_date.replace("-","")
val cvr_data = sc.sql(
s"""
|select distinct
| from_unixtime(unix_timestamp(partition_date ,'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
| cl_id as device_id,city_id as ucity_id,
| params["referrer_id"] as cid_id,params["business_id"] as diary_service_id
|from online.tl_hdfs_maidian_view
|where action='page_view'
|and partition_date ='${stat_date_not}'
|and params['page_name'] = 'welfare_detail'
|and params['referrer'] = 'diary_detail'
""".stripMargin
)
val cvr_data_filter = cvr_data.withColumn("y",lit(1)).withColumn("z",lit(1))
// cvr_data_filter.createOrReplaceTempView("cvr_data_filter")
// cvr_data_filter.show()
// println("cvr_data_filter.count()")
// println(cvr_data_filter.count())
val clk_data_filter =clk_data.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
// clk_data_filter.createOrReplaceTempView("clk_data_filter")
// clk_data_filter.show()
// println("clk_data_filter.count()")
// println(clk_data_filter.count())
val union_data = imp_data_filter.union(clk_data_filter).union(cvr_data_filter)
union_data.createOrReplaceTempView("union_data")
// union_data.show()
// println("union_data.count()")
// println(union_data.count())
val union_data_clabel = sc.sql(
s"""
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,
| c.level1_id as clevel1_id
|from union_data a
|left join online.tl_hdfs_diary_tags_view b on a.cid_id=b.diary_id
|left join online.bl_tag_hierarchy_detail c on b.tag_id=c.id
|where b.partition_date='${stat_date_not}'
|and c.partition_date='${stat_date_not}'
""".stripMargin
)
union_data_clabel.createOrReplaceTempView("union_data_clabel")
// union_data_clabel.show()
val union_data_slabel = sc.sql(
s"""
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,
| c.level1_id as slevel1_id
|from union_data_clabel a
|left join online.tl_meigou_servicetag_view b on a.diary_service_id=b.service_id
|left join online.bl_tag_hierarchy_detail c on b.tag_id=c.id
|where b.partition_date='${stat_date_not}'
|and c.partition_date='${stat_date_not}'
""".stripMargin
)
union_data_slabel.createOrReplaceTempView("union_data_slabel")
// union_data_slabel.show()
val union_data_ccity_name = sc.sql(
s"""
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,
| c.name as ccity_name
|from union_data_slabel a
|left join src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join src_zhengxing_api_tag c on b.tag_id=c.id
| where c.tag_type=4
""".stripMargin
)
union_data_ccity_name.createOrReplaceTempView("union_data_ccity_name")
// union_data_ccity_name.show()
val union_data_scity_id = sc.sql(
s"""
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,a.ccity_name,
| d.city_id as scity_id
|from union_data_ccity_name a
|left join online.tl_meigou_service_view b on a.diary_service_id=b.id
|left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id
|left join online.tl_hdfs_hospital_view d on c.hospital_id=d.id
|where b.partition_date='${stat_date_not}'
|and c.partition_date='${stat_date_not}'
|and d.partition_date='${stat_date_not}'
""".stripMargin
)
union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
union_data_scity_id.show()
val union_data_scity_id2 = sc.sql(
s"""
|select device_id,cid_id,first(stat_date) stat_date,first(ucity_id) ucity_id,first(diary_service_id) diary_service_id,first(y) y,
|first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,first(scity_id) scity_id
|from union_data_scity_id
|group by device_id,cid_id
""".stripMargin
)
GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="train_data",SaveMode.Append)
} else {
println("train_data already have param.date data")
}
sc.stop()
}
}
def click(spark:SparkSession): Unit ={
val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
println(yesterday)
val stat_yesterday = LocalDate.now().minusDays(1).toString
val max_stat_date = spark.sql(
s"""
|select max(stat_date) from esmm_click
""".stripMargin
)
val max = max_stat_date.collect().map(s => s(0).toString).head
println("max_stat_date",max)
if (max != stat_yesterday || max == null){
val result01 = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['diary_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] = '精选'
|and params['page_name'] = 'home'
""".stripMargin
)
result01.createOrReplaceTempView("temp_result")
val result02 = spark.sql(
s"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
""".stripMargin
)
result02.createOrReplaceTempView("temp_result02")
val result_dairy = spark.sql(
s"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.device_type as device_type,
| re.cid as cid_id,
| re.city_id as city_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
""".stripMargin
)
val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(jdbcuri,result_dairy, table="esmm_click",SaveMode.Append)
println("data insert")
}else{
println("data already exists")
}
}
}
...@@ -53,6 +53,11 @@ object GmeiConfig extends Serializable { ...@@ -53,6 +53,11 @@ object GmeiConfig extends Serializable {
.enableHiveSupport() .enableHiveSupport()
.getOrCreate() .getOrCreate()
spark.sql("use online")
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
val context = SparkContext.getOrCreate(sparkConf) val context = SparkContext.getOrCreate(sparkConf)
(context, spark) (context, spark)
} }
...@@ -65,10 +70,15 @@ object GmeiConfig extends Serializable { ...@@ -65,10 +70,15 @@ object GmeiConfig extends Serializable {
prop.put("isolationLevel", "NONE") prop.put("isolationLevel", "NONE")
prop.put("truncate", "true") prop.put("truncate", "true")
// save to mysql/tidb // save to mysql/tidb
df.repartition(128).write.mode(saveModel) try {
.option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 300) df.repartition(128).write.mode(saveModel)
.jdbc(jdbcuri, table, prop) .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 300)
print("写入成功") .jdbc(jdbcuri, table, prop)
print("写入成功")}
catch {
case _ => println("没有写入成功")
}
} }
...@@ -109,3 +119,4 @@ object GmeiConfig extends Serializable { ...@@ -109,3 +119,4 @@ object GmeiConfig extends Serializable {
} }
} }
...@@ -68,9 +68,9 @@ object Search_keywords_count { ...@@ -68,9 +68,9 @@ object Search_keywords_count {
//搜索关键词提取 //搜索关键词提取
val search_keywords = sc.sql( val search_keywords = sc.sql(
s""" s"""
|select count(test_udf(params)) as search_keywords |select params['query'] as search_keywords
|from online.tl_hdfs_maidian_view |from online.tl_hdfs_maidian_view
|where (action = 'do_search' or action = 'search_result_click_search') |where (action = 'do_search' or action = 'search_result_click_search' or action ='on_click_jumping_hot_word')
|and partition_date ='20190108' |and partition_date ='20190108'
""".stripMargin """.stripMargin
).show(20)/*.rdd.map(x=>{ ).show(20)/*.rdd.map(x=>{
......
...@@ -56,6 +56,11 @@ object data_feed_exposure_precise { ...@@ -56,6 +56,11 @@ object data_feed_exposure_precise {
//println(param.date) //println(param.date)
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
// sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
// sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
val result01=sc.sql( val result01=sc.sql(
s""" s"""
|select |select
......
package com.gmei
import java.io.Serializable
import java.time.LocalDate
import com.gmei.lib.AbstractParams
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
import scopt.OptionParser
import scala.util.parsing.json.JSON
object esmm_feature {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_test",tableName = "device_app_list")
ti.tidbMapTable(dbName = "jerry_test",tableName = "user_feature")
user_feature(sc)
get_applist(sc)
sc.stop()
}}
def get_applist(spark:SparkSession): Unit ={
val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
println(yesterday)
val df = spark.sql(
s"""
|select device["device_id"] as device_id,cl_type,params["installed_app_info"]
|from online.tl_hdfs_maidian_view where partition_date = $yesterday
|and action = 'user_installed_all_app_info'
""".stripMargin).dropDuplicates("device_id")
df.persist()
val old = spark.sql("select device_id from device_app_list").collect().map(x => x(0).toString)
import spark.implicits._
val android = df.rdd.map(x => (x(0).toString,x(1).toString,x(2).toString))
.filter(x => x._2 == "android").map(x => (x._1,x._2,parse_json(x._3),yesterday))
val ios = df.rdd.map(x => (x(0).toString,x(1).toString,x(2).toString))
.filter(x => x._2 == "ios").map(x => (x._1,x._2,x._3,yesterday))
val rdd = android.union(ios)
val new_user = rdd.filter(x => old.indexOf(x._1)== -1)
.toDF("device_id","os","app_list","update_date")
if (new_user.take(1).nonEmpty){
val jdbc = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(jdbc, new_user,"device_app_list", SaveMode.Append)
val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(tecent_jdbc, new_user,"device_app_list", SaveMode.Append)
}else{
println("没有新用户需要写入")
}
df.unpersist()
}
def parse_json(str:String): String ={
var t = List[Map[String, Any]]()
val result = JSON.parseFull(str)
result match {
case Some(b: List[Map[String, Any]]) => t = t ++ b
case None => println("Parsing failed")
case other => println("Unknown data structure: " + other)
}
var x = List[String]()
if (t.nonEmpty){
for (i <- t){
x = x:+i("appName").toString
}
}
x.mkString(",")
}
def user_feature(spark:SparkSession): Unit ={
val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
println(yesterday)
val sql_exist = "select device_id from user_feature"
val old = spark.sql(sql_exist)
.collect().map(x => x(0).toString)
val sql_yesterday =
s"""
|select device["device_id"] as id,device["device_type"],device["manufacturer"],city_id,channel,
|partition_date from online.tl_hdfs_maidian_view where partition_date = $yesterday
""".stripMargin
val rdd = spark.sql(sql_yesterday).repartition(200).na.drop().dropDuplicates("id").rdd
.map(x =>(x(0).toString,x(1).toString,x(2).toString,x(3).toString,
x(4).toString,x(5).toString))
import spark.implicits._
val df_new = rdd.filter(x => old.indexOf(x._1)== -1)
.toDF("device_id","device_type","manufacturer","city_id","channel","date")
if (df_new.take(1).nonEmpty){
df_new.persist()
val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(jdbcuri, df_new, "user_feature", SaveMode.Append)
val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(tecent_jdbc, df_new, "user_feature", SaveMode.Append)
df_new.unpersist()
}else {
println("no need to insert into user feature")
}
}
}
...@@ -499,4 +499,5 @@ object CTR_precise { ...@@ -499,4 +499,5 @@ object CTR_precise {
} }
} }
\ No newline at end of file
...@@ -8,6 +8,8 @@ import org.apache.log4j.{Level, Logger} ...@@ -8,6 +8,8 @@ import org.apache.log4j.{Level, Logger}
import scopt.OptionParser import scopt.OptionParser
import com.gmei.lib.AbstractParams import com.gmei.lib.AbstractParams
import java.io._ import java.io._
import scala.util.parsing.json._
object temp_analysis { object temp_analysis {
...@@ -160,22 +162,6 @@ object temp_analysis { ...@@ -160,22 +162,6 @@ object temp_analysis {
} }
//5.登录人数
val log_device_temp = sc.sql(
s"""
|select oe.stat_date,count(distinct(oe.device_id)) as log_num
|from data_feed_exposure oe left join final_id
|on oe.device_id = final_id.device_id
|and oe.stat_date >='2018-11-01'
|and final_id.device_id is null
|group by oe.stat_date
|order by oe.stat_date
""".stripMargin
)
println("登录人数统计:")
log_device_temp.show(80)
} }
...@@ -427,7 +413,8 @@ object meigou_xiaofei_renshu { ...@@ -427,7 +413,8 @@ object meigou_xiaofei_renshu {
import sc.implicits._ import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1) // val stat_date = GmeiConfig.getMinusNDate(1)
val stat_date=param.date
//println(param.date) //println(param.date)
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
...@@ -521,7 +508,7 @@ object meigou_xiaofei_renshu { ...@@ -521,7 +508,7 @@ object meigou_xiaofei_renshu {
object smart_rank_count { object alpha_ctr {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF) Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
...@@ -567,158 +554,108 @@ object smart_rank_count { ...@@ -567,158 +554,108 @@ object smart_rank_count {
import sc.implicits._ import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1) val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = param.date
//println(param.date) //println(param.date)
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
val agency_id = sc.sql( val click_count_recommend = sc.sql(
s""" s"""
|SELECT DISTINCT(cl_id) as device_id |select '${stat_date}' as stat_date,count(*) as click_count_recommend
|FROM online.ml_hospital_spam_pv_day |from bl.bl_alpha_et_mg_maidianlog_inc_d
|WHERE partition_date >= '20180402' |where params['tab_name']='recommend'
|AND partition_date <= '${partition_date}' |and params['page_name']='home'
|AND pv_ratio >= 0.95 |and type='on_click_feed_topic_card'
|UNION ALL |and partition_day='${partition_date}'
|SELECT DISTINCT(cl_id) as device_id """.stripMargin
|FROM online.ml_hospital_spam_pv_month
|WHERE partition_date >= '20171101'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
""".stripMargin
) )
agency_id.createOrReplaceTempView("agency_id") click_count_recommend.show()
val blacklist_id = sc.sql( val click_count_focus = sc.sql(
s""" s"""
|SELECT device_id |select '${stat_date}' as stat_date,count(*) as click_count_focus
|from blacklist |from bl.bl_alpha_et_mg_maidianlog_inc_d
""".stripMargin |where params['tab_name']='focus'
|and params['page_name']='home'
|and type='on_click_feed_topic_card'
|and partition_day='${partition_date}'
""".stripMargin
) )
blacklist_id.createOrReplaceTempView("blacklist_id") click_count_focus.show()
val final_id = sc.sql(
s"""
|select device_id
|from agency_id
|UNION ALL
|select device_id
|from blacklist_id
""".stripMargin
)
final_id.createOrReplaceTempView("final_id")
def parse_json(str:String): Int ={
var t = List[Map[String, Any]]()
val result = JSON.parseFull(str)
result match {
case Some(b: List[Map[String, Any]]) => t = t ++ b
case None => println("Parsing failed")
case other => println("Unknown data structure: " + other)
}
t.size
val user_city_meigou_view = sc.sql( }
s"""
|select ov.cl_id as device_id,ov.city_id as device_city,ov.params['business_id'] as meigou_id
|from online.tl_hdfs_maidian_view ov left join final_id
|on ov.cl_id = final_id.device_id
|where ov.action = "page_view"
|and ov.params['page_name']="welfare_detail"
|and ov.partition_date >='20181101'
|and ov.partition_date <'20181201'
|and ov.city_id is not null
|and final_id.device_id is null
""".stripMargin
)
user_city_meigou_view.createOrReplaceTempView("user_city_meigou_view")
val meigou_city = sc.sql( val expoure_cards=sc.sql(
s""" s"""
|select b.id as meigou_id,d.city_id as meigou_city |select params['exposure_cards'] as exposure_cards
|from online.tl_meigou_service_view b |from bl.bl_alpha_et_mg_maidianlog_inc_d
|left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id |where params['tab_name'] = 'recommend'
|left join online.tl_hdfs_hospital_view d on c.hospital_id=d.id |and params['page_name'] = 'home'
|where b.partition_date='20181228' |and type = 'page_precise_exposure'
|and c.partition_date='20181228' |and partition_day='${partition_date}'
|and d.partition_date='20181228'
""".stripMargin """.stripMargin
) )
meigou_city.createOrReplaceTempView("meigou_city") val a =expoure_cards.rdd.map(row => row(0).toString).map(row=>parse_json(row)).collect().sum
val result1=List((stat_date,a))
val df1 = sc.createDataFrame(result1).toDF("stat_date","expoure_count_recommend")
val meigou_pv_tongcheng = sc.sql( val expoure_cards2=sc.sql(
s""" s"""
|select a.device_id,a.device_city,a.meigou_id,b.meigou_city |select params['exposure_cards'] as exposure_cards
|from user_city_meigou_view a |from bl.bl_alpha_et_mg_maidianlog_inc_d
|left join meigou_city b |where params['tab_name'] = 'focus'
|on a.meigou_id = b.meigou_id |and params['page_name'] = 'home'
|and type = 'page_precise_exposure'
|and partition_day='${partition_date}'
""".stripMargin """.stripMargin
) )
meigou_pv_tongcheng.createOrReplaceTempView("meigou_pv_tongcheng") val b =expoure_cards2.rdd.map(row => row(0).toString).map(row=>parse_json(row)).collect().sum
val result2=List((stat_date,b))
val df2 = sc.createDataFrame(result2).toDF("stat_date","expoure_count_focus")
val meigou_pv_count = sc.sql(
s"""
|select '2018-11' as stat_date,meigou_city,count(device_id) as meigou_pv,count(distinct(device_id)) as meigou_device_num
|from meigou_pv_tongcheng
|where device_city = meigou_city
|group by meigou_city
""".stripMargin
)
meigou_pv_count.createOrReplaceTempView("meigou_pv_count")
//开始计算咨询 val result=click_count_recommend.join(click_count_focus,"stat_date")
val zixun_meigou_view = sc.sql( .join(df1,"stat_date")
s""" .join(df2,"stat_date")
|select ov.cl_id as device_id,ov.city_id as device_city,ov.params['service_id'] as meigou_id
|from online.tl_hdfs_maidian_view ov left join final_id
|on ov.cl_id = final_id.device_id
|where ov.partition_date >= '20181101'
|and ov.partition_date < '20181201'
|and ov.action = 'welfare_detail_click_message'
|and final_id.device_id is null
""".stripMargin
)
zixun_meigou_view.createOrReplaceTempView("zixun_meigou_view")
val zixun_meigou_tongcheng = sc.sql(
s"""
|select a.device_id,a.device_city,a.meigou_id,b.meigou_city
|from zixun_meigou_view a
|left join meigou_city b
|on a.meigou_id=b.meigou_id
""".stripMargin
)
zixun_meigou_tongcheng.createOrReplaceTempView("zixun_meigou_tongcheng")
val zixun_pv_count = sc.sql(
s"""
|select '2018-11' as stat_date,meigou_city,count(device_id) as meigou_zixun,count(distinct(device_id)) as meigou_zixun_device_num
|from zixun_meigou_tongcheng
|where device_city=meigou_city
|group by meigou_city
""".stripMargin
)
zixun_pv_count.createOrReplaceTempView("zixun_pv_count")
GmeiConfig.writeToJDBCTable(result, "alpha_ctr", SaveMode.Append)
//开始计算每个地区每月新增设备
val device_new_count = sc.sql( val device_duration_avge = sc.sql(
s""" s"""
|select first_city,count(distinct(device_id)) as new_device_month |SELECT '${stat_date}' as stat_date,sum(a.time_all)/count(a.device_id) as device_duration_avge,count(distinct(a.device_id)) as device_num from (select device_id,sum(params['duration']) as time_all
|from online.ml_device_day_active_status |from bl.bl_alpha_et_mg_maidianlog_inc_d
|where active_type != '4' |where type='on_app_session_over'
|and partition_date >='20181101' |and partition_day='${partition_date}'
|and partition_date <'20181201' |GROUP BY device_id) a
|group by first_city
""".stripMargin """.stripMargin
) )
device_new_count.createOrReplaceTempView("device_new_count") device_duration_avge.show()
val duration_everytime_avge=sc.sql(
//将所有的数据综合一起
val all_count = sc.sql(
s""" s"""
|select mc.stat_date,mc.meigou_city,mc.meigou_pv,mc.meigou_device_num,zc.meigou_zixun,zc.meigou_zixun_device_num,dc.new_device_month |SELECT '${stat_date}' as stat_date,sum(a.time_duration)/count(a.device_id) as duration_everytime_avge from (select device_id,params['duration'] as time_duration
|from meigou_pv_count mc |from bl.bl_alpha_et_mg_maidianlog_inc_d
|left join zixun_pv_count zc on mc.meigou_city = zc.meigou_city |where type='on_app_session_over'
|left join device_new_count dc on dc.first_city=mc.meigou_city |and partition_day='${partition_date}') a
""".stripMargin """.stripMargin
) )
all_count.show()
GmeiConfig.writeToJDBCTable(all_count, "smart_rank_count", SaveMode.Append) val result3=device_duration_avge.join(duration_everytime_avge,"stat_date")
GmeiConfig.writeToJDBCTable(result3, "alpha_duration", SaveMode.Append)
} }
...@@ -733,7 +670,7 @@ object smart_rank_count { ...@@ -733,7 +670,7 @@ object smart_rank_count {
//话题相关问题统计 //话题相关问题统计
object question_count { object copy_database {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF) Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
...@@ -772,50 +709,32 @@ object question_count { ...@@ -772,50 +709,32 @@ object question_count {
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video") ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click") ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist") ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list") ti.tidbMapTable(dbName = "jerry_test", tableName = "tl_hdfs_wiki_item_tag_view")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure") ti.tidbMapTable(dbName = "jerry_test", tableName = "Knowledge_network")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table") ti.tidbMapTable(dbName = "eagle", tableName = "src_mimas_prod_api_diary")
import sc.implicits._ import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1) val stat_date = GmeiConfig.getMinusNDate(1)
//println(param.date) // val stat_date=param.date
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
val agency_id = sc.sql( val new_data = sc.sql(
s"""
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_day
|WHERE partition_date >= '20180402'
|AND partition_date <= '20190117'
|AND pv_ratio >= 0.95
|UNION ALL
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_month
|WHERE partition_date >= '20171101'
|AND partition_date <= '20190117'
|AND pv_ratio >= 0.95
""".stripMargin
)
agency_id.createOrReplaceTempView("agency_id")
val question_count = sc.sql(
s""" s"""
|SELECT partition_date,count(cl_id) |select d.level2_id,d.level2_name,c.item_id,c.tag_id,c.id,c.name,c.treatment_method,c.price_min,c.price_max,c.treatment_time,c.maintain_time,c.recover_time
|FROM online.tl_hdfs_maidian_view ov left join agency_id |from online.bl_tag_hierarchy_detail d
|on ov.cl_id = agency_id.device_id |inner join
|WHERE ov.partition_date >= '20190101' |(select a.item_id,a.tag_id,b.id,b.name,b.treatment_method,b.price_min,b.price_max,b.treatment_time,b.maintain_time,b.recover_time
|and ov.action='community_home_click_feed_card' |from online.tl_hdfs_wiki_item_tag_view a
|and ov.params["card_type"]="问题" |inner join Knowledge_network b
|and ov.cl_id not in (select device_id from blacklist) |on a.item_id=b.id
|and agency_id.device_id is null |where a.partition_date='${partition_date}') c
|GROUP BY ov.partition_date |on d.id=c.tag_id
|order by ov.partition_date |where d.partition_date='${partition_date}'
""".stripMargin """.stripMargin
) )
question_count.show(30) GmeiConfig.writeToJDBCTable(new_data, "train_Knowledge_network_data", SaveMode.Overwrite)
} }
......
...@@ -218,34 +218,145 @@ object Repeated_content_recommendation { ...@@ -218,34 +218,145 @@ object Repeated_content_recommendation {
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table") ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
val stat_date = GmeiConfig.getMinusNDate(1) // val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = param.date val stat_date = param.date
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
val exp_diary = sc.sql(
val agency_id = sc.sql(
s""" s"""
|select concat_ws('|',device_id,cid_id) |SELECT DISTINCT(cl_id) as device_id
|from data_feed_exposure |FROM online.ml_hospital_spam_pv_day
|where cid_type = 'diary' |WHERE partition_date >= '20180402'
|and device_id not in (select device_id from blacklist) |AND partition_date <= '${partition_date}'
|and stat_date ='${stat_date}' |AND pv_ratio >= 0.95
|UNION ALL
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_month
|WHERE partition_date >= '20171101'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
|UNION ALL
|select distinct(device_id)
|from blacklist
""".stripMargin
)
agency_id.createOrReplaceTempView("agency_id")
val device_id_oldUser = sc.sql(
s"""
|select distinct(om.device_id) as device_id
|from online.ml_device_day_active_status om left join agency_id
|on om.device_id = agency_id.device_id
|where om.active_type = '4'
|and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and om.partition_date ='${partition_date}'
|and agency_id.device_id is null
""".stripMargin """.stripMargin
) )
exp_diary.show() device_id_oldUser.createOrReplaceTempView("device_id_old")
val get_result =exp_diary.rdd.map((_, 1)).reduceByKey(_ + _) device_id_oldUser.show()
val device_id_newUser = sc.sql(
s"""
|select distinct(om.device_id) as device_id
|from online.ml_device_day_active_status om left join agency_id
|on om.device_id = agency_id.device_id
|where om.active_type != '4'
|and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and om.partition_date ='${partition_date}'
|and agency_id.device_id is null
""".stripMargin
)
device_id_newUser.createOrReplaceTempView("device_id_new")
device_id_newUser.show()
val exp_diary_new = sc.sql(
s"""
|select concat_ws('|',de.device_id,de.cid_id)
|from data_feed_exposure de inner join device_id_new
|on de.device_id=device_id_new.device_id
|where de.cid_type = 'diary'
|and de.stat_date ='${stat_date}'
""".stripMargin
)
val get_result_new =exp_diary_new.rdd.map((_, 1)).reduceByKey(_ + _)
.sortBy(_._2,false) .sortBy(_._2,false)
val more_than2=get_result.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y) val more_than2_new=get_result_new.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
println(more_than2) println(more_than2_new)
val all =get_result.map(_._2).reduce((x,y)=>x+y) val all_new =get_result_new.map(_._2).reduce((x,y)=>x+y)
println(all) println(all_new)
val repeated_rate= more_than2 / all.toDouble val repeated_rate_new= more_than2_new / all_new.toDouble
println(repeated_rate) println(repeated_rate_new)
val test=List((stat_date,repeated_rate))
val df = sc.createDataFrame(test)
val exp_diary_old = sc.sql(
s"""
|select concat_ws('|',de.device_id,de.cid_id)
|from data_feed_exposure de inner join device_id_old
|on de.device_id=device_id_old.device_id
|where de.cid_type = 'diary'
|and de.stat_date ='${stat_date}'
""".stripMargin
)
val get_result_old =exp_diary_old.rdd.map((_, 1)).reduceByKey(_ + _)
.sortBy(_._2,false)
val more_than2_old=get_result_old.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
println(more_than2_old)
val all_old =get_result_old.map(_._2).reduce((x,y)=>x+y)
println(all_old)
val repeated_rate_old= more_than2_old / all_old.toDouble
println(repeated_rate_old)
val result2=List((stat_date,more_than2_old,all_old,more_than2_new,all_new))
val df2 = sc.createDataFrame(result2).toDF("stat_date","old_rep_count","old_imp_all","new_rep_count","new_imp_all")
GmeiConfig.writeToJDBCTable(df, table = "Repeated_evaluation_indicator", SaveMode.Append) GmeiConfig.writeToJDBCTable(df2, table = "Repeated_evaluation_indicator", SaveMode.Append)
// val exp_diary_old = sc.sql(
// s"""
// |select concat_ws('|',de.device_id,de.cid_id)
// |from data_feed_exposure de inner join device_id_old
// |where de.cid_type = 'diary'
// |and de.stat_date ='${stat_date}'
// """.stripMargin
// )
// val get_result_old =exp_diary_old.rdd.map((_, 1)).reduceByKey(_ + _)
// .sortBy(_._2,false)
//
// val more_than2_old=get_result_old.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
// println(more_than2_old)
// val all_old =get_result_old.map(_._2).reduce((x,y)=>x+y)
// println(all_old)
// val repeated_rate_old= more_than2_old / all_old.toDouble
// println(repeated_rate_old)
//
//
// val result2=List((stat_date,more_than2_old,all_old))
// val df2 = sc.createDataFrame(result2).toDF("stat_date","old_rep_count","old_imp_all")
//
// GmeiConfig.writeToJDBCTable(df2, table = "Repeated_evaluation_indicator_old", SaveMode.Append)
// val temp=get_result.collect() // val temp=get_result.collect()
...@@ -260,8 +371,6 @@ object Repeated_content_recommendation { ...@@ -260,8 +371,6 @@ object Repeated_content_recommendation {
} }
object Repeated_content_recommendation_moreday { object Repeated_content_recommendation_moreday {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
...@@ -305,14 +414,16 @@ object Repeated_content_recommendation_moreday { ...@@ -305,14 +414,16 @@ object Repeated_content_recommendation_moreday {
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table") ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// val stat_date = GmeiConfig.getMinusNDate(1) val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = "2019-01-16" // val stat_date = "2019-01-16"
// val partition_date = stat_date.replace("-","") // val partition_date = stat_date.replace("-","")
val now= new Date() val now= new Date()
// val stat_date=param.date
val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
val date = dateFormat.format(now.getTime - 86400000L * 15) val date = dateFormat.format(now.getTime - 86400000L * 8)
val yesterday=dateFormat.format(now.getTime- 86400000L) val yesterday=dateFormat.format(now.getTime- 86400000L)
...@@ -342,6 +453,7 @@ object Repeated_content_recommendation_moreday { ...@@ -342,6 +453,7 @@ object Repeated_content_recommendation_moreday {
val repeated_rate= fenmu / fenzi.toDouble val repeated_rate= fenmu / fenzi.toDouble
val result=List((yesterday,repeated_rate)) val result=List((yesterday,repeated_rate))
println(result)
val df_result = sc.createDataFrame(result) val df_result = sc.createDataFrame(result)
GmeiConfig.writeToJDBCTable(df_result, table = "Repeated_content_recommendation_moreday", SaveMode.Append) GmeiConfig.writeToJDBCTable(df_result, table = "Repeated_content_recommendation_moreday", SaveMode.Append)
...@@ -351,10 +463,7 @@ object Repeated_content_recommendation_moreday { ...@@ -351,10 +463,7 @@ object Repeated_content_recommendation_moreday {
// GmeiConfig.writeToJDBCTable(df, table = "Repeated_evaluation_indicator_moreday", SaveMode.Append) // GmeiConfig.writeToJDBCTable(df, table = "Repeated_evaluation_indicator_moreday", SaveMode.Append)
} }
} }
} }
...@@ -407,6 +516,7 @@ object GetHiveSearchData { ...@@ -407,6 +516,7 @@ object GetHiveSearchData {
val stat_date = GmeiConfig.getMinusNDate(1) val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = param.date // val stat_date = param.date
val partition_date = stat_date.replace("-","") val partition_date = stat_date.replace("-","")
...@@ -640,3 +750,246 @@ object GetHiveSearchData { ...@@ -640,3 +750,246 @@ object GetHiveSearchData {
} }
object find_reason {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure_precise")
// val stat_date = GmeiConfig.getMinusNDate(1)
val stat_date=param.date
val partition_date = stat_date.replace("-","")
//机构id
val blacklist = sc.sql(
s"""
|select device_id from blacklist
""".stripMargin
)
blacklist.createOrReplaceTempView("blacklist")
val agency_id = sc.sql(
s"""
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_day
|WHERE partition_date >= '20180402'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
|UNION ALL
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_month
|WHERE partition_date >= '20171101'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
""".stripMargin
)
// agency_id.show()
agency_id.createOrReplaceTempView("agency_id")
//每日新用户
val device_id_newUser = sc.sql(
s"""
|select distinct(os.device_id) as device_id
|from online.ml_device_day_active_status os left join blacklist
|on os.device_id = blacklist.device_id
|where os.active_type != '4'
|and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and os.partition_date ='${partition_date}'
|and blacklist.device_id is null
""".stripMargin
)
// device_id_newUser.show()
device_id_newUser.createOrReplaceTempView("device_id_new")
//每日老用户
val device_id_oldUser = sc.sql(
s"""
|select distinct(os.device_id) as device_id
|from online.ml_device_day_active_status os left join blacklist
|on os.device_id=blacklist.device_id
|where os.active_type = '4'
|and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and os.partition_date ='${partition_date}'
|and blacklist.device_id is null
""".stripMargin
)
// device_id_oldUser.show()
device_id_oldUser.createOrReplaceTempView("device_id_old")
val all_clk = sc.sql(
s"""
|select ov.cl_id as device_id
|from online.tl_hdfs_maidian_view ov left join agency_id
|on ov.cl_id = agency_id.device_id
|where ov.action = 'on_click_diary_card'
|and ov.cl_id != "NULL"
|and ov.params['tab_name'] = '精选'
|and ov.params['page_name'] = 'home'
|and ov.partition_date='${partition_date}'
|and agency_id.device_id is null
""".stripMargin
)
// all_clk.show()
all_clk.createOrReplaceTempView("all_clk_diary_card")
//1.当天老用户中的点击用户数
val old_clk_count = sc.sql(
s"""
|select '${stat_date}' as stat_date,count(distinct(oc.device_id)) as old_clk_count
|from all_clk_diary_card oc inner join device_id_old
|on oc.device_id = device_id_old.device_id
""".stripMargin
)
// old_clk_count.show()
//1.1有点击的老用户
val old_clk_device = sc.sql(
s"""
|select distinct(oc.device_id) as device_id
|from all_clk_diary_card oc inner join device_id_old
|on oc.device_id = device_id_old.device_id
""".stripMargin
)
old_clk_device.createOrReplaceTempView("old_clk_device")
//1.1无点击的老用户
val old_noclk_device = sc.sql(
s"""
|select device_id
|from device_id_old
|except
|select device_id
|from old_clk_device
""".stripMargin
)
old_noclk_device.show()
//2.当天新用户中的点击用户数
// val new_clk_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(distinct(oc.device_id)) as new_clk_count
// |from all_clk_diary_card oc inner join device_id_new
// |on oc.device_id = device_id_new.device_id
// """.stripMargin
// )
////2.1 有点击的新用户
// val new_clk_device = sc.sql(
// s"""
// |select distinct(oc.device_id) as device_id
// |from all_clk_diary_card oc inner join device_id_new
// |on oc.device_id = device_id_new.device_id
// """.stripMargin
// )
// new_clk_device.createOrReplaceTempView("new_clk_device")
//
//
// //3.当天老用户数
//
// val old_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(distinct(dio.device_id)) as old_count
// |from device_id_old dio left join agency_id
// |on dio.device_id = agency_id.device_id
// |where agency_id.device_id is null
// """.stripMargin
// )
//
// //4.当天新用户数
// val new_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(distinct(din.device_id)) as new_count
// |from device_id_new din left join agency_id
// |on din.device_id = agency_id.device_id
// |where agency_id.device_id is null
// """.stripMargin
// )
//
// //5.有点击老用户的曝光数
// val exp_clkold_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(dp.device_id) as imp_clkold_count
// |from data_feed_exposure_precise dp inner join old_clk_device
// |on dp.device_id = old_clk_device.device_id
// |where stat_date='${stat_date}'
// |group by stat_date
// """.stripMargin
// )
//
// //6.有点击新用户的曝光数
// val exp_clknew_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(dp.device_id) as imp_clknew_count
// |from data_feed_exposure_precise dp inner join new_clk_device
// |on dp.device_id = new_clk_device.device_id
// |where stat_date='${stat_date}'
// |group by stat_date
// """.stripMargin
// )
//
// val result = old_clk_count.join(new_clk_count,"stat_date")
// .join(old_count,"stat_date")
// .join(new_count,"stat_date")
// .join(exp_clkold_count,"stat_date")
// .join(exp_clknew_count,"stat_date")
//
// GmeiConfig.writeToJDBCTable(result, "device_clk_imp_reason", SaveMode.Append)
}
}
}
...@@ -61,6 +61,12 @@ object testt { ...@@ -61,6 +61,12 @@ object testt {
) )
blacklist.createOrReplaceTempView("blacklist") blacklist.createOrReplaceTempView("blacklist")
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
// sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
// sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
val agency_id = sc.sql( val agency_id = sc.sql(
s""" s"""
|SELECT DISTINCT(cl_id) as device_id |SELECT DISTINCT(cl_id) as device_id
...@@ -76,22 +82,24 @@ object testt { ...@@ -76,22 +82,24 @@ object testt {
|AND pv_ratio >= 0.95 |AND pv_ratio >= 0.95
""".stripMargin """.stripMargin
) )
agency_id.show() // agency_id.show()
agency_id.createOrReplaceTempView("agency_id") agency_id.createOrReplaceTempView("agency_id")
//每日新用户 //每日新用户
val device_id_newUser = sc.sql( val device_id_newUser = sc.sql(
s""" s"""
|select distinct(device_id) as device_id |select distinct(os.device_id) as device_id
|from online.ml_device_day_active_status |from online.ml_device_day_active_status os left join blacklist
|where active_type != '4' |on os.device_id=blacklist.device_id
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' |where os.active_type != '4'
|and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' | ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' | ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' | ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' | ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' | ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown') | ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}' |and os.partition_date ='${partition_date}'
|and blacklist.device_id is null
""".stripMargin """.stripMargin
) )
device_id_newUser.show() device_id_newUser.show()
...@@ -103,19 +111,19 @@ object testt { ...@@ -103,19 +111,19 @@ object testt {
|select distinct(os.device_id) as device_id |select distinct(os.device_id) as device_id
|from online.ml_device_day_active_status os left join blacklist |from online.ml_device_day_active_status os left join blacklist
|on os.device_id=blacklist.device_id |on os.device_id=blacklist.device_id
|where active_type = '4' |where os.active_type = '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' |and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' | ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' | ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' | ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' | ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' | ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown') | ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}' |and os.partition_date ='${partition_date}'
|and blacklist.device_id is null |and blacklist.device_id is null
""".stripMargin """.stripMargin
) )
device_id_oldUser.show() // device_id_oldUser.show()
device_id_oldUser.createOrReplaceTempView("device_id_old") device_id_oldUser.createOrReplaceTempView("device_id_old")
...@@ -402,3 +410,366 @@ object testt { ...@@ -402,3 +410,366 @@ object testt {
} }
object diary_clk_card {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure_precise")
val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date=param.date
val partition_date = stat_date.replace("-","")
//机构id
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
// sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
// sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
// sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
val blacklist = sc.sql(
s"""
|select device_id from blacklist
""".stripMargin
)
blacklist.createOrReplaceTempView("blacklist")
val agency_id = sc.sql(
s"""
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_day
|WHERE partition_date >= '20180402'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
|UNION ALL
|SELECT DISTINCT(cl_id) as device_id
|FROM online.ml_hospital_spam_pv_month
|WHERE partition_date >= '20171101'
|AND partition_date <= '${partition_date}'
|AND pv_ratio >= 0.95
""".stripMargin
)
agency_id.createOrReplaceTempView("agency_id")
val blacklist_all=sc.sql(
s"""
|SELECT device_id
|FROM blacklist
|UNION ALL
|SELECT device_id
|FROM agency_id
""".stripMargin
)
blacklist_all.createOrReplaceTempView("blacklist_all")
val device_id_oldUser = sc.sql(
s"""
|select distinct(om.device_id) as device_id
|from online.ml_device_day_active_status om left join blacklist_all
|on om.device_id = blacklist_all.device_id
|where om.active_type = '4'
|and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and om.partition_date ='${partition_date}'
|and blacklist_all.device_id is null
""".stripMargin
)
device_id_oldUser.createOrReplaceTempView("device_id_old")
device_id_oldUser.show()
val clk_count_oldUser_Contrast_a = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_Contrast_a
|from online.tl_hdfs_maidian_view ot inner join device_id_old
|on ot.cl_id = device_id_old.device_id
|where ot.action='on_click_diary_card'
|and ot.params['tab_name'] = '精选'
|and ot.params['page_name'] = 'home'
|and ot.cl_id regexp'1$$'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
clk_count_oldUser_Contrast_a.show()
val clk_count_oldUser_Contrast_b = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_Contrast_b
|from online.tl_hdfs_maidian_view ot inner join device_id_old
|on ot.cl_id = device_id_old.device_id
|where ot.action='full_stack_click_video_card_full_screen_play'
|and ot.params['tab_name'] = '精选'
|and ot.params["card_type"]="diary"
|and ot.cl_id regexp'1$$'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val imp_count_oldUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_Contrast
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_oldUser_Contrast_precise = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_Contrast_precise
|from data_feed_exposure_precise je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val clk_count_oldUser_all_a = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_all_a
|from online.tl_hdfs_maidian_view ot inner join device_id_old
|on ot.cl_id = device_id_old.device_id
|where ot.action='on_click_diary_card'
|and ot.params['tab_name'] = '精选'
|and ot.params['page_name'] = 'home'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val clk_count_oldUser_all_b = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_all_b
|from online.tl_hdfs_maidian_view ot inner join device_id_old
|on ot.cl_id = device_id_old.device_id
|where ot.action='full_stack_click_video_card_full_screen_play'
|and ot.params['tab_name'] = '精选'
|and ot.params["card_type"]="diary"
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val imp_count_oldUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_all
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_oldUser_all_precise = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_all_precise
|from data_feed_exposure_precise je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
//统计新用户ctr
val device_id_newUser = sc.sql(
s"""
|select distinct(device_id) as device_id
|from online.ml_device_day_active_status
|where active_type != '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
""".stripMargin
)
device_id_newUser.createOrReplaceTempView("device_id_new")
val clk_count_newUser_Contrast_a = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_Contrast_a
|from online.tl_hdfs_maidian_view ot inner join device_id_new
|on ot.cl_id = device_id_new.device_id
|where ot.action='on_click_diary_card'
|and ot.params['tab_name'] = '精选'
|and ot.params['page_name'] = 'home'
|and ot.cl_id regexp'1$$'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val clk_count_newUser_Contrast_b = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_Contrast_b
|from online.tl_hdfs_maidian_view ot inner join device_id_new
|on ot.cl_id = device_id_new.device_id
|where ot.action='full_stack_click_video_card_full_screen_play'
|and ot.params['tab_name'] = '精选'
|and ot.params["card_type"]="diary"
|and ot.cl_id regexp'1$$'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val imp_count_newUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_Contrast
|from data_feed_exposure je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_newUser_Contrast_precise = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_Contrast_precise
|from data_feed_exposure_precise je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val clk_count_newUser_all_a = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_all_a
|from online.tl_hdfs_maidian_view ot inner join device_id_new
|on ot.cl_id = device_id_new.device_id
|where ot.action='on_click_diary_card'
|and ot.params['tab_name'] = '精选'
|and ot.params['page_name'] = 'home'
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val clk_count_newUser_all_b = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_all_b
|from online.tl_hdfs_maidian_view ot inner join device_id_new
|on ot.cl_id = device_id_new.device_id
|where ot.action='full_stack_click_video_card_full_screen_play'
|and ot.params['tab_name'] = '精选'
|and ot.params["card_type"]="diary"
|and ot.partition_date ='${partition_date}'
""".stripMargin
)
val imp_count_newUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_all
|from data_feed_exposure je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_newUser_all_precise = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_all_precise
|from data_feed_exposure_precise je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val result1 = clk_count_oldUser_Contrast_a.join(clk_count_oldUser_Contrast_b,"stat_date")
.join(imp_count_oldUser_Contrast,"stat_date")
.join(clk_count_oldUser_all_a,"stat_date")
.join(clk_count_oldUser_all_b,"stat_date")
.join(imp_count_oldUser_all,"stat_date")
.join(clk_count_newUser_Contrast_a,"stat_date")
.join(clk_count_newUser_Contrast_b,"stat_date")
.join(imp_count_newUser_Contrast,"stat_date")
.join(clk_count_newUser_all_a,"stat_date")
.join(clk_count_newUser_all_b,"stat_date")
.join(imp_count_newUser_all,"stat_date")
.join(imp_count_oldUser_Contrast_precise,"stat_date")
.join(imp_count_oldUser_all_precise,"stat_date")
.join(imp_count_newUser_Contrast_precise,"stat_date")
.join(imp_count_newUser_all_precise,"stat_date")
result1.show()
GmeiConfig.writeToJDBCTable(result1, "on_click_diary_card", SaveMode.Append)
}
}
}
# -*- coding: utf-8 -*-
# 导入必要模块
import pandas as pd
from sqlalchemy import create_engine
data=pd.read_excel('wiki_item.xls')
print(data.head())
# # 初始化数据库连接,使用pymysql模块
engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s%s") % ("root","3SYz54LS9#^9sBvC",'10.66.157.22', "4000", "jerry_test","?charset=utf8"))
# engine = create_engine('mysql+pymysql://root:147369@localhost:3306/mydb')
data.to_sql('Knowledge_network',con=engine,if_exists='append',index=False)
print("Write to MySQL successfully!")
\ No newline at end of file
# coding: utf-8
import json
import requests
def dingding_robot(data):
# 机器人的webhooK 获取地址参考:https://open-doc.dingtalk.com/microapp/serverapi2/qf2nxq
webhook = "https://oapi.dingtalk.com/robot/send?access_token=5131b887f6b022150f903e9d690e08c0d481fba844545034aaf48906ee026fa0"
headers = {'content-type': 'application/json'} # 请求头
r = requests.post(webhook, headers=headers, data=json.dumps(data))
r.encoding = 'utf-8'
return (r.text)
if __name__ == "__main__":
import linecache
str = ""
for i in range(35,64):
s=linecache.getline('/srv/apps/ffm-baseline/eda/recommended_indexs/hypothesis_test.txt', i).strip("\n").split(",")
if s[0] != "":
str += s[0]+"\n"
str +="【同样重要】如下有变更,请提醒相关的人:\n" \
"1.推荐模型变更优化,影响CTR或者CVR(王志伟);\n" \
"2.任何涉及到数据库schame变更(王志伟)"
print(str)
# 请求参数 可以写入配置文件中
data = {
"msgtype": "text",
"text": {
"content": str,
"title": "自定义机器人"
# "picUrl": "",
# "messageUrl": "https://www.baidu.com/"
},
"at": {
"atMobiles":["17310453926"]
}
}
res = dingding_robot(data)
print(res) # 打印请求结果
\ No newline at end of file
#! -*- coding: utf8 -*-
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import levene
import datetime
from utils import con_sql
from decimal import *
import numpy as np
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
#########推荐策略前后统计指标假设检验(t检验)###############
#自动获取昨日日期
def get_yesterday_date():
#自动获取昨天的日期,如"2018-08-08"
"""
:rtype : str
"""
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y-%m-%d")
return yesterday
yesterday=get_yesterday_date()
print("监测数据日期:{}".format(yesterday))
#自动获取10日前的日期
def get_somedate():
#自动获取10日前的日期,如"2018-07-28"
"""
:rtype : str
"""
today = datetime.date.today()
someday = today - datetime.timedelta(days=10)
someday = someday.strftime("%Y-%m-%d")
return someday
ten_days=get_somedate()
print("===========分割线,T检验最近10日指标与策略前10日指标是否获得显著提升============")
#获取最近10天的数据
def DATA_recently(x,y,z,q,t):
ten_days = get_somedate()
sql_cid = "select {0}/{1} as {2} from {3} \
where stat_date >='{4}' ".format(x,y,z,q,t)
CVR_DATA_recently = con_sql(sql_cid)
return CVR_DATA_recently
#
# #获取固定时间的10天的数据
def DATA_fixed(x,y,z,q):
sql_cid = "select {0}/{1} as {2} from {3} \
where stat_date >='2018-11-17' and stat_date<='2018-11-26' group by stat_date".format(x,y,z,q)
CVR_DATA_fixed = con_sql(sql_cid)
return CVR_DATA_fixed
def DATA_recently_all(x,y,z,q,m,t):
ten_days = get_somedate()
sql_cid = "select ({0}+{1})/{2} as {3} from {4} \
where stat_date >='{5}' ".format(x,y,z,q,m,t)
CVR_DATA_recently = con_sql(sql_cid)
return CVR_DATA_recently
#
# #获取固定时间的10天的数据
def DATA_fixed_all(x,y,z,q,m):
sql_cid = "select ({0}+{1})/{2} as {3} from {4} \
where stat_date >='2018-11-17' and stat_date<='2018-11-26' group by stat_date".format(x,y,z,q,m)
CVR_DATA_fixed = con_sql(sql_cid)
return CVR_DATA_fixed
#
# #新用户cvr
x_crv_new_temp=DATA_recently("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv",ten_days)
x_crv_new=[float(str(Decimal(x_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_new_temp))]
y_crv_new_temp=DATA_fixed("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv")
y_crv_new=[float(str(Decimal(y_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_new_temp))]
# #老用户cvr
x_crv_old_temp=DATA_recently("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv",ten_days)
x_crv_old=[float(str(Decimal(x_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_old_temp))]
y_crv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv")
y_crv_old=[float(str(Decimal(y_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_old_temp))]
#
# #新用户ct-cvr
x_ctcrv_new_temp=DATA_recently("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv",ten_days)
x_ctcrv_new=[float(str(Decimal(x_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_new_temp))]
y_ctcrv_new_temp=DATA_fixed("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv")
y_ctcrv_new=[float(str(Decimal(y_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_new_temp))]
#
# #老用户ct-cvr
x_ctcrv_old_temp=DATA_recently("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv",ten_days)
x_ctcrv_old =[float(str(Decimal(x_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_old_temp))]
y_ctcrv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv")
y_ctcrv_old=[float(str(Decimal(y_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_old_temp))]
#
# #新用户ctr(page_view)
x_ctr_new_temp=DATA_recently("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser",ten_days)
x_ctr_new=[float(str(Decimal(x_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_temp))]
y_ctr_new_temp=DATA_fixed("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser")
y_ctr_new=[float(str(Decimal(y_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_temp))]
# #
# #老用户ctr(page_view)
x_ctr_old_temp=DATA_recently("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp",ten_days)
x_ctr_old=[float(str(Decimal(x_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_temp))]
y_ctr_old_temp=DATA_fixed("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp")
y_ctr_old=[float(str(Decimal(y_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_temp))]
#
# #新用户ctr(on_click_diary_card)
x_ctr_new_o_temp=DATA_recently_all("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","ctr_new","on_click_diary_card",ten_days)
x_ctr_new_o=[float(str(Decimal(x_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_o_temp))]
y_ctr_new_o_temp=DATA_fixed_all("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","ctr_new","on_click_diary_card")
y_ctr_new_o=[float(str(Decimal(y_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_o_temp))]
#
# #老用户ctr(on_click_diary_card)
x_ctr_old_o_temp=DATA_recently_all("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","ctr_old","on_click_diary_card",ten_days)
x_ctr_old_o=[float(str(Decimal(x_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_o_temp))]
y_ctr_old_o_temp=DATA_fixed_all("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","ctr_old","on_click_diary_card")
y_ctr_old_o=[float(str(Decimal(y_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_o_temp))]
# #
#
#
def t_test(x,y): #进行t检验
#策略前的数据,赋值给x,策略后的数据赋值给y,均采用10日内数据
#检验数据方差是否齐性
a=levene(x,y)
p_value=a[1] #结果若p_value>0.05,则认为两组数据方差是相等的,否则两组数据方差是不等的
if p_value>0.05: #认为数据方差具有齐性,equal_var=ture
t_test=ttest_ind(x,y,equal_var=True)
t_p_value=t_test[1]
# print(t_p_value)
if t_p_value>0.05:
print("95%置信度认为策略前后两组数据【无显著性差异】,即该指标没有显著变化,p_value:{}" .format(t_p_value))
print("\n")
else:
print("95%置信度认为策略前后两组数据【有显著性差异】,即该指标获得显著变化,p_value:{}" .format(t_p_value))
print("\n")
else: #认为数据方差不具有齐性,equal_var=false
t_test = ttest_ind(x, y, equal_var=False)
t_p_value = t_test[1]
if t_p_value > 0.05:
print("95%置信度认为策略前后两组数据【无显著性差异】,即该指标没有显著变化,p_value:{}" .format(t_p_value))
print("\n")
else:
print("95%置信度认为策略前后两组数据【有显著性差异】,即该指标获得显著变化,p_value:{}" .format(t_p_value))
print("\n")
#
# ###假设检验,判断是否具有显著性
#
#新用户cvr假设检验
print("【1】新用户CVR假设检验结果:")
crv_new_ttest=t_test(x_crv_new,y_crv_new)
#老用户cvr假设检验
print("【2】老用户CVR假设检验结果:")
crv_old_ttest=t_test(x_crv_old,y_crv_old)
#
#新用户ct_cvr假设检验
print("【3】新用户CT-CVR假设检验结果:")
ctcrv_new_ttest=t_test(x_ctcrv_new,y_ctcrv_new)
# #老用户ct_cvr假设检验
print("【4】老用户CT-CVR假设检验结果:")
ctcrv_old_ttest=t_test(x_ctcrv_old,y_ctcrv_old)
#
#
#新用户ctr假设检验
print("【5】新用户CTR假设检验结果:")
ctr_new_ttest=t_test(x_ctr_new,y_ctr_new)
#老用户ctr假设检验
print("【6】老用户CTR假设检验结果:")
ctr_old_ttest=t_test(x_ctr_old,y_ctr_old)
#新用户ctr(on_click_diary_card)假设检验
print("【7】新用户CTR假设检验(日记本列表ctr)(on_click_diary_card)结果:")
ctr_new_o_ttest=t_test(x_ctr_new_o,y_ctr_new_o)
#老用户ctr(on_click_diary_card)假设检验
print("【8】老用户CTR假设检验(日记本列表ctr)(on_click_diary_card)结果:")
ctr_old_o_ttest=t_test(x_ctr_old_o,y_ctr_old_o)
#
# ###############推荐策略不变的情况下数据假设检验##############
print("===========分割线,卡方检验昨日指标与前5日指标均值是否显著变化============")
# #1 计算每日指标卡方检验
#
# #自动获取5日前的日期
def get_fivedate():
#自动获取10日前的日期,如"2018-07-28"
"""
:rtype : str
"""
today = datetime.date.today()
someday = today - datetime.timedelta(days=5)
someday = someday.strftime("%Y-%m-%d")
return someday
five_days=get_fivedate()
#获取最近5天的数据,此函数只适用于on_click_diary_card表格,具体原因可以查看数据代码
def chi_DATA_recently(x,y,z,q,t1,t2):
sql_cid = "select AVG({0}+{1}),AVG({2}) from {3} \
where stat_date >= '{4}' and stat_date < '{5}' ".format(x,y,z,q,t1,t2)
CVR_DATA_recently = con_sql(sql_cid)[0]
return CVR_DATA_recently
def chi_DATA_yesterday(x,y,z,q,t1):
sql_cid = "select {0}+{1},{2} from {3} where stat_date='{4}' ".format(x,y,z,q,t1)
CVR_DATA_yesterday = con_sql(sql_cid)[0]
return CVR_DATA_yesterday
#获取最近5天的数据
def chi_DATA_recently_all(x,y,z,t1,t2):
sql_cid = "select AVG({0}),AVG({1}) from {2} \
where stat_date >= '{3}' and stat_date < '{4}' ".format(x,y,z,t1,t2)
CVR_DATA_recently = con_sql(sql_cid)[0]
return CVR_DATA_recently
def chi_DATA_yesterday_all(x,y,z,t1):
sql_cid = "select {0},{1} from {2} where stat_date='{3}' ".format(x,y,z,t1)
CVR_DATA_yesterday = con_sql(sql_cid)[0]
return CVR_DATA_yesterday
#整理数据
def data_cal(x,y):
x_a = [x[0], x[1] - x[0]]
y_a=[y[0], y[1] - y[0]]
a_df=pd.DataFrame({'原':x_a,'测':y_a})
return a_df
def chi_cal(data):
data['共计'] = data.apply(lambda x: x.sum(), axis=1)
data.loc['共计'] = data.apply(lambda x: x.sum())
t1=data.iloc[0]
t2=data.iloc[1]
t11_count=t1[0]
t12_count=t1[1]
t21_count=t2[0]
t22_count=t2[1]
###理论值计算
temp1=data['共计']
rate1=temp1[0]/temp1[2]
rate2=temp1[1]/temp1[2]
temp2=data.iloc[2]
t11_theory=temp2[0]*rate1
t12_theory=temp2[1]*rate1
t21_theory = temp2[0]*rate2
t22_theory = temp2[1]*rate2
#计算卡方值
X=(((t11_count-t11_theory)**2)/t11_theory)+(((t12_count-t12_theory)**2)/t12_theory)+(((t21_count-t21_theory)**2)/t21_theory)+(((t22_count-t22_theory)**2)/t22_theory)
print("卡方值为:{}".format(X))
#计算自由度
v=(len(data)-1)*(data.columns.size-1)
#查表发现阈值为3.84
if X>3.84:
print("数据波动较大,超出正常波动范围,95%可能性属于指标【显著变化,请关注】")
print("\n")
else:
print("数据波动较小,95%可能性属于【正常波动】范围")
print("\n")
#老用户精准点击曝光数据(首页精选日记本列表on_click_diary_card)
print("【1】(精准曝光)首页精选日记本列表老用户CTR数据波动假设检验结果:")
chi_ctr_precise_old_recently=chi_DATA_recently("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",five_days,yesterday)
temp1_old=[float(str(Decimal(chi_ctr_precise_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctr_precise_old_recently))]
chi_ctr_precise_old_yesterday=chi_DATA_yesterday("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",yesterday)
temp2_old=[float(chi_ctr_precise_old_yesterday[i]) for i in range(len(chi_ctr_precise_old_yesterday))]
ctr_tst_old=data_cal(temp1_old,temp2_old)
chi_cal(ctr_tst_old)
#新用户精准点击曝光数据(首页精选日记本列表on_click_diary_card)
print("【2】(精准曝光)首页精选日记本列表新用户CTR数据波动假设检验结果:")
chi_ctr_precise_new_recently=chi_DATA_recently("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",five_days,yesterday)
temp1_new=[float(str(Decimal(chi_ctr_precise_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctr_precise_new_recently))]
chi_ctr_precise_new_yesterday=chi_DATA_yesterday("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",yesterday)
temp2_new=[float(chi_ctr_precise_new_yesterday[i]) for i in range(len(chi_ctr_precise_new_yesterday))]
ctr_tst_new=data_cal(temp1_new,temp2_new)
chi_cal(ctr_tst_new)
#老用户美购转化数据
print("【3】老用户CVR数据波动假设检验结果:")
chi_cvr_old_recently=chi_DATA_recently_all("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",five_days,yesterday)
cvr_old=[float(str(Decimal(chi_cvr_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_cvr_old_recently))]
chi_cvr_old_yesterday=chi_DATA_yesterday_all("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",yesterday)
cvr_old2=[float(chi_cvr_old_yesterday[i]) for i in range(len(chi_cvr_old_yesterday))]
cvr_tst_old=data_cal(cvr_old,cvr_old2)
chi_cal(cvr_tst_old)
#老用户美购转化数据
print("【4】新用户CVR数据波动假设检验结果:")
chi_cvr_new_recently=chi_DATA_recently_all("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",five_days,yesterday)
cvr_new=[float(str(Decimal(chi_cvr_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_cvr_new_recently))]
chi_cvr_new_yesterday=chi_DATA_yesterday_all("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",yesterday)
cvr_new2=[float(chi_cvr_new_yesterday[i]) for i in range(len(chi_cvr_new_yesterday))]
cvr_tst_new=data_cal(cvr_new,cvr_new2)
chi_cal(cvr_tst_new)
#老用户美购转化数据
print("【5】老用户CT-CVR数据波动假设检验结果:")
chi_ctcvr_old_recently=chi_DATA_recently_all("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",five_days,yesterday)
ctcvr_old=[float(str(Decimal(chi_ctcvr_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctcvr_old_recently))]
chi_ctcvr_old_yesterday=chi_DATA_yesterday_all("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",yesterday)
ctcvr_old2=[float(chi_ctcvr_old_yesterday[i]) for i in range(len(chi_ctcvr_old_yesterday))]
ctcvr_tst_old=data_cal(ctcvr_old,ctcvr_old2)
chi_cal(ctcvr_tst_old)
#老用户美购转化数据
print("【6】新用户CT-CVR数据波动假设检验结果:")
chi_ctcvr_new_recently=chi_DATA_recently_all("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",five_days,yesterday)
ctcvr_new=[float(str(Decimal(chi_ctcvr_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctcvr_new_recently))]
chi_ctcvr_new_yesterday=chi_DATA_yesterday_all("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",yesterday)
ctcvr_new2=[float(chi_ctcvr_new_yesterday[i]) for i in range(len(chi_ctcvr_new_yesterday))]
ctcvr_tst_new=data_cal(ctcvr_new,ctcvr_new2)
chi_cal(ctcvr_tst_new)
# ###############数据波动大小检验##############
print("===============分割线,开始检测各个指标的5日内的方差和均值==================")
def get_var_data1(x,y,z,t1):
sql_cid = "select {0}/{1} from {2} \
where stat_date >= '{3}' ".format(x,y,z,t1)
CVR_DATA_recently = con_sql(sql_cid)
return CVR_DATA_recently
def get_var_data2(x,y,z,q,t1):
sql_cid = "select ({0}+{1})/{2} from {3} \
where stat_date >= '{4}' ".format(x,y,z,q,t1)
CVR_DATA_recently = con_sql(sql_cid)
return CVR_DATA_recently
def collect_data(data):
tt = [float(data[i][0])*100 for i in range(len(data))]
return tt
var_ctcvr_old_data=get_var_data1("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",five_days)
var_ctcvr_old_D=collect_data(var_ctcvr_old_data)
var_ctcvr_old=np.var(var_ctcvr_old_D)
mean_var_ctcvr_old=np.mean(var_ctcvr_old_D)
print("【1-1】老用户CT-CVR数据波动5日内方差检验结果:{}".format(var_ctcvr_old))
print("【1-2】老用户CT-CVR数据波动5日内均值:{}%".format(mean_var_ctcvr_old))
print("\n")
var_ctcvr_new_data=get_var_data1("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",five_days)
var_ctcvr_new_D=collect_data(var_ctcvr_new_data)
var_ctcvr_new=np.var(var_ctcvr_new_D)
mean_var_ctcvr_new=np.mean(var_ctcvr_new_D)
print("【2-1】新用户CT-CVR数据波动5日内方差检验结果:{}".format(var_ctcvr_new))
print("【2-2】新用户CT-CVR数据波动5日内均值:{}%".format(mean_var_ctcvr_new))
print("\n")
var_cvr_old_data=get_var_data1("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",five_days)
var_cvr_old_D=collect_data(var_cvr_old_data)
var_cvr_old=np.var(var_cvr_old_D)
mean_var_cvr_old=np.mean(var_cvr_old_D)
print("【3-1】老用户CVR数据波动5日内方差检验结果:{}".format(var_cvr_old))
print("【3-2】老用户CVR数据波动5日内均值:{}%".format(mean_var_cvr_old))
print("\n")
#
var_cvr_new_data=get_var_data1("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",five_days)
var_cvr_new_D=collect_data(var_cvr_new_data)
var_cvr_new=np.var(var_cvr_new_D)
mean_var_cvr_new=np.mean(var_cvr_new_D)
print("【4-1】新用户CVR数据波动5日内方差检验结果:{}".format(var_cvr_new))
print("【4-2】新用户CVR数据波动5日内均值:{}%".format(mean_var_cvr_new))
print("\n")
var_ctr_old_data=get_var_data2("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","on_click_diary_card",five_days)
var_ctr_old_D=collect_data(var_ctr_old_data)
var_ctr_old=np.var(var_cvr_old_D)
mean_var_ctr_old=np.mean(var_ctr_old_D)
print("【5-1】老用户CTR数据波动5日内方差检验结果:{}".format(var_ctr_old))
print("【5-2】老用户CTR数据波动5日内均值:{}%".format(mean_var_ctr_old))
print("\n")
var_ctr_new_data=get_var_data2("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","on_click_diary_card",five_days)
var_ctr_new_D=collect_data(var_ctr_new_data)
var_ctr_new=np.var(var_ctr_new_D)
mean_var_ctr_new=np.mean(var_ctr_new_D)
print("【6-1】新用户CTR数据波动5日内方差检验结果:{}".format(var_ctr_new))
print("【6-2】新用户CTR数据波动5日内均值:{}%".format(mean_var_ctr_new))
print("\n")
var_ctr_new_precise_data=get_var_data2("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",five_days)
var_ctr_new_precise_D=collect_data(var_ctr_new_precise_data)
var_ctr_new_precise=np.var(var_ctr_new_precise_D)
mean_var_ctr_new_precise=np.mean(var_ctr_new_precise_D)
print("【7-1】新用户精准曝光CTR数据波动5日内方差检验结果:{}".format(var_ctr_new_precise))
print("【7-2】新用户精准曝光CTR数据波动5日内均值:{}%".format(mean_var_ctr_new_precise))
print("\n")
var_ctr_old_precise_data=get_var_data2("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",five_days)
var_ctr_old_precise_D=collect_data(var_ctr_old_precise_data)
var_ctr_old_precise=np.var(var_ctr_old_precise_D)
mean_var_ctr_old_precise=np.mean(var_ctr_old_precise_D)
print("【8-1】老用户精准曝光CTR数据波动5日内方差检验结果:{}".format(var_ctr_old_precise))
print("【8-2】老用户精准曝光CTR数据波动5日内均值:{}%".format(mean_var_ctr_old_precise))
print("\n")
# print("============================分割线===================================")
#根据新老用户进行区分
# print("============================新用户各指标假设检验结果分析===================================")
# #新用户cvr假设检验
# print("【1】新用户CVR假设检验结果:")
# crv_new_ttest1=t_test(x_crv_new,y_crv_new)
# #新用户ct_cvr假设检验
# print("【3】新用户CT-CVR假设检验结果:")
# ctcrv_new_ttest1=t_test(x_ctcrv_new,y_ctcrv_new)
# #新用户ctr假设检验
# print("【5】新用户CTR假设检验结果:")
# ctr_new_ttest1=t_test(x_ctr_new,y_ctr_new)
# #新用户ctr(on_click_diary_card)假设检验
# print("【7】新用户CTR假设检验(日记本列表ctr)(on_click_diary_card)结果:")
# ctr_new_o_ttest1=t_test(x_ctr_new_o,y_ctr_new_o)
#
#
#
#
#
# print("============================老用户各指标假设检验结果分析===================================")
# #老用户cvr假设检验
# print("【2】老用户CVR假设检验结果:")
# crv_old_ttest1=t_test(x_crv_old,y_crv_old)
# # #老用户ct_cvr假设检验
# print("【4】老用户CT-CVR假设检验结果:")
# ctcrv_old_ttest1=t_test(x_ctcrv_old,y_ctcrv_old)
# #老用户ctr假设检验
# print("【6】老用户CTR假设检验结果:")
# ctr_old_ttest1=t_test(x_ctr_old,y_ctr_old)
# #老用户ctr(on_click_diary_card)假设检验
# print("【8】老用户CTR假设检验(日记本列表ctr)(on_click_diary_card)结果:")
# ctr_old_o_ttest1=t_test(x_ctr_old_o,y_ctr_old_o)
##发送邮件
# my_sender='gaoyazhe@igengmei.com'
# my_pass = 'VCrKTui99a7ALhiK'
# my_user1='wangzhiwei@igengmei.com'
# def mail():
# ret = True
# try:
# text = "Hi!\nHow are you?\nHere is the link you wanted:\nhttp://www.baidu.com"
# msg = MIMEText(text, 'plain', 'utf-8')
# msg['From'] = formataddr(["王志伟", my_sender])
# msg['To'] = my_user1
# msg['Subject'] = str(datetime.date.today()) + "-esmm多目标模型训练指标统计"
# server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
# server.login(my_sender, my_pass)
# server.sendmail(my_sender, [my_user1], msg.as_string())
# server.quit()
# except Exception:
# ret=False
# return ret
#
# ret=mail()
# if ret:
# print("邮件发送成功")
# else:
# print("邮件发送失败")
# chi_cvr_new=
# chi_cvr_old=
#
# chi_ctcvr_new=
# chi_ctcvr_old=
#
#
#
# def chi_cal(data):
# ##发送邮件
#
# #coding=utf-8
#
# import smtplib
# from email.mime.text import MIMEText
# from email.utils import formataddr
# from email.mime.application import MIMEApplication
# import datetime
#
# from email.mime.multipart import MIMEMultipart
#
# my_sender='wangzhiwei@igengmei.com'
# my_pass = 'RiKEcsHAgesCZ7yd'
# my_user1='wangzhiwei@igengmei.com'
# my_user2='gaoyazhe@igengmei.com'
# my_user3='huangkai@igengmei.com'
# def mail():
# ret = True
# pdfFile = 'hypothesis.txt'
# pdfApart = MIMEApplication(open(pdfFile, 'rb').read())
# pdfApart.add_header('Content-Disposition', 'attachment', filename=pdfFile)
# m = MIMEMultipart()
# m.attach(pdfApart)
# m['Subject'] = '数据指标监控数据(假设检验)'
# m['From'] = '王志伟<wangzhiwei@igengmei.com>'
#
#
# try:
# # text = "Hi!\nHow are you?\nHere is the link you wanted:\nhttp://www.baidu.com"
# # msg = MIMEText(text, 'plain', 'utf-8')
# # msg['From'] = formataddr(["王志伟", my_sender])
# # msg['To'] = my_user1
# # msg['Subject'] = str(datetime.date.today()) + "-esmm多目标模型训练指标统计"
# server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
# server.login(my_sender, my_pass)
# server.sendmail(my_sender, [my_user1,my_user2,my_user3], m.as_string())
# server.quit()
# except Exception:
# ret=False
# return ret
#
# ret=mail()
# if ret:
# print("邮件发送成功")
# else:
# print("邮件发送失败")
#####尝试发送邮箱,不带附件
#coding=utf-8
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
import datetime
my_sender='wangzhiwei@igengmei.com'
my_pass = 'RiKEcsHAgesCZ7yd'
my_user1='wangzhiwei@igengmei.com'
# my_user2='zhangyanzhao@igengmei.com'
# my_user3='zhaochen@igengmei.com'
# my_user4='huangkai@igengmei.com'
# my_user5='lixiaofang@igengmei.com'
# my_user6='duanyingrong@igengmei.com'
# my_user7='liuxiao@igengmei.com'
# my_user8='gaoyazhe@igengmei.com'
def mail():
ret=True
try:
with open('hypothesis.txt') as f:
stat_data = f.read()
msg=MIMEText(stat_data,'plain','utf-8')
msg['From']=formataddr(["王志伟",my_sender])
msg['To']=my_user1
msg['Subject']= str(datetime.date.today())+"-数据指标监控数据(假设检验)"
server=smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
server.login(my_sender, my_pass)
server.sendmail(my_sender,[my_user1],msg.as_string())
server.quit()
except Exception:
ret=False
return ret
ret=mail()
if ret:
print("邮件发送成功")
else:
print("邮件发送失败")
\ No newline at end of file
import pymysql
import pandas as pd
from multiprocessing import Pool
import numpy as np
import datetime
import time
def con_sql(db, sql):
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchone()[0]
return result
# def test(days):
# start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
# print(start)
# sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
# "from train_data where stat_date = '{}' and z = 1)".format(start,start)
# db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# exp = con_sql(db, sql)
# print(exp)
# sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
# "from train_data where stat_date = '{}' and z = 1)".format(start,start)
# click = con_sql(db, sql)
# return start,exp,click
if __name__ == "__main__":
# temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
# DIRECTORY_PATH = "/home/gmuser/"
# output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
# for i in range(1,41):
# a,b,c = test(i)
# with open(output_path, 'a+') as f:
# line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
# f.write(line)
#! /bin/bash #! /bin/bash
git checkout master
PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
MODEL_PATH=/srv/apps/ffm-baseline/tensnsorflow/es MODEL_PATH=/srv/apps/ffm-baseline/tensnsorflow/es
DATA_PATH=/home/gmuser/esmm_data DATA_PATH=/data/esmm
echo "rm leave tfrecord" echo "rm leave tfrecord"
rm ${DATA_PATH}/tr/* rm ${DATA_PATH}/tr/*
rm ${DATA_PATH}/va/* rm ${DATA_PATH}/va/*
rm ${DATA_PATH}/native/* rm ${DATA_PATH}/native/*
rm ${DATA_PATH}/nearby/* rm ${DATA_PATH}/nearby/*
rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/201* rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/20*
echo "data" echo "data"
${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/feature.log
echo "csv to tfrecord" echo "csv to tfrecord"
${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/ ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/
...@@ -32,15 +32,15 @@ rm ${DATA_PATH}/nearby/nearby_* ...@@ -32,15 +32,15 @@ rm ${DATA_PATH}/nearby/nearby_*
echo "train..." echo "train..."
${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=2 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
echo "infer native..." echo "infer native..."
${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/native_infer.log
echo "infer nearby..." echo "infer nearby..."
${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/nearby_infer.log
echo "sort and 2sql" echo "sort and 2sql"
${PYTHON_PATH} ${MODEL_PATH}/to_database.py ${PYTHON_PATH} ${MODEL_PATH}/to_database.py > ${DATA_PATH}/insert_database.log
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
from sqlalchemy import create_engine from sqlalchemy import create_engine
import pandas as pd import pandas as pd
import pymysql import pymysql
import MySQLdb
import time import time
def con_sql(sql): def con_sql(sql):
...@@ -37,10 +36,10 @@ def native_set_join(lst): ...@@ -37,10 +36,10 @@ def native_set_join(lst):
def main(): def main():
# native queue # native queue
df2 = pd.read_csv('/home/gmuser/esmm_data/native.csv') df2 = pd.read_csv('/data/esmm/native.csv')
df2['cid_id'] = df2['cid_id'].astype(str) df2['cid_id'] = df2['cid_id'].astype(str)
df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"]) df1 = pd.read_csv("/data/esmm/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"] df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False) df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
df3.columns = ["device_id","city_id","native_queue"] df3.columns = ["device_id","city_id","native_queue"]
...@@ -48,10 +47,10 @@ def main(): ...@@ -48,10 +47,10 @@ def main():
# nearby queue # nearby queue
df2 = pd.read_csv('/home/gmuser/esmm_data/nearby.csv') df2 = pd.read_csv('/data/esmm/nearby.csv')
df2['cid_id'] = df2['cid_id'].astype(str) df2['cid_id'] = df2['cid_id'].astype(str)
df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"]) df1 = pd.read_csv("/data/esmm/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"] df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False) df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
df4.columns = ["device_id","city_id","nearby_queue"] df4.columns = ["device_id","city_id","nearby_queue"]
...@@ -65,8 +64,6 @@ def main(): ...@@ -65,8 +64,6 @@ def main():
df_all["time"] = ctime df_all["time"] = ctime
print("union_device_count",df_all.shape) print("union_device_count",df_all.shape)
host='10.66.157.22' host='10.66.157.22'
port=4000 port=4000
user='root' user='root'
...@@ -75,21 +72,21 @@ def main(): ...@@ -75,21 +72,21 @@ def main():
charset='utf8' charset='utf8'
engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db)) engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db))
df_merge = df_all['device_id'] + df_all['city_id']
df_merge_str = (str(list(df_merge.values))).strip('[]')
try: try:
# df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1) # df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
df_merge = df_all['device_id'] + df_all['city_id']
df_merge_str = (str(list(df_merge.values))).strip('[]')
delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str) delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str)
con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cur = con.cursor() cur = con.cursor()
cur.execute(delete_str) cur.execute(delete_str)
con.commit() con.commit()
df_all.to_sql('esmm_device_diary_queue',con=engine,if_exists='append',index=False) df_all.to_sql('esmm_device_diary_queue',con=engine,if_exists='append',index=False,chunksize=8000)
except Exception as e: except Exception as e:
print(e) print(e)
print("done")
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
...@@ -28,16 +28,23 @@ def gen_tfrecords(in_file): ...@@ -28,16 +28,23 @@ def gen_tfrecords(in_file):
df = pd.read_csv(in_file) df = pd.read_csv(in_file)
for i in range(df.shape[0]): for i in range(df.shape[0]):
feats = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2"] "channel", "top", "time", "stat_date","hospital_id",
"method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
id = np.array([]) id = np.array([])
for j in feats: for j in feats:
id = np.append(id,df[j][i]) id = np.append(id,df[j][i])
app_list = np.array(str(df["app_list"][i]).split(","))
level2_list = np.array(str(df["clevel2_id"][i]).split(","))
level3_list = np.array(str(df["level3_ids"][i]).split(","))
features = tf.train.Features(feature={ features = tf.train.Features(feature={
"y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])), "y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
"z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])), "z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
"ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))) "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
}) "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
"level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
"level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
})
example = tf.train.Example(features = features) example = tf.train.Example(features = features)
serialized = example.SerializeToString() serialized = example.SerializeToString()
......
...@@ -53,7 +53,10 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False): ...@@ -53,7 +53,10 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
features = { features = {
"y": tf.FixedLenFeature([], tf.float32), "y": tf.FixedLenFeature([], tf.float32),
"z": tf.FixedLenFeature([], tf.float32), "z": tf.FixedLenFeature([], tf.float32),
"ids": tf.FixedLenFeature([11], tf.int64) "ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64),
"app_list": tf.VarLenFeature(tf.int64),
"level2_list": tf.VarLenFeature(tf.int64),
"level3_list": tf.VarLenFeature(tf.int64)
} }
parsed = tf.parse_single_example(record, features) parsed = tf.parse_single_example(record, features)
...@@ -99,6 +102,8 @@ def model_fn(features, labels, mode, params): ...@@ -99,6 +102,8 @@ def model_fn(features, labels, mode, params):
Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer()) Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer())
feat_ids = features['ids'] feat_ids = features['ids']
app_list = features['app_list']
level2_list = features['level2_list']
if FLAGS.task_type != "infer": if FLAGS.task_type != "infer":
y = labels['y'] y = labels['y']
...@@ -107,8 +112,12 @@ def model_fn(features, labels, mode, params): ...@@ -107,8 +112,12 @@ def model_fn(features, labels, mode, params):
#------build f(x)------ #------build f(x)------
with tf.variable_scope("Shared-Embedding-layer"): with tf.variable_scope("Shared-Embedding-layer"):
embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids) embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids)
app_id = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=app_list, sp_weights=None, combiner="sum")
level2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level2_list, sp_weights=None, combiner="sum")
x_concat = tf.reshape(embedding_id,shape=[-1, common_dims]) # None * (F * K)
# x_concat = tf.reshape(embedding_id,shape=[-1, common_dims]) # None * (F * K)
x_concat = tf.concat([tf.reshape(embedding_id,shape=[-1,common_dims]),app_id,level2], axis=1)
with tf.name_scope("CVR_Task"): with tf.name_scope("CVR_Task"):
if mode == tf.estimator.ModeKeys.TRAIN: if mode == tf.estimator.ModeKeys.TRAIN:
......
...@@ -9,18 +9,7 @@ import time ...@@ -9,18 +9,7 @@ import time
from sqlalchemy import create_engine from sqlalchemy import create_engine
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
# def test(): # def test():
# sql = "select max(update_time) from ffm_diary_queue" # sql = "select max(update_time) from ffm_diary_queue"
...@@ -285,6 +274,35 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel): ...@@ -285,6 +274,35 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
# print("nearby_pre shape") # print("nearby_pre shape")
# print(nearby_pre.shape) # print(nearby_pre.shape)
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
def test(days):
start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
print(start)
sql = "select (select count(*) from train_data where stat_date = '{}' and y = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
exp = con_sql(db, sql)[0].values.tolist()[0]
sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
click = con_sql(db, sql)[0].values.tolist()[0]
return start,exp,click
if __name__ == "__main__": if __name__ == "__main__":
......
import datetime
from pyspark.sql import HiveContext
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
# from py4j.java_gateway import java_import
# import pytispark.pytispark as pti
import pandas as pd
import pymysql
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
# def test():
conf = SparkConf().setAppName("My App").set("spark.io.compression.codec", "lzf")
sc = SparkContext(conf = conf)
hive_context = HiveContext(sc)
hive_context.sql(''' select device["device_type"] from online.tl_hdfs_maidian_view
where partition_date = '20181012' and action = "page_view"
and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 ''').show(6)
# def esmm_pre():
# yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
# print(yesterday)
#
# spark = SparkSession.builder.enableHiveSupport().getOrCreate()
# # gw = SparkContext._gateway
# #
# # # Import TiExtensions
# # java_import(gw.jvm, "org.apache.spark.sql.TiContext")
#
# # Inject TiExtensions, and get a TiContext
# # ti = gw.jvm.TiExtensions.getInstance(spark._jsparkSession).getOrCreateTiContext(spark._jsparkSession)
# ti = pti.TiContext(spark)
#
# ti.tidbMapDatabase("jerry_test")
#
# # sql("use tpch_test")
# spark.sql("select count(*) from esmm_pre_data").show(6)
#
# # conf = SparkConf().setAppName("esmm_pre").set("spark.io.compression.codec", "lzf")
#
# spark.sql("""
# select concat(tmp1.device_id,",",tmp1.city_id) as device_city, tmp1.merge_queue from (select device_id,if(city_id='world','worldwide',city_id) city_id,similarity_cid as merge_queue from nd_device_cid_similarity_matrix
# union select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue
# union select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1 where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date='{}'
# """.format(yesterday)).show(6)
if __name__ == '__main__':
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment