delete feature file

1bbca574 · 张彦钊 · 00b27814 · c5834874 · 1bbca574 · 00b27814
Commit 1bbca574 authored Apr 16, 2019 by 张彦钊
31 changed files
--- a/tensnsorflow/es/feature.py
+++ b/tensnsorflow/es/feature.py
@@ -18,39 +18,65 @@ def con_sql(db,sql):
    return df
+def multi_hot(df,column,n):
+    df[column] = df[column].fillna("lost_na")
+    app_list_value = [i.split(",") for i in df[column].unique()]
+    app_list_unique = []
+    for i in app_list_value:
+        app_list_unique.extend(i)
+    app_list_unique = list(set(app_list_unique))
+    number = len(app_list_unique)
+    app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
+    df[column] = df[column].apply(app_list_func, args=(app_list_map,))
+    return number,app_list_map
 def get_data():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select max(stat_date) from esmm_train_data"
+    sql = "select max(stat_date) from {}".format(train_data_set)
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:" + validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
-    start = (temp - datetime.timedelta(days=60)).strftime("%Y-%m-%d")
+    start = (temp - datetime.timedelta(days=300)).strftime("%Y-%m-%d")
    print(start)
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
+    sql = "select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name," \
-          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
+          "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,cut.time,dl.app_list " \
-          "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
+          "from {} e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
-          "left join cid_level2 cl on e.cid_id = cl.cid " \
          "left join cid_time_cut cut on e.cid_id = cut.cid " \
-          "where e.stat_date >= '{}'".format(start)
+          "left join device_app_list dl on e.device_id = dl.device_id " \
+          "left join diary_feat feat on e.cid_id = feat.diary_id " \
+          "where e.stat_date >= '{}'".format(train_data_set,start)
    df = con_sql(db, sql)
    # print(df.shape)
-    df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
+    df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
-                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2",
+                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
-                            12: "device_id", 13: "time"})
+                            11: "time",12:"app_list"})
    print("esmm data ok")
    # print(df.head(2)
    print("before")
    print(df.shape)
-    print("after")
    df = df.drop_duplicates()
-    df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
-                             "channel", "top", "l1","l2", "time", "stat_date"])
+                             "channel", "top", "time", "stat_date","app_list"])
+    print("after")
    print(df.shape)
+    app_list_number,app_list_map = multi_hot(df,"app_list",1)
+    level2_number,level2_map = multi_hot(df,"clevel2_id",1+app_list_number)
+    # df["app_list"] = df["app_list"].fillna("lost_na")
+    # app_list_value = [i.split(",") for i in df["app_list"].unique()]
+    # app_list_unique = []
+    # for i in app_list_value:
+    #     app_list_unique.extend(i)
+    # app_list_unique = list(set(app_list_unique))
+    # app_list_map = dict(zip(app_list_unique, list(range(1, len(app_list_unique) + 1))))
+    # df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,))
    unique_values = []
-    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
@@ -58,25 +84,15 @@ def get_data():
        # 下面这行代码是为了区分不同的列中有相同的值
        df[i] = df[i] + i
        unique_values.extend(list(df[i].unique()))
-    for i in ["l1","l2"]:
-        df[i] = df[i].astype("str")
-        df[i] = df[i].fillna("lost")
-        # l1和l2中的值与top类别是一个类别
-        df[i] = df[i]+"top"
-        unique_values.extend(list(df[i].unique()))
-    print("features:")
-    print(len(unique_values))
-    print(df.head(2))
-    temp = list(range(1,len(unique_values)+1))
+    temp = list(range(1+app_list_number+level2_number, 1 + app_list_number+level2_number + len(unique_values)))
    value_map = dict(zip(unique_values,temp))
    df = df.drop("device_id", axis=1)
-    train = df
+    train = df[df["stat_date"] != validate_date+"stat_date"]
    test = df[df["stat_date"] == validate_date+"stat_date"]
-    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2"]:
+                "channel", "top", "time", "stat_date"]:
        train[i] = train[i].map(value_map)
        test[i] = test[i].map(value_map)
@@ -88,7 +104,18 @@ def get_data():
    write_csv(train, "tr",100000)
    write_csv(test, "va",80000)
-    return validate_date,value_map
+    return validate_date,value_map,app_list_map,level2_map
+def app_list_func(x,l):
+    b = x.split(",")
+    e = []
+    for i in b:
+        if i in l.keys():
+            e.append(l[i])
+        else:
+            e.append(0)
+    return ",".join([str(j) for j in e])
 def write_csv(df,name,n):
@@ -102,44 +129,45 @@ def write_csv(df,name,n):
        temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)
-def get_predict(date,value_map):
+def get_predict(date,value_map,app_list_map,level2_map):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
+    sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \
-          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \
+          "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,dl.app_list " \
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
-          "left join cid_level2 cl on e.cid_id = cl.cid " \
+          "left join cid_time_cut cut on e.cid_id = cut.cid " \
-          "left join cid_time_cut cut on e.cid_id = cut.cid"
+          "left join device_app_list dl on e.device_id = dl.device_id " \
+          "left join diary_feat feat on e.cid_id = feat.diary_id"
    df = con_sql(db, sql)
-    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
+    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
-                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
+                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top",
-                            12: "device_id", 13: "cid_id", 14: "time"})
+                            10: "device_id", 11: "cid_id", 12: "time",13:"app_list"})
    df["stat_date"] = date
+    print(df.head(6))
+    df["app_list"] = df["app_list"].fillna("lost_na")
+    df["app_list"] = df["app_list"].apply(app_list_func,args=(app_list_map,))
+    df["clevel2_id"] = df["clevel2_id"].fillna("lost_na")
+    df["clevel2_id"] = df["clevel2_id"].apply(app_list_func, args=(level2_map,))
-    print("predict shape")
+    # print("predict shape")
-    print(df.shape)
+    # print(df.shape)
    df["uid"] = df["device_id"]
    df["city"] = df["ucity_id"]
-    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i
-    for i in ["l1","l2"]:
-        df[i] = df[i].astype("str")
-        df[i] = df[i].fillna("lost")
-        # l1和l2中的值与top类别是一个类别
-        df[i] = df[i]+"top"
    native_pre = df[df["label"] == 0]
    native_pre = native_pre.drop("label", axis=1)
    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)
-    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+    for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2"]:
+                "channel", "top", "time", "stat_date"]:
        native_pre[i] = native_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        native_pre[i] = native_pre[i].fillna(0)
@@ -151,19 +179,20 @@ def get_predict(date,value_map):
    print("native")
    print(native_pre.shape)
-    print(native_pre.head())
    native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False)
    write_csv(native_pre, "native",200000)
    print("nearby")
    print(nearby_pre.shape)
-    print(nearby_pre.head())
    nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False)
    write_csv(nearby_pre, "nearby", 160000)
 if __name__ == '__main__':
-    path = "/home/gmuser/esmm_data/"
+    train_data_set = "esmm_train_data"
-    date,value = get_data()
+    path = "/data/esmm/"
-    get_predict(date, value)
+    date,value,app_list,level2 = get_data()
+    get_predict(date, value,app_list,level2)
--- a/eda/esmm/Model_pipline/send_mail.py
+++ b/eda/esmm/Model_pipline/send_mail.py
-#coding=utf-8
-import smtplib
-from email.mime.text import MIMEText
-from email.utils import formataddr
-import datetime
-my_sender='gaoyazhe@igengmei.com'
-my_pass = 'VCrKTui99a7ALhiK'
-my_user1='gaoyazhe@igengmei.com'
-my_user2='zhangyanzhao@igengmei.com'
-def mail():
-    ret=True
-    try:
-        with open('/home/gmuser/esmm_data/submit.log') as f:
-            stat_data = f.read()
-            msg=MIMEText(stat_data,'plain','utf-8')
-            msg['From']=formataddr(["高雅喆",my_sender])
-            msg['To']=my_user1 + ',' + my_user2
-            msg['Subject']= str(datetime.date.today())+"-esmm多目标模型训练指标统计"
-            server=smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
-            server.login(my_sender, my_pass)
-            server.sendmail(my_sender,[my_user1,my_user2],msg.as_string())
-            server.quit()
-    except Exception:
-        ret=False
-    return ret
-ret=mail()
-if ret:
-    print("邮件发送成功")
-else:
-    print("邮件发送失败")
\ No newline at end of file
--- a/eda/esmm/Model_pipline/submit.sh
+++ b/eda/esmm/Model_pipline/submit.sh
 #! /bin/bash
-cd /srv/apps/ffm-baseline/eda/esmm
 git checkout master
 PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
-MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm
+MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm/Model_pipline
-DATA_PATH=/home/gmuser/esmm_data
+DATA_PATH=/data/esmm
-echo "start time"
-current=$(date "+%Y-%m-%d %H:%M:%S")
-timeStamp=$(date -d "$current" +%s)
-currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
-echo $current
 echo "rm leave tfrecord"
 rm ${DATA_PATH}/tr/*
 rm ${DATA_PATH}/va/*
 rm ${DATA_PATH}/native/*
 rm ${DATA_PATH}/nearby/*
-rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/201*
+rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/20*
-echo "data2ffm"
-${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/data2ffm.py > ${DATA_PATH}/infer.log
-all_sample=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$2$3$4}' | sort | uniq | wc -l`))
+echo "data"
-  uniq_feat=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$4}' | sort | uniq -u | wc -l`))
+${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/feature.log
-  repe_feat=$((all_sample-uniq_feat))
-  echo "Bayes Error Rate": $((repe_feat*100/all_sample))%
-echo "split data"
-split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv
-split -l $((`wc -l < ${DATA_PATH}/va.csv`/5)) ${DATA_PATH}/va.csv -d -a 4 ${DATA_PATH}/va/va_ --additional-suffix=.csv
-split -l $((`wc -l < ${DATA_PATH}/native.csv`/15)) ${DATA_PATH}/native.csv -d -a 4 ${DATA_PATH}/native/native_ --additional-suffix=.csv
-split -l $((`wc -l < ${DATA_PATH}/nearby.csv`/5)) ${DATA_PATH}/nearby.csv -d -a 4 ${DATA_PATH}/nearby/nearby_ --additional-suffix=.csv
 echo "csv to tfrecord"
-${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/
+${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/
-${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/va/ --output_dir=${DATA_PATH}/va/
+${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/va/ --output_dir=${DATA_PATH}/va/
-${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/native/ --output_dir=${DATA_PATH}/native/
+${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/native/ --output_dir=${DATA_PATH}/native/
-${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/get_tfrecord.py --input_dir=${DATA_PATH}/nearby/ --output_dir=${DATA_PATH}/nearby/
+${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/nearby/ --output_dir=${DATA_PATH}/nearby/
 cat ${DATA_PATH}/tr/*.tfrecord > ${DATA_PATH}/tr/tr.tfrecord
 cat ${DATA_PATH}/va/*.tfrecord > ${DATA_PATH}/va/va.tfrecord
@@ -49,35 +30,17 @@ rm ${DATA_PATH}/va/va_*
 rm ${DATA_PATH}/native/native_*
 rm ${DATA_PATH}/nearby/nearby_*
-echo "data transform time"
-current=$(date "+%Y-%m-%d %H:%M:%S")
-timeStamp=$(date -d "$current" +%s)
-currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
-echo $current
 echo "train..."
-${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
-echo "train time"
-current=$(date "+%Y-%m-%d %H:%M:%S")
-timeStamp=$(date -d "$current" +%s)
-currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
-echo $current
 echo "infer native..."
-${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/native_infer.log
 echo "infer nearby..."
-${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=2000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=8 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/nearby_infer.log
 echo "sort and 2sql"
-${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/sort_and_2sql.py
+${PYTHON_PATH} ${MODEL_PATH}/to_database.py > ${DATA_PATH}/insert_database.log
-echo "infer and sort and 2sql time"
-current=$(date "+%Y-%m-%d %H:%M:%S")
-timeStamp=$(date -d "$current" +%s)
-currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
-echo $current
-${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/send_mail.py
\ No newline at end of file
--- a/eda/esmm/Model_pipline/sort_and_2sql.py
+++ b/eda/esmm/Model_pipline/sort_and_2sql.py
@@ -3,7 +3,6 @@
 from sqlalchemy import create_engine
 import pandas as pd
 import pymysql
-import MySQLdb
 import time
 def con_sql(sql):
@@ -19,31 +18,41 @@ def con_sql(sql):
    return result
-def set_join(lst):
+def nearby_set_join(lst):
-    r = [str(i) for i in lst.unique().tolist()]
+    # return ','.join([str(i) for i in list(lst)])
-    r =r[:500]
+    return ','.join([str(i) for i in lst.unique().tolist()])
+def native_set_join(lst):
+    l = lst.unique().tolist()
+    d = int(len(l)/2)
+    if d == 0:
+        d = 1
+    r = [str(i) for i in l]
+    r =r[:d]
    return ','.join(r)
 def main():
    # native queue
-    df2 = pd.read_csv('/home/gmuser/esmm_data/native.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
+    df2 = pd.read_csv('/data/esmm/native.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)
-    df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/data/esmm/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
-    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
    print("native_device_count",df3.shape)
    # nearby queue
-    df2 = pd.read_csv('/home/gmuser/esmm_data/nearby.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
+    df2 = pd.read_csv('/data/esmm/nearby.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)
-    df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/data/esmm/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
-    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
    print("nearby_device_count",df4.shape)
@@ -55,8 +64,6 @@ def main():
    df_all["time"] = ctime
    print("union_device_count",df_all.shape)
    host='10.66.157.22'
    port=4000
    user='root'
@@ -65,11 +72,11 @@ def main():
    charset='utf8'
    engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db))
+    df_merge = df_all['device_id'] + df_all['city_id']
+    df_merge_str = (str(list(df_merge.values))).strip('[]')
    try:
        # df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
-        df_merge = df_all['device_id'] + df_all['city_id']
-        df_merge_str = (str(list(df_merge.values))).strip('[]')
        delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str)
        con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
        cur = con.cursor()
@@ -79,7 +86,7 @@ def main():
    except Exception as e:
        print(e)
+    print("done")
 if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/eda/esmm/Model_pipline/to_tfrecord.py
+++ b/eda/esmm/Model_pipline/to_tfrecord.py
+#coding=utf-8
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import pandas as pd
+import sys
+import os
+import glob
+import tensorflow as tf
+import numpy as np
+import re
+from multiprocessing import Pool as ThreadPool
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+LOG = tf.logging
+tf.app.flags.DEFINE_string("input_dir", "./", "input dir")
+tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
+tf.app.flags.DEFINE_integer("threads", 16, "threads num")
+def gen_tfrecords(in_file):
+    basename = os.path.basename(in_file) + ".tfrecord"
+    out_file = os.path.join(FLAGS.output_dir, basename)
+    tfrecord_out = tf.python_io.TFRecordWriter(out_file)
+    df = pd.read_csv(in_file)
+    for i in range(df.shape[0]):
+        feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
+                "channel", "top", "time", "stat_date"]
+        id = np.array([])
+        for j in feats:
+            id = np.append(id,df[j][i])
+        app_list = np.array(str(df["app_list"][i]).split(","))
+        level2_list = np.array(str(df["clevel2_id"][i]).split(","))
+        features = tf.train.Features(feature={
+            "y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
+            "z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
+            "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
+            "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
+            "level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int)))
+        })
+        example = tf.train.Example(features = features)
+        serialized = example.SerializeToString()
+        tfrecord_out.write(serialized)
+    tfrecord_out.close()
+def main(_):
+    if not os.path.exists(FLAGS.output_dir):
+        os.mkdir(FLAGS.output_dir)
+    file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
+    print("total files: %d" % len(file_list))
+    pool = ThreadPool(FLAGS.threads) # Sets the pool size
+    pool.map(gen_tfrecords, file_list)
+    pool.close()
+    pool.join()
+if __name__ == "__main__":
+    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.app.run()
\ No newline at end of file
--- a/eda/esmm/Model_pipline/DeepCvrMTL.py
+++ b/eda/esmm/Model_pipline/DeepCvrMTL.py
@@ -53,9 +53,11 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
        features = {
            "y": tf.FixedLenFeature([], tf.float32),
            "z": tf.FixedLenFeature([], tf.float32),
-            "feat_ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64)
+            "ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64),
-            #"feat_vals": tf.FixedLenFeature([None], tf.float32),
+            "app_list": tf.VarLenFeature(tf.int64),
+            "level2_list": tf.VarLenFeature(tf.int64)
        }
        parsed = tf.parse_single_example(record, features)
        y = parsed.pop('y')
        z = parsed.pop('z')
@@ -98,15 +100,9 @@ def model_fn(features, labels, mode, params):
    #------bulid weights------
    Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer())
-    #------build feaure-------
+    feat_ids = features['ids']
-    #{U-A-X-C不需要特殊处理的特征}
+    app_list = features['app_list']
-    feat_ids    = features['feat_ids']
+    level2_list = features['level2_list']
-    #feat_vals   = features['feat_vals']
-    #{User multi-hot}
-    #{Ad}
-    #{X multi-hot}
-    #x_intids    = features['x_intids']
-    #x_intvals   = features['x_intvals']
    if FLAGS.task_type != "infer":
        y = labels['y']
@@ -114,10 +110,13 @@ def model_fn(features, labels, mode, params):
    #------build f(x)------
    with tf.variable_scope("Shared-Embedding-layer"):
-        common_embs = tf.nn.embedding_lookup(Feat_Emb, feat_ids)     # None * F' * K
+        embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids)
-        #common_embs = tf.multiply(common_embs, feat_vals)
+        app_id = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=app_list, sp_weights=None, combiner="sum")
+        level2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level2_list, sp_weights=None, combiner="sum")
-        x_concat = tf.concat([tf.reshape(common_embs,shape=[-1, common_dims])],axis=1)    # None * (F * K)
+        # x_concat = tf.reshape(embedding_id,shape=[-1, common_dims])  # None * (F * K)
+        x_concat = tf.concat([tf.reshape(embedding_id,shape=[-1,common_dims]),app_id,level2], axis=1)
    with tf.name_scope("CVR_Task"):
        if mode == tf.estimator.ModeKeys.TRAIN:
@@ -348,20 +347,6 @@ def main(_):
                    fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr']))
    elif FLAGS.task_type == 'export':
        print("Not Implemented, Do It Yourself!")
-        #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
-        #feature_spec = {
-        #    'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
-        #    'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
-        #}
-        #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
-        #feature_spec = {
-        #    'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'),
-        #    'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals')
-        #}
-        #serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)
-        #Estimator.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
 if __name__ == "__main__":
    tf.logging.set_verbosity(tf.logging.INFO)

--- a/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+++ b/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
@@ -2,8 +2,9 @@ package com.gmei
 import java.io.Serializable
+import java.time.LocalDate
-import org.apache.spark.sql.{SaveMode, TiContext}
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
 import org.apache.log4j.{Level, Logger}
 import scopt.OptionParser
 import com.gmei.lib.AbstractParams
@@ -30,8 +31,8 @@ object EsmmData {
      .text(s"the databases environment you used")
      .action((x, c) => c.copy(env = x))
    opt[String]("date")
-        .text(s"the date you used")
+      .text(s"the date you used")
-        .action((x,c) => c.copy(date = x))
+      .action((x,c) => c.copy(date = x))
    note(
      """
        |For example, the following command runs this app on a tidb dataset:
@@ -69,15 +70,15 @@ object EsmmData {
      if (max_stat_date_str != param.date){
        val stat_date = param.date
        println(stat_date)
-//        val imp_data = sc.sql(
+        //        val imp_data = sc.sql(
-//          s"""
+        //          s"""
-//             |select distinct stat_date,device_id,city_id as ucity_id,
+        //             |select distinct stat_date,device_id,city_id as ucity_id,
-//             |  cid_id,diary_service_id
+        //             |  cid_id,diary_service_id
-//             |from data_feed_exposure
+        //             |from data_feed_exposure
-//             |where cid_type = 'diary'
+        //             |where cid_type = 'diary'
-//             |and stat_date ='${stat_date}'
+        //             |and stat_date ='${stat_date}'
-//         """.stripMargin
+        //         """.stripMargin
-//        )
+        //        )
        val imp_data = sc.sql(
          s"""
@@ -90,8 +91,8 @@ object EsmmData {
         """.stripMargin
        )
        //      imp_data.show()
-        //      println("imp_data.count()")
+        println("imp_data.count()")
-        //      println(imp_data.count())
+        println(imp_data.count())
        val clk_data = sc.sql(
@@ -104,8 +105,8 @@ object EsmmData {
         """.stripMargin
        )
        //      clk_data.show()
-        //      println("clk_data.count()")
+        println("clk_data.count()")
-        //      println(clk_data.count())
+        println(clk_data.count())
@@ -138,8 +139,9 @@ object EsmmData {
        //      println(cvr_data_filter.count())
+        val other_click = get_other_click(sc,stat_date_not)
-        val clk_data_filter =clk_data.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
+        val all_click = clk_data.union(other_click)
+        val clk_data_filter =all_click.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
        //      clk_data_filter.createOrReplaceTempView("clk_data_filter")
        //      clk_data_filter.show()
        //      println("clk_data_filter.count()")
@@ -220,10 +222,10 @@ object EsmmData {
             |group by device_id,cid_id
         """.stripMargin
        )
+        union_data_scity_id2.persist()
        GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append)
+        GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append)
+        union_data_scity_id2.unpersist()
      } else {
        println("esmm_train_data already have param.date data")
      }
@@ -232,6 +234,103 @@ object EsmmData {
    }
  }
+  def get_other_click(spark:SparkSession,yesterday:String): DataFrame ={
+    var result01 = spark.sql(
+      s"""
+         |select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+         |device["device_id"] as device_id,channel as device_type,
+         |city_id,params['business_id'] as cid
+         |from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
+         |and action = 'on_click_diary_card' and params['tab_name'] != '精选'
+         |and params['page_name'] = 'home'
+       """.stripMargin
+    )
+    //    println(result01.count())
+    //    result01.show(6)
+    val recommend = spark.sql(
+      s"""
+         |select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+         |device["device_id"] as device_id,channel as device_type,
+         |city_id,params["business_id"] as cid
+         |from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
+         |and action = 'diarybook_detail_click_recommend_block' and params["business_type"] = "diary"
+       """.stripMargin
+    )
+    //    println("详情页推荐日记：")
+    //    println(recommend.count())
+    //    recommend.show(6)
+    val search_zonghe = spark.sql(
+      s"""
+         |select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+         |device["device_id"] as device_id,channel as device_type,city_id,params["business_id"] as cid
+         |from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
+         |and action = 'search_result_click_infomation_item' and params["business_type"] = "diary"
+       """.stripMargin
+    )
+    //    println("搜索综合：")
+    //    println(search_zonghe.count())
+    //    search_zonghe.show(6)
+    val non_home = spark.sql(
+      s"""
+         |select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+         |device["device_id"] as device_id,channel as device_type,city_id,params["diary_id"] as cid
+         |from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
+         |and action = 'on_click_diary_card' and params['page_name'] != 'home'
+       """.stripMargin
+    )
+    //    println("non home：")
+    //    println(non_home.count())
+    //    non_home.show(6)
+    result01 = result01.union(recommend).union(search_zonghe).union(non_home)
+    //    println(result01.count())
+    result01.createOrReplaceTempView("temp_result")
+    val result02 = spark.sql(
+      s"""
+         |select * from temp_result
+         |where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+         |        ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+         |        ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+         |        ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+         |        ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
+         |       and device_id not in
+         |       (SELECT cl_id
+         |        FROM online.ml_hospital_spam_pv_day
+         |        WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
+         |              AND pv_ratio>=0.95
+         |        UNION ALL
+         |        SELECT cl_id
+         |        FROM online.ml_hospital_spam_pv_month
+         |        WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
+         |            AND pv_ratio>=0.95
+         |        )
+       """.stripMargin
+    )
+    result02.createOrReplaceTempView("temp_result02")
+    val result_dairy = spark.sql(
+      s"""
+         |select
+         |    re.stat_date as stat_date,
+         |    re.device_id as device_id,
+         |    re.city_id as ucity_id,
+         |    re.cid as cid_id,
+         |    da.service_id as diary_service_id
+         |from temp_result02 re
+         |left join online.ml_community_diary_updates da
+         |on re.cid = da.diary_id
+         |where da.partition_date='${yesterday}'
+       """.stripMargin
+    ).distinct()
+    result_dairy
+  }
 }
@@ -279,11 +378,20 @@ object EsmmPredData {
      ti.tidbMapTable("eagle","search_queue")
      ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_train_data")
      ti.tidbMapTable("eagle","biz_feed_diary_queue")
+      ti.tidbMapTable("jerry_prod","data_feed_exposure_precise")
      import sc.implicits._
      val yesteday_have_seq = GmeiConfig.getMinusNDate(1)
+      val target_user = sc.sql(
+        s"""
+           |select concat(t.device_id,",",t.city_id) from
+           |(select distinct device_id,city_id
+           |from data_feed_exposure where stat_date='${yesteday_have_seq}') t
+         """.stripMargin).collect().map(x => x(0).toString)
+      println("target_user",target_user.length)
      //nearby_data
      val raw_data = sc.sql(
        s"""
@@ -293,21 +401,50 @@ object EsmmPredData {
           |select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue
           |union
           |select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1
-           |where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date='${yesteday_have_seq}')
+         """.stripMargin)
-         """.stripMargin
+      //      raw_data.show()
-      )
-      raw_data.show()
-      val raw_data1 = raw_data.rdd.groupBy(_.getAs[String]("device_city")).map {
+      val raw_data1 = raw_data.rdd.groupBy(_.getAs[String]("device_city"))
+        .filter(x => target_user.indexOf(x._1) != -1)
+        .map {
        case (device_city, cid_data) =>
          val device_id = Try(device_city.split(",")(0)).getOrElse("")
          val city_id = Try(device_city.split(",")(1)).getOrElse("")
          val cids = Try(cid_data.toSeq.map(_.getAs[String]("merge_queue").split(",")).flatMap(_.zipWithIndex).sortBy(_._2).map(_._1).distinct.take(500).mkString(",")).getOrElse("")
          (device_id,city_id ,s"$cids")
      }.filter(_._3!="").toDF("device_id","city_id","merge_queue")
-      raw_data1.createOrReplaceTempView("raw_data1")
-      println("nearby_device_count",raw_data1.count())
+      //      println("nearby_device_count",raw_data1.count())
+      val start= LocalDate.now().minusDays(14).toString
+      import sc.implicits._
+      val sql =
+        s"""
+           |select distinct device_id,cid_id from data_feed_exposure_precise
+           |where stat_date >= "$start" and cid_type = "diary"
+       """.stripMargin
+      val history = sc.sql(sql).repartition(200).rdd
+        .map(x =>(x(0).toString,x(1).toString)).groupByKey().map(x => (x._1,x._2.mkString(",")))
+        .toDF("device_id","cid_set")
+      history.persist()
+      history.createOrReplaceTempView("history")
+      if (history.take(1).nonEmpty){
+        raw_data1.createOrReplaceTempView("r")
+        val sql_nearby_filter =
+          s"""
+             |select r.device_id,r.city_id,r.merge_queue,history.cid_set from r
+             |left join history on r.device_id = history.device_id
+       """.stripMargin
+        val df = sc.sql(sql_nearby_filter).na.fill("").rdd
+          .map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
+          .map(x => (x._1,x._2,x._3.split(",").diff(x._4.split(",")).mkString(",")))
+          .toDF("device_id","city_id","merge_queue")
+        df.createOrReplaceTempView("raw_data1")
+      }else{
+        raw_data1.createOrReplaceTempView("raw_data1")
+      }
      val raw_data2 = sc.sql(
        s"""
@@ -315,7 +452,7 @@ object EsmmPredData {
         """.stripMargin
      ).withColumn("label",lit(1))
      raw_data2.createOrReplaceTempView("raw_data2")
-      println("nearby_explode_count",raw_data2.count())
+      //      println("nearby_explode_count",raw_data2.count())
      // native_data
@@ -327,8 +464,26 @@ object EsmmPredData {
           |where a.stat_date='${yesteday_have_seq}' and b.native_queue != ""
         """.stripMargin
      )
-      native_data.createOrReplaceTempView("native_data")
+      //      println("native_device_count",native_data.count())
-      println("native_device_count",native_data.count())
+      if (history.take(1).nonEmpty){
+        native_data.createOrReplaceTempView("temp")
+        val sql_native_filter =
+          s"""
+             |select t.device_id,t.city_id,t.native_queue,history.cid_set from temp t
+             |left join history on t.device_id = history.device_id
+       """.stripMargin
+        val df = sc.sql(sql_native_filter).na.fill("").rdd
+          .map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
+          .map(x => (x._1,x._2,x._3.split(",").diff(x._4.split(",")).mkString(",")))
+          .toDF("device_id","city_id","native_queue")
+        df.createOrReplaceTempView("native_data")
+      }else{
+        native_data.createOrReplaceTempView("native_data")
+      }
+      history.unpersist()
      val native_data1 = sc.sql(
        s"""
@@ -336,9 +491,7 @@ object EsmmPredData {
          """.stripMargin
      ).withColumn("label",lit(0))
      native_data1.createOrReplaceTempView("native_data1")
-      println("native_explode_count",native_data1.count())
+      //      println("native_explode_count",native_data1.count())
      //union
      val union_data = sc.sql(
@@ -349,7 +502,7 @@ object EsmmPredData {
         """.stripMargin
      )
      union_data.createOrReplaceTempView("raw_data")
-      println("union_count",union_data.count())
+      //      println("union_count",union_data.count())
      //join feat
@@ -364,8 +517,8 @@ object EsmmPredData {
           |where b.partition_date = '${yesteday}'
         """.stripMargin
      )
-//      sid_data.show()
+      //      sid_data.show()
-      println(sid_data.count())
+      //      println(sid_data.count())
      val sid_data_label = sid_data.withColumn("y",lit(0)).withColumn("z",lit(0))
      sid_data_label.createOrReplaceTempView("union_data")
@@ -413,10 +566,29 @@ object EsmmPredData {
      union_data_ccity_name.createOrReplaceTempView("union_data_ccity_name")
      //      union_data_ccity_name.show()
+      val jdbcDF = sc.read
+        .format("jdbc")
+        .option("driver", "com.mysql.jdbc.Driver")
+        .option("url", "jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com:3306/zhengxing")
+        .option("dbtable", "api_punishment")
+        .option("user", "work")
+        .option("password", "BJQaT9VzDcuPBqkd")
+        .load()
+      jdbcDF.createOrReplaceTempView("api_punishment")
+      val now = LocalDate.now().toString
+      val punish_doctor = sc.sql(
+        s"""
+           |select doctor_id from api_punishment
+           |where end_time > '$now'
+        """.stripMargin).collect().map(x => x(0).toString).distinct
+      println("punish_doctor")
+      println(punish_doctor.length)
      val union_data_scity_id = sc.sql(
        s"""
           |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.label,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,a.ccity_name,
-           |  d.city_id as scity_id
+           |  d.city_id as scity_id,b.doctor_id,c.hospital_id
           |from union_data_ccity_name a
           |left join online.tl_meigou_service_view b on a.diary_service_id=b.id
           |left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id
@@ -424,24 +596,28 @@ object EsmmPredData {
           |where b.partition_date='${yesteday}'
           |and c.partition_date='${yesteday}'
           |and d.partition_date='${yesteday}'
+           |and b.doctor_id not in (${punish_doctor.map(x => s"'$x'").mkString(",")})
         """.stripMargin
      )
      union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
      val union_data_scity_id2 = sc.sql(
        s"""
-           |select device_id,cid_id,first(stat_date) stat_date,first(ucity_id) ucity_id,label,first(diary_service_id)diary_service_id,first(y) y,
+           |select device_id,cid_id,first(stat_date) stat_date,ucity_id,first(label) label,first(diary_service_id) diary_service_id,first(y) y,
-           |first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,first(scity_id) scity_id
+           |first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,
+           |first(scity_id) scity_id,first(hospital_id) hospital_id
           |from union_data_scity_id
-           |group by device_id,cid_id,label
+           |group by device_id,ucity_id,cid_id
         """.stripMargin
      )
      //      union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
-      println(union_data_scity_id2.count())
+      //      println(union_data_scity_id2.count())
+      union_data_scity_id2.persist()
      GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_pre_data",SaveMode.Overwrite)
+      GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_pre_data",SaveMode.Overwrite)
+      union_data_scity_id2.unpersist()
@@ -497,9 +673,9 @@ object GetDiaryPortrait {
      val diary_tag = sc.sql(
        s"""
           |select c.diary_id,
-           |		concat_ws(';',collect_set(cast(c.level1_id as string))) as level1_ids,
+           |		concat_ws(',',collect_set(cast(c.level1_id as string))) as level1_ids,
-           |		concat_ws(';',collect_set(cast(c.level2_id as string))) as level2_ids,
+           |		concat_ws(',',collect_set(cast(c.level2_id as string))) as level2_ids,
-           |		concat_ws(';',collect_set(cast(c.level3_id as string))) as level3_ids from
+           |		concat_ws(',',collect_set(cast(c.level3_id as string))) as level3_ids from
           |	(select a.diary_id,b.level1_id,b.level2_id,b.level3_id
           |		from online.tl_hdfs_diary_tags_view a
           |		left join online.bl_tag_hierarchy_detail b
@@ -509,10 +685,17 @@ object GetDiaryPortrait {
           |	group by c.diary_id
         """.stripMargin
      )
-      diary_tag.show()
+//      diary_tag.show()
-      println(diary_tag.count())
+//      println(diary_tag.count())
+      diary_tag.createOrReplaceTempView("t")
+      val result = sc.sql(
+        s"""
+           |select diary_id,level1_ids,level2_ids,level3_ids,split(level2_ids,",")[0] as level2 from t
+         """.stripMargin
+      )
+      val jdbc = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
-      GmeiConfig.writeToJDBCTable(diary_tag,"diary_feat",SaveMode.Overwrite)
+      GmeiConfig.writeToJDBCTable(jdbc,result,"diary_feat",SaveMode.Overwrite)
      sc.stop()
@@ -589,19 +772,19 @@ object GetDevicePortrait {
      device_search_tag.createOrReplaceTempView("tag_count")
      val max_count_tag = sc.sql(
-      s"""
+        s"""
-         |select a.device_id,a.stat_date,a.level1_id as max_level1_id,a.level1_count as max_level1_count
+           |select a.device_id,a.stat_date,a.level1_id as max_level1_id,a.level1_count as max_level1_count
-         |from tag_count a
+           |from tag_count a
-         |inner join
+           |inner join
-         |(select device_id,max(level1_count) as max_count from tag_count group by device_id) b
+           |(select device_id,max(level1_count) as max_count from tag_count group by device_id) b
-         |on a.level1_count = b.max_count and a.device_id = b.device_id
+           |on a.level1_count = b.max_count and a.device_id = b.device_id
       """.stripMargin
-    )
+      )
-//        .rdd.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
+      //        .rdd.map(x => (x(0).toString,x(1).toString,x(2).toString,x(3).toString))
-//      max_count_tag.foreachPartition(GmeiConfig.updateDeviceFeat)
+      //      max_count_tag.foreachPartition(GmeiConfig.updateDeviceFeat)
-//
+      //
-//      max_count_tag.take(10).foreach(println)
+      //      max_count_tag.take(10).foreach(println)
-//      println(max_count_tag.count())
+      //      println(max_count_tag.count())
      //drop duplicates
      val max_count_tag_rdd = max_count_tag.rdd.groupBy(_.getAs[String]("device_id")).map {
@@ -669,7 +852,7 @@ object GetLevelCount {
      import sc.implicits._
      val stat_date = GmeiConfig.getMinusNDate(1).replace("-","")
-//      val diary_queue = sc.read.json(param.path).rdd.map(x => x(0).toString).distinct().collect().toList.mkString(",")
+      //      val diary_queue = sc.read.json(param.path).rdd.map(x => x(0).toString).distinct().collect().toList.mkString(",")
      val diary_queue = "16215222,16204965,15361235,16121397,16277565,15491159,16299587,16296887,15294642,16204934,15649199,16122580,16122580,16122580,16122580,16122580,16122580"
      val diary_level1 = sc.sql(
        s"""
@@ -804,4 +987,298 @@ object GetDeviceDuration {
    }
  }
 }
\ No newline at end of file
+object EsmmDataTest {
+  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
+  case class Params(env: String = "dev",
+                    date: String = GmeiConfig.getMinusNDate(1)
+                   ) extends AbstractParams[Params] with Serializable
+  val defaultParams = Params()
+  val parser = new OptionParser[Params]("Feed_EDA") {
+    head("EsmmData")
+    opt[String]("env")
+      .text(s"the databases environment you used")
+      .action((x, c) => c.copy(env = x))
+    opt[String]("date")
+      .text(s"the date you used")
+      .action((x,c) => c.copy(date = x))
+    note(
+      """
+        |For example, the following command runs this app on a tidb dataset:
+        |
+        | spark-submit --class com.gmei.EsmmData ./target/scala-2.11/feededa-assembly-0.1.jar \
+      """.stripMargin +
+        s"|   --env ${defaultParams.env}"
+    )
+  }
+  def main(args: Array[String]): Unit = {
+    parser.parse(args, defaultParams).map { param =>
+      GmeiConfig.setup(param.env)
+      val spark_env = GmeiConfig.getSparkSession()
+      val sc = spark_env._2
+      val ti = new TiContext(sc)
+      ti.tidbMapTable(dbName = "eagle",tableName = "src_mimas_prod_api_diary_tags")
+      ti.tidbMapTable(dbName = "eagle",tableName = "src_zhengxing_api_tag")
+      ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_click")
+      ti.tidbMapTable(dbName = "jerry_prod",tableName = "data_feed_exposure_precise")
+      ti.tidbMapTable(dbName = "jerry_test", tableName = "train_data")
+      click(sc)
+      val max_stat_date = sc.sql(
+        s"""
+           |select max(stat_date) from train_data
+         """.stripMargin
+      )
+      val max_stat_date_str = max_stat_date.collect().map(s => s(0).toString).head
+      println("max_stat_date_str",max_stat_date_str)
+      println("param.date",param.date)
+      if (max_stat_date_str != param.date || max_stat_date_str == null){
+        val stat_date = param.date
+        println(stat_date)
+        //        val imp_data = sc.sql(
+        //          s"""
+        //             |select distinct stat_date,device_id,city_id as ucity_id,
+        //             |  cid_id,diary_service_id
+        //             |from data_feed_exposure
+        //             |where cid_type = 'diary'
+        //             |and stat_date ='${stat_date}'
+        //         """.stripMargin
+        //        )
+        val imp_data = sc.sql(
+          s"""
+             |select * from
+             |(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
+             |from data_feed_exposure_precise
+             |where cid_type = 'diary'
+             |and stat_date ='${stat_date}'
+             |group by stat_date,device_id,city_id,cid_id,diary_service_id) a
+         """.stripMargin
+        )
+        //      imp_data.show()
+        //      println("imp_data.count()")
+        //      println(imp_data.count())
+        val clk_data = sc.sql(
+          s"""
+             |select distinct stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
+             |from esmm_click
+             |where stat_date ='${stat_date}'
+         """.stripMargin
+        )
+        //      clk_data.show()
+        //      println("clk_data.count()")
+        //      println(clk_data.count())
+        val imp_data_filter = imp_data.except(clk_data).withColumn("y",lit(0)).withColumn("z",lit(0))
+        //      imp_data_filter.createOrReplaceTempView("imp_data_filter")
+        //      imp_data_filter.show()
+        //      println("imp_data_filter.count()")
+        //      println(imp_data_filter.count())
+        val stat_date_not = stat_date.replace("-","")
+        val cvr_data = sc.sql(
+          s"""
+             |select distinct
+             |  from_unixtime(unix_timestamp(partition_date ,'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+             |  cl_id as device_id,city_id as ucity_id,
+             |  params["referrer_id"] as cid_id,params["business_id"] as diary_service_id
+             |from online.tl_hdfs_maidian_view
+             |where action='page_view'
+             |and partition_date ='${stat_date_not}'
+             |and params['page_name'] = 'welfare_detail'
+             |and params['referrer'] = 'diary_detail'
+         """.stripMargin
+        )
+        val cvr_data_filter = cvr_data.withColumn("y",lit(1)).withColumn("z",lit(1))
+        //      cvr_data_filter.createOrReplaceTempView("cvr_data_filter")
+        //      cvr_data_filter.show()
+        //      println("cvr_data_filter.count()")
+        //      println(cvr_data_filter.count())
+        val clk_data_filter =clk_data.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
+        //      clk_data_filter.createOrReplaceTempView("clk_data_filter")
+        //      clk_data_filter.show()
+        //      println("clk_data_filter.count()")
+        //      println(clk_data_filter.count())
+        val union_data = imp_data_filter.union(clk_data_filter).union(cvr_data_filter)
+        union_data.createOrReplaceTempView("union_data")
+        //      union_data.show()
+        //      println("union_data.count()")
+        //      println(union_data.count())
+        val union_data_clabel = sc.sql(
+          s"""
+             |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,
+             |  c.level1_id as clevel1_id
+             |from union_data a
+             |left join online.tl_hdfs_diary_tags_view b on a.cid_id=b.diary_id
+             |left join online.bl_tag_hierarchy_detail c on b.tag_id=c.id
+             |where b.partition_date='${stat_date_not}'
+             |and c.partition_date='${stat_date_not}'
+         """.stripMargin
+        )
+        union_data_clabel.createOrReplaceTempView("union_data_clabel")
+        //      union_data_clabel.show()
+        val union_data_slabel = sc.sql(
+          s"""
+             |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,
+             |  c.level1_id as slevel1_id
+             |from union_data_clabel a
+             |left join online.tl_meigou_servicetag_view b on a.diary_service_id=b.service_id
+             |left join online.bl_tag_hierarchy_detail c on b.tag_id=c.id
+             |where b.partition_date='${stat_date_not}'
+             |and c.partition_date='${stat_date_not}'
+         """.stripMargin
+        )
+        union_data_slabel.createOrReplaceTempView("union_data_slabel")
+        //      union_data_slabel.show()
+        val union_data_ccity_name = sc.sql(
+          s"""
+             |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,
+             |  c.name as ccity_name
+             |from union_data_slabel a
+             |left join src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
+             |left join src_zhengxing_api_tag c on b.tag_id=c.id
+             | where c.tag_type=4
+         """.stripMargin
+        )
+        union_data_ccity_name.createOrReplaceTempView("union_data_ccity_name")
+        //      union_data_ccity_name.show()
+        val union_data_scity_id = sc.sql(
+          s"""
+             |select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,a.ccity_name,
+             |  d.city_id as scity_id
+             |from union_data_ccity_name a
+             |left join online.tl_meigou_service_view b on a.diary_service_id=b.id
+             |left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id
+             |left join online.tl_hdfs_hospital_view d on c.hospital_id=d.id
+             |where b.partition_date='${stat_date_not}'
+             |and c.partition_date='${stat_date_not}'
+             |and d.partition_date='${stat_date_not}'
+         """.stripMargin
+        )
+        union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
+        union_data_scity_id.show()
+        val union_data_scity_id2 = sc.sql(
+          s"""
+             |select device_id,cid_id,first(stat_date) stat_date,first(ucity_id) ucity_id,first(diary_service_id) diary_service_id,first(y) y,
+             |first(z) z,first(clevel1_id) clevel1_id,first(slevel1_id) slevel1_id,first(ccity_name) ccity_name,first(scity_id) scity_id
+             |from union_data_scity_id
+             |group by device_id,cid_id
+         """.stripMargin
+        )
+        GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="train_data",SaveMode.Append)
+      } else {
+        println("train_data already have param.date data")
+      }
+      sc.stop()
+    }
+  }
+  def click(spark:SparkSession): Unit ={
+    val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
+    println(yesterday)
+    val stat_yesterday = LocalDate.now().minusDays(1).toString
+    val max_stat_date = spark.sql(
+      s"""
+         |select max(stat_date) from esmm_click
+         """.stripMargin
+    )
+    val max = max_stat_date.collect().map(s => s(0).toString).head
+    println("max_stat_date",max)
+    if (max != stat_yesterday || max == null){
+      val result01 = spark.sql(
+        s"""
+           |select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
+           |device["device_id"] as device_id,channel as device_type,
+           |city_id,params['diary_id'] as cid
+           |from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
+           |and action = 'on_click_diary_card' and params['tab_name'] = '精选'
+           |and params['page_name'] = 'home'
+       """.stripMargin
+      )
+      result01.createOrReplaceTempView("temp_result")
+      val result02 = spark.sql(
+        s"""
+           |select * from temp_result
+           |where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |        ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |        ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |        ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |        ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
+           |       and device_id not in
+           |       (SELECT cl_id
+           |        FROM online.ml_hospital_spam_pv_day
+           |        WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
+           |              AND pv_ratio>=0.95
+           |        UNION ALL
+           |        SELECT cl_id
+           |        FROM online.ml_hospital_spam_pv_month
+           |        WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
+           |            AND pv_ratio>=0.95
+           |        )
+       """.stripMargin
+      )
+      result02.createOrReplaceTempView("temp_result02")
+      val result_dairy = spark.sql(
+        s"""
+           |select
+           |    re.stat_date as stat_date,
+           |    re.device_id as device_id,
+           |    re.device_type as device_type,
+           |    re.cid as cid_id,
+           |    re.city_id as city_id,
+           |    da.service_id as diary_service_id
+           |from temp_result02 re
+           |left join online.ml_community_diary_updates da
+           |on re.cid = da.diary_id
+           |where da.partition_date='${yesterday}'
+       """.stripMargin
+      )
+      val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
+      GmeiConfig.writeToJDBCTable(jdbcuri,result_dairy, table="esmm_click",SaveMode.Append)
+      println("data insert")
+    }else{
+      println("data already exists")
+    }
+  }
+}
--- a/eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
+++ b/eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
@@ -53,6 +53,11 @@ object GmeiConfig extends Serializable {
      .enableHiveSupport()
      .getOrCreate()
+    spark.sql("use online")
+    spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
+    spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+    spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
+    spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
    val context = SparkContext.getOrCreate(sparkConf)
    (context, spark)
  }
@@ -65,10 +70,15 @@ object GmeiConfig extends Serializable {
    prop.put("isolationLevel", "NONE")
    prop.put("truncate", "true")
    // save to mysql/tidb
-    df.repartition(128).write.mode(saveModel)
+    try {
-      .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 300)
+      df.repartition(128).write.mode(saveModel)
-      .jdbc(jdbcuri, table, prop)
+        .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 300)
-    print("写入成功")
+        .jdbc(jdbcuri, table, prop)
+      print("写入成功")}
+    catch {
+      case _ => println("没有写入成功")
+    }
  }
@@ -109,3 +119,4 @@ object GmeiConfig extends Serializable {
  }
 }
--- a/eda/feededa/src/main/scala/com/gmei/Search_keywords_count.scala
+++ b/eda/feededa/src/main/scala/com/gmei/Search_keywords_count.scala
@@ -68,9 +68,9 @@ object Search_keywords_count {
      //搜索关键词提取
      val search_keywords = sc.sql(
        s"""
-           |select count(test_udf(params)) as search_keywords
+           |select params['query'] as search_keywords
           |from online.tl_hdfs_maidian_view
-           |where (action = 'do_search' or action = 'search_result_click_search')
+           |where (action = 'do_search' or action = 'search_result_click_search' or action ='on_click_jumping_hot_word')
           |and partition_date ='20190108'
         """.stripMargin
      ).show(20)/*.rdd.map(x=>{

--- a/eda/feededa/src/main/scala/com/gmei/data_feed_exposure_precise.scala
+++ b/eda/feededa/src/main/scala/com/gmei/data_feed_exposure_precise.scala
@@ -56,6 +56,11 @@ object data_feed_exposure_precise {
      //println(param.date)
      val partition_date = stat_date.replace("-","")
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+//      sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
+//      sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
      val result01=sc.sql(
        s"""
           |select

--- a/eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
+++ b/eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
+package com.gmei
+import java.io.Serializable
+import java.time.LocalDate
+import com.gmei.lib.AbstractParams
+import org.apache.log4j.{Level, Logger}
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
+import scopt.OptionParser
+import scala.util.parsing.json.JSON
+object esmm_feature {
+  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
+  case class Params(env: String = "dev",
+                    date: String = "2018-08-01"
+                   ) extends AbstractParams[Params] with Serializable
+  val defaultParams = Params()
+  val parser = new OptionParser[Params]("Feed_EDA") {
+    head("WeafareStat")
+    opt[String]("env")
+      .text(s"the databases environment you used")
+      .action((x, c) => c.copy(env = x))
+    opt[String] ("date")
+      .text(s"the date you used")
+      .action((x,c) => c.copy(date = x))
+    note(
+      """
+        |For example, the following command runs this app on a tidb dataset:
+        |
+        | spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
+      """.stripMargin +
+        s"|   --env ${defaultParams.env}"
+    )
+  }
+  def main(args: Array[String]): Unit = {
+    parser.parse(args, defaultParams).map { param =>
+      GmeiConfig.setup(param.env)
+      val spark_env = GmeiConfig.getSparkSession()
+      val sc = spark_env._2
+      val ti = new TiContext(sc)
+      ti.tidbMapTable(dbName = "jerry_test",tableName = "device_app_list")
+      ti.tidbMapTable(dbName = "jerry_test",tableName = "user_feature")
+      user_feature(sc)
+      get_applist(sc)
+    sc.stop()
+  }}
+  def get_applist(spark:SparkSession): Unit ={
+    val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
+    println(yesterday)
+    val df = spark.sql(
+      s"""
+         |select device["device_id"] as device_id,cl_type,params["installed_app_info"]
+         |from online.tl_hdfs_maidian_view where partition_date = $yesterday
+         |and action = 'user_installed_all_app_info'
+       """.stripMargin).dropDuplicates("device_id")
+    df.persist()
+    val old = spark.sql("select device_id from device_app_list").collect().map(x => x(0).toString)
+    import spark.implicits._
+    val android = df.rdd.map(x => (x(0).toString,x(1).toString,x(2).toString))
+      .filter(x => x._2 == "android").map(x => (x._1,x._2,parse_json(x._3),yesterday))
+    val ios = df.rdd.map(x => (x(0).toString,x(1).toString,x(2).toString))
+      .filter(x => x._2 == "ios").map(x => (x._1,x._2,x._3,yesterday))
+    val rdd = android.union(ios)
+    val new_user = rdd.filter(x => old.indexOf(x._1)== -1)
+      .toDF("device_id","os","app_list","update_date")
+    if (new_user.take(1).nonEmpty){
+      val jdbc = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
+      GmeiConfig.writeToJDBCTable(jdbc, new_user,"device_app_list", SaveMode.Append)
+      val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
+      GmeiConfig.writeToJDBCTable(tecent_jdbc, new_user,"device_app_list", SaveMode.Append)
+    }else{
+      println("没有新用户需要写入")
+    }
+    df.unpersist()
+  }
+  def parse_json(str:String): String ={
+    var t = List[Map[String, Any]]()
+    val result = JSON.parseFull(str)
+    result match {
+      case Some(b: List[Map[String, Any]]) => t = t ++ b
+      case None => println("Parsing failed")
+      case other => println("Unknown data structure: " + other)
+    }
+    var x = List[String]()
+    if (t.nonEmpty){
+      for (i <- t){
+        x = x:+i("appName").toString
+      }
+    }
+    x.mkString(",")
+  }
+  def user_feature(spark:SparkSession): Unit ={
+    val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
+    println(yesterday)
+    val sql_exist = "select device_id from user_feature"
+    val old = spark.sql(sql_exist)
+      .collect().map(x => x(0).toString)
+    val sql_yesterday =
+      s"""
+         |select device["device_id"] as id,device["device_type"],device["manufacturer"],city_id,channel,
+         |partition_date from online.tl_hdfs_maidian_view where partition_date = $yesterday
+       """.stripMargin
+    val rdd = spark.sql(sql_yesterday).repartition(200).na.drop().dropDuplicates("id").rdd
+      .map(x =>(x(0).toString,x(1).toString,x(2).toString,x(3).toString,
+        x(4).toString,x(5).toString))
+    import spark.implicits._
+    val df_new = rdd.filter(x => old.indexOf(x._1)== -1)
+      .toDF("device_id","device_type","manufacturer","city_id","channel","date")
+    if (df_new.take(1).nonEmpty){
+      df_new.persist()
+      val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
+      GmeiConfig.writeToJDBCTable(jdbcuri, df_new, "user_feature", SaveMode.Append)
+      val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
+      GmeiConfig.writeToJDBCTable(tecent_jdbc, df_new, "user_feature", SaveMode.Append)
+      df_new.unpersist()
+    }else {
+      println("no need to insert into user feature")
+    }
+  }
+}
--- a/eda/feededa/src/main/scala/com/gmei/find_bug.scala
+++ b/eda/feededa/src/main/scala/com/gmei/find_bug.scala
@@ -499,4 +499,5 @@ object CTR_precise {
  }
 }
\ No newline at end of file
--- a/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
+++ b/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
@@ -8,6 +8,8 @@ import org.apache.log4j.{Level, Logger}
 import scopt.OptionParser
 import com.gmei.lib.AbstractParams
 import java.io._
+import scala.util.parsing.json._
 object temp_analysis {
@@ -160,22 +162,6 @@ object temp_analysis {
      }
-      //5.登录人数
-      val log_device_temp = sc.sql(
-        s"""
-           |select oe.stat_date,count(distinct(oe.device_id)) as log_num
-           |from data_feed_exposure oe left join final_id
-           |on oe.device_id = final_id.device_id
-           |and oe.stat_date >='2018-11-01'
-           |and final_id.device_id is null
-           |group by oe.stat_date
-           |order by oe.stat_date
-         """.stripMargin
-      )
-      println("登录人数统计：")
-      log_device_temp.show(80)
    }
@@ -427,7 +413,8 @@ object meigou_xiaofei_renshu {
      import sc.implicits._
-      val stat_date = GmeiConfig.getMinusNDate(1)
+//      val stat_date = GmeiConfig.getMinusNDate(1)
+      val stat_date=param.date
      //println(param.date)
      val partition_date = stat_date.replace("-","")
@@ -521,7 +508,7 @@ object meigou_xiaofei_renshu {
-object smart_rank_count {
+object alpha_ctr {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
@@ -567,158 +554,108 @@ object smart_rank_count {
      import sc.implicits._
      val stat_date = GmeiConfig.getMinusNDate(1)
+//      val stat_date = param.date
      //println(param.date)
      val partition_date = stat_date.replace("-","")
-      val agency_id = sc.sql(
+      val click_count_recommend = sc.sql(
        s"""
-           |SELECT DISTINCT(cl_id) as device_id
+           |select '${stat_date}' as stat_date,count(*) as click_count_recommend
-           |FROM online.ml_hospital_spam_pv_day
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-           |WHERE partition_date >= '20180402'
+           |where params['tab_name']='recommend'
-           |AND partition_date <= '${partition_date}'
+           |and params['page_name']='home'
-           |AND pv_ratio >= 0.95
+           |and type='on_click_feed_topic_card'
-           |UNION ALL
+           |and partition_day='${partition_date}'
-           |SELECT DISTINCT(cl_id) as device_id
+       """.stripMargin
-           |FROM online.ml_hospital_spam_pv_month
-           |WHERE partition_date >= '20171101'
-           |AND partition_date <= '${partition_date}'
-           |AND pv_ratio >= 0.95
-         """.stripMargin
      )
-      agency_id.createOrReplaceTempView("agency_id")
+      click_count_recommend.show()
-      val blacklist_id = sc.sql(
+      val click_count_focus = sc.sql(
        s"""
-           |SELECT device_id
+           |select '${stat_date}' as stat_date,count(*) as click_count_focus
-           |from blacklist
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-         """.stripMargin
+           |where params['tab_name']='focus'
+           |and params['page_name']='home'
+           |and type='on_click_feed_topic_card'
+           |and partition_day='${partition_date}'
+       """.stripMargin
      )
-      blacklist_id.createOrReplaceTempView("blacklist_id")
+      click_count_focus.show()
-      val final_id = sc.sql(
-        s"""
-           |select device_id
-           |from agency_id
-           |UNION ALL
-           |select device_id
-           |from blacklist_id
-         """.stripMargin
-      )
-      final_id.createOrReplaceTempView("final_id")
+      def parse_json(str:String): Int ={
+        var t = List[Map[String, Any]]()
+        val result = JSON.parseFull(str)
+        result match {
+          case Some(b: List[Map[String, Any]]) => t = t ++ b
+          case None => println("Parsing failed")
+          case other => println("Unknown data structure: " + other)
+        }
+        t.size
-      val user_city_meigou_view = sc.sql(
+      }
-        s"""
-           |select ov.cl_id as device_id,ov.city_id as device_city,ov.params['business_id'] as meigou_id
-           |from online.tl_hdfs_maidian_view ov left join final_id
-           |on ov.cl_id = final_id.device_id
-           |where ov.action = "page_view"
-           |and ov.params['page_name']="welfare_detail"
-           |and ov.partition_date >='20181101'
-           |and ov.partition_date <'20181201'
-           |and ov.city_id is not null
-           |and final_id.device_id  is null
-       """.stripMargin
-      )
-      user_city_meigou_view.createOrReplaceTempView("user_city_meigou_view")
-      val meigou_city = sc.sql(
+      val expoure_cards=sc.sql(
        s"""
-           |select b.id as meigou_id,d.city_id as meigou_city
+           |select params['exposure_cards'] as exposure_cards
-           |from online.tl_meigou_service_view b
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-           |left join online.tl_hdfs_doctor_view c on b.doctor_id=c.id
+           |where params['tab_name'] = 'recommend'
-           |left join online.tl_hdfs_hospital_view d on c.hospital_id=d.id
+           |and params['page_name'] = 'home'
-           |where b.partition_date='20181228'
+           |and type = 'page_precise_exposure'
-           |and c.partition_date='20181228'
+           |and partition_day='${partition_date}'
-           |and d.partition_date='20181228'
       """.stripMargin
      )
-      meigou_city.createOrReplaceTempView("meigou_city")
+      val a =expoure_cards.rdd.map(row => row(0).toString).map(row=>parse_json(row)).collect().sum
+      val result1=List((stat_date,a))
+      val df1 = sc.createDataFrame(result1).toDF("stat_date","expoure_count_recommend")
-      val meigou_pv_tongcheng = sc.sql(
+      val expoure_cards2=sc.sql(
        s"""
-           |select a.device_id,a.device_city,a.meigou_id,b.meigou_city
+           |select params['exposure_cards'] as exposure_cards
-           |from user_city_meigou_view a
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-           |left join meigou_city b
+           |where params['tab_name'] = 'focus'
-           |on a.meigou_id = b.meigou_id
+           |and params['page_name'] = 'home'
+           |and type = 'page_precise_exposure'
+           |and partition_day='${partition_date}'
       """.stripMargin
      )
-      meigou_pv_tongcheng.createOrReplaceTempView("meigou_pv_tongcheng")
+      val b =expoure_cards2.rdd.map(row => row(0).toString).map(row=>parse_json(row)).collect().sum
+      val result2=List((stat_date,b))
+      val df2 = sc.createDataFrame(result2).toDF("stat_date","expoure_count_focus")
-      val meigou_pv_count = sc.sql(
-        s"""
-           |select '2018-11' as stat_date,meigou_city,count(device_id) as meigou_pv,count(distinct(device_id)) as meigou_device_num
-           |from meigou_pv_tongcheng
-           |where device_city = meigou_city
-           |group by meigou_city
-       """.stripMargin
-      )
-      meigou_pv_count.createOrReplaceTempView("meigou_pv_count")
-//开始计算咨询
+      val result=click_count_recommend.join(click_count_focus,"stat_date")
-      val zixun_meigou_view = sc.sql(
+        .join(df1,"stat_date")
-        s"""
+        .join(df2,"stat_date")
-           |select ov.cl_id as device_id,ov.city_id as device_city,ov.params['service_id'] as meigou_id
-           |from online.tl_hdfs_maidian_view ov left join final_id
-           |on ov.cl_id = final_id.device_id
-           |where ov.partition_date >= '20181101'
-           |and ov.partition_date < '20181201'
-           |and ov.action = 'welfare_detail_click_message'
-           |and final_id.device_id is null
-       """.stripMargin
-      )
-      zixun_meigou_view.createOrReplaceTempView("zixun_meigou_view")
-      val zixun_meigou_tongcheng = sc.sql(
-        s"""
-           |select a.device_id,a.device_city,a.meigou_id,b.meigou_city
-           |from zixun_meigou_view a
-           |left join meigou_city b
-           |on a.meigou_id=b.meigou_id
-       """.stripMargin
-      )
-      zixun_meigou_tongcheng.createOrReplaceTempView("zixun_meigou_tongcheng")
-      val zixun_pv_count = sc.sql(
-        s"""
-           |select '2018-11' as stat_date,meigou_city,count(device_id) as meigou_zixun,count(distinct(device_id)) as meigou_zixun_device_num
-           |from zixun_meigou_tongcheng
-           |where device_city=meigou_city
-           |group by meigou_city
-       """.stripMargin
-      )
-      zixun_pv_count.createOrReplaceTempView("zixun_pv_count")
+      GmeiConfig.writeToJDBCTable(result, "alpha_ctr", SaveMode.Append)
-      //开始计算每个地区每月新增设备
-      val device_new_count = sc.sql(
+      val device_duration_avge = sc.sql(
        s"""
-           |select first_city,count(distinct(device_id)) as new_device_month
+           |SELECT '${stat_date}' as stat_date,sum(a.time_all)/count(a.device_id) as device_duration_avge,count(distinct(a.device_id)) as device_num from (select device_id,sum(params['duration']) as time_all
-           |from online.ml_device_day_active_status
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-           |where active_type != '4'
+           |where type='on_app_session_over'
-           |and partition_date >='20181101'
+           |and partition_day='${partition_date}'
-           |and partition_date <'20181201'
+           |GROUP BY device_id) a
-           |group by first_city
       """.stripMargin
      )
-      device_new_count.createOrReplaceTempView("device_new_count")
+      device_duration_avge.show()
+      val duration_everytime_avge=sc.sql(
-//将所有的数据综合一起
-      val all_count = sc.sql(
        s"""
-           |select mc.stat_date,mc.meigou_city,mc.meigou_pv,mc.meigou_device_num,zc.meigou_zixun,zc.meigou_zixun_device_num,dc.new_device_month
+           |SELECT '${stat_date}' as stat_date,sum(a.time_duration)/count(a.device_id) as duration_everytime_avge from (select device_id,params['duration'] as time_duration
-           |from meigou_pv_count mc
+           |from bl.bl_alpha_et_mg_maidianlog_inc_d
-           |left join zixun_pv_count zc on mc.meigou_city = zc.meigou_city
+           |where type='on_app_session_over'
-           |left join device_new_count dc on dc.first_city=mc.meigou_city
+           |and partition_day='${partition_date}') a
       """.stripMargin
      )
-      all_count.show()
-      GmeiConfig.writeToJDBCTable(all_count, "smart_rank_count", SaveMode.Append)
+      val result3=device_duration_avge.join(duration_everytime_avge,"stat_date")
+      GmeiConfig.writeToJDBCTable(result3, "alpha_duration", SaveMode.Append)
    }
@@ -733,7 +670,7 @@ object smart_rank_count {
 //话题相关问题统计
-object question_count {
+object copy_database {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
@@ -772,50 +709,32 @@ object question_count {
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
-      ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
+      ti.tidbMapTable(dbName = "jerry_test", tableName = "tl_hdfs_wiki_item_tag_view")
-      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
+      ti.tidbMapTable(dbName = "jerry_test", tableName = "Knowledge_network")
-      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
+      ti.tidbMapTable(dbName = "eagle", tableName = "src_mimas_prod_api_diary")
      import sc.implicits._
      val stat_date = GmeiConfig.getMinusNDate(1)
-      //println(param.date)
+//      val stat_date=param.date
      val partition_date = stat_date.replace("-","")
-      val agency_id = sc.sql(
+      val new_data = sc.sql(
-        s"""
-           |SELECT DISTINCT(cl_id) as device_id
-           |FROM online.ml_hospital_spam_pv_day
-           |WHERE partition_date >= '20180402'
-           |AND partition_date <= '20190117'
-           |AND pv_ratio >= 0.95
-           |UNION ALL
-           |SELECT DISTINCT(cl_id) as device_id
-           |FROM online.ml_hospital_spam_pv_month
-           |WHERE partition_date >= '20171101'
-           |AND partition_date <= '20190117'
-           |AND pv_ratio >= 0.95
-       """.stripMargin
-        )
-      agency_id.createOrReplaceTempView("agency_id")
-      val question_count = sc.sql(
        s"""
-           |SELECT partition_date,count(cl_id)
+           |select d.level2_id,d.level2_name,c.item_id,c.tag_id,c.id,c.name,c.treatment_method,c.price_min,c.price_max,c.treatment_time,c.maintain_time,c.recover_time
-           |FROM online.tl_hdfs_maidian_view ov left join agency_id
+           |from online.bl_tag_hierarchy_detail d
-           |on ov.cl_id = agency_id.device_id
+           |inner join
-           |WHERE ov.partition_date >= '20190101'
+           |(select a.item_id,a.tag_id,b.id,b.name,b.treatment_method,b.price_min,b.price_max,b.treatment_time,b.maintain_time,b.recover_time
-           |and ov.action='community_home_click_feed_card'
+           |from online.tl_hdfs_wiki_item_tag_view a
-           |and ov.params["card_type"]="问题"
+           |inner join Knowledge_network b
-           |and ov.cl_id not in (select device_id from blacklist)
+           |on a.item_id=b.id
-           |and agency_id.device_id is null
+           |where a.partition_date='${partition_date}') c
-           |GROUP BY ov.partition_date
+           |on d.id=c.tag_id
-           |order by ov.partition_date
+           |where d.partition_date='${partition_date}'
       """.stripMargin
      )
-      question_count.show(30)
+      GmeiConfig.writeToJDBCTable(new_data, "train_Knowledge_network_data", SaveMode.Overwrite)
    }

--- a/eda/feededa/src/main/scala/com/gmei/temp_count.scala
+++ b/eda/feededa/src/main/scala/com/gmei/temp_count.scala
@@ -218,34 +218,145 @@ object Repeated_content_recommendation {
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
-      val stat_date = GmeiConfig.getMinusNDate(1)
+//      val stat_date = GmeiConfig.getMinusNDate(1)
-//      val stat_date = param.date
+      val stat_date = param.date
      val partition_date = stat_date.replace("-","")
-      val exp_diary = sc.sql(
+      val agency_id = sc.sql(
        s"""
-           |select concat_ws('|',device_id,cid_id)
+           |SELECT DISTINCT(cl_id) as device_id
-           |from data_feed_exposure
+           |FROM online.ml_hospital_spam_pv_day
-           |where cid_type = 'diary'
+           |WHERE partition_date >= '20180402'
-           |and device_id not in (select device_id from blacklist)
+           |AND partition_date <= '${partition_date}'
-           |and stat_date ='${stat_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_month
+           |WHERE partition_date >= '20171101'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |select distinct(device_id)
+           |from blacklist
+         """.stripMargin
+      )
+      agency_id.createOrReplaceTempView("agency_id")
+      val device_id_oldUser = sc.sql(
+        s"""
+           |select distinct(om.device_id) as device_id
+           |from online.ml_device_day_active_status om left join agency_id
+           |on om.device_id = agency_id.device_id
+           |where om.active_type = '4'
+           |and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and om.partition_date ='${partition_date}'
+           |and agency_id.device_id is null
         """.stripMargin
      )
-      exp_diary.show()
+      device_id_oldUser.createOrReplaceTempView("device_id_old")
-      val get_result =exp_diary.rdd.map((_, 1)).reduceByKey(_ + _)
+      device_id_oldUser.show()
+      val device_id_newUser = sc.sql(
+        s"""
+           |select distinct(om.device_id) as device_id
+           |from online.ml_device_day_active_status om left join agency_id
+           |on om.device_id = agency_id.device_id
+           |where om.active_type != '4'
+           |and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and om.partition_date ='${partition_date}'
+           |and agency_id.device_id is null
+         """.stripMargin
+      )
+      device_id_newUser.createOrReplaceTempView("device_id_new")
+      device_id_newUser.show()
+      val exp_diary_new = sc.sql(
+        s"""
+           |select concat_ws('|',de.device_id,de.cid_id)
+           |from data_feed_exposure  de inner join device_id_new
+           |on de.device_id=device_id_new.device_id
+           |where de.cid_type = 'diary'
+           |and de.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val get_result_new =exp_diary_new.rdd.map((_, 1)).reduceByKey(_ + _)
              .sortBy(_._2,false)
-      val more_than2=get_result.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
+      val more_than2_new=get_result_new.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
-      println(more_than2)
+      println(more_than2_new)
-      val all =get_result.map(_._2).reduce((x,y)=>x+y)
+      val all_new =get_result_new.map(_._2).reduce((x,y)=>x+y)
-      println(all)
+      println(all_new)
-      val repeated_rate= more_than2 / all.toDouble
+      val repeated_rate_new= more_than2_new / all_new.toDouble
-      println(repeated_rate)
+      println(repeated_rate_new)
-      val test=List((stat_date,repeated_rate))
-      val df = sc.createDataFrame(test)
+      val exp_diary_old = sc.sql(
+        s"""
+           |select concat_ws('|',de.device_id,de.cid_id)
+           |from data_feed_exposure de inner join device_id_old
+           |on de.device_id=device_id_old.device_id
+           |where de.cid_type = 'diary'
+           |and de.stat_date ='${stat_date}'
+               """.stripMargin
+      )
+      val get_result_old =exp_diary_old.rdd.map((_, 1)).reduceByKey(_ + _)
+        .sortBy(_._2,false)
+      val more_than2_old=get_result_old.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
+      println(more_than2_old)
+      val all_old =get_result_old.map(_._2).reduce((x,y)=>x+y)
+      println(all_old)
+      val repeated_rate_old= more_than2_old / all_old.toDouble
+      println(repeated_rate_old)
+      val result2=List((stat_date,more_than2_old,all_old,more_than2_new,all_new))
+      val df2 = sc.createDataFrame(result2).toDF("stat_date","old_rep_count","old_imp_all","new_rep_count","new_imp_all")
-      GmeiConfig.writeToJDBCTable(df, table = "Repeated_evaluation_indicator", SaveMode.Append)
+      GmeiConfig.writeToJDBCTable(df2, table = "Repeated_evaluation_indicator", SaveMode.Append)
+//      val exp_diary_old = sc.sql(
+//        s"""
+//           |select concat_ws('|',de.device_id,de.cid_id)
+//           |from data_feed_exposure de inner join device_id_old
+//           |where de.cid_type = 'diary'
+//           |and de.stat_date ='${stat_date}'
+//         """.stripMargin
+//      )
+//      val get_result_old =exp_diary_old.rdd.map((_, 1)).reduceByKey(_ + _)
+//        .sortBy(_._2,false)
+//
+//      val more_than2_old=get_result_old.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
+//      println(more_than2_old)
+//      val all_old =get_result_old.map(_._2).reduce((x,y)=>x+y)
+//      println(all_old)
+//      val repeated_rate_old= more_than2_old / all_old.toDouble
+//      println(repeated_rate_old)
+//
+//
+//      val result2=List((stat_date,more_than2_old,all_old))
+//      val df2 = sc.createDataFrame(result2).toDF("stat_date","old_rep_count","old_imp_all")
+//
+//      GmeiConfig.writeToJDBCTable(df2, table = "Repeated_evaluation_indicator_old", SaveMode.Append)
 //      val temp=get_result.collect()
@@ -260,8 +371,6 @@ object Repeated_content_recommendation {
 }
 object Repeated_content_recommendation_moreday {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
@@ -305,14 +414,16 @@ object Repeated_content_recommendation_moreday {
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
-//      val stat_date = GmeiConfig.getMinusNDate(1)
+      val stat_date = GmeiConfig.getMinusNDate(1)
 //      val stat_date = "2019-01-16"
 //      val partition_date = stat_date.replace("-","")
      val now= new Date()
+//      val stat_date=param.date
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
-      val date = dateFormat.format(now.getTime - 86400000L * 15)
+      val date = dateFormat.format(now.getTime - 86400000L * 8)
      val yesterday=dateFormat.format(now.getTime- 86400000L)
@@ -342,6 +453,7 @@ object Repeated_content_recommendation_moreday {
      val repeated_rate= fenmu / fenzi.toDouble
      val result=List((yesterday,repeated_rate))
+      println(result)
      val df_result = sc.createDataFrame(result)
      GmeiConfig.writeToJDBCTable(df_result, table = "Repeated_content_recommendation_moreday", SaveMode.Append)
@@ -351,10 +463,7 @@ object Repeated_content_recommendation_moreday {
 //      GmeiConfig.writeToJDBCTable(df, table = "Repeated_evaluation_indicator_moreday", SaveMode.Append)
    }
  }
 }
@@ -407,6 +516,7 @@ object GetHiveSearchData {
      val stat_date = GmeiConfig.getMinusNDate(1)
 //      val stat_date = param.date
      val partition_date = stat_date.replace("-","")
@@ -640,3 +750,246 @@ object GetHiveSearchData {
  }
+object find_reason {
+  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
+  case class Params(env: String = "dev",
+                    date: String = "2018-08-01"
+                   ) extends AbstractParams[Params] with Serializable
+  val defaultParams = Params()
+  val parser = new OptionParser[Params]("Feed_EDA") {
+    head("WeafareStat")
+    opt[String]("env")
+      .text(s"the databases environment you used")
+      .action((x, c) => c.copy(env = x))
+    opt[String] ("date")
+      .text(s"the date you used")
+      .action((x,c) => c.copy(date = x))
+    note(
+      """
+        |For example, the following command runs this app on a tidb dataset:
+        |
+        | spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
+      """.stripMargin +
+        s"|   --env ${defaultParams.env}"
+    )
+  }
+  def main(args: Array[String]): Unit = {
+    parser.parse(args, defaultParams).map { param =>
+      GmeiConfig.setup(param.env)
+      val spark_env = GmeiConfig.getSparkSession()
+      val sc = spark_env._2
+      val ti = new TiContext(sc)
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure_precise")
+//      val stat_date = GmeiConfig.getMinusNDate(1)
+      val stat_date=param.date
+      val partition_date = stat_date.replace("-","")
+      //机构id
+      val blacklist = sc.sql(
+        s"""
+           |select device_id from blacklist
+         """.stripMargin
+      )
+      blacklist.createOrReplaceTempView("blacklist")
+      val agency_id = sc.sql(
+        s"""
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_day
+           |WHERE partition_date >= '20180402'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_month
+           |WHERE partition_date >= '20171101'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+         """.stripMargin
+      )
+//      agency_id.show()
+      agency_id.createOrReplaceTempView("agency_id")
+      //每日新用户
+      val device_id_newUser = sc.sql(
+        s"""
+           |select distinct(os.device_id) as device_id
+           |from online.ml_device_day_active_status os left join blacklist
+           |on os.device_id = blacklist.device_id
+           |where os.active_type != '4'
+           |and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and os.partition_date ='${partition_date}'
+           |and blacklist.device_id is null
+         """.stripMargin
+      )
+//      device_id_newUser.show()
+      device_id_newUser.createOrReplaceTempView("device_id_new")
+      //每日老用户
+      val device_id_oldUser = sc.sql(
+        s"""
+           |select distinct(os.device_id) as device_id
+           |from online.ml_device_day_active_status os left join blacklist
+           |on os.device_id=blacklist.device_id
+           |where os.active_type = '4'
+           |and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and os.partition_date ='${partition_date}'
+           |and blacklist.device_id is null
+         """.stripMargin
+      )
+//      device_id_oldUser.show()
+      device_id_oldUser.createOrReplaceTempView("device_id_old")
+      val all_clk = sc.sql(
+        s"""
+           |select ov.cl_id as device_id
+           |from online.tl_hdfs_maidian_view ov left join agency_id
+           |on ov.cl_id = agency_id.device_id
+           |where ov.action = 'on_click_diary_card'
+           |and ov.cl_id != "NULL"
+           |and ov.params['tab_name'] = '精选'
+           |and ov.params['page_name'] = 'home'
+           |and ov.partition_date='${partition_date}'
+           |and agency_id.device_id is  null
+       """.stripMargin
+      )
+//      all_clk.show()
+      all_clk.createOrReplaceTempView("all_clk_diary_card")
+      //1.当天老用户中的点击用户数
+      val old_clk_count = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date,count(distinct(oc.device_id)) as old_clk_count
+           |from all_clk_diary_card oc inner join device_id_old
+           |on oc.device_id = device_id_old.device_id
+       """.stripMargin
+      )
+//      old_clk_count.show()
+      //1.1有点击的老用户
+      val old_clk_device = sc.sql(
+        s"""
+           |select distinct(oc.device_id) as device_id
+           |from all_clk_diary_card oc inner join device_id_old
+           |on oc.device_id = device_id_old.device_id
+       """.stripMargin
+      )
+      old_clk_device.createOrReplaceTempView("old_clk_device")
+      //1.1无点击的老用户
+      val old_noclk_device = sc.sql(
+        s"""
+           |select device_id
+           |from device_id_old
+           |except
+           |select device_id
+           |from old_clk_device
+       """.stripMargin
+      )
+      old_noclk_device.show()
+      //2.当天新用户中的点击用户数
+//      val new_clk_count = sc.sql(
+//        s"""
+//           |select '${stat_date}' as stat_date,count(distinct(oc.device_id)) as new_clk_count
+//           |from all_clk_diary_card oc inner join device_id_new
+//           |on oc.device_id = device_id_new.device_id
+//       """.stripMargin
+//      )
+////2.1 有点击的新用户
+//      val new_clk_device = sc.sql(
+//        s"""
+//           |select distinct(oc.device_id) as device_id
+//           |from all_clk_diary_card oc inner join device_id_new
+//           |on oc.device_id = device_id_new.device_id
+//       """.stripMargin
+//      )
+//      new_clk_device.createOrReplaceTempView("new_clk_device")
+//
+//
+//      //3.当天老用户数
+//
+//      val old_count = sc.sql(
+//        s"""
+//           |select '${stat_date}' as stat_date,count(distinct(dio.device_id)) as old_count
+//           |from device_id_old dio left join agency_id
+//           |on dio.device_id = agency_id.device_id
+//           |where agency_id.device_id is null
+//       """.stripMargin
+//      )
+//
+//      //4.当天新用户数
+//      val new_count = sc.sql(
+//        s"""
+//           |select '${stat_date}' as stat_date,count(distinct(din.device_id)) as new_count
+//           |from device_id_new din left join agency_id
+//           |on din.device_id = agency_id.device_id
+//           |where agency_id.device_id is null
+//       """.stripMargin
+//      )
+//
+//      //5.有点击老用户的曝光数
+//      val exp_clkold_count = sc.sql(
+//        s"""
+//           |select '${stat_date}' as stat_date,count(dp.device_id) as imp_clkold_count
+//           |from data_feed_exposure_precise dp inner join old_clk_device
+//           |on dp.device_id = old_clk_device.device_id
+//           |where stat_date='${stat_date}'
+//           |group by stat_date
+//       """.stripMargin
+//      )
+//
+//      //6.有点击新用户的曝光数
+//      val exp_clknew_count = sc.sql(
+//        s"""
+//           |select '${stat_date}' as stat_date,count(dp.device_id) as imp_clknew_count
+//           |from data_feed_exposure_precise dp inner join new_clk_device
+//           |on dp.device_id = new_clk_device.device_id
+//           |where stat_date='${stat_date}'
+//           |group by stat_date
+//       """.stripMargin
+//      )
+//
+//      val result = old_clk_count.join(new_clk_count,"stat_date")
+//        .join(old_count,"stat_date")
+//        .join(new_count,"stat_date")
+//        .join(exp_clkold_count,"stat_date")
+//        .join(exp_clknew_count,"stat_date")
+//
+//      GmeiConfig.writeToJDBCTable(result, "device_clk_imp_reason", SaveMode.Append)
+    }
+  }
+}
--- a/eda/feededa/src/main/scala/com/gmei/testt.scala
+++ b/eda/feededa/src/main/scala/com/gmei/testt.scala
@@ -61,6 +61,12 @@ object testt {
      )
      blacklist.createOrReplaceTempView("blacklist")
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+//      sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
+//      sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
      val agency_id = sc.sql(
        s"""
           |SELECT DISTINCT(cl_id) as device_id
@@ -76,22 +82,24 @@ object testt {
           |AND pv_ratio >= 0.95
         """.stripMargin
      )
-      agency_id.show()
+//      agency_id.show()
      agency_id.createOrReplaceTempView("agency_id")
 //每日新用户
      val device_id_newUser = sc.sql(
        s"""
-           |select distinct(device_id) as device_id
+           |select distinct(os.device_id) as device_id
-           |from online.ml_device_day_active_status
+           |from online.ml_device_day_active_status os left join blacklist
-           |where active_type != '4'
+           |on os.device_id=blacklist.device_id
-           |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |where os.active_type != '4'
+           |and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
-           |and partition_date ='${partition_date}'
+           |and os.partition_date ='${partition_date}'
+           |and blacklist.device_id is null
         """.stripMargin
      )
      device_id_newUser.show()
@@ -103,19 +111,19 @@ object testt {
           |select distinct(os.device_id) as device_id
           |from online.ml_device_day_active_status os left join blacklist
           |on os.device_id=blacklist.device_id
-           |where active_type = '4'
+           |where os.active_type = '4'
-           |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |and os.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
-           |and partition_date ='${partition_date}'
+           |and os.partition_date ='${partition_date}'
           |and blacklist.device_id is null
         """.stripMargin
      )
-      device_id_oldUser.show()
+//      device_id_oldUser.show()
      device_id_oldUser.createOrReplaceTempView("device_id_old")
@@ -402,3 +410,366 @@ object testt {
 }
+object diary_clk_card {
+  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
+  case class Params(env: String = "dev",
+                    date: String = "2018-08-01"
+                   ) extends AbstractParams[Params] with Serializable
+  val defaultParams = Params()
+  val parser = new OptionParser[Params]("Feed_EDA") {
+    head("WeafareStat")
+    opt[String]("env")
+      .text(s"the databases environment you used")
+      .action((x, c) => c.copy(env = x))
+    opt[String] ("date")
+      .text(s"the date you used")
+      .action((x,c) => c.copy(date = x))
+    note(
+      """
+        |For example, the following command runs this app on a tidb dataset:
+        |
+        | spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
+      """.stripMargin +
+        s"|   --env ${defaultParams.env}"
+    )
+  }
+  def main(args: Array[String]): Unit = {
+    parser.parse(args, defaultParams).map { param =>
+      GmeiConfig.setup(param.env)
+      val spark_env = GmeiConfig.getSparkSession()
+      val sc = spark_env._2
+      val ti = new TiContext(sc)
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure_precise")
+      val stat_date = GmeiConfig.getMinusNDate(1)
+//      val stat_date=param.date
+      val partition_date = stat_date.replace("-","")
+      //机构id
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
+//      sc.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+//      sc.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
+//      sc.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
+      val blacklist = sc.sql(
+        s"""
+           |select device_id from blacklist
+         """.stripMargin
+      )
+      blacklist.createOrReplaceTempView("blacklist")
+      val agency_id = sc.sql(
+        s"""
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_day
+           |WHERE partition_date >= '20180402'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_month
+           |WHERE partition_date >= '20171101'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+         """.stripMargin
+      )
+      agency_id.createOrReplaceTempView("agency_id")
+      val blacklist_all=sc.sql(
+        s"""
+           |SELECT device_id
+           |FROM blacklist
+           |UNION ALL
+           |SELECT device_id
+           |FROM agency_id
+         """.stripMargin
+      )
+      blacklist_all.createOrReplaceTempView("blacklist_all")
+      val device_id_oldUser = sc.sql(
+        s"""
+           |select distinct(om.device_id) as device_id
+           |from online.ml_device_day_active_status om left join blacklist_all
+           |on om.device_id = blacklist_all.device_id
+           |where om.active_type = '4'
+           |and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and om.partition_date ='${partition_date}'
+           |and blacklist_all.device_id is null
+         """.stripMargin
+      )
+      device_id_oldUser.createOrReplaceTempView("device_id_old")
+      device_id_oldUser.show()
+      val clk_count_oldUser_Contrast_a = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_Contrast_a
+           |from online.tl_hdfs_maidian_view ot inner join device_id_old
+           |on ot.cl_id = device_id_old.device_id
+           |where ot.action='on_click_diary_card'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params['page_name'] = 'home'
+           |and ot.cl_id regexp'1$$'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      clk_count_oldUser_Contrast_a.show()
+      val clk_count_oldUser_Contrast_b = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_Contrast_b
+           |from online.tl_hdfs_maidian_view ot inner join device_id_old
+           |on ot.cl_id = device_id_old.device_id
+           |where ot.action='full_stack_click_video_card_full_screen_play'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params["card_type"]="diary"
+           |and ot.cl_id regexp'1$$'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val imp_count_oldUser_Contrast = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_Contrast
+           |from data_feed_exposure je inner join device_id_old
+           |on je.device_id = device_id_old.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id regexp'1$$'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val imp_count_oldUser_Contrast_precise = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_Contrast_precise
+           |from data_feed_exposure_precise je inner join device_id_old
+           |on je.device_id = device_id_old.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id regexp'1$$'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val clk_count_oldUser_all_a = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_all_a
+           |from online.tl_hdfs_maidian_view ot inner join device_id_old
+           |on ot.cl_id = device_id_old.device_id
+           |where ot.action='on_click_diary_card'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params['page_name'] = 'home'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val clk_count_oldUser_all_b = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_oldUser_all_b
+           |from online.tl_hdfs_maidian_view ot inner join device_id_old
+           |on ot.cl_id = device_id_old.device_id
+           |where ot.action='full_stack_click_video_card_full_screen_play'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params["card_type"]="diary"
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val imp_count_oldUser_all = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_all
+           |from data_feed_exposure je inner join device_id_old
+           |on je.device_id = device_id_old.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val imp_count_oldUser_all_precise = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_all_precise
+           |from data_feed_exposure_precise je inner join device_id_old
+           |on je.device_id = device_id_old.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+//统计新用户ctr
+      val device_id_newUser = sc.sql(
+        s"""
+           |select distinct(device_id) as device_id
+           |from online.ml_device_day_active_status
+           |where active_type != '4'
+           |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      device_id_newUser.createOrReplaceTempView("device_id_new")
+      val clk_count_newUser_Contrast_a = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_Contrast_a
+           |from online.tl_hdfs_maidian_view ot inner join device_id_new
+           |on ot.cl_id = device_id_new.device_id
+           |where ot.action='on_click_diary_card'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params['page_name'] = 'home'
+           |and ot.cl_id regexp'1$$'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val clk_count_newUser_Contrast_b = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_Contrast_b
+           |from online.tl_hdfs_maidian_view ot inner join device_id_new
+           |on ot.cl_id = device_id_new.device_id
+           |where ot.action='full_stack_click_video_card_full_screen_play'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params["card_type"]="diary"
+           |and ot.cl_id regexp'1$$'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val imp_count_newUser_Contrast = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_Contrast
+           |from data_feed_exposure je inner join device_id_new
+           |on je.device_id = device_id_new.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id regexp'1$$'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val imp_count_newUser_Contrast_precise = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_Contrast_precise
+           |from data_feed_exposure_precise je inner join device_id_new
+           |on je.device_id = device_id_new.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id regexp'1$$'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val clk_count_newUser_all_a = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_all_a
+           |from online.tl_hdfs_maidian_view ot inner join device_id_new
+           |on ot.cl_id = device_id_new.device_id
+           |where ot.action='on_click_diary_card'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params['page_name'] = 'home'
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val clk_count_newUser_all_b = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(ot.cl_id) as clk_count_newUser_all_b
+           |from online.tl_hdfs_maidian_view ot inner join device_id_new
+           |on ot.cl_id = device_id_new.device_id
+           |where ot.action='full_stack_click_video_card_full_screen_play'
+           |and ot.params['tab_name'] = '精选'
+           |and ot.params["card_type"]="diary"
+           |and ot.partition_date ='${partition_date}'
+         """.stripMargin
+      )
+      val imp_count_newUser_all = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_all
+           |from data_feed_exposure je inner join device_id_new
+           |on je.device_id = device_id_new.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val imp_count_newUser_all_precise = sc.sql(
+        s"""
+           |select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_all_precise
+           |from data_feed_exposure_precise je inner join device_id_new
+           |on je.device_id = device_id_new.device_id
+           |where je.cid_type = 'diary'
+           |and je.device_id not in (select device_id from blacklist)
+           |and je.stat_date ='${stat_date}'
+         """.stripMargin
+      )
+      val result1 = clk_count_oldUser_Contrast_a.join(clk_count_oldUser_Contrast_b,"stat_date")
+        .join(imp_count_oldUser_Contrast,"stat_date")
+        .join(clk_count_oldUser_all_a,"stat_date")
+        .join(clk_count_oldUser_all_b,"stat_date")
+        .join(imp_count_oldUser_all,"stat_date")
+        .join(clk_count_newUser_Contrast_a,"stat_date")
+        .join(clk_count_newUser_Contrast_b,"stat_date")
+        .join(imp_count_newUser_Contrast,"stat_date")
+        .join(clk_count_newUser_all_a,"stat_date")
+        .join(clk_count_newUser_all_b,"stat_date")
+        .join(imp_count_newUser_all,"stat_date")
+        .join(imp_count_oldUser_Contrast_precise,"stat_date")
+        .join(imp_count_oldUser_all_precise,"stat_date")
+        .join(imp_count_newUser_Contrast_precise,"stat_date")
+        .join(imp_count_newUser_all_precise,"stat_date")
+      result1.show()
+      GmeiConfig.writeToJDBCTable(result1, "on_click_diary_card", SaveMode.Append)
+    }
+  }
+}
--- a/eda/recommended_indexs/csv_sql.py
+++ b/eda/recommended_indexs/csv_sql.py
+# -*- coding: utf-8 -*-
+# 导入必要模块
+import pandas as pd
+from sqlalchemy import create_engine
+data=pd.read_excel('wiki_item.xls')
+print(data.head())
+# # 初始化数据库连接，使用pymysql模块
+engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s%s") % ("root","3SYz54LS9#^9sBvC",'10.66.157.22', "4000", "jerry_test","?charset=utf8"))
+# engine = create_engine('mysql+pymysql://root:147369@localhost:3306/mydb')
+data.to_sql('Knowledge_network',con=engine,if_exists='append',index=False)
+print("Write to MySQL successfully!")
\ No newline at end of file
--- a/eda/recommended_indexs/dingding_robot.py
+++ b/eda/recommended_indexs/dingding_robot.py
+# coding: utf-8
+import json
+import requests
+def dingding_robot(data):
+    # 机器人的webhooK 获取地址参考：https://open-doc.dingtalk.com/microapp/serverapi2/qf2nxq
+    webhook = "https://oapi.dingtalk.com/robot/send?access_token=5131b887f6b022150f903e9d690e08c0d481fba844545034aaf48906ee026fa0"
+    headers = {'content-type': 'application/json'} # 请求头
+    r = requests.post(webhook, headers=headers, data=json.dumps(data))
+    r.encoding = 'utf-8'
+    return (r.text)
+if __name__ == "__main__":
+    import linecache
+    str = ""
+    for i in range(35,64):
+        s=linecache.getline('/srv/apps/ffm-baseline/eda/recommended_indexs/hypothesis_test.txt', i).strip("\n").split(",")
+        if s[0] != "":
+            str += s[0]+"\n"
+    str +="【同样重要】如下有变更，请提醒相关的人:\n" \
+          "1.推荐模型变更优化，影响CTR或者CVR(王志伟)；\n" \
+          "2.任何涉及到数据库schame变更(王志伟)"
+    print(str)
+    # 请求参数 可以写入配置文件中
+    data = {
+        "msgtype": "text",
+        "text": {
+            "content": str,
+            "title": "自定义机器人"
+            # "picUrl": "",
+            # "messageUrl": "https://www.baidu.com/"
+        },
+        "at": {
+            "atMobiles":["17310453926"]
+        }
+    }
+    res = dingding_robot(data)
+    print(res) # 打印请求结果
\ No newline at end of file
--- a/eda/recommended_indexs/hypothesis_test.py
+++ b/eda/recommended_indexs/hypothesis_test.py
+#! -*- coding: utf8 -*-
+import pandas as pd
+from scipy.stats import ttest_ind
+from scipy.stats import levene
+import datetime
+from utils import con_sql
+from decimal import *
+import numpy as np
+import smtplib
+from email.mime.text import MIMEText
+from email.utils import formataddr
+#########推荐策略前后统计指标假设检验（t检验）###############
+#自动获取昨日日期
+def get_yesterday_date():
+	#自动获取昨天的日期,如"2018-08-08"
+	"""
+	:rtype : str
+	"""
+	today = datetime.date.today()
+	yesterday = today - datetime.timedelta(days=1)
+	yesterday = yesterday.strftime("%Y-%m-%d")
+	return yesterday
+yesterday=get_yesterday_date()
+print("监测数据日期:{}".format(yesterday))
+#自动获取10日前的日期
+def get_somedate():
+	#自动获取10日前的日期,如"2018-07-28"
+	"""
+	:rtype : str
+	"""
+	today = datetime.date.today()
+	someday = today - datetime.timedelta(days=10)
+	someday = someday.strftime("%Y-%m-%d")
+	return someday
+ten_days=get_somedate()
+print("===========分割线，T检验最近10日指标与策略前10日指标是否获得显著提升============")
+#获取最近10天的数据
+def DATA_recently(x,y,z,q,t):
+    ten_days = get_somedate()
+    sql_cid = "select {0}/{1} as {2} from {3} \
+    where stat_date >='{4}' ".format(x,y,z,q,t)
+    CVR_DATA_recently = con_sql(sql_cid)
+    return CVR_DATA_recently
+#
+# #获取固定时间的10天的数据
+def DATA_fixed(x,y,z,q):
+    sql_cid = "select {0}/{1} as {2} from {3} \
+    where stat_date >='2018-11-17'  and stat_date<='2018-11-26' group by stat_date".format(x,y,z,q)
+    CVR_DATA_fixed = con_sql(sql_cid)
+    return CVR_DATA_fixed
+def DATA_recently_all(x,y,z,q,m,t):
+    ten_days = get_somedate()
+    sql_cid = "select ({0}+{1})/{2} as {3} from {4} \
+    where stat_date >='{5}' ".format(x,y,z,q,m,t)
+    CVR_DATA_recently = con_sql(sql_cid)
+    return CVR_DATA_recently
+#
+# #获取固定时间的10天的数据
+def DATA_fixed_all(x,y,z,q,m):
+    sql_cid = "select ({0}+{1})/{2} as {3} from {4} \
+    where stat_date >='2018-11-17'  and stat_date<='2018-11-26' group by stat_date".format(x,y,z,q,m)
+    CVR_DATA_fixed = con_sql(sql_cid)
+    return CVR_DATA_fixed
+#
+# #新用户cvr
+x_crv_new_temp=DATA_recently("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv",ten_days)
+x_crv_new=[float(str(Decimal(x_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_new_temp))]
+y_crv_new_temp=DATA_fixed("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv")
+y_crv_new=[float(str(Decimal(y_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_new_temp))]
+# #老用户cvr
+x_crv_old_temp=DATA_recently("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv",ten_days)
+x_crv_old=[float(str(Decimal(x_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_old_temp))]
+y_crv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv")
+y_crv_old=[float(str(Decimal(y_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_old_temp))]
+#
+# #新用户ct-cvr
+x_ctcrv_new_temp=DATA_recently("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv",ten_days)
+x_ctcrv_new=[float(str(Decimal(x_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_new_temp))]
+y_ctcrv_new_temp=DATA_fixed("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv")
+y_ctcrv_new=[float(str(Decimal(y_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_new_temp))]
+#
+# #老用户ct-cvr
+x_ctcrv_old_temp=DATA_recently("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv",ten_days)
+x_ctcrv_old =[float(str(Decimal(x_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_old_temp))]
+y_ctcrv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv")
+y_ctcrv_old=[float(str(Decimal(y_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_old_temp))]
+#
+# #新用户ctr(page_view)
+x_ctr_new_temp=DATA_recently("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser",ten_days)
+x_ctr_new=[float(str(Decimal(x_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_temp))]
+y_ctr_new_temp=DATA_fixed("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser")
+y_ctr_new=[float(str(Decimal(y_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_temp))]
+# #
+# #老用户ctr(page_view)
+x_ctr_old_temp=DATA_recently("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp",ten_days)
+x_ctr_old=[float(str(Decimal(x_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_temp))]
+y_ctr_old_temp=DATA_fixed("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp")
+y_ctr_old=[float(str(Decimal(y_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_temp))]
+#
+# #新用户ctr(on_click_diary_card)
+x_ctr_new_o_temp=DATA_recently_all("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","ctr_new","on_click_diary_card",ten_days)
+x_ctr_new_o=[float(str(Decimal(x_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_o_temp))]
+y_ctr_new_o_temp=DATA_fixed_all("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","ctr_new","on_click_diary_card")
+y_ctr_new_o=[float(str(Decimal(y_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_o_temp))]
+#
+# #老用户ctr(on_click_diary_card)
+x_ctr_old_o_temp=DATA_recently_all("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","ctr_old","on_click_diary_card",ten_days)
+x_ctr_old_o=[float(str(Decimal(x_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_o_temp))]
+y_ctr_old_o_temp=DATA_fixed_all("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","ctr_old","on_click_diary_card")
+y_ctr_old_o=[float(str(Decimal(y_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_o_temp))]
+# #
+#
+#
+def t_test(x,y): #进行t检验
+    #策略前的数据，赋值给x,策略后的数据赋值给y,均采用10日内数据
+    #检验数据方差是否齐性
+    a=levene(x,y)
+    p_value=a[1] #结果若p_value>0.05,则认为两组数据方差是相等的，否则两组数据方差是不等的
+    if p_value>0.05: #认为数据方差具有齐性，equal_var=ture
+        t_test=ttest_ind(x,y,equal_var=True)
+        t_p_value=t_test[1]
+        # print(t_p_value)
+        if t_p_value>0.05:
+            print("95%置信度认为策略前后两组数据【无显著性差异】，即该指标没有显著变化，p_value:{}" .format(t_p_value))
+            print("\n")
+        else:
+            print("95%置信度认为策略前后两组数据【有显著性差异】，即该指标获得显著变化，p_value:{}" .format(t_p_value))
+            print("\n")
+    else: #认为数据方差不具有齐性，equal_var=false
+        t_test = ttest_ind(x, y, equal_var=False)
+        t_p_value = t_test[1]
+        if t_p_value > 0.05:
+            print("95%置信度认为策略前后两组数据【无显著性差异】，即该指标没有显著变化，p_value:{}" .format(t_p_value))
+            print("\n")
+        else:
+            print("95%置信度认为策略前后两组数据【有显著性差异】，即该指标获得显著变化，p_value:{}" .format(t_p_value))
+            print("\n")
+#
+# ###假设检验，判断是否具有显著性
+#
+#新用户cvr假设检验
+print("【1】新用户CVR假设检验结果：")
+crv_new_ttest=t_test(x_crv_new,y_crv_new)
+#老用户cvr假设检验
+print("【2】老用户CVR假设检验结果：")
+crv_old_ttest=t_test(x_crv_old,y_crv_old)
+#
+#新用户ct_cvr假设检验
+print("【3】新用户CT-CVR假设检验结果：")
+ctcrv_new_ttest=t_test(x_ctcrv_new,y_ctcrv_new)
+# #老用户ct_cvr假设检验
+print("【4】老用户CT-CVR假设检验结果：")
+ctcrv_old_ttest=t_test(x_ctcrv_old,y_ctcrv_old)
+#
+#
+#新用户ctr假设检验
+print("【5】新用户CTR假设检验结果：")
+ctr_new_ttest=t_test(x_ctr_new,y_ctr_new)
+#老用户ctr假设检验
+print("【6】老用户CTR假设检验结果：")
+ctr_old_ttest=t_test(x_ctr_old,y_ctr_old)
+#新用户ctr(on_click_diary_card)假设检验
+print("【7】新用户CTR假设检验(日记本列表ctr)（on_click_diary_card）结果：")
+ctr_new_o_ttest=t_test(x_ctr_new_o,y_ctr_new_o)
+#老用户ctr(on_click_diary_card)假设检验
+print("【8】老用户CTR假设检验(日记本列表ctr)（on_click_diary_card）结果：")
+ctr_old_o_ttest=t_test(x_ctr_old_o,y_ctr_old_o)
+#
+# ###############推荐策略不变的情况下数据假设检验##############
+print("===========分割线，卡方检验昨日指标与前5日指标均值是否显著变化============")
+# #1 计算每日指标卡方检验
+#
+# #自动获取5日前的日期
+def get_fivedate():
+	#自动获取10日前的日期,如"2018-07-28"
+	"""
+	:rtype : str
+	"""
+	today = datetime.date.today()
+	someday = today - datetime.timedelta(days=5)
+	someday = someday.strftime("%Y-%m-%d")
+	return someday
+five_days=get_fivedate()
+#获取最近5天的数据,此函数只适用于on_click_diary_card表格，具体原因可以查看数据代码
+def chi_DATA_recently(x,y,z,q,t1,t2):
+    sql_cid = "select AVG({0}+{1}),AVG({2}) from {3} \
+    where stat_date >= '{4}' and stat_date < '{5}' ".format(x,y,z,q,t1,t2)
+    CVR_DATA_recently = con_sql(sql_cid)[0]
+    return CVR_DATA_recently
+def chi_DATA_yesterday(x,y,z,q,t1):
+    sql_cid = "select {0}+{1},{2} from {3} where stat_date='{4}'  ".format(x,y,z,q,t1)
+    CVR_DATA_yesterday = con_sql(sql_cid)[0]
+    return CVR_DATA_yesterday
+#获取最近5天的数据
+def chi_DATA_recently_all(x,y,z,t1,t2):
+    sql_cid = "select AVG({0}),AVG({1}) from {2} \
+    where stat_date >= '{3}' and stat_date < '{4}' ".format(x,y,z,t1,t2)
+    CVR_DATA_recently = con_sql(sql_cid)[0]
+    return CVR_DATA_recently
+def chi_DATA_yesterday_all(x,y,z,t1):
+    sql_cid = "select {0},{1} from {2} where stat_date='{3}'  ".format(x,y,z,t1)
+    CVR_DATA_yesterday = con_sql(sql_cid)[0]
+    return CVR_DATA_yesterday
+#整理数据
+def data_cal(x,y):
+    x_a = [x[0], x[1] - x[0]]
+    y_a=[y[0], y[1] - y[0]]
+    a_df=pd.DataFrame({'原':x_a,'测':y_a})
+    return a_df
+def chi_cal(data):
+    data['共计'] = data.apply(lambda x: x.sum(), axis=1)
+    data.loc['共计'] = data.apply(lambda x: x.sum())
+    t1=data.iloc[0]
+    t2=data.iloc[1]
+    t11_count=t1[0]
+    t12_count=t1[1]
+    t21_count=t2[0]
+    t22_count=t2[1]
+    ###理论值计算
+    temp1=data['共计']
+    rate1=temp1[0]/temp1[2]
+    rate2=temp1[1]/temp1[2]
+    temp2=data.iloc[2]
+    t11_theory=temp2[0]*rate1
+    t12_theory=temp2[1]*rate1
+    t21_theory = temp2[0]*rate2
+    t22_theory = temp2[1]*rate2
+    #计算卡方值
+    X=(((t11_count-t11_theory)**2)/t11_theory)+(((t12_count-t12_theory)**2)/t12_theory)+(((t21_count-t21_theory)**2)/t21_theory)+(((t22_count-t22_theory)**2)/t22_theory)
+    print("卡方值为：{}".format(X))
+    #计算自由度
+    v=(len(data)-1)*(data.columns.size-1)
+    #查表发现阈值为3.84
+    if X>3.84:
+        print("数据波动较大，超出正常波动范围，95%可能性属于指标【显著变化，请关注】")
+        print("\n")
+    else:
+        print("数据波动较小，95%可能性属于【正常波动】范围")
+        print("\n")
+#老用户精准点击曝光数据（首页精选日记本列表on_click_diary_card）
+print("【1】（精准曝光）首页精选日记本列表老用户CTR数据波动假设检验结果：")
+chi_ctr_precise_old_recently=chi_DATA_recently("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",five_days,yesterday)
+temp1_old=[float(str(Decimal(chi_ctr_precise_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctr_precise_old_recently))]
+chi_ctr_precise_old_yesterday=chi_DATA_yesterday("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",yesterday)
+temp2_old=[float(chi_ctr_precise_old_yesterday[i]) for i in range(len(chi_ctr_precise_old_yesterday))]
+ctr_tst_old=data_cal(temp1_old,temp2_old)
+chi_cal(ctr_tst_old)
+#新用户精准点击曝光数据（首页精选日记本列表on_click_diary_card）
+print("【2】（精准曝光）首页精选日记本列表新用户CTR数据波动假设检验结果：")
+chi_ctr_precise_new_recently=chi_DATA_recently("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",five_days,yesterday)
+temp1_new=[float(str(Decimal(chi_ctr_precise_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctr_precise_new_recently))]
+chi_ctr_precise_new_yesterday=chi_DATA_yesterday("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",yesterday)
+temp2_new=[float(chi_ctr_precise_new_yesterday[i]) for i in range(len(chi_ctr_precise_new_yesterday))]
+ctr_tst_new=data_cal(temp1_new,temp2_new)
+chi_cal(ctr_tst_new)
+#老用户美购转化数据
+print("【3】老用户CVR数据波动假设检验结果：")
+chi_cvr_old_recently=chi_DATA_recently_all("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",five_days,yesterday)
+cvr_old=[float(str(Decimal(chi_cvr_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_cvr_old_recently))]
+chi_cvr_old_yesterday=chi_DATA_yesterday_all("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",yesterday)
+cvr_old2=[float(chi_cvr_old_yesterday[i]) for i in range(len(chi_cvr_old_yesterday))]
+cvr_tst_old=data_cal(cvr_old,cvr_old2)
+chi_cal(cvr_tst_old)
+#老用户美购转化数据
+print("【4】新用户CVR数据波动假设检验结果：")
+chi_cvr_new_recently=chi_DATA_recently_all("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",five_days,yesterday)
+cvr_new=[float(str(Decimal(chi_cvr_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_cvr_new_recently))]
+chi_cvr_new_yesterday=chi_DATA_yesterday_all("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",yesterday)
+cvr_new2=[float(chi_cvr_new_yesterday[i]) for i in range(len(chi_cvr_new_yesterday))]
+cvr_tst_new=data_cal(cvr_new,cvr_new2)
+chi_cal(cvr_tst_new)
+#老用户美购转化数据
+print("【5】老用户CT-CVR数据波动假设检验结果：")
+chi_ctcvr_old_recently=chi_DATA_recently_all("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",five_days,yesterday)
+ctcvr_old=[float(str(Decimal(chi_ctcvr_old_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctcvr_old_recently))]
+chi_ctcvr_old_yesterday=chi_DATA_yesterday_all("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",yesterday)
+ctcvr_old2=[float(chi_ctcvr_old_yesterday[i]) for i in range(len(chi_ctcvr_old_yesterday))]
+ctcvr_tst_old=data_cal(ctcvr_old,ctcvr_old2)
+chi_cal(ctcvr_tst_old)
+#老用户美购转化数据
+print("【6】新用户CT-CVR数据波动假设检验结果：")
+chi_ctcvr_new_recently=chi_DATA_recently_all("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",five_days,yesterday)
+ctcvr_new=[float(str(Decimal(chi_ctcvr_new_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctcvr_new_recently))]
+chi_ctcvr_new_yesterday=chi_DATA_yesterday_all("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",yesterday)
+ctcvr_new2=[float(chi_ctcvr_new_yesterday[i]) for i in range(len(chi_ctcvr_new_yesterday))]
+ctcvr_tst_new=data_cal(ctcvr_new,ctcvr_new2)
+chi_cal(ctcvr_tst_new)
+# ###############数据波动大小检验##############
+print("===============分割线,开始检测各个指标的5日内的方差和均值==================")
+def get_var_data1(x,y,z,t1):
+    sql_cid = "select {0}/{1} from {2} \
+    where stat_date >= '{3}' ".format(x,y,z,t1)
+    CVR_DATA_recently = con_sql(sql_cid)
+    return CVR_DATA_recently
+def get_var_data2(x,y,z,q,t1):
+    sql_cid = "select ({0}+{1})/{2} from {3} \
+    where stat_date >= '{4}' ".format(x,y,z,q,t1)
+    CVR_DATA_recently = con_sql(sql_cid)
+    return CVR_DATA_recently
+def collect_data(data):
+    tt = [float(data[i][0])*100 for i in range(len(data))]
+    return tt
+var_ctcvr_old_data=get_var_data1("diary_meigou_oldUser","diary_exp_oldUser","diary_meigou_crv",five_days)
+var_ctcvr_old_D=collect_data(var_ctcvr_old_data)
+var_ctcvr_old=np.var(var_ctcvr_old_D)
+mean_var_ctcvr_old=np.mean(var_ctcvr_old_D)
+print("【1-1】老用户CT-CVR数据波动5日内方差检验结果：{}".format(var_ctcvr_old))
+print("【1-2】老用户CT-CVR数据波动5日内均值:{}%".format(mean_var_ctcvr_old))
+print("\n")
+var_ctcvr_new_data=get_var_data1("diary_meigou_newUser","diary_exp_newUser","diary_meigou_crv",five_days)
+var_ctcvr_new_D=collect_data(var_ctcvr_new_data)
+var_ctcvr_new=np.var(var_ctcvr_new_D)
+mean_var_ctcvr_new=np.mean(var_ctcvr_new_D)
+print("【2-1】新用户CT-CVR数据波动5日内方差检验结果：{}".format(var_ctcvr_new))
+print("【2-2】新用户CT-CVR数据波动5日内均值:{}%".format(mean_var_ctcvr_new))
+print("\n")
+var_cvr_old_data=get_var_data1("diary_meigou_oldUser","diary_clk_oldUser","diary_meigou_crv",five_days)
+var_cvr_old_D=collect_data(var_cvr_old_data)
+var_cvr_old=np.var(var_cvr_old_D)
+mean_var_cvr_old=np.mean(var_cvr_old_D)
+print("【3-1】老用户CVR数据波动5日内方差检验结果：{}".format(var_cvr_old))
+print("【3-2】老用户CVR数据波动5日内均值:{}%".format(mean_var_cvr_old))
+print("\n")
+#
+var_cvr_new_data=get_var_data1("diary_meigou_newUser","diary_clk_newUser","diary_meigou_crv",five_days)
+var_cvr_new_D=collect_data(var_cvr_new_data)
+var_cvr_new=np.var(var_cvr_new_D)
+mean_var_cvr_new=np.mean(var_cvr_new_D)
+print("【4-1】新用户CVR数据波动5日内方差检验结果：{}".format(var_cvr_new))
+print("【4-2】新用户CVR数据波动5日内均值:{}%".format(mean_var_cvr_new))
+print("\n")
+var_ctr_old_data=get_var_data2("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all","on_click_diary_card",five_days)
+var_ctr_old_D=collect_data(var_ctr_old_data)
+var_ctr_old=np.var(var_cvr_old_D)
+mean_var_ctr_old=np.mean(var_ctr_old_D)
+print("【5-1】老用户CTR数据波动5日内方差检验结果：{}".format(var_ctr_old))
+print("【5-2】老用户CTR数据波动5日内均值:{}%".format(mean_var_ctr_old))
+print("\n")
+var_ctr_new_data=get_var_data2("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all","on_click_diary_card",five_days)
+var_ctr_new_D=collect_data(var_ctr_new_data)
+var_ctr_new=np.var(var_ctr_new_D)
+mean_var_ctr_new=np.mean(var_ctr_new_D)
+print("【6-1】新用户CTR数据波动5日内方差检验结果：{}".format(var_ctr_new))
+print("【6-2】新用户CTR数据波动5日内均值:{}%".format(mean_var_ctr_new))
+print("\n")
+var_ctr_new_precise_data=get_var_data2("clk_count_newUser_all_a","clk_count_newUser_all_b","imp_count_newUser_all_precise","on_click_diary_card",five_days)
+var_ctr_new_precise_D=collect_data(var_ctr_new_precise_data)
+var_ctr_new_precise=np.var(var_ctr_new_precise_D)
+mean_var_ctr_new_precise=np.mean(var_ctr_new_precise_D)
+print("【7-1】新用户精准曝光CTR数据波动5日内方差检验结果：{}".format(var_ctr_new_precise))
+print("【7-2】新用户精准曝光CTR数据波动5日内均值:{}%".format(mean_var_ctr_new_precise))
+print("\n")
+var_ctr_old_precise_data=get_var_data2("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",five_days)
+var_ctr_old_precise_D=collect_data(var_ctr_old_precise_data)
+var_ctr_old_precise=np.var(var_ctr_old_precise_D)
+mean_var_ctr_old_precise=np.mean(var_ctr_old_precise_D)
+print("【8-1】老用户精准曝光CTR数据波动5日内方差检验结果：{}".format(var_ctr_old_precise))
+print("【8-2】老用户精准曝光CTR数据波动5日内均值:{}%".format(mean_var_ctr_old_precise))
+print("\n")
+# print("============================分割线===================================")
+#根据新老用户进行区分
+# print("============================新用户各指标假设检验结果分析===================================")
+# #新用户cvr假设检验
+# print("【1】新用户CVR假设检验结果：")
+# crv_new_ttest1=t_test(x_crv_new,y_crv_new)
+# #新用户ct_cvr假设检验
+# print("【3】新用户CT-CVR假设检验结果：")
+# ctcrv_new_ttest1=t_test(x_ctcrv_new,y_ctcrv_new)
+# #新用户ctr假设检验
+# print("【5】新用户CTR假设检验结果：")
+# ctr_new_ttest1=t_test(x_ctr_new,y_ctr_new)
+# #新用户ctr(on_click_diary_card)假设检验
+# print("【7】新用户CTR假设检验(日记本列表ctr)（on_click_diary_card）结果：")
+# ctr_new_o_ttest1=t_test(x_ctr_new_o,y_ctr_new_o)
+#
+#
+#
+#
+#
+# print("============================老用户各指标假设检验结果分析===================================")
+# #老用户cvr假设检验
+# print("【2】老用户CVR假设检验结果：")
+# crv_old_ttest1=t_test(x_crv_old,y_crv_old)
+# # #老用户ct_cvr假设检验
+# print("【4】老用户CT-CVR假设检验结果：")
+# ctcrv_old_ttest1=t_test(x_ctcrv_old,y_ctcrv_old)
+# #老用户ctr假设检验
+# print("【6】老用户CTR假设检验结果：")
+# ctr_old_ttest1=t_test(x_ctr_old,y_ctr_old)
+# #老用户ctr(on_click_diary_card)假设检验
+# print("【8】老用户CTR假设检验(日记本列表ctr)（on_click_diary_card）结果：")
+# ctr_old_o_ttest1=t_test(x_ctr_old_o,y_ctr_old_o)
+##发送邮件
+# my_sender='gaoyazhe@igengmei.com'
+# my_pass = 'VCrKTui99a7ALhiK'
+# my_user1='wangzhiwei@igengmei.com'
+# def mail():
+#     ret = True
+#     try:
+#         text = "Hi!\nHow are you?\nHere is the link you wanted:\nhttp://www.baidu.com"
+#         msg = MIMEText(text, 'plain', 'utf-8')
+#         msg['From'] = formataddr(["王志伟", my_sender])
+#         msg['To'] = my_user1
+#         msg['Subject'] = str(datetime.date.today()) + "-esmm多目标模型训练指标统计"
+#         server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
+#         server.login(my_sender, my_pass)
+#         server.sendmail(my_sender, [my_user1], msg.as_string())
+#         server.quit()
+#     except Exception:
+#         ret=False
+#     return ret
+#
+# ret=mail()
+# if ret:
+#     print("邮件发送成功")
+# else:
+#     print("邮件发送失败")
+# chi_cvr_new=
+# chi_cvr_old=
+#
+# chi_ctcvr_new=
+# chi_ctcvr_old=
+#
+#
+#
+# def chi_cal(data):
--- a/eda/recommended_indexs/knowledge_network.xls
+++ b/eda/recommended_indexs/knowledge_network.xls
--- a/eda/recommended_indexs/nose.xls
+++ b/eda/recommended_indexs/nose.xls
--- a/eda/recommended_indexs/send_email.py
+++ b/eda/recommended_indexs/send_email.py
+# ##发送邮件
+#
+# #coding=utf-8
+#
+# import smtplib
+# from email.mime.text import MIMEText
+# from email.utils import formataddr
+# from email.mime.application import MIMEApplication
+# import datetime
+#
+# from email.mime.multipart import MIMEMultipart
+#
+# my_sender='wangzhiwei@igengmei.com'
+# my_pass = 'RiKEcsHAgesCZ7yd'
+# my_user1='wangzhiwei@igengmei.com'
+# my_user2='gaoyazhe@igengmei.com'
+# my_user3='huangkai@igengmei.com'
+# def mail():
+#     ret = True
+#     pdfFile = 'hypothesis.txt'
+#     pdfApart = MIMEApplication(open(pdfFile, 'rb').read())
+#     pdfApart.add_header('Content-Disposition', 'attachment', filename=pdfFile)
+#     m = MIMEMultipart()
+#     m.attach(pdfApart)
+#     m['Subject'] = '数据指标监控数据（假设检验）'
+#     m['From'] = '王志伟<wangzhiwei@igengmei.com>'
+#
+#
+#     try:
+#         # text = "Hi!\nHow are you?\nHere is the link you wanted:\nhttp://www.baidu.com"
+#         # msg = MIMEText(text, 'plain', 'utf-8')
+#         # msg['From'] = formataddr(["王志伟", my_sender])
+#         # msg['To'] = my_user1
+#         # msg['Subject'] = str(datetime.date.today()) + "-esmm多目标模型训练指标统计"
+#         server = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
+#         server.login(my_sender, my_pass)
+#         server.sendmail(my_sender, [my_user1,my_user2,my_user3], m.as_string())
+#         server.quit()
+#     except Exception:
+#         ret=False
+#     return ret
+#
+# ret=mail()
+# if ret:
+#     print("邮件发送成功")
+# else:
+#     print("邮件发送失败")
+#####尝试发送邮箱，不带附件
+#coding=utf-8
+import smtplib
+from email.mime.text import MIMEText
+from email.utils import formataddr
+import datetime
+my_sender='wangzhiwei@igengmei.com'
+my_pass = 'RiKEcsHAgesCZ7yd'
+my_user1='wangzhiwei@igengmei.com'
+# my_user2='zhangyanzhao@igengmei.com'
+# my_user3='zhaochen@igengmei.com'
+# my_user4='huangkai@igengmei.com'
+# my_user5='lixiaofang@igengmei.com'
+# my_user6='duanyingrong@igengmei.com'
+# my_user7='liuxiao@igengmei.com'
+# my_user8='gaoyazhe@igengmei.com'
+def mail():
+    ret=True
+    try:
+        with open('hypothesis.txt') as f:
+            stat_data = f.read()
+            msg=MIMEText(stat_data,'plain','utf-8')
+            msg['From']=formataddr(["王志伟",my_sender])
+            msg['To']=my_user1
+            msg['Subject']= str(datetime.date.today())+"-数据指标监控数据（假设检验）"
+            server=smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
+            server.login(my_sender, my_pass)
+            server.sendmail(my_sender,[my_user1],msg.as_string())
+            server.quit()
+    except Exception:
+        ret=False
+    return ret
+ret=mail()
+if ret:
+    print("邮件发送成功")
+else:
+    print("邮件发送失败")
\ No newline at end of file
--- a/eda/recommended_indexs/wiki_item.xls
+++ b/eda/recommended_indexs/wiki_item.xls
--- a/eda/recommended_indexs/xiaba.xls
+++ b/eda/recommended_indexs/xiaba.xls
--- a/eda/recommended_indexs/xiaba.xlsx
+++ b/eda/recommended_indexs/xiaba.xlsx
--- a/tensnsorflow/eda.py
+++ b/tensnsorflow/eda.py
+import pymysql
+import pandas as pd
+from multiprocessing import Pool
+import numpy as np
+import datetime
+import time
+def con_sql(db, sql):
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchone()[0]
+    return result
+# def test(days):
+#     start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
+#     print(start)
+#     sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
+#           "from train_data where stat_date = '{}' and z = 1)".format(start,start)
+#     db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+#     exp = con_sql(db, sql)
+#     print(exp)
+#     sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
+#           "from train_data where stat_date = '{}' and z = 1)".format(start,start)
+#     click = con_sql(db, sql)
+#     return start,exp,click
+if __name__ == "__main__":
+    # temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
+    # DIRECTORY_PATH = "/home/gmuser/"
+    # output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
+    # for i in range(1,41):
+    #     a,b,c = test(i)
+    #     with open(output_path, 'a+') as f:
+    #         line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
+    #         f.write(line)
--- a/tensnsorflow/es/pipeline.sh
+++ b/tensnsorflow/es/pipeline.sh
 #! /bin/bash
+git checkout master
 PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
 MODEL_PATH=/srv/apps/ffm-baseline/tensnsorflow/es
-DATA_PATH=/home/gmuser/esmm_data
+DATA_PATH=/data/esmm
 echo "rm leave tfrecord"
 rm ${DATA_PATH}/tr/*
 rm ${DATA_PATH}/va/*
 rm ${DATA_PATH}/native/*
 rm ${DATA_PATH}/nearby/*
-rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/201*
+rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/20*
 echo "data"
-${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/infer.log
+${PYTHON_PATH} ${MODEL_PATH}/feature.py > ${DATA_PATH}/feature.log
 echo "csv to tfrecord"
 ${PYTHON_PATH} ${MODEL_PATH}/to_tfrecord.py --input_dir=${DATA_PATH}/tr/ --output_dir=${DATA_PATH}/tr/
@@ -32,15 +32,15 @@ rm ${DATA_PATH}/nearby/nearby_*
 echo "train..."
-${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=2 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
 echo "infer native..."
-${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/native_infer.log
 echo "infer nearby..."
-${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log
+${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.5 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=15 --feature_size=300000 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/nearby_infer.log
 echo "sort and 2sql"
-${PYTHON_PATH} ${MODEL_PATH}/to_database.py
+${PYTHON_PATH} ${MODEL_PATH}/to_database.py > ${DATA_PATH}/insert_database.log
--- a/tensnsorflow/es/to_database.py
+++ b/tensnsorflow/es/to_database.py
@@ -3,7 +3,6 @@
 from sqlalchemy import create_engine
 import pandas as pd
 import pymysql
-import MySQLdb
 import time
 def con_sql(sql):
@@ -37,10 +36,10 @@ def native_set_join(lst):
 def main():
    # native queue
-    df2 = pd.read_csv('/home/gmuser/esmm_data/native.csv')
+    df2 = pd.read_csv('/data/esmm/native.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)
-    df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/data/esmm/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
@@ -48,10 +47,10 @@ def main():
    # nearby queue
-    df2 = pd.read_csv('/home/gmuser/esmm_data/nearby.csv')
+    df2 = pd.read_csv('/data/esmm/nearby.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)
-    df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/data/esmm/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
@@ -65,8 +64,6 @@ def main():
    df_all["time"] = ctime
    print("union_device_count",df_all.shape)
    host='10.66.157.22'
    port=4000
    user='root'
@@ -75,21 +72,21 @@ def main():
    charset='utf8'
    engine = create_engine(str(r"mysql+mysqldb://%s:" + '%s' + "@%s:%s/%s") % (user, password, host, port, db))
+    df_merge = df_all['device_id'] + df_all['city_id']
+    df_merge_str = (str(list(df_merge.values))).strip('[]')
    try:
        # df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
-        df_merge = df_all['device_id'] + df_all['city_id']
-        df_merge_str = (str(list(df_merge.values))).strip('[]')
        delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str)
        con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
        cur = con.cursor()
        cur.execute(delete_str)
        con.commit()
-        df_all.to_sql('esmm_device_diary_queue',con=engine,if_exists='append',index=False)
+        df_all.to_sql('esmm_device_diary_queue',con=engine,if_exists='append',index=False,chunksize=8000)
    except Exception as e:
        print(e)
+    print("done")
 if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/tensnsorflow/es/to_tfrecord.py
+++ b/tensnsorflow/es/to_tfrecord.py
@@ -28,16 +28,23 @@ def gen_tfrecords(in_file):
    df = pd.read_csv(in_file)
    for i in range(df.shape[0]):
-        feats = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
+        feats = ["ucity_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2"]
+                "channel", "top", "time", "stat_date","hospital_id",
+                 "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
        id = np.array([])
        for j in feats:
            id = np.append(id,df[j][i])
+        app_list = np.array(str(df["app_list"][i]).split(","))
+        level2_list = np.array(str(df["clevel2_id"][i]).split(","))
+        level3_list = np.array(str(df["level3_ids"][i]).split(","))
        features = tf.train.Features(feature={
            "y": tf.train.Feature(float_list=tf.train.FloatList(value=[df["y"][i]])),
            "z": tf.train.Feature(float_list=tf.train.FloatList(value=[df["z"][i]])),
-            "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int)))
+            "ids": tf.train.Feature(int64_list=tf.train.Int64List(value=id.astype(np.int))),
-        })
+            "app_list":tf.train.Feature(int64_list=tf.train.Int64List(value=app_list.astype(np.int))),
+            "level2_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level2_list.astype(np.int))),
+            "level3_list": tf.train.Feature(int64_list=tf.train.Int64List(value=level3_list.astype(np.int)))
+            })
        example = tf.train.Example(features = features)
        serialized = example.SerializeToString()

--- a/tensnsorflow/es/train.py
+++ b/tensnsorflow/es/train.py
@@ -53,7 +53,10 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
        features = {
            "y": tf.FixedLenFeature([], tf.float32),
            "z": tf.FixedLenFeature([], tf.float32),
-            "ids": tf.FixedLenFeature([11], tf.int64)
+            "ids": tf.FixedLenFeature([FLAGS.field_size], tf.int64),
+            "app_list": tf.VarLenFeature(tf.int64),
+            "level2_list": tf.VarLenFeature(tf.int64),
+            "level3_list": tf.VarLenFeature(tf.int64)
        }
        parsed = tf.parse_single_example(record, features)
@@ -99,6 +102,8 @@ def model_fn(features, labels, mode, params):
    Feat_Emb = tf.get_variable(name='embeddings', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer())
    feat_ids = features['ids']
+    app_list = features['app_list']
+    level2_list = features['level2_list']
    if FLAGS.task_type != "infer":
        y = labels['y']
@@ -107,8 +112,12 @@ def model_fn(features, labels, mode, params):
    #------build f(x)------
    with tf.variable_scope("Shared-Embedding-layer"):
        embedding_id = tf.nn.embedding_lookup(Feat_Emb,feat_ids)
+        app_id = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=app_list, sp_weights=None, combiner="sum")
+        level2 = tf.nn.embedding_lookup_sparse(Feat_Emb, sp_ids=level2_list, sp_weights=None, combiner="sum")
-        x_concat = tf.reshape(embedding_id,shape=[-1, common_dims])  # None * (F * K)
+        # x_concat = tf.reshape(embedding_id,shape=[-1, common_dims])  # None * (F * K)
+        x_concat = tf.concat([tf.reshape(embedding_id,shape=[-1,common_dims]),app_id,level2], axis=1)
    with tf.name_scope("CVR_Task"):
        if mode == tf.estimator.ModeKeys.TRAIN:

--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -9,18 +9,7 @@ import time
 from sqlalchemy import create_engine
-def con_sql(db,sql):
-    cursor = db.cursor()
-    try:
-        cursor.execute(sql)
-        result = cursor.fetchall()
-        df = pd.DataFrame(list(result))
-    except Exception:
-        print("发生异常", Exception)
-        df = pd.DataFrame()
-    finally:
-        db.close()
-    return df
 # def test():
 #     sql = "select max(update_time) from ffm_diary_queue"
@@ -285,6 +274,35 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
    # print("nearby_pre shape")
    # print(nearby_pre.shape)
+def con_sql(db,sql):
+    cursor = db.cursor()
+    try:
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        df = pd.DataFrame(list(result))
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
+    return df
+def test(days):
+    start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
+    print(start)
+    sql = "select (select count(*) from train_data where stat_date = '{}' and y = 0)/(select count(*) " \
+          "from train_data where stat_date = '{}' and z = 1)".format(start)
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    exp = con_sql(db, sql)[0].values.tolist()[0]
+    sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
+          "from train_data where stat_date = '{}' and z = 1)".format(start)
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    click = con_sql(db, sql)[0].values.tolist()[0]
+    return start,exp,click
 if __name__ == "__main__":

--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
+import datetime
+from pyspark.sql import HiveContext
+from pyspark.context import SparkContext
+from pyspark.conf import SparkConf
+from pyspark.sql import SQLContext
+from pyspark.sql import SparkSession
+# from py4j.java_gateway import java_import
+# import pytispark.pytispark as pti
+import pandas as pd
+import pymysql
+def con_sql(db,sql):
+    cursor = db.cursor()
+    try:
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        df = pd.DataFrame(list(result))
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
+    return df
+# def test():
+        conf = SparkConf().setAppName("My App").set("spark.io.compression.codec", "lzf")
+        sc = SparkContext(conf = conf)
+        hive_context = HiveContext(sc)
+        hive_context.sql(''' select device["device_type"] from online.tl_hdfs_maidian_view
+        where partition_date = '20181012' and action = "page_view"
+        and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 ''').show(6)
+# def esmm_pre():
+#     yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
+#     print(yesterday)
+#
+#     spark = SparkSession.builder.enableHiveSupport().getOrCreate()
+#     # gw = SparkContext._gateway
+#     #
+#     # # Import TiExtensions
+#     # java_import(gw.jvm, "org.apache.spark.sql.TiContext")
+#
+#     # Inject TiExtensions, and get a TiContext
+#     # ti = gw.jvm.TiExtensions.getInstance(spark._jsparkSession).getOrCreateTiContext(spark._jsparkSession)
+#     ti = pti.TiContext(spark)
+#
+#     ti.tidbMapDatabase("jerry_test")
+#
+#     # sql("use tpch_test")
+#     spark.sql("select count(*) from esmm_pre_data").show(6)
+#
+#     # conf = SparkConf().setAppName("esmm_pre").set("spark.io.compression.codec", "lzf")
+#
+#     spark.sql("""
+#     select concat(tmp1.device_id,",",tmp1.city_id) as device_city, tmp1.merge_queue from (select device_id,if(city_id='world','worldwide',city_id) city_id,similarity_cid as merge_queue from nd_device_cid_similarity_matrix
+# union select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue
+# union select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1 where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date='{}'
+#     """.format(yesterday)).show(6)
+if __name__ == '__main__':