esmm train data first level1_id

7c856a21 · 高雅喆 · a0d42a6e · 7c856a21 · 7c856a21 · 7c856a21
Commit 7c856a21 authored Jan 03, 2019 by 高雅喆
5 changed files
--- a/eda/esmm/Feature_pipline/data2ffm.py
+++ b/eda/esmm/Feature_pipline/data2ffm.py
@@ -264,7 +264,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):


 if __name__ == "__main__":
-    path = "/home/gaoyazhe/data/"
+    path = "/home/gmuser/esmm_data/"
    a = time.time()
    df, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
    model = transform(df, validate_date)

--- a/eda/esmm/Model_pipline/send_mail.py
+++ b/eda/esmm/Model_pipline/send_mail.py
@@ -11,7 +11,7 @@ my_user='gaoyazhe@igengmei.com'
 def mail():
    ret=True
    try:
-        with open('/home/gaoyazhe/data/submit.log') as f:
+        with open('/home/gmuser/esmm_data/submit.log') as f:
            stat_data = f.read()
            msg=MIMEText(stat_data,'plain','utf-8')
            msg['From']=formataddr(["高雅喆",my_sender])

--- a/eda/esmm/Model_pipline/sort_and_2sql.py
+++ b/eda/esmm/Model_pipline/sort_and_2sql.py
@@ -25,10 +25,10 @@ def set_join(lst):
 def main():

    # native queue
-    df2 = pd.read_csv('/home/gaoyazhe/data/native.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
+    df2 = pd.read_csv('/home/gmuser/esmm_data/native.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
    df2['cid_id'] = df2['cid_id'].astype(str)

-    df1 = pd.read_csv("/home/gaoyazhe/data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
@@ -36,10 +36,10 @@ def main():


    # nearby queue
-    df2 = pd.read_csv('/home/gaoyazhe/data/nearby.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
+    df2 = pd.read_csv('/home/gmuser/esmm_data/nearby.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
    df2['cid_id'] = df2['cid_id'].astype(str)

-    df1 = pd.read_csv("/home/gaoyazhe/data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]

--- a/eda/esmm/Model_pipline/submit.sh
+++ b/eda/esmm/Model_pipline/submit.sh
 #! /bin/bash
 PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
 MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm
-DATA_PATH=/home/gaoyazhe/data
+DATA_PATH=/home/gmuser/esmm_data

 echo "start time"
 current=$(date "+%Y-%m-%d %H:%M:%S")

--- a/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+++ b/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
@@ -209,7 +209,7 @@ object EsmmData {
             |and d.partition_date='${stat_date_not}'
         """.stripMargin
        )
-        //      union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
+        union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
        union_data_scity_id.show()

        val union_data_scity_id2 = sc.sql(