Merge branch 'master' of http://git.wanmeizhensuo.com/ML/ffm-baseline

e916ae5c · 王志伟 · 9cde0cf4 · eecd0fdd · e916ae5c · e916ae5c
Commit e916ae5c authored Jun 03, 2019 by 王志伟
4 changed files
--- a/eda/esmm/Model_pipline/feature_engineering.py
+++ b/eda/esmm/Model_pipline/feature_engineering.py
@@ -75,6 +75,7 @@ def con_sql(db,sql):
    db.close()
    return df

+
 def feature_engineer():
    apps_number, app_list_map, level2_number, leve2_map, level3_number, leve3_map = get_map()
    unique_values = []

--- a/eda/esmm/Model_pipline/submit.sh
+++ b/eda/esmm/Model_pipline/submit.sh
@@ -8,7 +8,7 @@ export CLASSPATH="/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/api
 echo $CLASSPATH
 export LD_LIBRARY_PATH="/usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server:/opt/hadoop/lib/native"
 echo $LD_LIBRARY_PATH
-export PATH=$PATH:/usr/local/hadoop/bin/
+

 echo "rm model file"
 rm -r ${LOCAL_PATH}/model_ckpt/DeepCvrMTL/20*
@@ -29,4 +29,6 @@ echo "sort and 2sql"
 ${PYTHON_PATH} ${MODEL_PATH}/to_database.py > "/home/gmuser/esmm/log/insert_$b.log"

 echo "delete files"
-
+rm /home/gmuser/esmm/*.csv
+rm /home/gmuser/esmm/native/*
+rm /home/gmuser/esmm/nearby/*
\ No newline at end of file
--- a/eda/esmm/Model_pipline/to_database.py
+++ b/eda/esmm/Model_pipline/to_database.py
@@ -19,12 +19,7 @@ def con_sql(sql):
    return result


-def nearby_set_join(lst):
-    # return ','.join([str(i) for i in list(lst)])
-    return ','.join([str(i) for i in lst.unique().tolist()])
-
-
-def native_set_join(lst):
+def set_join(lst):
    l = lst.unique().tolist()
    r = [str(i) for i in l]
    r =r[:500]
@@ -39,7 +34,8 @@ def main():

    df1 = pd.read_csv(path+"/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
-    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
+    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False))\
+        .reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
    print("native_device_count",df3.shape)

@@ -50,7 +46,8 @@ def main():

    df1 = pd.read_csv(path+"/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
-    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
+    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False))\
+        .reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
    print("nearby_device_count",df4.shape)


--- a/eda/esmm/Model_pipline/train.py
+++ b/eda/esmm/Model_pipline/train.py