Merge branch 'master' of http://git.wanmeizhensuo.com/ML/ffm-baseline

de8f872a · 王志伟 · f27d8217 · 971a0d0e · de8f872a · de8f872a
Commit de8f872a authored Dec 27, 2018 by 王志伟
6 changed files
--- a/eda/esmm/Feature_pipline/data2ffm.py
+++ b/eda/esmm/Feature_pipline/data2ffm.py
@@ -114,10 +114,10 @@ class multiFFMFormatPandas:
        x = 0
        while True:
            if x + step < data.__len__():
-                data_list.append(data.loc[x:x + step])
+                data_list.append(data.iloc[x:x + step])
-                x = x + step + 1
+                x = x + step
            else:
-                data_list.append(data.loc[x:data.__len__()])
+                data_list.append(data.iloc[x:data.__len__()])
                break
        return data_list
@@ -147,7 +147,7 @@ def get_data():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id " \
-          "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
+          "from esmm_train_data e left join user_feature_clean u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id " \
          "where e.stat_date >= '{}'".format(start)
    df = con_sql(db, sql)
@@ -208,7 +208,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id " \
-          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
+          "from esmm_pre_data e left join user_feature_clean u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",

--- a/eda/esmm/Model_pipline/submit.sh
+++ b/eda/esmm/Model_pipline/submit.sh
@@ -19,6 +19,11 @@ rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/2018*
 echo "data2ffm"
 ${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/data2ffm.py > ${DATA_PATH}/infer.log
+all_sample=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$2$3$4}' | sort | uniq | wc -l`))
+uniq_feat=$((`cat ${DATA_PATH}/tr.csv | awk -F '\t' '{print$5}' | awk -F ',' '{print$4}' | sort | uniq -u | wc -l`))
+repe_feat=$((all_sample-uniq_feat))
+echo "Bayes Error Rate" : $((repe_feat*100/all_sample))%
 echo "split data"
 split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv
 split -l $((`wc -l < ${DATA_PATH}/va.csv`/5)) ${DATA_PATH}/va.csv -d -a 4 ${DATA_PATH}/va/va_ --additional-suffix=.csv

--- a/tensnsorflow/applist.py
+++ b/tensnsorflow/applist.py
@@ -68,12 +68,20 @@ def sort_app():
                "job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"}
                 }
    df["app_list"] = df["app_list"].apply(json_format)
+    n = df.shape[0]
+    df["sum"] = 0
    for i in category.keys():
        df[i] = df["app_list"].apply(lambda x: 1 if len(x & category[i]) > 0 else 0)
-        print(i)
+        df["sum"] = df["sum"]+df[i]
-        print(df[i].value_counts())
+        # print(i)
+        # print(df.loc[df[i]==1].shape[0]/n)
    df = df.drop("app_list",axis=1)
+    # for i in df["sum"].unique():
+    #     print(i)
+    #     a = df.loc[df["sum"] == i].shape[0]/n
+    #     print(a)
    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
    print(df.shape)
    n = 200000

--- a/tensnsorflow/exp_channel.py
+++ b/tensnsorflow/exp_channel.py
+import pandas as pd
+import pymysql
+def con_sql(db,sql):
+    cursor = db.cursor()
+    try:
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        df = pd.DataFrame(list(result))
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
+    return df
+def exp():
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    sql = "select manufacturer,channel from user_feature"
+    df = con_sql(db, sql)
+    n = df.shape[0]
+    manufacturer = df[0].unique()
+    manufacturer_map = {}
+    print("manufacturer unique")
+    print(len(manufacturer))
+    for i in manufacturer:
+        manufacturer_map[i] = df.loc[df[0]==i].shape[0]/n
+    print(sorted(manufacturer_map.items(),key = lambda x:x[1]))
+    channel = df[1].unique()
+    channel_map = {}
+    print("channel unique")
+    print(len(channel))
+    for i in channel:
+        channel_map[i] = df.loc[df[1] == i].shape[0] / n
+    print(sorted(channel_map.items(), key=lambda x: x[1]))
+def clean():
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    sql = "select device_id,device_type,manufacturer,channel,city_id from user_feature"
+    df = con_sql(db, sql)
+    df = df.rename(columns={0: "device_id",1: "device_type", 2: "manufacturer", 3: "channel", 4: "city_id"})
+    n = df.shape[0]
+    manufacturer = df["manufacturer"].unique()
+    for i in manufacturer:
+       if df.loc[df["manufacturer"]==i].shape[0]/n < 0.0005:
+           df.loc[df["manufacturer"] == i,["manufacturer"]] = "other"
+    channel = df["channel"].unique()
+    for i in channel:
+        if df.loc[df["channel"] == i].shape[0] / n < 0.0001:
+            df.loc[df["channel"] == i, ["channel"]] = "other"
+    from sqlalchemy import create_engine
+    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
+    n = 200000
+    for i in range(0,df.shape[0],n):
+        print(i)
+        if i == 0:
+            temp = df.loc[0:n]
+        elif i+n > df.shape[0]:
+            temp = df.loc[i+1:]
+        else:
+            temp = df.loc[i+1:i+n]
+        pd.io.sql.to_sql(temp, "user_feature_clean", yconnect, schema='jerry_test', if_exists='append', index=False)
+        print("insert done")
+if __name__ == "__main__":
+    clean()
--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -99,6 +99,7 @@ class multiFFMFormatPandas:
        result_map = {}
        for i in data_list:
            result_map.update(i.get())
        pool.close()
        pool.join()
@@ -114,10 +115,10 @@ class multiFFMFormatPandas:
        x = 0
        while True:
            if x + step < data.__len__():
-                data_list.append(data.loc[x:x + step])
+                data_list.append(data.iloc[x:x + step])
-                x = x + step + 1
+                x = x + step
            else:
-                data_list.append(data.loc[x:data.__len__()])
+                data_list.append(data.iloc[x:data.__len__()])
                break
        return data_list
@@ -147,7 +148,7 @@ def get_data():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id " \
-          "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
+          "from esmm_train_data e left join user_feature_clean u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id " \
          "where e.stat_date >= '{}'".format(start)
    df = con_sql(db, sql)
@@ -174,6 +175,7 @@ def get_data():
    manufacturer = list(set(df["manufacturer"].values.tolist()))
    channel = list(set(df["channel"].values.tolist()))
    return df,validate_date,ucity_id,ccity_name,manufacturer,channel
@@ -197,8 +199,8 @@ def transform(a,validate_date):
    train = train.drop("stat_date",axis=1)
    test = df[df["stat_date"] == validate_date]
    test = test.drop("stat_date",axis=1)
-    # print("train shape")
+    print("train shape")
-    # print(train.shape)
+    print(train.shape)
    train.to_csv(path + "tr.csv", sep="\t", index=False)
    test.to_csv(path + "va.csv", sep="\t", index=False)
@@ -209,7 +211,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id " \
-          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
+          "from esmm_pre_data e left join user_feature_clean u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
@@ -218,23 +220,23 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
    print("before filter:")
    print(df.shape)
-    print(df.loc[df["device_id"]=="358035085192742"].shape)
    df = df[df["ucity_id"].isin(ucity_id)]
    print("after ucity filter:")
    print(df.shape)
-    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["ccity_name"].isin(ccity_name)]
    print("after ccity_name filter:")
    print(df.shape)
-    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["manufacturer"].isin(manufacturer)]
    print("after manufacturer filter:")
    print(df.shape)
-    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["channel"].isin(channel)]
    print("after channel filter:")
    print(df.shape)
-    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df["cid_id"] = df["cid_id"].astype("str")
    df["clevel1_id"] = df["clevel1_id"].astype("str")
    df["top"] = df["top"].astype("str")
@@ -245,9 +247,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
        [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
         df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
    df = df.drop(["z","label","device_id","cid_id"], axis=1).fillna(0.0)
-    print(df.head(2))
+    print("before transform")
-    df = model.transform(df,n=160000, processes=22)
+    print(df.shape)
-    df = pd.DataFrame(df)
+    temp_series = model.transform(df,n=160000, processes=22)
+    df = pd.DataFrame(temp_series)
    print("after transform")
    print(df.shape)
    df["label"] = df[0].apply(lambda x: x.split(",")[0])
@@ -286,8 +289,8 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
 if __name__ == "__main__":
    path = "/home/gmuser/ffm/"
    a = time.time()
-    df, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
+    temp, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
-    model = transform(df, validate_date)
+    model = transform(temp, validate_date)
    get_predict_set(ucity_id,model,ccity_name,manufacturer,channel)
    b = time.time()
    print("cost(分钟)")

--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
-import time
+import pandas as pd
-from pyspark.context import SparkContext
+import pymysql
-from pyspark.conf import SparkConf
-conf = SparkConf().setMaster("spark://10.30.181.88:7077").setAppName("My app")
+def con_sql(db,sql):
-sc = SparkContext(conf=conf)
+    cursor = db.cursor()
-sc.setLogLevel("WARN")
+    try:
-for i in range(1,100):
+        cursor.execute(sql)
-    print(i)
+        result = cursor.fetchall()
-    time.sleep(5)
+        df = pd.DataFrame(list(result))
\ No newline at end of file
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
+    return df
+def exp():
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchone()[0]
+    native = tuple(result.split(","))
+    print("total")
+    print(len(native))
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
+    sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
+    df = con_sql(db,sql)
+    n = df.shape[0]
+    one = df[1].unique()
+    one_map = {}
+    for i in one:
+        one_map[i] = df.loc[df[1]==i].shape[0]/n
+    print(sorted(one_map.items(),key = lambda x:x[1]))
+    two = df[2].unique()
+    two_map = {}
+    print("分界线")
+    for i in two:
+        two_map[i] = df.loc[df[2] == i].shape[0] / n
+    print(sorted(two_map.items(), key=lambda x: x[1]))
+def click():
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
+    sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
+          "on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
+          "and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
+    df = con_sql(db, sql)
+    n = df.shape[0]
+    print(n)
+    one = df[1].unique()
+    one_map = {}
+    for i in one:
+        one_map[i] = df.loc[df[1] == i].shape[0] / n
+    print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
+    two = df[2].unique()
+    two_map = {}
+    print("分界线")
+    for i in two:
+        two_map[i] = df.loc[df[2] == i].shape[0] / n
+    print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))
+if __name__ == "__main__":
+   click()