native_queue 取top50%日记队列

914b42d9 · 张彦钊 · 1500979f · 914b42d9 · 1500979f
Commit 914b42d9 authored Jan 28, 2019 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 97 deletions

sort_and_2sql.py eda/esmm/Model_pipline/sort_and_2sql.py +13 -3

test.py tensnsorflow/test.py +0 -94

No files found.
--- a/eda/esmm/Model_pipline/sort_and_2sql.py
+++ b/eda/esmm/Model_pipline/sort_and_2sql.py
@@ -18,10 +18,20 @@ def con_sql(sql):
    db.close()
    return result

-def set_join(lst):
+def nearby_set_join(lst):
    # return ','.join([str(i) for i in list(lst)])
    return ','.join([str(i) for i in lst.unique().tolist()])

+
+def native_set_join(lst):
+    l = lst.unique().tolist()
+    d = int(len(l)/2)
+    if d == 0:
+        d = 1
+    r = [str(i) for i in l]
+    r =r[:d]
+    return ','.join(r)
+
 def main():

    # native queue
@@ -30,7 +40,7 @@ def main():

    df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
-    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
    print("native_device_count",df3.shape)

@@ -41,7 +51,7 @@ def main():

    df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
-    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
    print("nearby_device_count",df4.shape)


--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
-import pandas as pd
-import pymysql
-
-
-def con_sql(db,sql):
-    cursor = db.cursor()
-    try:
-        cursor.execute(sql)
-        result = cursor.fetchall()
-        df = pd.DataFrame(list(result))
-    except Exception:
-        print("发生异常", Exception)
-        df = pd.DataFrame()
-    finally:
-        db.close()
-    return df
-
-
-
-def exp():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
-    cursor = db.cursor()
-    cursor.execute(sql)
-    result = cursor.fetchone()[0]
-    native = tuple(result.split(","))
-    print("total")
-    print(len(native))
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
-    sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
-    df = con_sql(db,sql)
-
-    n = df.shape[0]
-    one = df[1].unique()
-    one_map = {}
-    for i in one:
-        one_map[i] = df.loc[df[1]==i].shape[0]/n
-    print(sorted(one_map.items(),key = lambda x:x[1]))
-    two = df[2].unique()
-    two_map = {}
-    print("分界线")
-    for i in two:
-        two_map[i] = df.loc[df[2] == i].shape[0] / n
-    print(sorted(two_map.items(), key=lambda x: x[1]))
-
-
-def click():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
-    sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
-          "on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
-          "and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
-    df = con_sql(db, sql)
-
-    n = df.shape[0]
-    print(n)
-    one = df[1].unique()
-    one_map = {}
-    for i in one:
-        one_map[i] = df.loc[df[1] == i].shape[0] / n
-    print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
-    two = df[2].unique()
-    two_map = {}
-    print("分界线")
-    for i in two:
-        two_map[i] = df.loc[df[2] == i].shape[0] / n
-    print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))
-
-def get_cid():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select cid_id from esmm_train_data where device_id = '358035085192742' " \
-          "and stat_date >= '2018-12-03'"
-    df = con_sql(db, sql)[0].values.tolist()
-    print(",".join(df))
-
-
-
-if __name__ == "__main__":
-
-    import pandas as pd
-    from sklearn.preprocessing import MinMaxScaler
-
-
-    # 读取葡萄酒数据集
-    data = pd.read_csv("G:/dataset/wine.csv")
-    # 获取第二列Alcohol
-    x = data["Alcohol"]
-    # 获取数据的基本情况
-    print(x.describe())
-    minMax = MinMaxScaler()
-    # 将数据进行归一化
-    x_std = minMax.fit_transform(x)
-    pd.DataFrame()
-
-