取top一半的日记

6013308d · 张彦钊 · f52ebb1e · 6013308d · 6013308d
Commit 6013308d authored Jan 28, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 4 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +1 -1

sort_to_sql.py tensnsorflow/sort_to_sql.py +15 -3

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -109,7 +109,7 @@ def get_predict(date,value_map):
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_level2 cl on e.cid_id = cl.cid " \
-          "left join cid_time_cut cut on e.cid_id = cut.cid where e.device_id = '358035085192742'"
+          "left join cid_time_cut cut on e.cid_id = cut.cid"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",

--- a/tensnsorflow/sort_to_sql.py
+++ b/tensnsorflow/sort_to_sql.py
@@ -18,10 +18,22 @@ def con_sql(sql):
    db.close()
    return result

-def set_join(lst):
+
+def nearby_set_join(lst):
    # return ','.join([str(i) for i in list(lst)])
    return ','.join([str(i) for i in lst.unique().tolist()])

+
+def native_set_join(lst):
+    l = lst.unique().tolist()
+    d = int(len(l)/2)
+    if d == 0:
+        d = 1
+    r = [str(i) for i in l]
+    r =r[:d]
+    return ','.join(r)
+
+
 def main():

    # native queue
@@ -30,7 +42,7 @@ def main():

    df1 = pd.read_csv("/home/gmuser/esmm_data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
-    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
    print("native_device_count",df3.shape)

@@ -41,7 +53,7 @@ def main():

    df1 = pd.read_csv("/home/gmuser/esmm_data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
-    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
+    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
    print("nearby_device_count",df4.shape)