Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

Bayes Error Rate

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
Bayes Error Rate
0506bc69 · 高雅喆 · e4a70ddf · f6badd3c · 0506bc69 · 0506bc69
Commit 0506bc69 authored Dec 26, 2018 by 高雅喆
Hide whitespace changes
Inline Side-by-side

Showing with 72 additions and 14 deletions

temp_analysis.scala eda/feededa/src/main/scala/com/gmei/temp_analysis.scala +40 -6

applist.py tensnsorflow/applist.py +0 -1

ffm.py tensnsorflow/ffm.py +32 -7

No files found.
--- a/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
+++ b/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
@@ -425,6 +425,15 @@ object meigou_xiaofei_renshu {
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")


+
+      sc.sparkContext.addJar("hdfs:///user/hive/share/lib/udf/daybits-1.0.0-SNAPSHOT.jar")
+      sc.sparkContext.addJar("hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+
+
+      sc.sql("CREATE TEMPORARY FUNCTION dayBitsGetW1 AS 'com.gmei.data.daybits.udf.DayBitsGetsW1'")
+      sc.sql("SELECT order_id FROM mining.ml_order_spam_recognize WHERE partition_date='20181212' AND self_support=0 AND dayBitsGetW1(predict_result,'20181212')=0").show()
+
+
      import sc.implicits._
      val stat_date = GmeiConfig.getMinusNDate(1)
      //println(param.date)
@@ -467,23 +476,48 @@ object meigou_xiaofei_renshu {
      final_id.createOrReplaceTempView("final_id")


+//      val meigou_price = sc.sql(
+//        s"""
+//           |select  md.user_id,sum(md.gengmei_price) as pay_all
+//           |from online.ml_meigou_order_detail md left join final_id
+//           |on md.device_id = final_id.device_id
+//           |where md.status= 2
+//           |and final_id.device_id is  null
+//           |and md.partition_date = '20181223'
+//           |and  md.pay_time  is  not  null
+//           |and  md.validate_time>'2017-01-01 00:00:00.0'
+//           |group by md.user_id
+//           |order by sum(md.gengmei_price)
+//       """.stripMargin
+//      )
+//      meigou_price.show(80)
+
+
      val meigou_price = sc.sql(
        s"""
           |select  md.user_id,sum(md.gengmei_price) as pay_all
-           |from online.ml_meigou_order_detail md left join final_id
-           |on md.device_id = final_id.device_id
+           |from online.ml_meigou_order_detail md
+           |left join
+           |(
+           |        SELECT
+           |        order_id
+           |        FROM mining.ml_order_spam_recognize
+           |        WHERE partition_date='20181223' AND
+           |        self_support=0 AND dayBitsGetW1(predict_result,'20181223')=0
+           |)spam
+           |on md.order_id = spam.order_id
           |where md.status= 2
-           |and final_id.device_id is  null
+           |and  spam.order_id is null
           |and md.partition_date = '20181223'
           |and  md.pay_time  is  not  null
-           |and  md.validate_time>'2018-01-01 00:00:00.0'
+           |and  md.validate_time>'2017-01-01 00:00:00.0'
           |group by md.user_id
           |order by sum(md.gengmei_price)
       """.stripMargin
      )
-      meigou_price.show(80)
+//      meigou_price.show(80)

-      GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Append)
+//      GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Overwrite)

    }


--- a/tensnsorflow/applist.py
+++ b/tensnsorflow/applist.py
@@ -65,7 +65,6 @@ def sort_app():
                "child": {"小伴龙", "儿歌多多", "宝宝巴士奇妙屋", "智慧树", "贝瓦儿歌", "儿歌点点", "宝贝听听", "宝宝小厨房", "宝宝游乐园", "叽里呱啦"},
                "homework": {"作业帮", "小猿搜题", "一起作业学生端", "学霸君", "互动作业", "猿题库", "纳米盒", "阿凡题", "洋葱数学"},
                "work": {"钉钉", "企业微信", "移动彩云", "云之家", "今目标", "口袋助理", "推事本", "奇鱼微办公", "工作圈", "明道"},
-                "home": {"最美装修", "齐家网", "土巴兔装修", "装修头条", "装修管家", "窝牛装修", "丽芙家居", "酷家乐装修", "惠装装修", "房天下装修"},
                "job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"}
                 }
    df["app_list"] = df["app_list"].apply(json_format)

--- a/tensnsorflow/ffm.py
+++ b/tensnsorflow/ffm.py
@@ -98,7 +98,13 @@ class multiFFMFormatPandas:

        result_map = {}
        for i in data_list:
+            print("before:total")
+            print(len(result_map))
+            print(len(i.get()))
            result_map.update(i.get())
+            print("result_map")
+            print(len(result_map))
+
        pool.close()
        pool.join()

@@ -199,8 +205,8 @@ def transform(a,validate_date):
    test = test.drop("stat_date",axis=1)
    # print("train shape")
    # print(train.shape)
-    train.to_csv(path + "tr.csv", sep="\t", index=False)
-    test.to_csv(path + "va.csv", sep="\t", index=False)
+    # train.to_csv(path + "tr.csv", sep="\t", index=False)
+    # test.to_csv(path + "va.csv", sep="\t", index=False)

    return model

@@ -218,14 +224,23 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):

    print("before filter:")
    print(df.shape)
+    print(df.loc[df["device_id"]=="358035085192742"].shape)
    df = df[df["ucity_id"].isin(ucity_id)]
    print("after ucity filter:")
    print(df.shape)
+    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["ccity_name"].isin(ccity_name)]
    print("after ccity_name filter:")
+    print(df.shape)
+    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["manufacturer"].isin(manufacturer)]
+    print("after manufacturer filter:")
+    print(df.shape)
+    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df = df[df["channel"].isin(channel)]
+    print("after channel filter:")
    print(df.shape)
+    print(df.loc[df["device_id"] == "358035085192742"].shape)
    df["cid_id"] = df["cid_id"].astype("str")
    df["clevel1_id"] = df["clevel1_id"].astype("str")
    df["top"] = df["top"].astype("str")
@@ -236,9 +251,12 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
        [df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
         df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
    df = df.drop(["z","label","device_id","cid_id"], axis=1).fillna(0.0)
-    print(df.head(2))
-    df = model.transform(df,n=160000, processes=22)
-    df = pd.DataFrame(df)
+    print("before transform")
+    print(df.shape)
+    temp_series = model.transform(df,n=160000, processes=22)
+    df = pd.DataFrame(temp_series)
+    print("after transform")
+    print(df.shape)
    df["label"] = df[0].apply(lambda x: x.split(",")[0])
    df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
    df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
@@ -251,14 +269,21 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
    df = df.drop([0, "seq"], axis=1)
    print(df.head())

+    print(df.loc[df["device_id"] == "358035085192742"].shape)
    native_pre = df[df["label"] == "0"]
    native_pre = native_pre.drop("label", axis=1)
+    print("native")
+    print(native_pre.shape)
+    print(native_pre.loc[native_pre["device_id"] == "358035085192742"].shape)
    native_pre.to_csv(path+"native.csv",sep="\t",index=False)
    # print("native_pre shape")
    # print(native_pre.shape)

    nearby_pre = df[df["label"] == "1"]
    nearby_pre = nearby_pre.drop("label", axis=1)
+    print("nearby")
+    print(nearby_pre.shape)
+    print(nearby_pre.loc[nearby_pre["device_id"] == "358035085192742"].shape)
    nearby_pre.to_csv(path + "nearby.csv", sep="\t", index=False)
    # print("nearby_pre shape")
    # print(nearby_pre.shape)
@@ -268,8 +293,8 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
 if __name__ == "__main__":
    path = "/home/gmuser/ffm/"
    a = time.time()
-    df, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
-    model = transform(df, validate_date)
+    temp, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
+    model = transform(temp, validate_date)
    get_predict_set(ucity_id,model,ccity_name,manufacturer,channel)
    b = time.time()
    print("cost(分钟)")