Commit 0506bc69 authored by 高雅喆's avatar 高雅喆

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

Bayes Error Rate
parents e4a70ddf f6badd3c
......@@ -425,6 +425,15 @@ object meigou_xiaofei_renshu {
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
sc.sparkContext.addJar("hdfs:///user/hive/share/lib/udf/daybits-1.0.0-SNAPSHOT.jar")
sc.sparkContext.addJar("hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
sc.sql("CREATE TEMPORARY FUNCTION dayBitsGetW1 AS 'com.gmei.data.daybits.udf.DayBitsGetsW1'")
sc.sql("SELECT order_id FROM mining.ml_order_spam_recognize WHERE partition_date='20181212' AND self_support=0 AND dayBitsGetW1(predict_result,'20181212')=0").show()
import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1)
//println(param.date)
......@@ -467,23 +476,48 @@ object meigou_xiaofei_renshu {
final_id.createOrReplaceTempView("final_id")
// val meigou_price = sc.sql(
// s"""
// |select md.user_id,sum(md.gengmei_price) as pay_all
// |from online.ml_meigou_order_detail md left join final_id
// |on md.device_id = final_id.device_id
// |where md.status= 2
// |and final_id.device_id is null
// |and md.partition_date = '20181223'
// |and md.pay_time is not null
// |and md.validate_time>'2017-01-01 00:00:00.0'
// |group by md.user_id
// |order by sum(md.gengmei_price)
// """.stripMargin
// )
// meigou_price.show(80)
val meigou_price = sc.sql(
s"""
|select md.user_id,sum(md.gengmei_price) as pay_all
|from online.ml_meigou_order_detail md left join final_id
|on md.device_id = final_id.device_id
|from online.ml_meigou_order_detail md
|left join
|(
| SELECT
| order_id
| FROM mining.ml_order_spam_recognize
| WHERE partition_date='20181223' AND
| self_support=0 AND dayBitsGetW1(predict_result,'20181223')=0
|)spam
|on md.order_id = spam.order_id
|where md.status= 2
|and final_id.device_id is null
|and spam.order_id is null
|and md.partition_date = '20181223'
|and md.pay_time is not null
|and md.validate_time>'2018-01-01 00:00:00.0'
|and md.validate_time>'2017-01-01 00:00:00.0'
|group by md.user_id
|order by sum(md.gengmei_price)
""".stripMargin
)
meigou_price.show(80)
// meigou_price.show(80)
GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Append)
// GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Overwrite)
}
......
......@@ -65,7 +65,6 @@ def sort_app():
"child": {"小伴龙", "儿歌多多", "宝宝巴士奇妙屋", "智慧树", "贝瓦儿歌", "儿歌点点", "宝贝听听", "宝宝小厨房", "宝宝游乐园", "叽里呱啦"},
"homework": {"作业帮", "小猿搜题", "一起作业学生端", "学霸君", "互动作业", "猿题库", "纳米盒", "阿凡题", "洋葱数学"},
"work": {"钉钉", "企业微信", "移动彩云", "云之家", "今目标", "口袋助理", "推事本", "奇鱼微办公", "工作圈", "明道"},
"home": {"最美装修", "齐家网", "土巴兔装修", "装修头条", "装修管家", "窝牛装修", "丽芙家居", "酷家乐装修", "惠装装修", "房天下装修"},
"job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"}
}
df["app_list"] = df["app_list"].apply(json_format)
......
......@@ -98,7 +98,13 @@ class multiFFMFormatPandas:
result_map = {}
for i in data_list:
print("before:total")
print(len(result_map))
print(len(i.get()))
result_map.update(i.get())
print("result_map")
print(len(result_map))
pool.close()
pool.join()
......@@ -199,8 +205,8 @@ def transform(a,validate_date):
test = test.drop("stat_date",axis=1)
# print("train shape")
# print(train.shape)
train.to_csv(path + "tr.csv", sep="\t", index=False)
test.to_csv(path + "va.csv", sep="\t", index=False)
# train.to_csv(path + "tr.csv", sep="\t", index=False)
# test.to_csv(path + "va.csv", sep="\t", index=False)
return model
......@@ -218,14 +224,23 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
print("before filter:")
print(df.shape)
print(df.loc[df["device_id"]=="358035085192742"].shape)
df = df[df["ucity_id"].isin(ucity_id)]
print("after ucity filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["ccity_name"].isin(ccity_name)]
print("after ccity_name filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["manufacturer"].isin(manufacturer)]
print("after manufacturer filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df = df[df["channel"].isin(channel)]
print("after channel filter:")
print(df.shape)
print(df.loc[df["device_id"] == "358035085192742"].shape)
df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str")
......@@ -236,9 +251,12 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
[df["device_id"].values.tolist(), df["ucity_id"].values.tolist(), df["cid_id"].values.tolist(),
df["y"].values.tolist(), df["z"].values.tolist()], sep=",")
df = df.drop(["z","label","device_id","cid_id"], axis=1).fillna(0.0)
print(df.head(2))
df = model.transform(df,n=160000, processes=22)
df = pd.DataFrame(df)
print("before transform")
print(df.shape)
temp_series = model.transform(df,n=160000, processes=22)
df = pd.DataFrame(temp_series)
print("after transform")
print(df.shape)
df["label"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
df["city_id"] = df[0].apply(lambda x: x.split(",")[2])
......@@ -251,14 +269,21 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
df = df.drop([0, "seq"], axis=1)
print(df.head())
print(df.loc[df["device_id"] == "358035085192742"].shape)
native_pre = df[df["label"] == "0"]
native_pre = native_pre.drop("label", axis=1)
print("native")
print(native_pre.shape)
print(native_pre.loc[native_pre["device_id"] == "358035085192742"].shape)
native_pre.to_csv(path+"native.csv",sep="\t",index=False)
# print("native_pre shape")
# print(native_pre.shape)
nearby_pre = df[df["label"] == "1"]
nearby_pre = nearby_pre.drop("label", axis=1)
print("nearby")
print(nearby_pre.shape)
print(nearby_pre.loc[nearby_pre["device_id"] == "358035085192742"].shape)
nearby_pre.to_csv(path + "nearby.csv", sep="\t", index=False)
# print("nearby_pre shape")
# print(nearby_pre.shape)
......@@ -268,8 +293,8 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
if __name__ == "__main__":
path = "/home/gmuser/ffm/"
a = time.time()
df, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
model = transform(df, validate_date)
temp, validate_date, ucity_id,ccity_name,manufacturer,channel = get_data()
model = transform(temp, validate_date)
get_predict_set(ucity_id,model,ccity_name,manufacturer,channel)
b = time.time()
print("cost(分钟)")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment