change test fliw

5d132b92 · 张彦钊 · cec6a113 · 5d132b92
Commit 5d132b92 authored May 29, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

feature_engineering.py eda/esmm/Model_pipline/feature_engineering.py +5 -1

No files found.
--- a/eda/esmm/Model_pipline/feature_engineering.py
+++ b/eda/esmm/Model_pipline/feature_engineering.py
@@ -139,7 +139,7 @@ def feature_engineer():
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:" + validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
-    start = (temp - datetime.timedelta(days=100)).strftime("%Y-%m-%d")
+    start = (temp - datetime.timedelta(days=10)).strftime("%Y-%m-%d")
    print(start)

    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC')
@@ -266,6 +266,7 @@ def get_predict(date,value_map,app_list_map,leve2_map,leve3_map):
                "treatment_method", "price_min", "price_max", "treatment_time", "maintain_time", "recover_time"]

    df = spark.sql(sql)
+    df = df.drop_duplicates(["treatment_method","price_min","price_max","treatment_time","maintain_time","recover_time"])
    df = df.na.fill(dict(zip(features, features)))
    f = time.time()
    rdd = df.select("label", "y", "z", "ucity_id", "device_id", "cid_id", "app_list", "level2_ids", "level3_ids",
@@ -295,6 +296,9 @@ def get_predict(date,value_map,app_list_map,leve2_map,leve3_map):
        .toDF("city","uid","cid_id")
    print("native csv")
    native_pre.toPandas().to_csv(local_path+"native.csv", header=True)
+
+    print("预测集总数")
+    print(rdd.count())
 # TODO 写成csv文件改成下面这样
    # native_pre.coalesce(1).write.format('com.databricks.spark.csv').save(path+"native/",header = 'true')