change test file

d94b9aa5 · 张彦钊 · ce8cfc08 · d94b9aa5
Commit d94b9aa5 authored May 24, 2019 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 11 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +8 -11

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -53,7 +53,6 @@ def get_map():
    c = time.time()
    leve3_number, leve3_map = get_list(db, sql, 1+leve2_number+apps_number)
    print((time.time() - c) / 60)
-
    return apps_number, app_list_map,leve2_number, leve2_map,leve3_number, leve3_map


@@ -67,10 +66,16 @@ def get_unique(db,sql):
    print(len(v))
    return v

+def con_sql(db,sql):
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    df = pd.DataFrame(list(result))
+    db.close()
+    return df

 def feature_engineer():
    apps_number, app_list_map, level2_number, level2_map, level3_number, level3_map = get_map()
-
    unique_values = []
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select distinct stat_date from esmm_train_data_dur"
@@ -188,7 +193,7 @@ def feature_engineer():
                    "tag1","tag2","tag3","tag4","tag5","tag6","tag7",
                     "ucity_id", "ccity_name","device_type", "manufacturer", "channel", "top", "time",
                    "hospital_id","treatment_method", "price_min", "price_max", "treatment_time",
-                    "maintain_time","recover_time").rdd.coalesce(200)\
+                    "maintain_time","recover_time").rdd.repartition(200)\
        .map(lambda x: (x[0],float(x[1]),float(x[2]),
                        app_list_func(x[3], app_list_map), app_list_func(x[4], level2_map),
                        app_list_func(x[5], level3_map), app_list_func(x[6], level2_map),
@@ -320,14 +325,6 @@ def get_predict():
    rdd.unpersist()


-def con_sql(db,sql):
-    cursor = db.cursor()
-    cursor.execute(sql)
-    result = cursor.fetchall()
-    df = pd.DataFrame(list(result))
-    db.close()
-    return df
-

 if __name__ == '__main__':
    sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \