Commit 68d51e67 authored by 张彦钊's avatar 张彦钊

change test file

parent d6984584
......@@ -206,15 +206,15 @@ def feature_engineer():
df = spark.sql(sql)
print("train number")
print(df.count())
df = df.drop_duplicates(["ucity_id", "level2_ids", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids",
"tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7","search_tag2","search_tag3"])
df = df.na.fill(dict(zip(features, features)))
print("train number")
print(df.count())
rdd = df.select("stat_date", "y", "z", "app_list", "level2_ids", "level3_ids",
"tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7",
"ucity_id", "ccity_name", "device_type", "manufacturer", "channel", "top", "time",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment