Commit 03f86762 authored by 张彦钊's avatar 张彦钊

删除device id特征

parent 1342e3ec
...@@ -45,12 +45,12 @@ def get_data(): ...@@ -45,12 +45,12 @@ def get_data():
print("after") print("after")
df = df.drop_duplicates() df = df.drop_duplicates()
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date","device_id"]) "channel", "top", "l1","l2", "time", "stat_date"])
print(df.shape) print(df.shape)
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date","device_id"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
...@@ -71,10 +71,11 @@ def get_data(): ...@@ -71,10 +71,11 @@ def get_data():
temp = list(range(1,len(unique_values)+1)) temp = list(range(1,len(unique_values)+1))
value_map = dict(zip(unique_values,temp)) value_map = dict(zip(unique_values,temp))
df = df.drop("device_id", axis=1)
train = df[df["stat_date"] != validate_date+"stat_date"] train = df[df["stat_date"] != validate_date+"stat_date"]
test = df[df["stat_date"] == validate_date+"stat_date"] test = df[df["stat_date"] == validate_date+"stat_date"]
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2","device_id"]: "channel", "top", "l1", "time", "stat_date","l2"]:
train[i] = train[i].map(value_map) train[i] = train[i].map(value_map)
test[i] = test[i].map(value_map) test[i] = test[i].map(value_map)
...@@ -107,8 +108,7 @@ def get_predict(date,value_map): ...@@ -107,8 +108,7 @@ def get_predict(date,value_map):
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \ "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_level2 cl on e.cid_id = cl.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid"
"where e.device_id = '358035085192742'"
df = con_sql(db, sql) df = con_sql(db, sql)
df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
...@@ -118,12 +118,10 @@ def get_predict(date,value_map): ...@@ -118,12 +118,10 @@ def get_predict(date,value_map):
print("predict shape") print("predict shape")
print(df.shape) print(df.shape)
df["uid"] = df["device_id"] df["uid"] = df["device_id"]
df["city"] = df["ucity_id"] df["city"] = df["ucity_id"]
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "time", "stat_date","device_id"] "channel", "top", "time", "stat_date"]
for i in features: for i in features:
df[i] = df[i].astype("str") df[i] = df[i].astype("str")
df[i] = df[i].fillna("lost") df[i] = df[i].fillna("lost")
...@@ -139,9 +137,8 @@ def get_predict(date,value_map): ...@@ -139,9 +137,8 @@ def get_predict(date,value_map):
nearby_pre = df[df["label"] == 1] nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1) nearby_pre = nearby_pre.drop("label", axis=1)
for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2","device_id"]: "channel", "top", "l1", "time", "stat_date","l2"]:
native_pre[i] = native_pre[i].map(value_map) native_pre[i] = native_pre[i].map(value_map)
# TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
native_pre[i] = native_pre[i].fillna(0) native_pre[i] = native_pre[i].fillna(0)
...@@ -150,6 +147,7 @@ def get_predict(date,value_map): ...@@ -150,6 +147,7 @@ def get_predict(date,value_map):
# TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下 # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
nearby_pre[i] = nearby_pre[i].fillna(0) nearby_pre[i] = nearby_pre[i].fillna(0)
print("native") print("native")
print(native_pre.shape) print(native_pre.shape)
print(native_pre.head()) print(native_pre.head())
...@@ -170,4 +168,3 @@ if __name__ == '__main__': ...@@ -170,4 +168,3 @@ if __name__ == '__main__':
...@@ -29,7 +29,7 @@ def gen_tfrecords(in_file): ...@@ -29,7 +29,7 @@ def gen_tfrecords(in_file):
for i in range(df.shape[0]): for i in range(df.shape[0]):
feats = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", feats = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1", "time", "stat_date","l2","device_id"] "channel", "top", "l1", "time", "stat_date","l2"]
id = np.array([]) id = np.array([])
for j in feats: for j in feats:
id = np.append(id,df[j][i]) id = np.append(id,df[j][i])
......
...@@ -53,7 +53,7 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False): ...@@ -53,7 +53,7 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
features = { features = {
"y": tf.FixedLenFeature([], tf.float32), "y": tf.FixedLenFeature([], tf.float32),
"z": tf.FixedLenFeature([], tf.float32), "z": tf.FixedLenFeature([], tf.float32),
"ids": tf.FixedLenFeature([12], tf.int64) "ids": tf.FixedLenFeature([11], tf.int64)
} }
parsed = tf.parse_single_example(record, features) parsed = tf.parse_single_example(record, features)
...@@ -351,20 +351,6 @@ def main(_): ...@@ -351,20 +351,6 @@ def main(_):
fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr'])) fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr']))
elif FLAGS.task_type == 'export': elif FLAGS.task_type == 'export':
print("Not Implemented, Do It Yourself!") print("Not Implemented, Do It Yourself!")
#feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
#feature_spec = {
# 'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
# 'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
#}
#serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
#feature_spec = {
# 'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'),
# 'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals')
#}
#serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)
#Estimator.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
if __name__ == "__main__": if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment