删除device id特征

03f86762 · 张彦钊 · 1342e3ec · 03f86762 · 03f86762 · 03f86762
Commit 03f86762 authored Jan 22, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 28 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +8 -11

test.py tensnsorflow/test.py +3 -2

train.py tensnsorflow/train.py +1 -15

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -45,12 +45,12 @@ def get_data():
    print("after")
    df = df.drop_duplicates()
    df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                             "channel", "top", "l1","l2", "time", "stat_date","device_id"])
+                             "channel", "top", "l1","l2", "time", "stat_date"])
    print(df.shape)
    unique_values = []
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "time", "stat_date","device_id"]
+                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
@@ -71,10 +71,11 @@ def get_data():
    temp = list(range(1,len(unique_values)+1))
    value_map = dict(zip(unique_values,temp))
+    df = df.drop("device_id", axis=1)
    train = df[df["stat_date"] != validate_date+"stat_date"]
    test = df[df["stat_date"] == validate_date+"stat_date"]
    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2","device_id"]:
+                "channel", "top", "l1", "time", "stat_date","l2"]:
        train[i] = train[i].map(value_map)
        test[i] = test[i].map(value_map)
@@ -107,8 +108,7 @@ def get_predict(date,value_map):
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_level2 cl on e.cid_id = cl.cid " \
-          "left join cid_time_cut cut on e.cid_id = cut.cid " \
+          "left join cid_time_cut cut on e.cid_id = cut.cid"
-          "where e.device_id = '358035085192742'"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
@@ -118,12 +118,10 @@ def get_predict(date,value_map):
    print("predict shape")
    print(df.shape)
    df["uid"] = df["device_id"]
    df["city"] = df["ucity_id"]
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "time", "stat_date","device_id"]
+                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
@@ -139,9 +137,8 @@ def get_predict(date,value_map):
    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)
    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2","device_id"]:
+                "channel", "top", "l1", "time", "stat_date","l2"]:
        native_pre[i] = native_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        native_pre[i] = native_pre[i].fillna(0)
@@ -150,6 +147,7 @@ def get_predict(date,value_map):
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        nearby_pre[i] = nearby_pre[i].fillna(0)
    print("native")
    print(native_pre.shape)
    print(native_pre.head())
@@ -170,4 +168,3 @@ if __name__ == '__main__':
--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
@@ -29,7 +29,7 @@ def gen_tfrecords(in_file):
    for i in range(df.shape[0]):
        feats = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
-                "channel", "top", "l1", "time", "stat_date","l2","device_id"]
+                "channel", "top", "l1", "time", "stat_date","l2"]
        id = np.array([])
        for j in feats:
            id = np.append(id,df[j][i])
@@ -58,4 +58,4 @@ def main(_):
 if __name__ == "__main__":
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.app.run()
\ No newline at end of file
--- a/tensnsorflow/train.py
+++ b/tensnsorflow/train.py
@@ -53,7 +53,7 @@ def input_fn(filenames, batch_size=32, num_epochs=1, perform_shuffle=False):
        features = {
            "y": tf.FixedLenFeature([], tf.float32),
            "z": tf.FixedLenFeature([], tf.float32),
-            "ids": tf.FixedLenFeature([12], tf.int64)
+            "ids": tf.FixedLenFeature([11], tf.int64)
        }
        parsed = tf.parse_single_example(record, features)
@@ -351,20 +351,6 @@ def main(_):
                    fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr']))
    elif FLAGS.task_type == 'export':
        print("Not Implemented, Do It Yourself!")
-        #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
-        #feature_spec = {
-        #    'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
-        #    'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
-        #}
-        #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
-        #feature_spec = {
-        #    'feat_ids': tf.placeholder(dtype=tf.int64, shape=[None, FLAGS.field_size], name='feat_ids'),
-        #    'feat_vals': tf.placeholder(dtype=tf.float32, shape=[None, FLAGS.field_size], name='feat_vals')
-        #}
-        #serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_spec)
-        #Estimator.export_savedmodel(FLAGS.servable_model_dir, serving_input_receiver_fn)
 if __name__ == "__main__":
    tf.logging.set_verbosity(tf.logging.INFO)