pandas 映射

4e17c892 · 张彦钊 · b0fa9e51 · 4e17c892 · 4e17c892
Commit 4e17c892 authored Jan 15, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 94 deletions

feature_engineering.py tensnsorflow/feature_engineering.py +2 -2

test.py tensnsorflow/test.py +42 -92

No files found.
--- a/tensnsorflow/feature_engineering.py
+++ b/tensnsorflow/feature_engineering.py
@@ -93,9 +93,9 @@ def get_predict(date,value_map):
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i

-    native_pre = df[df["label"] == "0"]
+    native_pre = df[df["label"] == 0]
    native_pre = native_pre.drop("label", axis=1)
-    nearby_pre = df[df["label"] == "1"]
+    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)

    for i in features:

--- a/tensnsorflow/test.py
+++ b/tensnsorflow/test.py
-import pandas as pd
-import pymysql
-
-
-def con_sql(db,sql):
-    cursor = db.cursor()
-    try:
-        cursor.execute(sql)
-        result = cursor.fetchall()
-        df = pd.DataFrame(list(result))
-    except Exception:
-        print("发生异常", Exception)
-        df = pd.DataFrame()
-    finally:
-        db.close()
-    return df
-
-
-
-def exp():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
-    cursor = db.cursor()
-    cursor.execute(sql)
-    result = cursor.fetchone()[0]
-    native = tuple(result.split(","))
-    print("total")
-    print(len(native))
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
-    sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
-    df = con_sql(db,sql)
-
-
-    n = df.shape[0]
-    one = df[1].unique()
-    one_map = {}
-    for i in one:
-        one_map[i] = df.loc[df[1]==i].shape[0]/n
-    print(sorted(one_map.items(),key = lambda x:x[1]))
-    two = df[2].unique()
-    two_map = {}
-    print("分界线")
-    for i in two:
-        two_map[i] = df.loc[df[2] == i].shape[0] / n
-    print(sorted(two_map.items(), key=lambda x: x[1]))
+#coding=utf-8

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import pandas as pd
+import sys
+import os
+import glob

-def click():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
-    sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
-          "on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
-          "and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
-    df = con_sql(db, sql)
+import tensorflow as tf
+import numpy as np
+import re
+from multiprocessing import Pool as ThreadPool

-    n = df.shape[0]
-    print(n)
-    one = df[1].unique()
-    one_map = {}
-    for i in one:
-        one_map[i] = df.loc[df[1] == i].shape[0] / n
-    print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
-    two = df[2].unique()
-    two_map = {}
-    print("分界线")
-    for i in two:
-        two_map[i] = df.loc[df[2] == i].shape[0] / n
-    print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+LOG = tf.logging

-def get_cid():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select distinct cid_id from esmm_train_data where device_id = '358035085192742' " \
-          "and stat_date >= '2018-12-03'"
-    df = con_sql(db, sql)[0].values.tolist()
-    print(",".join(df))
+tf.app.flags.DEFINE_string("input_dir", "./", "input dir")
+tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
+tf.app.flags.DEFINE_integer("threads", 16, "threads num")

 def gen_tfrecords(in_file):
-    import os
-    import tensorflow as tf
-    path = "/home/data/"
    basename = os.path.basename(in_file) + ".tfrecord"
-    # 拼接文件路径
-    out_file = os.path.join(path, basename)
+    out_file = os.path.join(FLAGS.output_dir, basename)
    tfrecord_out = tf.python_io.TFRecordWriter(out_file)
    df = pd.read_csv(in_file)
+    ["", "", "", "device_type", "manufacturer",
+     , "level2_ids", "time", "stat_date"]
    for i in range(df.shape[0]):
        features = tf.train.Features(feature={
            "y": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["y"][i]])),
            "z": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["z"][i]])),
-            "top": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["top"][i]]))
+            "top": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["top"][i]])),
+            "channel":tf.train.Feature(int64_list=tf.train.Int64List(value=[df["channel"][i]])),
+            "ucity_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["ucity_id"][i]])),
+            "clevel1_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["clevel1_id"][i]])),
+            "ccity_name": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["ccity_name"][i]])),
+            "channel": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["channel"][i]])),
+
        })
-        example = tf.train.Example(features=features)
+
+        example = tf.train.Example(features = tf.train.Features(feature = feature))
        serialized = example.SerializeToString()
        tfrecord_out.write(serialized)
    tfrecord_out.close()

-def get_cid_time():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    sql = "select cid_id,time from cid_time"
-    df = con_sql(db, sql)
-    df = df.rename(columns = {0:"cid",1:"time"})
-    print(df.head(6))
-    df.to_csv("/home/gmuser/cid_time.csv",index=None)
+def main(_):
+    if not os.path.exists(FLAGS.output_dir):
+        os.mkdir(FLAGS.output_dir)
+    file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
+    print("total files: %d" % len(file_list))

+    pool = ThreadPool(FLAGS.threads) # Sets the pool size
+    pool.map(gen_tfrecords, file_list)
+    pool.close()
+    pool.join()


 if __name__ == "__main__":
-    get_cid_time()
-    pd.cut()
-
-
-
-
-
-
+    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.app.run()
\ No newline at end of file