Commit 4e17c892 authored by 张彦钊's avatar 张彦钊

pandas 映射

parent b0fa9e51
......@@ -93,9 +93,9 @@ def get_predict(date,value_map):
df[i] = df[i].fillna("lost")
df[i] = df[i] + i
native_pre = df[df["label"] == "0"]
native_pre = df[df["label"] == 0]
native_pre = native_pre.drop("label", axis=1)
nearby_pre = df[df["label"] == "1"]
nearby_pre = df[df["label"] == 1]
nearby_pre = nearby_pre.drop("label", axis=1)
for i in features:
......
import pandas as pd
import pymysql
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
def exp():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchone()[0]
native = tuple(result.split(","))
print("total")
print(len(native))
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
df = con_sql(db,sql)
n = df.shape[0]
one = df[1].unique()
one_map = {}
for i in one:
one_map[i] = df.loc[df[1]==i].shape[0]/n
print(sorted(one_map.items(),key = lambda x:x[1]))
two = df[2].unique()
two_map = {}
print("分界线")
for i in two:
two_map[i] = df.loc[df[2] == i].shape[0] / n
print(sorted(two_map.items(), key=lambda x: x[1]))
#coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pandas as pd
import sys
import os
import glob
def click():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
"on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
"and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
df = con_sql(db, sql)
import tensorflow as tf
import numpy as np
import re
from multiprocessing import Pool as ThreadPool
n = df.shape[0]
print(n)
one = df[1].unique()
one_map = {}
for i in one:
one_map[i] = df.loc[df[1] == i].shape[0] / n
print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
two = df[2].unique()
two_map = {}
print("分界线")
for i in two:
two_map[i] = df.loc[df[2] == i].shape[0] / n
print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))
flags = tf.app.flags
FLAGS = flags.FLAGS
LOG = tf.logging
def get_cid():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select distinct cid_id from esmm_train_data where device_id = '358035085192742' " \
"and stat_date >= '2018-12-03'"
df = con_sql(db, sql)[0].values.tolist()
print(",".join(df))
tf.app.flags.DEFINE_string("input_dir", "./", "input dir")
tf.app.flags.DEFINE_string("output_dir", "./", "output dir")
tf.app.flags.DEFINE_integer("threads", 16, "threads num")
def gen_tfrecords(in_file):
import os
import tensorflow as tf
path = "/home/data/"
basename = os.path.basename(in_file) + ".tfrecord"
# 拼接文件路径
out_file = os.path.join(path, basename)
out_file = os.path.join(FLAGS.output_dir, basename)
tfrecord_out = tf.python_io.TFRecordWriter(out_file)
df = pd.read_csv(in_file)
["", "", "", "device_type", "manufacturer",
, "level2_ids", "time", "stat_date"]
for i in range(df.shape[0]):
features = tf.train.Features(feature={
"y": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["y"][i]])),
"z": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["z"][i]])),
"top": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["top"][i]]))
"top": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["top"][i]])),
"channel":tf.train.Feature(int64_list=tf.train.Int64List(value=[df["channel"][i]])),
"ucity_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["ucity_id"][i]])),
"clevel1_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["clevel1_id"][i]])),
"ccity_name": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["ccity_name"][i]])),
"channel": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["channel"][i]])),
})
example = tf.train.Example(features=features)
example = tf.train.Example(features = tf.train.Features(feature = feature))
serialized = example.SerializeToString()
tfrecord_out.write(serialized)
tfrecord_out.close()
def get_cid_time():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select cid_id,time from cid_time"
df = con_sql(db, sql)
df = df.rename(columns = {0:"cid",1:"time"})
print(df.head(6))
df.to_csv("/home/gmuser/cid_time.csv",index=None)
def main(_):
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
file_list = glob.glob(os.path.join(FLAGS.input_dir, "*.csv"))
print("total files: %d" % len(file_list))
pool = ThreadPool(FLAGS.threads) # Sets the pool size
pool.map(gen_tfrecords, file_list)
pool.close()
pool.join()
if __name__ == "__main__":
get_cid_time()
pd.cut()
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment