test.py 3.68 KB
import pandas as pd
import pymysql


def con_sql(db,sql):
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        df = pd.DataFrame(list(result))
    except Exception:
        print("发生异常", Exception)
        df = pd.DataFrame()
    finally:
        db.close()
    return df



def exp():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
    cursor = db.cursor()
    cursor.execute(sql)
    result = cursor.fetchone()[0]
    native = tuple(result.split(","))
    print("total")
    print(len(native))
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    sql = "select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}".format(native)
    df = con_sql(db,sql)


    n = df.shape[0]
    one = df[1].unique()
    one_map = {}
    for i in one:
        one_map[i] = df.loc[df[1]==i].shape[0]/n
    print(sorted(one_map.items(),key = lambda x:x[1]))
    two = df[2].unique()
    two_map = {}
    print("分界线")
    for i in two:
        two_map[i] = df.loc[df[2] == i].shape[0] / n
    print(sorted(two_map.items(), key=lambda x: x[1]))


def click():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    sql = "select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f " \
          "on d.cid_id = f.diary_id where d.device_id = '358035085192742' " \
          "and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
    df = con_sql(db, sql)

    n = df.shape[0]
    print(n)
    one = df[1].unique()
    one_map = {}
    for i in one:
        one_map[i] = df.loc[df[1] == i].shape[0] / n
    print(sorted(one_map.items(), key=lambda x: x[1],reverse=True))
    two = df[2].unique()
    two_map = {}
    print("分界线")
    for i in two:
        two_map[i] = df.loc[df[2] == i].shape[0] / n
    print(sorted(two_map.items(), key=lambda x: x[1],reverse=True))

def get_cid():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select distinct cid_id from esmm_train_data where device_id = '358035085192742' " \
          "and stat_date >= '2018-12-03'"
    df = con_sql(db, sql)[0].values.tolist()
    print(",".join(df))

def gen_tfrecords(in_file):
    import os
    import tensorflow as tf
    path = "/home/data/"
    basename = os.path.basename(in_file) + ".tfrecord"
    # 拼接文件路径
    out_file = os.path.join(path, basename)
    tfrecord_out = tf.python_io.TFRecordWriter(out_file)
    df = pd.read_csv(in_file)
    for i in range(df.shape[0]):
        features = tf.train.Features(feature={
            "y": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["y"][i]])),
            "z": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["z"][i]])),
            "top": tf.train.Feature(int64_list=tf.train.Int64List(value=[df["top"][i]]))
        })
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        tfrecord_out.write(serialized)
    tfrecord_out.close()

def get_cid_time():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select cid_id,time from cid_time"
    df = con_sql(db, sql)
    df = df.rename(columns = {0:"cid",1:"time"})
    print(df.head(6))
    df.to_csv("/home/gmuser/cid_time.csv",index=None)



if __name__ == "__main__":
    get_cid_time()
    pd.cut()