Commit d147e9b5 authored by 高雅喆's avatar 高雅喆

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

add if
parents 95adea27 d9a211b7
...@@ -4,6 +4,7 @@ import pymysql ...@@ -4,6 +4,7 @@ import pymysql
import pandas as pd import pandas as pd
from multiprocessing import Pool from multiprocessing import Pool
import numpy as np import numpy as np
import datetime
from sqlalchemy import create_engine from sqlalchemy import create_engine
...@@ -20,23 +21,30 @@ def con_sql(db,sql): ...@@ -20,23 +21,30 @@ def con_sql(db,sql):
db.close() db.close()
return df return df
def test(): # def test():
sql = "select max(update_time) from ffm_diary_queue" # sql = "select max(update_time) from ffm_diary_queue"
db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='eagle') # db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='eagle')
cursor = db.cursor() # cursor = db.cursor()
cursor.execute(sql) # cursor.execute(sql)
result = cursor.fetchone()[0] # result = cursor.fetchone()[0]
db.close() # db.close()
print(result) # print(result)
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data" sql = "select max(stat_date) from esmm_train_data"
validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:"+validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=3)).strftime("%Y-%m-%d")
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,y,z,stat_date,ucity_id,cid_id,clevel1_id,ccity_name from esmm_train_data " \
"where stat_date >= '{}'".format(start)
df = con_sql(db,sql) df = con_sql(db,sql)
df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id", df = df.rename(columns={0:"device_id",1: "y",2:"z",3:"stat_date",4:"ucity_id",5:"cid_id",
6:"clevel1_id",7:"ccity_name"}) 6:"clevel1_id",7:"ccity_name"})
print("esmm data ok") print("esmm data ok")
print(df.head())
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
df["cid_id"] = df["cid_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str")
df["y"] = df["y"].astype("str") df["y"] = df["y"].astype("str")
...@@ -47,17 +55,19 @@ def get_data(): ...@@ -47,17 +55,19 @@ def get_data():
print(df.head()) print(df.head())
print("shape") print("shape")
print(df.shape) print(df.shape)
df = pd.merge(df,get_statistics(),on="device_id",how='left').fillna(0) df = pd.merge(df,get_statistics(),how='left').fillna(0)
print("merge") print("merge")
print(df.head()) print(df.head())
print("shape") print("shape")
print(df.shape) print(df.shape)
# transform(df) df = df.drop("device_id", axis=1)
print(df.head())
transform(df, validate_date)
def transform(df): def transform(df,validate_date):
model = multiFFMFormatPandas() model = multiFFMFormatPandas()
df = model.fit_transform(df, y="y", n=80000, processes=10) df = model.fit_transform(df, y="y", n=80000, processes=18)
df = pd.DataFrame(df) df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0]) df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1]) df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
...@@ -68,16 +78,17 @@ def transform(df): ...@@ -68,16 +78,17 @@ def transform(df):
df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:])) df["ffm"] = df[0].apply(lambda x: ",".join(x.split(",")[4:]))
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",") df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0]) df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop(0, axis=1).drop("seq",axis=1) df = df.drop([0,"seq"], axis=1)
print(df.head()) print(df.head())
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data"
df = con_sql(db, sql)[0].values.tolist()[0]
train = df[df["stat_date"] != "2018-11-25"] train = df[df["stat_date"] != validate_date]
train = train.drop("stat_date",axis=1) train = train.drop("stat_date",axis=1)
test = df[df["stat_date"] == "2018-11-25"] print("train shape")
print(train.shape)
test = df[df["stat_date"] == validate_date]
test = test.drop("stat_date",axis=1) test = test.drop("stat_date",axis=1)
print("test shape")
print(test.shape)
train.to_csv(path+"train.csv",index=None) train.to_csv(path+"train.csv",index=None)
test.to_csv(path + "test.csv", index=None) test.to_csv(path + "test.csv", index=None)
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
...@@ -95,15 +106,15 @@ def transform(df): ...@@ -95,15 +106,15 @@ def transform(df):
def get_statistics(): def get_statistics():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select device_id,device_type,channel,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \ sql = "select device_id,device_type,total,精选,直播,鼻部,眼部,微整,牙齿,轮廓,美肤抗衰," \
"吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click" "吸脂,脂肪填充,隆胸,私密,毛发管理,公立,韩国 from home_tab_click"
df = con_sql(db, sql) df = con_sql(db, sql).drop_duplicates()
df = df.rename(columns={0:"device_id",1:"device_type",2:"channel",3:"total"}) df = df.rename(columns={0:"device_id",1:"os",2:"total"})
for i in df.columns.difference(["device_id", "device_type","channel","total"]): for i in df.columns.difference(["device_id", "os","total"]):
df[i] = df[i]/df["total"] df[i] = df[i]/df["total"]
df = df.drop("total", axis=1)
return df return df
class multiFFMFormatPandas: class multiFFMFormatPandas:
def __init__(self): def __init__(self):
self.field_index_ = None self.field_index_ = None
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment