Commit d61a97b5 authored by 张彦钊's avatar 张彦钊

change directories and modules

parent 4f096b7d
import datetime
import pymysql
import pandas as pd
import numpy as np
from utils import FFMFormatPandas
import xlearn as xl
import time
from prepareData import fetch_data
# 从数据库的表里获取数据,并转化成df格式
def con_sql(sql):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result)).dropna()
db.close()
return df
# 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click"
click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= '2018-08-03'"
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取点击表里的数据")
# 从time特征中抽取hour
click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
click = click.drop("time",axis=1)
print("点击表数据预览")
print(click.head(2))
# 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= '2018-08-03'"
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取曝光表里的数据")
# 从time特征中抽取hour
exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
exposure = exposure.drop("time",axis=1)
print("曝光表数据预览")
print(exposure.head(2))
exposure, click, click_device_id = fetch_data()
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
......@@ -86,69 +47,13 @@ print(data.head(2))
print("start ffm transform")
# ffm 格式转换函数、类
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
start = time.time()
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end-start)
data.to_csv("/home/zhangyanzhao/data.csv",index=False)
data = pd.read_csv("/home/zhangyanzhao/data.csv",header=None)
print("数据集大小")
......
from utils import con_sql
import datetime
def fetch_data(start_date='2018-08-03'):
# 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click"
click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id")
# 获取点击表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_click where stat_date >= {0}".format(start_date)
click = con_sql(sql)
click = click.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取点击表里的数据")
# 从time特征中抽取hour
click["hour"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
click["minute"] = click["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
click = click.drop("time",axis=1)
print("点击表数据预览")
print(click.head(2))
# 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure where stat_date >= {0}".format(start_date)
exposure = con_sql(sql)
exposure = exposure.rename(columns={0:"cid",1:"device_id",2:"time",3:"stat_date"})
print("成功获取曝光表里的数据")
# 从time特征中抽取hour
exposure["hour"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).hour)
exposure["minute"] = exposure["time"].apply(lambda x:datetime.datetime.fromtimestamp(x).minute)
exposure = exposure.drop("time",axis=1)
print("曝光表数据预览")
print(exposure.head(2))
return exposure, click, click_device_id
from prepareData import fetch_data
def fetch_test_data():
exposure, click, click_device_ids = fetch_data(start_date='2018-08-06')
from utils import *
import datetime
if __name__ == '__main__':
data = pd.read_csv("data/raw-exposure.csv")[["cid","device_id","time"]]
data["hour"] = data["time"].apply(lambda x: lambda x:datetime.datetime.fromtimestamp(x).hour)
#data.to_csv("data/data.csv")
print(data.head())
# encoding = "utf-8"
import pymysql
import pandas as pd
import numpy as np
# 从数据库的表里获取数据,并转化成df格式
def con_sql(sql):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result)).dropna()
db.close()
return df
# ffm 格式转换函数、类
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment