Commit 83a7534f authored by 张彦钊's avatar 张彦钊

add predict, user profile, candidates set, filter cids

parent 724824ce
File deleted
data/
*.pyc
.DS_Store
......@@ -5,6 +5,6 @@ TEST_DATE = '2018-08-06'
DATA_START_DATE = '2018-07-05'
DATA_END_DATE = '2018-08-06'
MODEL_VERSION = ''
# processData.py
# diaryTraining.py
......@@ -3,12 +3,25 @@ import pandas as pd
from utils import *
from config import *
# 候选集cid只能从训练数据集cid中选择
def filter_cid(df):
data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
print("过滤前样本大小:")
print(df.shape)
df = df.loc[df["cid"].isin(data_set_cid)]
print("过滤后样本大小:")
print(df.shape)
return df
def get_allCitiesDiaryTop2000():
# 获取全国点击量TOP2000日记
sql = "select city_id,cid from data_feed_click where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000 = con_sql(sql)
allCitiesTop2000 = allCitiesTop2000.rename(columns={0:"city_id",1:"cid"})
allCitiesTop2000.to_csv(DIRECTORY_PATH+"diaryTestSet/allCitiesDiaryTop2000.csv")
allCitiesTop2000 = allCitiesTop2000.rename(columns={0: "city_id", 1: "cid"})
allCitiesTop2000 = filter_cid(allCitiesTop2000)
allCitiesTop2000.to_csv(DIRECTORY_PATH + "diaryTestSet/allCitiesDiaryTop2000.csv")
print("成功获取全国日记点击量TOP2000")
return allCitiesTop2000
......@@ -17,7 +30,7 @@ def get_cityList():
# 获取全国城市列表
sql = "select distinct city_id from data_feed_click"
cityList = con_sql(sql)
cityList.to_csv(DIRECTORY_PATH+"diaryTestSet/cityList.csv")
cityList.to_csv(DIRECTORY_PATH + "diaryTestSet/cityList.csv")
cityList = cityList[0].values.tolist()
cityList.remove('worldwide')
print("成功获取全国城市列表")
......@@ -33,25 +46,19 @@ def get_eachCityDiaryTop2000():
"where cid_type = 'diary' and city_id = '{0}' " \
"order by click_count_choice desc limit 2000".format(i)
data = con_sql(sql)
data = data.rename(columns={0:"city_id",1:"cid"})
if data.shape[0]<2000:
n = 2000-data.shape[0]
data = data.rename(columns={0: "city_id", 1: "cid"})
data = filter_cid(data)
if data.shape[0] < 2000:
n = 2000 - data.shape[0]
# 全国点击量TOP2000日记中去除该城市的日记
temp = allCitiesTop2000[allCitiesTop2000["city_id"]!=i].loc[:n-1]
temp = allCitiesTop2000[allCitiesTop2000["city_id"] != i].loc[:n - 1]
data = data.append(temp)
else:
pass
file_name = DIRECTORY_PATH+"diaryTestSet/{0}DiaryTop2000.csv".format(i)
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(i)
data.to_csv(file_name)
if __name__ == "__main__":
get_eachCityDiaryTop2000()
import xlearn as xl
from config import *
from diaryCandidateSet import get_eachCityDiaryTop2000
print("Start training")
ffm_model = xl.create_ffm()
......@@ -20,3 +21,7 @@ ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DA
DATA_END_DATE,"0.03","0.002"),
DIRECTORY_PATH + "testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt".format(TEST_DATE,
DATA_START_DATE,DATA_END_DATE,"0.03","0.002"))
print('---------------candidates--------------')
get_eachCityDiaryTop2000()
......@@ -5,17 +5,17 @@ import pickle
if __name__ == '__main__':
data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
data["y"] = 1
test_data = data.tail(5)
test_data = data.tail(1)
ffm = FFMFormatPandas()
data = ffm.fit_transform(data, y='y')
data.to_csv("ffm_data.csv", index=False)
data.to_csv("../data/ffm_data.csv", index=False)
with open("ffm.object", "wb") as f:
with open("../data/ffm.object", "wb") as f:
pickle.dump(ffm, f)
with open("ffm.object", "rb") as f:
with open("../data/ffm.object", "rb") as f:
ffm = pickle.load(f)
result = ffm.transform(test_data)
print(result)
data_1 = pd.read_csv("ffm_data.csv", header=None).tail(5)
data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
print(data_1)
......@@ -5,80 +5,102 @@ import pandas as pd
from config import *
import pickle
exposure, click, click_device_id = fetch_data(
start_date=DATA_START_DATE, end_date=DATA_END_DATE)
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
print("正样本个数")
print(click.shape[0])
print("负样本个数")
print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
data = data.sort_values(by="stat_date", ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
print("Start ffm transform")
start = time.time()
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f:
pickle.dump(ffm_train, f)
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end - start)
data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
def feature_en():
exposure, click, click_device_id = fetch_data(
start_date=DATA_START_DATE, end_date=DATA_END_DATE)
# 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
print(exposure.shape)
exposure = exposure.append(click)
exposure = exposure.append(click)
subset = click.columns.tolist()
exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数")
print(exposure.shape)
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
print(exposure.shape)
# 打标签
click["y"] = 1
exposure["y"] = 0
print("正样本个数")
print(click.shape[0])
print("负样本个数")
print(exposure.shape[0])
# 合并点击表和曝光表
data = click.append(exposure)
data = data.sort_values(by="stat_date", ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
data = data.drop("stat_date", axis=1)
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print(data.head(2))
# 持久化候选cid
data_set_cid = data[["cid"]].unique()
cid_df = pd.DataFrame()
cid_df['cid'] = data_set_cid
print("data_set_cid :")
print(cid_df.head(2))
cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)
return data, test_number, validation_number
def ffm_transform(data, test_number, validation_number):
print("Start ffm transform")
start = time.time()
ffm_train = FFMFormatPandas()
data = ffm_train.fit_transform(data, y='y')
with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f:
pickle.dump(ffm_train, f)
print("done transform ffm")
end = time.time()
print("ffm转化数据耗时:")
print(end - start)
data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
print("数据集大小")
print(data.shape)
print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
if __name__ == "__main__":
data_fe = feature_en()
# ffm_transform(data_fe)
from config import *
import pandas as pd
import pickle
import xlearn as xl
import datetime
from userProfile import fetch_user_profile
# 接收device_id、city_id
# 将device_id、city_id拼接到对应的城市热门日记表
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def device_id_merge(user_profile):
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(user_profile['city_id'])
data = pd.read_csv(file_name)
data["device_id"] = user_profile['device_id']
now = datetime.datetime.now()
data["hour"] = now.hour
data["minute"] = now.minute
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
data["y"] = 0
data = data.drop("city_id",axis=1)
print(data.head(2))
return data
# 把ffm.pkl load进来,将上面的表转化为ffm格式
def transform_ffm_format(ffm_format_pandas, df, device_id):
data = ffm_format_pandas.transform(df)
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
predict_file_name = DIRECTORY_PATH + "diaryPredictSet/{0}_{1}DiaryTop2000.csv".format(device_id, now)
data.to_csv(predict_file_name)
user_instance_file_path = ''
return user_instance_file_path
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def predict(user_profile):
ffm_model = xl.create_ffm()
user_instance_file_path = device_id_merge(device_id)
ffm_model.setTest(user_instance_file_path)
ffm_model.predict(DIRECTORY_PATH + MODEL_VERSION, "./{0}_output.txt".format(device_id))
def router(device_id):
user_profile, is_exist = fetch_user_profile(device_id)
file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE)
with open(file_path, "rb") as f:
ffm_format_pandas = pickle.load(f)
if is_exist:
predict()
else:
pass # do something
if __name__ == "__main__":
router(device_id='358035085192742')
# 预测一些真实的device_id
\ No newline at end of file
# 预测一些真实的device_id
from utils import con_sql
def fetch_user_profile(device_id):
# TODO sql语句中的device_id可能对应多个city_id
sql = "select device_id,city_id from data_feed_click limit 1"
user_profile = con_sql(sql)
is_exist = user_profile.empty
return user_profile, is_exist
......@@ -71,3 +71,9 @@ class FFMFormatPandas:
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
def is_feature_index_exist(self, name):
if name in self.feature_index_:
return True
else:
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment