Commit 83a7534f authored by 张彦钊's avatar 张彦钊

add predict, user profile, candidates set, filter cids

parent 724824ce
File deleted
data/ data/
*.pyc *.pyc
.DS_Store
...@@ -5,6 +5,6 @@ TEST_DATE = '2018-08-06' ...@@ -5,6 +5,6 @@ TEST_DATE = '2018-08-06'
DATA_START_DATE = '2018-07-05' DATA_START_DATE = '2018-07-05'
DATA_END_DATE = '2018-08-06' DATA_END_DATE = '2018-08-06'
MODEL_VERSION = ''
# processData.py # processData.py
# diaryTraining.py # diaryTraining.py
...@@ -3,12 +3,25 @@ import pandas as pd ...@@ -3,12 +3,25 @@ import pandas as pd
from utils import * from utils import *
from config import * from config import *
# 候选集cid只能从训练数据集cid中选择
def filter_cid(df):
data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
print("过滤前样本大小:")
print(df.shape)
df = df.loc[df["cid"].isin(data_set_cid)]
print("过滤后样本大小:")
print(df.shape)
return df
def get_allCitiesDiaryTop2000(): def get_allCitiesDiaryTop2000():
# 获取全国点击量TOP2000日记 # 获取全国点击量TOP2000日记
sql = "select city_id,cid from data_feed_click where cid_type = 'diary' order by click_count_choice desc limit 2000" sql = "select city_id,cid from data_feed_click where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000 = con_sql(sql) allCitiesTop2000 = con_sql(sql)
allCitiesTop2000 = allCitiesTop2000.rename(columns={0:"city_id",1:"cid"}) allCitiesTop2000 = allCitiesTop2000.rename(columns={0: "city_id", 1: "cid"})
allCitiesTop2000.to_csv(DIRECTORY_PATH+"diaryTestSet/allCitiesDiaryTop2000.csv") allCitiesTop2000 = filter_cid(allCitiesTop2000)
allCitiesTop2000.to_csv(DIRECTORY_PATH + "diaryTestSet/allCitiesDiaryTop2000.csv")
print("成功获取全国日记点击量TOP2000") print("成功获取全国日记点击量TOP2000")
return allCitiesTop2000 return allCitiesTop2000
...@@ -17,7 +30,7 @@ def get_cityList(): ...@@ -17,7 +30,7 @@ def get_cityList():
# 获取全国城市列表 # 获取全国城市列表
sql = "select distinct city_id from data_feed_click" sql = "select distinct city_id from data_feed_click"
cityList = con_sql(sql) cityList = con_sql(sql)
cityList.to_csv(DIRECTORY_PATH+"diaryTestSet/cityList.csv") cityList.to_csv(DIRECTORY_PATH + "diaryTestSet/cityList.csv")
cityList = cityList[0].values.tolist() cityList = cityList[0].values.tolist()
cityList.remove('worldwide') cityList.remove('worldwide')
print("成功获取全国城市列表") print("成功获取全国城市列表")
...@@ -33,25 +46,19 @@ def get_eachCityDiaryTop2000(): ...@@ -33,25 +46,19 @@ def get_eachCityDiaryTop2000():
"where cid_type = 'diary' and city_id = '{0}' " \ "where cid_type = 'diary' and city_id = '{0}' " \
"order by click_count_choice desc limit 2000".format(i) "order by click_count_choice desc limit 2000".format(i)
data = con_sql(sql) data = con_sql(sql)
data = data.rename(columns={0:"city_id",1:"cid"}) data = data.rename(columns={0: "city_id", 1: "cid"})
data = filter_cid(data)
if data.shape[0]<2000: if data.shape[0] < 2000:
n = 2000-data.shape[0] n = 2000 - data.shape[0]
# 全国点击量TOP2000日记中去除该城市的日记 # 全国点击量TOP2000日记中去除该城市的日记
temp = allCitiesTop2000[allCitiesTop2000["city_id"]!=i].loc[:n-1] temp = allCitiesTop2000[allCitiesTop2000["city_id"] != i].loc[:n - 1]
data = data.append(temp) data = data.append(temp)
else: else:
pass pass
file_name = DIRECTORY_PATH+"diaryTestSet/{0}DiaryTop2000.csv".format(i) file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(i)
data.to_csv(file_name) data.to_csv(file_name)
if __name__ == "__main__": if __name__ == "__main__":
get_eachCityDiaryTop2000() get_eachCityDiaryTop2000()
import xlearn as xl import xlearn as xl
from config import * from config import *
from diaryCandidateSet import get_eachCityDiaryTop2000
print("Start training") print("Start training")
ffm_model = xl.create_ffm() ffm_model = xl.create_ffm()
...@@ -20,3 +21,7 @@ ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DA ...@@ -20,3 +21,7 @@ ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DA
DATA_END_DATE,"0.03","0.002"), DATA_END_DATE,"0.03","0.002"),
DIRECTORY_PATH + "testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt".format(TEST_DATE, DIRECTORY_PATH + "testset{0}_output_model_{1}-{2}_lr{3}_lambda{4}.txt".format(TEST_DATE,
DATA_START_DATE,DATA_END_DATE,"0.03","0.002")) DATA_START_DATE,DATA_END_DATE,"0.03","0.002"))
print('---------------candidates--------------')
get_eachCityDiaryTop2000()
...@@ -5,17 +5,17 @@ import pickle ...@@ -5,17 +5,17 @@ import pickle
if __name__ == '__main__': if __name__ == '__main__':
data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]] data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
data["y"] = 1 data["y"] = 1
test_data = data.tail(5) test_data = data.tail(1)
ffm = FFMFormatPandas() ffm = FFMFormatPandas()
data = ffm.fit_transform(data, y='y') data = ffm.fit_transform(data, y='y')
data.to_csv("ffm_data.csv", index=False) data.to_csv("../data/ffm_data.csv", index=False)
with open("ffm.object", "wb") as f: with open("../data/ffm.object", "wb") as f:
pickle.dump(ffm, f) pickle.dump(ffm, f)
with open("ffm.object", "rb") as f: with open("../data/ffm.object", "rb") as f:
ffm = pickle.load(f) ffm = pickle.load(f)
result = ffm.transform(test_data) result = ffm.transform(test_data)
print(result) print(result)
data_1 = pd.read_csv("ffm_data.csv", header=None).tail(5) data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
print(data_1) print(data_1)
...@@ -5,80 +5,102 @@ import pandas as pd ...@@ -5,80 +5,102 @@ import pandas as pd
from config import * from config import *
import pickle import pickle
exposure, click, click_device_id = fetch_data(
start_date=DATA_START_DATE, end_date=DATA_END_DATE) def feature_en():
exposure, click, click_device_id = fetch_data(
# 求曝光表和点击表的差集合 start_date=DATA_START_DATE, end_date=DATA_END_DATE)
print("曝光表处理前的样本个数")
print(exposure.shape) # 求曝光表和点击表的差集合
print("曝光表处理前的样本个数")
exposure = exposure.append(click) print(exposure.shape)
exposure = exposure.append(click)
subset = click.columns.tolist() exposure = exposure.append(click)
exposure = exposure.drop_duplicates(subset=subset, keep=False) exposure = exposure.append(click)
print("差集后曝光表个数") subset = click.columns.tolist()
print(exposure.shape) exposure = exposure.drop_duplicates(subset=subset, keep=False)
print("差集后曝光表个数")
exposure = exposure.loc[exposure["device_id"].isin(click_device_id)] print(exposure.shape)
print("去除未点击用户后曝光表个数")
print(exposure.shape) exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
print("去除未点击用户后曝光表个数")
# 打标签 print(exposure.shape)
click["y"] = 1
exposure["y"] = 0 # 打标签
click["y"] = 1
print("正样本个数") exposure["y"] = 0
print(click.shape[0])
print("负样本个数") print("正样本个数")
print(exposure.shape[0]) print(click.shape[0])
print("负样本个数")
# 合并点击表和曝光表 print(exposure.shape[0])
data = click.append(exposure)
data = data.sort_values(by="stat_date", ascending=False) # 合并点击表和曝光表
print("前两行数据") data = click.append(exposure)
print(data.head(2)) data = data.sort_values(by="stat_date", ascending=False)
print("后两行数据") print("前两行数据")
print(data.tail(2)) print(data.head(2))
test_number = data[data["stat_date"] == TEST_DATE].shape[0] print("后两行数据")
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0] print(data.tail(2))
data = data.drop("stat_date", axis=1) test_number = data[data["stat_date"] == TEST_DATE].shape[0]
validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
# 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征 data = data.drop("stat_date", axis=1)
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60 # 数值是0的特征会被ffm格式删除,经过下面的处理后,没有数值是0的特征
data["hour"] = data["hour"].astype("category") data.loc[data["hour"] == 0, ["hour"]] = 24
data["minute"] = data["minute"].astype("category") data.loc[data["minute"] == 0, ["minute"]] = 60
print(data.head(2)) data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
print("Start ffm transform") print(data.head(2))
start = time.time()
ffm_train = FFMFormatPandas() # 持久化候选cid
data = ffm_train.fit_transform(data, y='y') data_set_cid = data[["cid"]].unique()
with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f: cid_df = pd.DataFrame()
pickle.dump(ffm_train, f) cid_df['cid'] = data_set_cid
print("data_set_cid :")
print("done transform ffm") print(cid_df.head(2))
end = time.time() cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)
print("ffm转化数据耗时:")
print(end - start) return data, test_number, validation_number
data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None) def ffm_transform(data, test_number, validation_number):
print("数据集大小")
print(data.shape) print("Start ffm transform")
print(data.head(2)) start = time.time()
ffm_train = FFMFormatPandas()
test = data.loc[:test_number] data = ffm_train.fit_transform(data, y='y')
print("测试集大小") with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f:
print(test.shape[0]) pickle.dump(ffm_train, f)
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误 print("done transform ffm")
validation = data.loc[(test_number + 1):(test_number + validation_number)] end = time.time()
print("验证集大小") print("ffm转化数据耗时:")
print(validation.shape[0]) print(end - start)
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number + validation_number + 1):] data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
print("训练集大小") data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
print(train.shape[0]) print("数据集大小")
# TODO validation date is not the end of train date print(data.shape)
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None) print(data.head(2))
test = data.loc[:test_number]
print("测试集大小")
print(test.shape[0])
test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
# 注意:测试集的日期一定要大于验证集,否则数据切割可能会出现错误
validation = data.loc[(test_number + 1):(test_number + validation_number)]
print("验证集大小")
print(validation.shape[0])
validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
train = data.loc[(test_number + validation_number + 1):]
print("训练集大小")
print(train.shape[0])
# TODO validation date is not the end of train date
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
if __name__ == "__main__":
data_fe = feature_en()
# ffm_transform(data_fe)
from config import *
import pandas as pd
import pickle
import xlearn as xl
import datetime
from userProfile import fetch_user_profile
# 接收device_id、city_id # 接收device_id、city_id
# 将device_id、city_id拼接到对应的城市热门日记表
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def device_id_merge(user_profile):
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(user_profile['city_id'])
data = pd.read_csv(file_name)
data["device_id"] = user_profile['device_id']
now = datetime.datetime.now()
data["hour"] = now.hour
data["minute"] = now.minute
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
data["y"] = 0
data = data.drop("city_id",axis=1)
print(data.head(2))
return data
# 把ffm.pkl load进来,将上面的表转化为ffm格式 # 把ffm.pkl load进来,将上面的表转化为ffm格式
def transform_ffm_format(ffm_format_pandas, df, device_id):
data = ffm_format_pandas.transform(df)
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
predict_file_name = DIRECTORY_PATH + "diaryPredictSet/{0}_{1}DiaryTop2000.csv".format(device_id, now)
data.to_csv(predict_file_name)
user_instance_file_path = ''
return user_instance_file_path
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里 # 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def predict(user_profile):
ffm_model = xl.create_ffm()
user_instance_file_path = device_id_merge(device_id)
ffm_model.setTest(user_instance_file_path)
ffm_model.predict(DIRECTORY_PATH + MODEL_VERSION, "./{0}_output.txt".format(device_id))
def router(device_id):
user_profile, is_exist = fetch_user_profile(device_id)
file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE)
with open(file_path, "rb") as f:
ffm_format_pandas = pickle.load(f)
if is_exist:
predict()
else:
pass # do something
if __name__ == "__main__":
router(device_id='358035085192742')
# 预测一些真实的device_id # 预测一些真实的device_id
\ No newline at end of file
from utils import con_sql
def fetch_user_profile(device_id):
# TODO sql语句中的device_id可能对应多个city_id
sql = "select device_id,city_id from data_feed_click limit 1"
user_profile = con_sql(sql)
is_exist = user_profile.empty
return user_profile, is_exist
...@@ -71,3 +71,9 @@ class FFMFormatPandas: ...@@ -71,3 +71,9 @@ class FFMFormatPandas:
def transform(self, df): def transform(self, df):
t = df.dtypes.to_dict() t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()}) return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
def is_feature_index_exist(self, name):
if name in self.feature_index_:
return True
else:
return False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment