Commit 3a6de1e5 authored by 高雅喆's avatar 高雅喆

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

add a comment and new a class ClkCidUidRate
parents 62b8bc34 11c1c9b8
...@@ -7,12 +7,8 @@ from config import * ...@@ -7,12 +7,8 @@ from config import *
# 候选集cid只能从训练数据集cid中选择 # 候选集cid只能从训练数据集cid中选择
def filter_cid(df): def filter_cid(df):
data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist() data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
print("过滤前样本大小:")
print(df.shape)
if not df.empty: if not df.empty:
df = df.loc[df["cid"].isin(data_set_cid)] df = df.loc[df["cid"].isin(data_set_cid)]
print("过滤后样本大小:")
print(df.shape)
return df return df
......
...@@ -94,17 +94,19 @@ def router(device_id): ...@@ -94,17 +94,19 @@ def router(device_id):
if __name__ == "__main__": if __name__ == "__main__":
# TODO 如果耗时小于一分钟,下一次取到的device_id和上一次相同 # TODO 如果耗时小于一分钟,下一次取到的device_id和上一次相同
while True: while True:
start = time.time() start = time.time()
empty,device_id_list = get_active_users() empty,device_id_list = get_active_users()
if empty: if empty:
time.sleep(10) time.sleep(10)
else: else:
old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist()
for device_id in device_id_list: for device_id in device_id_list:
if device_id in old_device_id_list:
router(device_id) router(device_id)
else:
print("该用户不是老用户,不能预测")
end = time.time() end = time.time()
time_cost = (end - start) time_cost = (end - start)
print("预测耗时{}秒".format(time_cost)) print("耗时{}秒".format(time_cost))
...@@ -60,6 +60,13 @@ def feature_en(): ...@@ -60,6 +60,13 @@ def feature_en():
print(cid_df.head(2)) print(cid_df.head(2))
cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False) cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)
# 将device_id 保存。目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测
data_set_device_id = data["device_id"].unique()
device_id_df = pd.DataFrame()
device_id_df['device_id'] = data_set_device_id
print("data_set_device_id :")
print(device_id_df.head(2))
device_id_df.to_csv(DIRECTORY_PATH + "data_set_device_id.csv", index=False)
return data, test_number, validation_number return data, test_number, validation_number
...@@ -99,8 +106,5 @@ def ffm_transform(data, test_number, validation_number): ...@@ -99,8 +106,5 @@ def ffm_transform(data, test_number, validation_number):
train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None) train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)
if __name__ == "__main__":
data_fe = feature_en()
ffm_transform(data_fe)
...@@ -5,9 +5,10 @@ from diaryCandidateSet import get_eachCityDiaryTop3000 ...@@ -5,9 +5,10 @@ from diaryCandidateSet import get_eachCityDiaryTop3000
# 把数据获取、特征转换、模型训练的模型串联在一起 # 把数据获取、特征转换、模型训练的模型串联在一起
if __name__ == "__main__": if __name__ == "__main__":
data_fe = feature_en() data, test_number, validation_number = feature_en()
ffm_transform(data_fe) ffm_transform(data, test_number, validation_number)
train() train()
print('---------------prepare candidates--------------') print("end")
get_eachCityDiaryTop3000() # print('---------------prepare candidates--------------')
# get_eachCityDiaryTop3000()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment