Commit 9a117ced authored by 张彦钊's avatar 张彦钊

change process argument

parent 91605d0d
...@@ -15,14 +15,11 @@ def fetch_data(start_date, end_date): ...@@ -15,14 +15,11 @@ def fetch_data(start_date, end_date):
"where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date) "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
click = con_sql(sql) click = con_sql(sql)
click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"}) click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
print(click.head(5))
print("成功获取点击表里的数据") print("成功获取点击表里的数据")
# 从time特征中抽取hour # 从time特征中抽取hour
click["hour"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour) click["hour"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
click["minute"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute) click["minute"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
click = click.drop("time_date", axis=1) click = click.drop("time_date", axis=1)
print("点击表数据预览")
print(click.head(2))
# 获取曝光表里的数据 # 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure " \ sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
...@@ -37,7 +34,5 @@ def fetch_data(start_date, end_date): ...@@ -37,7 +34,5 @@ def fetch_data(start_date, end_date):
exposure["hour"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour) exposure["hour"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
exposure["minute"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute) exposure["minute"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
exposure = exposure.drop("time_date", axis=1) exposure = exposure.drop("time_date", axis=1)
print("曝光表数据预览")
print(exposure.head(2))
return exposure, click, click_device_id return exposure, click, click_device_id
...@@ -36,11 +36,8 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date): ...@@ -36,11 +36,8 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
# 合并点击表和曝光表 # 合并点击表和曝光表
data = click.append(exposure) data = click.append(exposure)
print("点击表和曝光表合并成功")
data = data.sort_values(by="stat_date", ascending=False) data = data.sort_values(by="stat_date", ascending=False)
print("前两行数据")
print(data.head(2))
print("后两行数据")
print(data.tail(2))
test_number = data[data["stat_date"] == test_date].shape[0] test_number = data[data["stat_date"] == test_date].shape[0]
validation_number = data[data["stat_date"] == validation_date].shape[0] validation_number = data[data["stat_date"] == validation_date].shape[0]
data = data.drop("stat_date", axis=1) data = data.drop("stat_date", axis=1)
...@@ -50,23 +47,20 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date): ...@@ -50,23 +47,20 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
data.loc[data["minute"] == 0, ["minute"]] = 60 data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category") data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category") data["minute"] = data["minute"].astype("category")
print(data.head(2))
# 持久化候选cid # 持久化候选cid
data_set_cid = data["cid"].unique() data_set_cid = data["cid"].unique()
cid_df = pd.DataFrame() cid_df = pd.DataFrame()
cid_df['cid'] = data_set_cid cid_df['cid'] = data_set_cid
print("data_set_cid :")
print(cid_df.head(2))
cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False) cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)
print("成功保存data_set_cid")
# 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测 # 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在,不需要预测
data_set_device_id = data["device_id"].unique() data_set_device_id = data["device_id"].unique()
device_id_df = pd.DataFrame() device_id_df = pd.DataFrame()
device_id_df['device_id'] = data_set_device_id device_id_df['device_id'] = data_set_device_id
print("data_set_device_id :")
print(device_id_df.head(2))
device_id_df.to_csv(DIRECTORY_PATH + "data_set_device_id.csv", index=False) device_id_df.to_csv(DIRECTORY_PATH + "data_set_device_id.csv", index=False)
print("成功保存data_set_device_id")
return data, test_number, validation_number return data, test_number, validation_number
...@@ -75,7 +69,7 @@ def ffm_transform(data, test_number, validation_number): ...@@ -75,7 +69,7 @@ def ffm_transform(data, test_number, validation_number):
print("Start ffm transform") print("Start ffm transform")
start = time.time() start = time.time()
ffm_train = multiFFMFormatPandas() ffm_train = multiFFMFormatPandas()
data = ffm_train.fit_transform(data, y='y',n=100000,processes=6) data = ffm_train.fit_transform(data, y='y',n=80000,processes=5)
with open(DIRECTORY_PATH+"ffm.pkl", "wb") as f: with open(DIRECTORY_PATH+"ffm.pkl", "wb") as f:
pickle.dump(ffm_train, f) pickle.dump(ffm_train, f)
...@@ -88,7 +82,6 @@ def ffm_transform(data, test_number, validation_number): ...@@ -88,7 +82,6 @@ def ffm_transform(data, test_number, validation_number):
data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None) data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None)
print("数据集大小") print("数据集大小")
print(data.shape) print(data.shape)
print(data.head(2))
test = data.loc[:test_number] test = data.loc[:test_number]
print("测试集大小") print("测试集大小")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment