Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

add the count in result format

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
add the count in result format
65dcce4a · 高雅喆 · 8cf15903 · e521d2ea · 65dcce4a · 65dcce4a
Commit 65dcce4a authored Aug 17, 2018 by 高雅喆
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 17 deletions

prepareData.py prepareData.py +0 -5

processData.py processData.py +4 -11

utils.py utils.py +1 -1

No files found.
--- a/prepareData.py
+++ b/prepareData.py
@@ -15,14 +15,11 @@ def fetch_data(start_date, end_date):
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
    click = con_sql(sql)
    click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
-    print(click.head(5))
    print("成功获取点击表里的数据")
    # 从time特征中抽取hour
    click["hour"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
    click["minute"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
    click = click.drop("time_date", axis=1)
-    print("点击表数据预览")
-    print(click.head(2))

    # 获取曝光表里的数据
    sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
@@ -37,7 +34,5 @@ def fetch_data(start_date, end_date):
    exposure["hour"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
    exposure["minute"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
    exposure = exposure.drop("time_date", axis=1)
-    print("曝光表数据预览")
-    print(exposure.head(2))

    return exposure, click, click_device_id
--- a/processData.py
+++ b/processData.py
@@ -36,11 +36,8 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):

    # 合并点击表和曝光表
    data = click.append(exposure)
+    print("点击表和曝光表合并成功")
    data = data.sort_values(by="stat_date", ascending=False)
-    print("前两行数据")
-    print(data.head(2))
-    print("后两行数据")
-    print(data.tail(2))
    test_number = data[data["stat_date"] == test_date].shape[0]
    validation_number = data[data["stat_date"] == validation_date].shape[0]
    data = data.drop("stat_date", axis=1)
@@ -50,23 +47,20 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
    data.loc[data["minute"] == 0, ["minute"]] = 60
    data["hour"] = data["hour"].astype("category")
    data["minute"] = data["minute"].astype("category")
-    print(data.head(2))

    # 持久化候选cid
    data_set_cid = data["cid"].unique()
    cid_df = pd.DataFrame()
    cid_df['cid'] = data_set_cid
-    print("data_set_cid :")
-    print(cid_df.head(2))
    cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)
+    print("成功保存data_set_cid")

    # 将device_id 保存,目的是为了判断预测的device_id是否在这个集合里,如果不在，不需要预测
    data_set_device_id = data["device_id"].unique()
    device_id_df = pd.DataFrame()
    device_id_df['device_id'] = data_set_device_id
-    print("data_set_device_id :")
-    print(device_id_df.head(2))
    device_id_df.to_csv(DIRECTORY_PATH + "data_set_device_id.csv", index=False)
+    print("成功保存data_set_device_id")
    return data, test_number, validation_number


@@ -75,7 +69,7 @@ def ffm_transform(data, test_number, validation_number):
    print("Start ffm transform")
    start = time.time()
    ffm_train = multiFFMFormatPandas()
-    data = ffm_train.fit_transform(data, y='y',n=100000,processes=6)
+    data = ffm_train.fit_transform(data, y='y',n=50000,processes=4)
    with open(DIRECTORY_PATH+"ffm.pkl", "wb") as f:
        pickle.dump(ffm_train, f)

@@ -88,7 +82,6 @@ def ffm_transform(data, test_number, validation_number):
    data = pd.read_csv(DIRECTORY_PATH + "total_ffm_data.csv", header=None)
    print("数据集大小")
    print(data.shape)
-    print(data.head(2))

    test = data.loc[:test_number]
    print("测试集大小")

--- a/utils.py
+++ b/utils.py
@@ -89,7 +89,7 @@ class multiFFMFormatPandas:

        return self

-    def fit_transform(self, df, y=None,n=200000,processes=8):
+    def fit_transform(self, df, y=None,n=50000,processes=4):
        # n是每个线程运行最大的数据条数,processes是线程数
        self.fit(df, y)
        n = n