import time
from prepareData import fetch_data
from utils import FFMFormatPandas
import pandas as pd
from config import *
import pickle


def feature_en():
    exposure, click, click_device_id = fetch_data(
        start_date=DATA_START_DATE, end_date=DATA_END_DATE)

    # 求曝光表和点击表的差集合
    print("曝光表处理前的样本个数")
    print(exposure.shape)

    exposure = exposure.append(click)
    exposure = exposure.append(click)
    subset = click.columns.tolist()
    exposure = exposure.drop_duplicates(subset=subset, keep=False)
    print("差集后曝光表个数")
    print(exposure.shape)

    exposure = exposure.loc[exposure["device_id"].isin(click_device_id)]
    print("去除未点击用户后曝光表个数")
    print(exposure.shape)

    # 打标签
    click["y"] = 1
    exposure["y"] = 0

    print("正样本个数")
    print(click.shape[0])
    print("负样本个数")
    print(exposure.shape[0])

    # 合并点击表和曝光表
    data = click.append(exposure)
    data = data.sort_values(by="stat_date", ascending=False)
    print("前两行数据")
    print(data.head(2))
    print("后两行数据")
    print(data.tail(2))
    test_number = data[data["stat_date"] == TEST_DATE].shape[0]
    validation_number = data[data["stat_date"] == VALIDATION_DATE].shape[0]
    data = data.drop("stat_date", axis=1)

    # 数值是0的特征会被ffm格式删除，经过下面的处理后，没有数值是0的特征
    data.loc[data["hour"] == 0, ["hour"]] = 24
    data.loc[data["minute"] == 0, ["minute"]] = 60
    data["hour"] = data["hour"].astype("category")
    data["minute"] = data["minute"].astype("category")
    print(data.head(2))

    # 持久化候选cid
    data_set_cid = data["cid"].unique()
    cid_df = pd.DataFrame()
    cid_df['cid'] = data_set_cid
    print("data_set_cid :")
    print(cid_df.head(2))
    cid_df.to_csv(DIRECTORY_PATH + "data_set_cid.csv", index=False)

    return data, test_number, validation_number


def ffm_transform(data, test_number, validation_number):

    print("Start ffm transform")
    start = time.time()
    ffm_train = FFMFormatPandas()
    data = ffm_train.fit_transform(data, y='y')
    with open(DIRECTORY_PATH+"ffm_{0}_{1}.pkl".format(DATA_START_DATE,DATA_END_DATE), "wb") as f:
        pickle.dump(ffm_train, f)

    print("done transform ffm")
    end = time.time()
    print("ffm转化数据耗时：")
    print(end - start)

    data.to_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), index=False)
    data = pd.read_csv(DIRECTORY_PATH + "data{0}-{1}.csv".format(DATA_START_DATE, DATA_END_DATE), header=None)
    print("数据集大小")
    print(data.shape)
    print(data.head(2))

    test = data.loc[:test_number]
    print("测试集大小")
    print(test.shape[0])
    test.to_csv(DIRECTORY_PATH + "test{0}.csv".format(TEST_DATE), index=False, header=None)
    # 注意：测试集的日期一定要大于验证集，否则数据切割可能会出现错误
    validation = data.loc[(test_number + 1):(test_number + validation_number)]
    print("验证集大小")
    print(validation.shape[0])
    validation.to_csv(DIRECTORY_PATH + "validation{0}.csv".format(VALIDATION_DATE), index=False, header=None)
    train = data.loc[(test_number + validation_number + 1):]
    print("训练集大小")
    print(train.shape[0])
    # TODO validation date is not the end of train date
    train.to_csv(DIRECTORY_PATH + "train{0}-{1}.csv".format(DATA_START_DATE, VALIDATION_DATE), index=False, header=None)


if __name__ == "__main__":
    data_fe = feature_en()
    ffm_transform(data_fe)


