Commit 735c83c9 authored by 赵威's avatar 赵威

add size

parent daaaffce
...@@ -12,23 +12,9 @@ def read_csv_data(dataset_path): ...@@ -12,23 +12,9 @@ def read_csv_data(dataset_path):
diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|") diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("diary_click.csv"), sep="|") click_df = pd.read_csv(dataset_path.joinpath("diary_click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"), sep="|") conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"), sep="|")
# TODO remove sample
# return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
return device_df, diary_df, click_df, conversion_df return device_df, diary_df, click_df, conversion_df
# def _get_data_from_redis(key):
# column_key = key + ":column"
# d = redis_db_client.hgetall(key)
# tmp = d.values()
# lists = []
# for i in tmp:
# lists.append(str(i, "utf-8").split("|"))
# columns = str(redis_db_client.get(column_key), "utf-8").split("|")
# df = pd.DataFrame(lists, columns=columns)
# return df
def get_device_dict_from_redis(): def get_device_dict_from_redis():
db_key = "cvr:db:device" db_key = "cvr:db:device"
column_key = db_key + ":column" column_key = db_key + ":column"
...@@ -100,7 +86,7 @@ def device_feature_engineering(df): ...@@ -100,7 +86,7 @@ def device_feature_engineering(df):
nullseries = device_df.isnull().sum() nullseries = device_df.isnull().sum()
print("device:") print("device:")
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(device_df.size) print(device_df.size)
device_columns = [ device_columns = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history", "device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
...@@ -142,7 +128,7 @@ def diary_feature_engineering(df, from_redis=False): ...@@ -142,7 +128,7 @@ def diary_feature_engineering(df, from_redis=False):
print("diary:") print("diary:")
nullseries = diary_df.isnull().sum() nullseries = diary_df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(diary_df.size) print(diary_df.size)
diary_columns = [ diary_columns = [
"card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level", "topic_num", "favor_num", "vote_num",
...@@ -165,7 +151,7 @@ def click_feature_engineering(click_df, conversion_df): ...@@ -165,7 +151,7 @@ def click_feature_engineering(click_df, conversion_df):
print("click:") print("click:")
nullseries = cc_df.isnull().sum() nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(cc_df.size) print(cc_df.size)
return cc_df return cc_df
...@@ -223,7 +209,7 @@ def join_features(device_df, diary_df, cc_df): ...@@ -223,7 +209,7 @@ def join_features(device_df, diary_df, cc_df):
print("df:") print("df:")
nullseries = df.isnull().sum() nullseries = df.isnull().sum()
print(nullseries[nullseries > 0]) print(nullseries[nullseries > 0])
# print(df.size) print(df.size)
drop_columns = [ drop_columns = [
"cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands", "cl_id", "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
...@@ -238,69 +224,6 @@ def join_features(device_df, diary_df, cc_df): ...@@ -238,69 +224,6 @@ def join_features(device_df, diary_df, cc_df):
return df return df
# def join_device_diary(device_id, diary_ids, device_df, diary_df):
# a_df = device_df.loc[device_df["device_id"] == device_id]
# b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
# b_df["device_id"] = device_id
# df = pd.merge(a_df, b_df, how="left", on="device_id")
# df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
# df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
# df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
# df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
# df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
# df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
# df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
# df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
# df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
# df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
# df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
# df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
# df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
# df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
# df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
# df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
# df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
# df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
# df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
# df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
# df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
# df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
# df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
# df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
# df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
# df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
# df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
# df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
# df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
# df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
# df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
# df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
# df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
# df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
# df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
# df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
# drop_columns = [
# "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
# "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
# "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
# "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
# ]
# df.drop(drop_columns, inplace=True, axis=1)
# return df
def device_diary_fe(device_id, diary_ids, device_dict, diary_dict): def device_diary_fe(device_id, diary_ids, device_dict, diary_dict):
time_1 = timeit.default_timer() time_1 = timeit.default_timer()
device_info = device_dict.get(device_id, {}).copy() device_info = device_dict.get(device_id, {}).copy()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment