Commit 89b8f476 authored by 赵威's avatar 赵威

tag numbers for diary

parent 250f1b3f
...@@ -72,6 +72,13 @@ _int_columns = [ ...@@ -72,6 +72,13 @@ _int_columns = [
"sixty_browse_user_num", "sixty_browse_user_num",
"ninety_browse_user_num", "ninety_browse_user_num",
"history_browse_user_num", "history_browse_user_num",
"first_demands_num",
"second_demands_num",
"first_solutions_num",
"second_solutions_num",
"first_positions_num",
"second_positions_num",
"projects_num",
] ]
_float_columns = [ _float_columns = [
"one_ctr", "one_ctr",
......
...@@ -101,6 +101,13 @@ DIARY_COLUMNS = [ ...@@ -101,6 +101,13 @@ DIARY_COLUMNS = [
"first_positions", "first_positions",
"second_positions", "second_positions",
"projects", "projects",
"first_demands_num",
"second_demands_num",
"first_solutions_num",
"second_solutions_num",
"first_positions_num",
"second_positions_num",
"projects_num",
] ]
INT_COLUMNS = [ INT_COLUMNS = [
"active_days", "active_days",
...@@ -163,6 +170,13 @@ INT_COLUMNS = [ ...@@ -163,6 +170,13 @@ INT_COLUMNS = [
"sixty_browse_user_num", "sixty_browse_user_num",
"ninety_browse_user_num", "ninety_browse_user_num",
"history_browse_user_num", "history_browse_user_num",
"first_demands_num",
"second_demands_num",
"first_solutions_num",
"second_solutions_num",
"first_positions_num",
"second_positions_num",
"projects_num",
] ]
FLOAT_COLUMNS = [ FLOAT_COLUMNS = [
"one_ctr", "one_ctr",
...@@ -226,6 +240,9 @@ def get_diary_dict_from_redis(): ...@@ -226,6 +240,9 @@ def get_diary_dict_from_redis():
"second_positions", "projects" "second_positions", "projects"
]: ]:
tmp[col_name] = elem.split(",") tmp[col_name] = elem.split(",")
if "" in tmp[col_name]:
tmp[col_name].remove("")
tmp[col_name + "_num"] = len(tmp[col_name])
elif col_name in ["is_pure_author", "is_have_pure_reply", "is_have_reply"]: elif col_name in ["is_pure_author", "is_have_pure_reply", "is_have_reply"]:
if elem == "true": if elem == "true":
tmp[col_name] = 1 tmp[col_name] = 1
...@@ -256,6 +273,14 @@ def diary_feature_engineering(df): ...@@ -256,6 +273,14 @@ def diary_feature_engineering(df):
diary_df["second_positions"] = diary_df["second_positions"].apply(lambda d: d if isinstance(d, list) else []) diary_df["second_positions"] = diary_df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
diary_df["projects"] = diary_df["projects"].apply(lambda d: d if isinstance(d, list) else []) diary_df["projects"] = diary_df["projects"].apply(lambda d: d if isinstance(d, list) else [])
df["first_demands_num"] = df["first_demands"].apply(lambda d: len(d))
df["second_demands_num"] = df["second_demands"].apply(lambda d: len(d))
df["first_solutions_num"] = df["first_solutions"].apply(lambda d: len(d))
df["second_solutions_num"] = df["second_solutions"].apply(lambda d: len(d))
df["first_positions_num"] = df["first_positions"].apply(lambda d: len(d))
df["second_positions_num"] = df["second_positions"].apply(lambda d: len(d))
df["projects_num"] = df["projects"].apply(lambda d: len(d))
diary_df["is_pure_author"] = diary_df["is_pure_author"].astype(int) diary_df["is_pure_author"] = diary_df["is_pure_author"].astype(int)
diary_df["is_have_pure_reply"] = diary_df["is_have_pure_reply"].astype(int) diary_df["is_have_pure_reply"] = diary_df["is_have_pure_reply"].astype(int)
diary_df["is_have_reply"] = diary_df["is_have_reply"].astype(int) diary_df["is_have_reply"] = diary_df["is_have_reply"].astype(int)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment