Commit d1b06803 authored by 赵威's avatar 赵威

remane

parent 0cd940b7
......@@ -5,8 +5,8 @@ import time
import tensorflow as tf
from models.esmm import device_fe as device_fe
from models.esmm import diary_fe as diary_fe
from models.esmm.fe import device_fe as device_fe
from models.esmm.fe import diary_fe as diary_fe
from models.esmm.model import model_predict_diary
from utils.cache import redis_client2
from utils.grey import recommed_service_category_device_id_by_tail
......
......@@ -3,7 +3,7 @@ import pandas as pd
from utils.cache import redis_db_client
# "channel_first", "city_first", "model_first",
DIARY_DEVICE_COLUMNS = [
DEVICE_COLUMNS = [
"device_id", "active_type", "active_days", "past_consume_ability_history", "potential_consume_ability_history",
"price_sensitive_history", "first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions",
"second_positions", "projects"
......@@ -67,4 +67,4 @@ def device_feature_engineering(df):
print("device:")
print(nullseries[nullseries > 0])
print(device_df.shape)
return device_df[DIARY_DEVICE_COLUMNS]
return device_df[DEVICE_COLUMNS]
import pandas as pd
def click_feature_engineering(click_df, conversion_df):
# click_df = click_df.copy()
# conversion_df = conversion_df.copy()
click_df.rename(columns={"label": "click_label"}, inplace=True)
conversion_df.rename(columns={"label": "conversion_label"}, inplace=True)
cc_df = pd.merge(click_df, conversion_df, how="left", left_on=["cl_id", "card_id"], right_on=["cl_id", "card_id"])
cc_df.drop(["partition_date_x", "partition_date_y"], axis=1, inplace=True)
cc_df["conversion_label"].fillna(0, inplace=True)
print("click:")
nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0])
print(cc_df.shape)
return cc_df
......@@ -80,24 +80,6 @@ def diary_feature_engineering(df):
return diary_df[DIARY_COLUMNS]
def click_feature_engineering(click_df, conversion_df):
# click_df = click_df.copy()
# conversion_df = conversion_df.copy()
click_df.rename(columns={"label": "click_label"}, inplace=True)
conversion_df.rename(columns={"label": "conversion_label"}, inplace=True)
cc_df = pd.merge(click_df, conversion_df, how="left", left_on=["cl_id", "card_id"], right_on=["cl_id", "card_id"])
cc_df.drop(["partition_date_x", "partition_date_y"], axis=1, inplace=True)
cc_df["conversion_label"].fillna(0, inplace=True)
print("click:")
nullseries = cc_df.isnull().sum()
print(nullseries[nullseries > 0])
print(cc_df.shape)
return cc_df
def join_features(device_df, diary_df, cc_df):
a = pd.merge(device_df, cc_df, how="inner", left_on="device_id", right_on="cl_id")
df = pd.merge(a, diary_df, how="inner", left_on="card_id", right_on="card_id")
......
import pandas as pd
TRACTATE_COLUMNS = [
"card_id", "is_pure_author", "is_have_pure_reply", "is_have_reply", "content_level", "show_tag_id", "reply_num",
"reply_pure_num", "one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr", "thirty_ctr", "sixty_ctr", "ninety_ctr", "history_ctr",
"first_demands", "second_demands", "first_solutions", "second_solutions", "first_positions", "second_positions", "projects"
]
def read_csv_data(dataset_path):
tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("tractate_click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("tractate_click_cvr.csv"), sep="|")
return tractate_df, click_df, conversion_df
def get_tractate_from_redis():
"""
return: {diary_id: {first_demands: [], is_pure_author: 1}}
"""
pass
def tractate_feature_engineering(tractate_df):
df = tractate_df.copy()
df["first_demands"] = df["first_demands"].str.split(",")
df["second_demands"] = df["second_demands"].str.split(",")
df["first_solutions"] = df["first_solutions"].str.split(",")
df["second_solutions"] = df["second_solutions"].str.split(",")
df["first_positions"] = df["first_positions"].str.split(",")
df["second_positions"] = df["second_positions"].str.split(",")
df["projects"] = df["projects"].str.split(",")
df["first_demands"] = df["first_demands"].apply(lambda d: d if isinstance(d, list) else [])
df["second_demands"] = df["second_demands"].apply(lambda d: d if isinstance(d, list) else [])
df["first_solutions"] = df["first_solutions"].apply(lambda d: d if isinstance(d, list) else [])
df["second_solutions"] = df["second_solutions"].apply(lambda d: d if isinstance(d, list) else [])
df["first_positions"] = df["first_positions"].apply(lambda d: d if isinstance(d, list) else [])
df["second_positions"] = df["second_positions"].apply(lambda d: d if isinstance(d, list) else [])
df["projects"] = df["projects"].apply(lambda d: d if isinstance(d, list) else [])
df["is_pure_author"] = df["is_pure_author"].astype(int)
df["is_have_pure_reply"] = df["is_have_pure_reply"].astype(int)
df["is_have_reply"] = df["is_have_reply"].astype(int)
print("tractate:")
nullseries = df.isnull().sum()
print(nullseries[nullseries > 0])
print(df.shape)
return df[TRACTATE_COLUMNS]
def join_features(device_df, tractate_df, cc_df):
pass
def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
pass
......@@ -6,7 +6,7 @@ from tensorflow import feature_column as fc
from tensorflow.python.estimator.canned import head as head_lib
from tensorflow.python.ops.losses import losses
from .diary_fe import device_diary_fe
from .fe.diary_fe import device_diary_fe
from .utils import common_elements, nth_element
......
import pandas as pd
TRACTATE_COLUMNS = []
def read_csv_data(dataset_path):
tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("tractate_click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("tractate_click_cvr.csv"), sep="|")
return tractate_df, click_df, conversion_df
def get_tractate_from_redis():
"""
return: {diary_id: {first_demands: [], is_pure_author: 1}}
"""
pass
def tractate_feature_engineering(df):
tractate_df = df.copy()
def click_feature_engineering(click_df, conversion_df):
pass
def join_features(device_df, tractate_df, cc_df):
pass
def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
pass
......@@ -9,14 +9,13 @@ from pathlib import Path
import tensorflow as tf
from sklearn.model_selection import train_test_split
from models.esmm import device_fe as device_fe
from models.esmm import diary_fe as diary_fe
from models.esmm.fe import device_fe as device_fe
from models.esmm.fe import diary_fe as diary_fe
from models.esmm.fe import click_fe as click_fe
from models.esmm.diary_model import model_predict_diary
from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export
# tf.compat.v1.enable_eager_execution()
def main():
time_begin = time.time()
......@@ -34,7 +33,7 @@ def main():
# print(device_df.sample(1))
diary_df = diary_fe.diary_feature_engineering(diary_df)
# print(diary_df.sample(1))
cc_df = diary_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
cc_df = click_fe.click_feature_engineering(diary_click_df, diary_conversion_df)
# print(cc_df.sample(1))
df = diary_fe.join_features(device_df, diary_df, cc_df)
# print(df.sample(1))
......
import datetime
import time
from pathlib import Path
from sklearn.model_selection import train_test_split
from models.esmm.fe import device_fe as device_fe
from models.esmm.fe import tractate_fe as tractate_fe
from models.esmm.fe import click_fe as click_fe
def main():
time_begin = time.time()
data_path = Path("~/data/cvr_data").expanduser() # local
# data_path = Path("/srv/apps/node2vec_git/cvr_data/") # server
tractate_df, tractate_click_df, tractate_conversion_df = tractate_fe.read_csv_data(data_path)
device_df = device_fe.read_csv_data(data_path)
total_time = (time.time() - time_begin) / 60
print("total cost {:.2f} mins at {}".format(total_time, datetime.now()))
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment