Commit a08d32a5 authored by 赵威's avatar 赵威

get data from redis

parent 75dcbb86
......@@ -5,11 +5,12 @@ import timeit
from datetime import datetime
from pathlib import Path
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering,
get_device_df_from_redis, get_diary_df_from_redis, join_features, read_csv_data)
get_device_df_from_redis, get_diary_df_from_redis, join_device_diary, join_features, read_csv_data)
from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export, model_predict
......@@ -18,51 +19,58 @@ from models.esmm.model import esmm_model_fn, model_export, model_predict
def main():
time_begin = time.time()
# df = get_device_df_from_redis()
df2 = get_diary_df_from_redis()
# print(df2.sample(1))
# print(df.size)
# print(df2.size)
# a = device_feature_engineering(df)
# print(a.size)
b = diary_feature_engineering(df2, from_redis=True)
print(b.sample(10))
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/"))
# print(diary_df.sample(1))
# device_df = device_feature_engineering(device_df)
device_df = device_feature_engineering(device_df)
# print(device_df.sample(1))
diary_df = diary_feature_engineering(diary_df)
# print(diary_df.sample(1))
# cc_df = click_feature_engineering(click_df, conversion_df)
# df = join_features(device_df, diary_df, cc_df)
cc_df = click_feature_engineering(click_df, conversion_df)
df = join_features(device_df, diary_df, cc_df)
# train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2)
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)
# all_features = build_features(df)
all_features = build_features(df)
# params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
# model_path = str(Path("~/data/model_tmp/").expanduser())
# if os.path.exists(model_path):
# shutil.rmtree(model_path)
# model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
model_path = str(Path("~/data/model_tmp/").expanduser())
if os.path.exists(model_path):
shutil.rmtree(model_path)
model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
# print("train")
# model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
# metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
# print("metrics: " + str(metrics))
print("train")
model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
print("metrics: " + str(metrics))
# model_export_path = str(Path("~/data/models/").expanduser())
# save_path = model_export(model, all_features, model_export_path)
# print("save to: " + save_path)
model_export_path = str(Path("~/data/models/").expanduser())
save_path = model_export(model, all_features, model_export_path)
print("save to: " + save_path)
# predict_fn = tf.contrib.predictor.from_saved_model(save_path)
predict_fn = tf.contrib.predictor.from_saved_model(save_path)
# for i in range(10):
# test_300 = test_df.sample(300)
# model_predict(test_300, predict_fn)
print("==============================")
device_id = "861601036552944"
diary_ids = [
"16195283", "16838351", "17161073", "17297878", "17307484", "17396235", "16418737", "16995481", "17312201", "12237988"
]
df = get_device_df_from_redis()
df2 = get_diary_df_from_redis()
redis_device_df = device_feature_engineering(df)
redis_diary_df = diary_feature_engineering(df2, from_redis=True)
res = join_device_diary(device_id, diary_ids, redis_device_df, redis_diary_df)
print(len(res))
model_predict(res, predict_fn)
total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
......
......@@ -11,7 +11,8 @@ def read_csv_data(dataset_path):
click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|")
# TODO remove sample
return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
# return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
return device_df, diary_df, click_df, conversion_df
def _get_data_from_redis(key):
......@@ -192,3 +193,66 @@ def join_features(device_df, diary_df, cc_df):
# df.drop(col, inplace=True, axis=1)
df.drop(drop_columns, inplace=True, axis=1)
return df
def join_device_diary(device_id, diary_ids, device_df, diary_df):
a_df = device_df.loc[device_df["device_id"] == device_id]
b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
b_df["device_id"] = device_id
df = pd.merge(a_df, b_df, how="left", on="device_id")
df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
drop_columns = [
"first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
"first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
]
df.drop(drop_columns, inplace=True, axis=1)
return df
from collections import Counter
import pandas as pd
def common_elements(lst1, lst2):
return [element for element in lst1 if element in lst2]
def common_elements(lst1, lst2, n=3):
a = Counter(lst1)
b = Counter(lst2)
interactions = a & b
return list(interactions.elements())[:n]
def nth_element(lst, n):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment