Commit a08d32a5 authored by 赵威's avatar 赵威

get data from redis

parent 75dcbb86
...@@ -5,11 +5,12 @@ import timeit ...@@ -5,11 +5,12 @@ import timeit
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import pandas as pd
import tensorflow as tf import tensorflow as tf
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering, from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering,
get_device_df_from_redis, get_diary_df_from_redis, join_features, read_csv_data) get_device_df_from_redis, get_diary_df_from_redis, join_device_diary, join_features, read_csv_data)
from models.esmm.input_fn import build_features, esmm_input_fn from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export, model_predict from models.esmm.model import esmm_model_fn, model_export, model_predict
...@@ -18,51 +19,58 @@ from models.esmm.model import esmm_model_fn, model_export, model_predict ...@@ -18,51 +19,58 @@ from models.esmm.model import esmm_model_fn, model_export, model_predict
def main(): def main():
time_begin = time.time() time_begin = time.time()
# df = get_device_df_from_redis()
df2 = get_diary_df_from_redis()
# print(df2.sample(1))
# print(df.size)
# print(df2.size)
# a = device_feature_engineering(df)
# print(a.size)
b = diary_feature_engineering(df2, from_redis=True)
print(b.sample(10))
device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/")) device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/"))
# print(diary_df.sample(1)) # print(diary_df.sample(1))
# device_df = device_feature_engineering(device_df) device_df = device_feature_engineering(device_df)
# print(device_df.sample(1))
diary_df = diary_feature_engineering(diary_df) diary_df = diary_feature_engineering(diary_df)
# print(diary_df.sample(1)) # print(diary_df.sample(1))
# cc_df = click_feature_engineering(click_df, conversion_df) cc_df = click_feature_engineering(click_df, conversion_df)
# df = join_features(device_df, diary_df, cc_df) df = join_features(device_df, diary_df, cc_df)
# train_df, test_df = train_test_split(df, test_size=0.2) train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2) train_df, val_df = train_test_split(train_df, test_size=0.2)
# all_features = build_features(df) all_features = build_features(df)
# params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1} params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
# model_path = str(Path("~/data/model_tmp/").expanduser()) model_path = str(Path("~/data/model_tmp/").expanduser())
# if os.path.exists(model_path): if os.path.exists(model_path):
# shutil.rmtree(model_path) shutil.rmtree(model_path)
# model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path) model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
# print("train") print("train")
# model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000) model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
# metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000) metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
# print("metrics: " + str(metrics)) print("metrics: " + str(metrics))
# model_export_path = str(Path("~/data/models/").expanduser()) model_export_path = str(Path("~/data/models/").expanduser())
# save_path = model_export(model, all_features, model_export_path) save_path = model_export(model, all_features, model_export_path)
# print("save to: " + save_path) print("save to: " + save_path)
# predict_fn = tf.contrib.predictor.from_saved_model(save_path) predict_fn = tf.contrib.predictor.from_saved_model(save_path)
# for i in range(10): # for i in range(10):
# test_300 = test_df.sample(300) # test_300 = test_df.sample(300)
# model_predict(test_300, predict_fn) # model_predict(test_300, predict_fn)
print("==============================")
device_id = "861601036552944"
diary_ids = [
"16195283", "16838351", "17161073", "17297878", "17307484", "17396235", "16418737", "16995481", "17312201", "12237988"
]
df = get_device_df_from_redis()
df2 = get_diary_df_from_redis()
redis_device_df = device_feature_engineering(df)
redis_diary_df = diary_feature_engineering(df2, from_redis=True)
res = join_device_diary(device_id, diary_ids, redis_device_df, redis_diary_df)
print(len(res))
model_predict(res, predict_fn)
total_time = (time.time() - time_begin) / 60 total_time = (time.time() - time_begin) / 60
print("cost {:.2f} mins at {}".format(total_time, datetime.now())) print("cost {:.2f} mins at {}".format(total_time, datetime.now()))
......
...@@ -11,7 +11,8 @@ def read_csv_data(dataset_path): ...@@ -11,7 +11,8 @@ def read_csv_data(dataset_path):
click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|") click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|") conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|")
# TODO remove sample # TODO remove sample
return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df # return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
return device_df, diary_df, click_df, conversion_df
def _get_data_from_redis(key): def _get_data_from_redis(key):
...@@ -192,3 +193,66 @@ def join_features(device_df, diary_df, cc_df): ...@@ -192,3 +193,66 @@ def join_features(device_df, diary_df, cc_df):
# df.drop(col, inplace=True, axis=1) # df.drop(col, inplace=True, axis=1)
df.drop(drop_columns, inplace=True, axis=1) df.drop(drop_columns, inplace=True, axis=1)
return df return df
def join_device_diary(device_id, diary_ids, device_df, diary_df):
a_df = device_df.loc[device_df["device_id"] == device_id]
b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
b_df["device_id"] = device_id
df = pd.merge(a_df, b_df, how="left", on="device_id")
df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
drop_columns = [
"first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
"first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
"second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
"second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
]
df.drop(drop_columns, inplace=True, axis=1)
return df
from collections import Counter
import pandas as pd import pandas as pd
def common_elements(lst1, lst2): def common_elements(lst1, lst2, n=3):
return [element for element in lst1 if element in lst2] a = Counter(lst1)
b = Counter(lst2)
interactions = a & b
return list(interactions.elements())[:n]
def nth_element(lst, n): def nth_element(lst, n):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment