get data from redis

a08d32a5 · 赵威 · 75dcbb86 · a08d32a5 · a08d32a5 · a08d32a5
Commit a08d32a5 authored Jul 21, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 33 deletions

main.py src/main.py +38 -30

fe.py src/models/esmm/fe.py +65 -1

utils.py src/models/esmm/utils.py +7 -2

No files found.
--- a/src/main.py
+++ b/src/main.py
@@ -5,11 +5,12 @@ import timeit
 from datetime import datetime
 from pathlib import Path
+import pandas as pd
 import tensorflow as tf
 from sklearn.model_selection import train_test_split
 from models.esmm.fe import (click_feature_engineering, device_feature_engineering, diary_feature_engineering,
-                            get_device_df_from_redis, get_diary_df_from_redis, join_features, read_csv_data)
+                            get_device_df_from_redis, get_diary_df_from_redis, join_device_diary, join_features, read_csv_data)
 from models.esmm.input_fn import build_features, esmm_input_fn
 from models.esmm.model import esmm_model_fn, model_export, model_predict
@@ -18,51 +19,58 @@ from models.esmm.model import esmm_model_fn, model_export, model_predict
 def main():
    time_begin = time.time()
-    # df = get_device_df_from_redis()
-    df2 = get_diary_df_from_redis()
-    # print(df2.sample(1))
-    # print(df.size)
-    # print(df2.size)
-    # a = device_feature_engineering(df)
-    # print(a.size)
-    b = diary_feature_engineering(df2, from_redis=True)
-    print(b.sample(10))
    device_df, diary_df, click_df, conversion_df = read_csv_data(Path("~/data/cvr_data/"))
    # print(diary_df.sample(1))
-    # device_df = device_feature_engineering(device_df)
+    device_df = device_feature_engineering(device_df)
+    # print(device_df.sample(1))
    diary_df = diary_feature_engineering(diary_df)
    # print(diary_df.sample(1))
-    # cc_df = click_feature_engineering(click_df, conversion_df)
+    cc_df = click_feature_engineering(click_df, conversion_df)
-    # df = join_features(device_df, diary_df, cc_df)
+    df = join_features(device_df, diary_df, cc_df)
-    # train_df, test_df = train_test_split(df, test_size=0.2)
+    train_df, test_df = train_test_split(df, test_size=0.2)
-    # train_df, val_df = train_test_split(train_df, test_size=0.2)
+    train_df, val_df = train_test_split(train_df, test_size=0.2)
-    # all_features = build_features(df)
+    all_features = build_features(df)
-    # params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
+    params = {"feature_columns": all_features, "hidden_units": [64, 32], "learning_rate": 0.1}
-    # model_path = str(Path("~/data/model_tmp/").expanduser())
+    model_path = str(Path("~/data/model_tmp/").expanduser())
-    # if os.path.exists(model_path):
+    if os.path.exists(model_path):
-    #     shutil.rmtree(model_path)
+        shutil.rmtree(model_path)
-    # model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
+    model = tf.estimator.Estimator(model_fn=esmm_model_fn, params=params, model_dir=model_path)
-    # print("train")
+    print("train")
-    # model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
+    model.train(input_fn=lambda: esmm_input_fn(train_df, shuffle=True), steps=5000)
-    # metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
+    metrics = model.evaluate(input_fn=lambda: esmm_input_fn(val_df, False), steps=5000)
-    # print("metrics: " + str(metrics))
+    print("metrics: " + str(metrics))
-    # model_export_path = str(Path("~/data/models/").expanduser())
+    model_export_path = str(Path("~/data/models/").expanduser())
-    # save_path = model_export(model, all_features, model_export_path)
+    save_path = model_export(model, all_features, model_export_path)
-    # print("save to: " + save_path)
+    print("save to: " + save_path)
-    # predict_fn = tf.contrib.predictor.from_saved_model(save_path)
+    predict_fn = tf.contrib.predictor.from_saved_model(save_path)
    # for i in range(10):
    #     test_300 = test_df.sample(300)
    #     model_predict(test_300, predict_fn)
+    print("==============================")
+    device_id = "861601036552944"
+    diary_ids = [
+        "16195283", "16838351", "17161073", "17297878", "17307484", "17396235", "16418737", "16995481", "17312201", "12237988"
+    ]
+    df = get_device_df_from_redis()
+    df2 = get_diary_df_from_redis()
+    redis_device_df = device_feature_engineering(df)
+    redis_diary_df = diary_feature_engineering(df2, from_redis=True)
+    res = join_device_diary(device_id, diary_ids, redis_device_df, redis_diary_df)
+    print(len(res))
+    model_predict(res, predict_fn)
    total_time = (time.time() - time_begin) / 60
    print("cost {:.2f} mins at {}".format(total_time, datetime.now()))

--- a/src/models/esmm/fe.py
+++ b/src/models/esmm/fe.py
@@ -11,7 +11,8 @@ def read_csv_data(dataset_path):
    click_df = pd.read_csv(dataset_path.joinpath("click.csv"), sep="|")
    conversion_df = pd.read_csv(dataset_path.joinpath("click_cvr.csv"), sep="|")
    # TODO remove sample
-    return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
+    # return device_df.sample(10000), diary_df.sample(5000), click_df, conversion_df
+    return device_df, diary_df, click_df, conversion_df
 def _get_data_from_redis(key):
@@ -192,3 +193,66 @@ def join_features(device_df, diary_df, cc_df):
    #         df.drop(col, inplace=True, axis=1)
    df.drop(drop_columns, inplace=True, axis=1)
    return df
+def join_device_diary(device_id, diary_ids, device_df, diary_df):
+    a_df = device_df.loc[device_df["device_id"] == device_id]
+    b_df = diary_df.loc[diary_df["card_id"].isin(diary_ids)]
+    b_df["device_id"] = device_id
+    df = pd.merge(a_df, b_df, how="left", on="device_id")
+    df["first_demands"] = df[["first_demands_x", "first_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_demands"] = df[["second_demands_x", "second_demands_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_solutions"] = df[["first_solutions_x", "first_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_solutions"] = df[["second_solutions_x", "second_solutions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["first_positions"] = df[["first_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["second_positions"] = df[["second_positions_x", "second_positions_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["projects"] = df[["projects_x", "projects_y"]].apply(lambda x: common_elements(*x), axis=1)
+    df["device_fd"] = df["first_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sd"] = df["second_demands_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fs"] = df["first_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_ss"] = df["second_solutions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_fp"] = df["first_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_sp"] = df["second_positions_x"].apply(lambda x: nth_element(x, 0))
+    df["device_p"] = df["projects_x"].apply(lambda x: nth_element(x, 0))
+    df["content_fd"] = df["first_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sd"] = df["second_demands_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fs"] = df["first_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_ss"] = df["second_solutions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_fp"] = df["first_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_sp"] = df["second_positions_y"].apply(lambda x: nth_element(x, 0))
+    df["content_p"] = df["projects_y"].apply(lambda x: nth_element(x, 0))
+    df["fd1"] = df["first_demands"].apply(lambda x: nth_element(x, 0))
+    df["fd2"] = df["first_demands"].apply(lambda x: nth_element(x, 1))
+    df["fd3"] = df["first_demands"].apply(lambda x: nth_element(x, 2))
+    df["sd1"] = df["second_demands"].apply(lambda x: nth_element(x, 0))
+    df["sd2"] = df["second_demands"].apply(lambda x: nth_element(x, 1))
+    df["sd3"] = df["second_demands"].apply(lambda x: nth_element(x, 2))
+    df["fs1"] = df["first_solutions"].apply(lambda x: nth_element(x, 0))
+    df["fs2"] = df["first_solutions"].apply(lambda x: nth_element(x, 1))
+    df["fs3"] = df["first_solutions"].apply(lambda x: nth_element(x, 2))
+    df["ss1"] = df["second_solutions"].apply(lambda x: nth_element(x, 0))
+    df["ss2"] = df["second_solutions"].apply(lambda x: nth_element(x, 1))
+    df["ss3"] = df["second_solutions"].apply(lambda x: nth_element(x, 2))
+    df["fp1"] = df["first_positions"].apply(lambda x: nth_element(x, 0))
+    df["fp2"] = df["first_positions"].apply(lambda x: nth_element(x, 1))
+    df["fp3"] = df["first_positions"].apply(lambda x: nth_element(x, 2))
+    df["sp1"] = df["second_positions"].apply(lambda x: nth_element(x, 0))
+    df["sp2"] = df["second_positions"].apply(lambda x: nth_element(x, 1))
+    df["sp3"] = df["second_positions"].apply(lambda x: nth_element(x, 2))
+    df["p1"] = df["projects"].apply(lambda x: nth_element(x, 0))
+    df["p2"] = df["projects"].apply(lambda x: nth_element(x, 1))
+    df["p3"] = df["projects"].apply(lambda x: nth_element(x, 2))
+    drop_columns = [
+        "first_demands_x", "first_demands_y", "first_demands", "second_demands_x", "second_demands_y", "second_demands",
+        "first_solutions_x", "first_solutions_y", "first_solutions", "second_solutions_x", "second_solutions_y",
+        "second_solutions", "first_positions_x", "first_positions_y", "first_positions", "second_positions_x",
+        "second_positions_y", "second_positions", "projects_x", "projects_y", "projects"
+    ]
+    df.drop(drop_columns, inplace=True, axis=1)
+    return df
--- a/src/models/esmm/utils.py
+++ b/src/models/esmm/utils.py
+from collections import Counter
 import pandas as pd
-def common_elements(lst1, lst2):
+def common_elements(lst1, lst2, n=3):
-    return [element for element in lst1 if element in lst2]
+    a = Counter(lst1)
+    b = Counter(lst2)
+    interactions = a & b
+    return list(interactions.elements())[:n]
 def nth_element(lst, n):