Commit 0521421d authored by 赵威's avatar 赵威

remove path

parent 2da5ccaa
...@@ -12,8 +12,9 @@ from sklearn.model_selection import train_test_split ...@@ -12,8 +12,9 @@ from sklearn.model_selection import train_test_split
from models.esmm import device_fe as device_fe from models.esmm import device_fe as device_fe
from models.esmm import diary_fe as diary_fe from models.esmm import diary_fe as diary_fe
from models.esmm.diary_model import model_predict_diary
from models.esmm.input_fn import build_features, esmm_input_fn from models.esmm.input_fn import build_features, esmm_input_fn
from models.esmm.model import esmm_model_fn, model_export, model_predict_diary from models.esmm.model import esmm_model_fn, model_export
# tf.compat.v1.enable_eager_execution() # tf.compat.v1.enable_eager_execution()
...@@ -25,18 +26,21 @@ def main(): ...@@ -25,18 +26,21 @@ def main():
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# # device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("~/data/cvr_data").expanduser()) # device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("~/data/cvr_data").expanduser())
# device_df, diary_df, click_df, conversion_df = diary_fe.read_csv_data(Path("/srv/apps/node2vec_git/cvr_data/"))
# # print(diary_df.sample(1)) data_path = Path("/srv/apps/node2vec_git/cvr_data/")
# device_df = device_fe.device_feature_engineering(device_df) diary_df, click_df, conversion_df = diary_fe.read_csv_data(data_path)
# # print(device_df.sample(1)) device_df = device_fe.read_csv_data(data_path)
# diary_df = diary_fe.diary_feature_engineering(diary_df) # print(diary_df.sample(1))
# # print(diary_df.sample(1)) device_df = device_fe.device_feature_engineering(device_df)
# cc_df = diary_fe.click_feature_engineering(click_df, conversion_df) # print(device_df.sample(1))
# # print(cc_df.sample(1)) diary_df = diary_fe.diary_feature_engineering(diary_df)
# df = diary_fe.join_features(device_df, diary_df, cc_df) # print(diary_df.sample(1))
# # print(df.sample(1)) cc_df = diary_fe.click_feature_engineering(click_df, conversion_df)
# # print(df.dtypes) # print(cc_df.sample(1))
df = diary_fe.join_features(device_df, diary_df, cc_df)
# print(df.sample(1))
print(df.dtypes)
# train_df, test_df = train_test_split(df, test_size=0.2) # train_df, test_df = train_test_split(df, test_size=0.2)
# train_df, val_df = train_test_split(train_df, test_size=0.2) # train_df, val_df = train_test_split(train_df, test_size=0.2)
...@@ -84,7 +88,8 @@ def main(): ...@@ -84,7 +88,8 @@ def main():
for i in range(5): for i in range(5):
time_1 = timeit.default_timer() time_1 = timeit.default_timer()
res = model_predict_diary(random.sample(device_ids, 1)[0], random.sample(diary_ids, 200), device_dict, diary_dict, predict_fn) res = model_predict_diary(
random.sample(device_ids, 1)[0], random.sample(diary_ids, 200), device_dict, diary_dict, predict_fn)
print(res[:10]) print(res[:10])
total_1 = (timeit.default_timer() - time_1) total_1 = (timeit.default_timer() - time_1)
print("total prediction cost {:.5f}s".format(total_1), "\n") print("total prediction cost {:.5f}s".format(total_1), "\n")
......
import pandas as pd
from utils.cache import redis_db_client from utils.cache import redis_db_client
# "channel_first", "city_first", "model_first", # "channel_first", "city_first", "model_first",
...@@ -8,6 +10,12 @@ DIARY_DEVICE_COLUMNS = [ ...@@ -8,6 +10,12 @@ DIARY_DEVICE_COLUMNS = [
] ]
def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
device_df.drop_duplicates(subset=["device_id"], inplace=True)
return device_df
def get_device_dict_from_redis(): def get_device_dict_from_redis():
""" """
return: {device_id: {first_demands: [], city_first: ""}} return: {device_id: {first_demands: [], city_first: ""}}
......
...@@ -14,14 +14,10 @@ DIARY_COLUMNS = [ ...@@ -14,14 +14,10 @@ DIARY_COLUMNS = [
def read_csv_data(dataset_path): def read_csv_data(dataset_path):
device_df = pd.read_csv(dataset_path.joinpath("device.csv"), sep="|")
device_df.drop_duplicates(subset=["device_id"], inplace=True)
diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|") diary_df = pd.read_csv(dataset_path.joinpath("diary.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("diary_click.csv"), sep="|") click_df = pd.read_csv(dataset_path.joinpath("diary_click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"), sep="|") conversion_df = pd.read_csv(dataset_path.joinpath("diary_click_cvr.csv"), sep="|")
return device_df, diary_df, click_df, conversion_df return diary_df, click_df, conversion_df
def get_diary_dict_from_redis(): def get_diary_dict_from_redis():
......
import timeit
import tensorflow as tf
from .diary_fe import device_diary_fe
from .model import _bytes_feature, _float_feature, _int64_feature
def model_predict_diary(device_id, diary_ids, device_dict, diary_dict, predict_fn):
try:
time_1 = timeit.default_timer()
device_info, diary_lst, diary_ids_res = device_diary_fe(device_id, diary_ids, device_dict, diary_dict)
print("predict check: " + str(len(diary_lst)) + " " + str(len(diary_ids_res)))
# TODO
int_columns = [
"active_type", "active_days", "card_id", "is_pure_author", "is_have_reply", "is_have_pure_reply", "content_level",
"topic_num", "favor_num", "vote_num"
]
float_columns = ["one_ctr", "three_ctr", "seven_ctr", "fifteen_ctr"]
str_columns = [
"device_id", "past_consume_ability_history", "potential_consume_ability_history", "price_sensitive_history",
"device_fd", "device_sd", "device_fs", "device_ss", "device_fp", "device_sp", "device_p", "content_fd", "content_sd",
"content_fs", "content_ss", "content_fp", "content_sp", "content_p", "fd1", "fd2", "fd3", "sd1", "sd2", "sd3", "fs1",
"fs2", "fs3", "ss1", "ss2", "ss3", "fp1", "fp2", "fp3", "sp1", "sp2", "sp3", "p1", "p2", "p3"
]
examples = []
for diary_info in diary_lst:
tmp = {}
tmp.update(device_info)
tmp.update(diary_info)
features = {}
for col in int_columns:
features[col] = _int64_feature(int(tmp[col]))
for col in float_columns:
features[col] = _float_feature(float(tmp[col]))
for col in str_columns:
features[col] = _bytes_feature(str(tmp[col]).encode(encoding="utf-8"))
example = tf.train.Example(features=tf.train.Features(feature=features))
examples.append(example.SerializeToString())
total_1 = (timeit.default_timer() - time_1)
print("make example cost {:.5f}s".format(total_1))
time_1 = timeit.default_timer()
predictions = predict_fn({"examples": examples})
res_tuple = sorted(zip(diary_ids_res, predictions["output"].tolist()), key=lambda x: x[1], reverse=True)
res = []
for (id, _) in res_tuple:
res.append(int(id))
# print(res)
total_1 = (timeit.default_timer() - time_1)
print("prediction cost {:.5f}s".format(total_1))
return res
except Exception as e:
print(e)
# device_info, _, _ = device_diary_fe(device_id, diary_ids, device_dict, diary_dict)
# print(device_info)
return []
import pandas as pd
TRACTATE_COLUMNS = []
def read_csv_data(dataset_path):
tractate_df = pd.read_csv(dataset_path.joinpath("tractate.csv"), sep="|")
click_df = pd.read_csv(dataset_path.joinpath("tractate_click.csv"), sep="|")
conversion_df = pd.read_csv(dataset_path.joinpath("tractate_click_cvr.csv"), sep="|")
return tractate_df, click_df, conversion_df
def get_tractate_from_redis():
"""
return: {diary_id: {first_demands: [], is_pure_author: 1}}
"""
pass
def tractate_feature_engineering(df):
tractate_df = df.copy()
def click_feature_engineering(click_df, conversion_df):
pass
def join_features(device_df, tractate_df, cc_df):
pass
def device_tractate_fe(device_id, tractate_ids, device_dict, tractate_dict):
pass
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment