Commit fcfb12fe authored by 郭羽's avatar 郭羽

特征工程优化

parent 57d4f6ea
......@@ -231,6 +231,7 @@ def getDataVocab(samples,model_columns):
# 多值特征
for c in ITEM_MULTI_COLUMN_EXTRA_MAP.keys():
print(c)
datas = samples.select(c).distinct().collect()
tagSet = set()
for d in datas:
......@@ -244,6 +245,7 @@ def getDataVocab(samples,model_columns):
# id类特征 和 类别特征
for c in ["userid","itemid"] + [ITEM_PREFIX + c for c in ITEM_CATE_COLUMNS]:
print(c)
datas = samples.select(c).distinct().collect()
vocabSet = set()
for d in datas:
......@@ -254,7 +256,6 @@ def getDataVocab(samples,model_columns):
pass
for c in model_columns:
print("col",c)
# 判断是否以Bucket结尾
if c.endswith("Bucket"):
datas = samples.select(c).distinct().collect()
......@@ -738,6 +739,7 @@ if __name__ == '__main__':
expDF = spark.sql(expSql)
# ratingDF = samplesNegAndUnion(clickDF,expDF)
ratingDF = clickDF.union(expDF)
print("pos size:"+str(clickDF.count()),"neg size:"+str(expDF.count()))
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\
.withColumnRenamed("device_id", "userid")\
......
......@@ -20,7 +20,7 @@ one_hot_columns = ["item_"+c for c in ["service_type","doctor_type","doctor_famo
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
VERSION = configUtils.SERVICE_VERSION
trainDay = time.strftime("%Y%m%d", time.localtime())
trainDay = time.strftime("%Y%m%d%H", time.localtime())
data_path_train = "/data/files/service_feature_{}_train.csv".format(VERSION)
data_path_test = "/data/files/service_feature_{}_test.csv".format(VERSION)
model_file = configUtils.SERVICE_MODEL_PATH + "/" + trainDay
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment