Commit 95279079 authored by 宋柯's avatar 宋柯

模型调试

parent 66e025dc
...@@ -41,6 +41,7 @@ FEATURE_VOCAB_KEY = "Strategy:rec:vocab:service:" + VERSION ...@@ -41,6 +41,7 @@ FEATURE_VOCAB_KEY = "Strategy:rec:vocab:service:" + VERSION
FEATURE_COLUMN_KEY = "Strategy:rec:column:service:" + VERSION FEATURE_COLUMN_KEY = "Strategy:rec:column:service:" + VERSION
ITEM_PREFIX = "ITEM_" ITEM_PREFIX = "ITEM_"
USER_PREFIX = "USER_"
CATEGORY_PREFIX = "CATEGORY_" CATEGORY_PREFIX = "CATEGORY_"
NUMERIC_PREFIX = "NUMERIC_" NUMERIC_PREFIX = "NUMERIC_"
DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION) DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION)
...@@ -292,7 +293,14 @@ def getUserProfileFeature(spark, startDay, endDay): ...@@ -292,7 +293,14 @@ def getUserProfileFeature(spark, startDay, endDay):
return (date.fromisoformat(dt) + timedelta(days = 1)).strftime('%Y%m%d') return (date.fromisoformat(dt) + timedelta(days = 1)).strftime('%Y%m%d')
addOneDay_UDF = F.udf(addOneDay, StringType()) addOneDay_UDF = F.udf(addOneDay, StringType())
userProfileFeatureDF = userProfileFeatureDF.withColumn('partition_date', addOneDay_UDF('dt')).drop('dt') userProfileFeatureDF = userProfileFeatureDF.withColumn('partition_date', addOneDay_UDF('dt'))\
.withColumn("os", USER_PREFIX + CATEGORY_PREFIX + "os")\
.withColumn("user_city_id", USER_PREFIX + CATEGORY_PREFIX + "user_city_id")\
.withColumn("second_solutions", USER_PREFIX + CATEGORY_PREFIX + "second_solutions")\
.withColumn("second_demands", USER_PREFIX + CATEGORY_PREFIX + "second_demands")\
.withColumn("second_positions", USER_PREFIX + CATEGORY_PREFIX + "second_positions")\
.withColumn("projects", USER_PREFIX + CATEGORY_PREFIX + "projects")\
.drop('dt')
userProfileFeatureDF.cache() userProfileFeatureDF.cache()
userProfileFeatureDF.show(20, False) userProfileFeatureDF.show(20, False)
...@@ -946,7 +954,47 @@ if __name__ == '__main__': ...@@ -946,7 +954,47 @@ if __name__ == '__main__':
.join(clickStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\ .join(clickStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\
.join(expStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\ .join(expStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\
.join(itemEsFeatureDF, on = ["card_id"], how = 'left') .join(itemEsFeatureDF, on = ["card_id"], how = 'left')
samples = samples.withColumn("card_id", ITEM_PREFIX + CATEGORY_PREFIX + "card_id")\
.withColumn("device_id", USER_PREFIX + CATEGORY_PREFIX + "device_id")\
.drop("partition_date", "timestamp")
# | -- card_id: string(nullable=true)
# | -- partition_date: string(nullable=true)
# | -- device_id: string(nullable=true)
# | -- timestamp: long(nullable=true)
# | -- os: string(nullable=true)
# | -- user_city_id: string(nullable=true)
# | -- label: integer(nullable=false)
# | -- second_solutions: string(nullable=true)
# | -- second_demands: string(nullable=true)
# | -- second_positions: string(nullable=true)
# | -- projects: string(nullable=true)
# | -- ITEM_NUMERIC_click_count_sum: double(nullable=true)
# | -- ITEM_NUMERIC_click_count_avg: double(nullable=true)
# | -- ITEM_NUMERIC_click_count_stddev: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_sum: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_avg: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_stddev: double(nullable=true)
# | -- ITEM_NUMERIC_discount: double(nullable=true)
# | -- ITEM_NUMERIC_case_count: long(nullable=true)
# | -- ITEM_NUMERIC_sales_count: long(nullable=true)
# | -- ITEM_CATEGORY_service_type: string(nullable=true)
# | -- ITEM_CATEGORY_merchant_id: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_type: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_id: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_famous: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_id: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_city_tag_id: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_type: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_is_high_quality: string(nullable=true)
# | -- ITEM_CATEGORY_second_demands: string(nullable=true)
# | -- ITEM_CATEGORY_second_solutions: string(nullable=true)
# | -- ITEM_CATEGORY_second_positions: string(nullable=true)
# | -- ITEM_CATEGORY_projects: string(nullable=true)
# | -- ITEM_NUMERIC_sku_price: double(nullable=true)
#
print(samples.schema.fields)
print([field.name for field in samples.schema.fields])
samples.printSchema() samples.printSchema()
samples.show(20, False) samples.show(20, False)
sys.exit() sys.exit()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment