Commit a6862152 authored by 宋柯's avatar 宋柯

模型调试

parent db49c69d
...@@ -43,6 +43,7 @@ FEATURE_COLUMN_KEY = "Strategy:rec:column:service:" + VERSION ...@@ -43,6 +43,7 @@ FEATURE_COLUMN_KEY = "Strategy:rec:column:service:" + VERSION
ITEM_PREFIX = "ITEM_" ITEM_PREFIX = "ITEM_"
USER_PREFIX = "USER_" USER_PREFIX = "USER_"
CATEGORY_PREFIX = "CATEGORY_" CATEGORY_PREFIX = "CATEGORY_"
MULTI_CATEGORY_PREFIX = "MULTI_CATEGORY_"
NUMERIC_PREFIX = "NUMERIC_" NUMERIC_PREFIX = "NUMERIC_"
DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION) DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION)
...@@ -221,13 +222,17 @@ def itemEsFeaturesProcess(itemDF, spark): ...@@ -221,13 +222,17 @@ def itemEsFeaturesProcess(itemDF, spark):
item_es_feature_start_time = int(round(time.time())) item_es_feature_start_time = int(round(time.time()))
item_categoty_cols = ['id', 'service_type', 'merchant_id', 'doctor_type', 'doctor_id', item_categoty_cols = ['id', 'service_type', 'merchant_id', 'doctor_type', 'doctor_id',
'doctor_famous', 'hospital_id', 'hospital_city_tag_id', 'hospital_type', 'hospital_is_high_quality', 'doctor_famous', 'hospital_id', 'hospital_city_tag_id', 'hospital_type', 'hospital_is_high_quality']
'tags_v3', 'second_demands', 'second_solutions', 'second_positions'] item_multi_categots_cols =['tags_v3', 'second_demands', 'second_solutions', 'second_positions']
for item_categoty_col in item_categoty_cols: for item_categoty_col in item_categoty_cols:
itemDF[ITEM_PREFIX + CATEGORY_PREFIX + item_categoty_col] = itemDF[item_categoty_col] itemDF[ITEM_PREFIX + CATEGORY_PREFIX + item_categoty_col] = itemDF[item_categoty_col]
itemDF = itemDF.drop(columns = item_categoty_cols) itemDF = itemDF.drop(columns = item_categoty_cols)
for item_multi_categots_col in item_multi_categots_cols:
itemDF[ITEM_PREFIX + MULTI_CATEGORY_PREFIX + item_multi_categots_col] = itemDF[item_multi_categots_col]
itemDF = itemDF.drop(columns = item_multi_categots_cols)
item_numeric_cols = ['case_count', 'sales_count', 'discount', 'sku_price'] item_numeric_cols = ['case_count', 'sales_count', 'discount', 'sku_price']
for item_numeric_col in item_numeric_cols: for item_numeric_col in item_numeric_cols:
itemDF[ITEM_PREFIX + NUMERIC_PREFIX + item_numeric_col] = itemDF[item_numeric_col] itemDF[ITEM_PREFIX + NUMERIC_PREFIX + item_numeric_col] = itemDF[item_numeric_col]
...@@ -294,12 +299,10 @@ def getUserProfileFeature(spark, startDay, endDay): ...@@ -294,12 +299,10 @@ def getUserProfileFeature(spark, startDay, endDay):
addOneDay_UDF = F.udf(addOneDay, StringType()) addOneDay_UDF = F.udf(addOneDay, StringType())
userProfileFeatureDF = userProfileFeatureDF.withColumn('partition_date', addOneDay_UDF('dt'))\ userProfileFeatureDF = userProfileFeatureDF.withColumn('partition_date', addOneDay_UDF('dt'))\
.withColumnRenamed("os", USER_PREFIX + CATEGORY_PREFIX + "os")\ .withColumnRenamed("second_solutions", USER_PREFIX + MULTI_CATEGORY_PREFIX + "second_solutions")\
.withColumnRenamed("user_city_id", USER_PREFIX + CATEGORY_PREFIX + "user_city_id")\ .withColumnRenamed("second_demands", USER_PREFIX + MULTI_CATEGORY_PREFIX + "second_demands")\
.withColumnRenamed("second_solutions", USER_PREFIX + CATEGORY_PREFIX + "second_solutions")\ .withColumnRenamed("second_positions", USER_PREFIX + MULTI_CATEGORY_PREFIX + "second_positions")\
.withColumnRenamed("second_demands", USER_PREFIX + CATEGORY_PREFIX + "second_demands")\ .withColumnRenamed("projects", USER_PREFIX + MULTI_CATEGORY_PREFIX + "projects")\
.withColumnRenamed("second_positions", USER_PREFIX + CATEGORY_PREFIX + "second_positions")\
.withColumnRenamed("projects", USER_PREFIX + CATEGORY_PREFIX + "projects")\
.drop('dt') .drop('dt')
userProfileFeatureDF.cache() userProfileFeatureDF.cache()
userProfileFeatureDF.show(20, False) userProfileFeatureDF.show(20, False)
...@@ -751,7 +754,7 @@ def init_es_query(): ...@@ -751,7 +754,7 @@ def init_es_query():
}, },
"query": { "query": {
"bool": { "bool": {
"must": [{"term": {"is_online": True}}], "must": [],
"must_not": [], "must_not": [],
"should": [] "should": []
} }
...@@ -955,7 +958,9 @@ if __name__ == '__main__': ...@@ -955,7 +958,9 @@ if __name__ == '__main__':
.join(expStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\ .join(expStaticFeatures, on = ["card_id", "partition_date"], how = 'left')\
.join(itemEsFeatureDF, on = ["card_id"], how = 'left') .join(itemEsFeatureDF, on = ["card_id"], how = 'left')
samples = samples.withColumnRenamed("card_id", ITEM_PREFIX + CATEGORY_PREFIX + "card_id")\ samples = samples.withColumnRenamed("card_id", ITEM_PREFIX + CATEGORY_PREFIX + "card_id")\
.withColumnRenamed("device_id", USER_PREFIX + CATEGORY_PREFIX + "device_id")\ .withColumnRenamed("device_id", USER_PREFIX + CATEGORY_PREFIX + "device_id") \
.withColumnRenamed("os", USER_PREFIX + CATEGORY_PREFIX + "os") \
.withColumnRenamed("user_city_id", USER_PREFIX + CATEGORY_PREFIX + "user_city_id") \
.drop("partition_date", "timestamp") .drop("partition_date", "timestamp")
# | -- card_id: string(nullable=true) # | -- card_id: string(nullable=true)
...@@ -993,8 +998,19 @@ if __name__ == '__main__': ...@@ -993,8 +998,19 @@ if __name__ == '__main__':
# | -- ITEM_CATEGORY_projects: string(nullable=true) # | -- ITEM_CATEGORY_projects: string(nullable=true)
# | -- ITEM_NUMERIC_sku_price: double(nullable=true) # | -- ITEM_NUMERIC_sku_price: double(nullable=true)
# #
print(samples.schema.fields) fields = [field.name for field in samples.schema.fields]
print([field.name for field in samples.schema.fields])
fields_na_value_dict = {}
for field in fields:
if field.startswith(ITEM_PREFIX + CATEGORY_PREFIX) or field.startswith(USER_PREFIX + CATEGORY_PREFIX):
fields_na_value_dict[field] = '-1'
elif field.startswith(ITEM_PREFIX + MULTI_CATEGORY_PREFIX) or field.startswith(USER_PREFIX + MULTI_CATEGORY_PREFIX):
fields_na_value_dict[field] = ['-1']
elif field.startswith(ITEM_PREFIX + NUMERIC_PREFIX) or field.startswith(USER_PREFIX + NUMERIC_PREFIX):
fields_na_value_dict[field] = 0
samples.na.fill(fields_na_value_dict)
samples.printSchema() samples.printSchema()
samples.show(20, False) samples.show(20, False)
sys.exit() sys.exit()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment