Commit eff7b4c2 authored by 宋柯's avatar 宋柯

模型调试

parent f3f46616
......@@ -863,7 +863,7 @@ def addDays(n, format="%Y%m%d"):
return (date.today() + timedelta(days=n)).strftime(format)
def generatePartitionDates(partitionDates):
return [addDays(-trainDay) for trainDay in range(partitionDates)]
return [addDays(-trainDay - 1) for trainDay in range(partitionDates)]
#显示所有列
pd.set_option('display.max_columns', None)
......@@ -873,8 +873,8 @@ pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth',100)
def get_click_exp_start_end_time(trainDays):
startDay = addDays(-int(trainDays))
endDay = addDays(0)
startDay = addDays(-int(trainDays) - 1)
endDay = addDays(-1)
print("click_exp_start_end_time: {}, {}".format(startDay, endDay), flush=True)
return startDay, endDay
......@@ -943,7 +943,7 @@ if __name__ == '__main__':
itemEsFeatureDF = get_item_es_feature_df()
#计算 item 统计特征
clickStaticFeatures, expStaticFeatures = getItemStaticFeatures(itemStatisticStartDays + trainDays, startDay, endDay)
clickStaticFeatures, expStaticFeatures = getItemStaticFeatures(itemStatisticStartDays + trainDays + 1, startDay, endDay)
#user Profile Feature
userProfileFeatureDF = getUserProfileFeature(spark, addDays(-trainDays - 1, format = "%Y-%m-%d"), addDays(-1, format = "%Y-%m-%d"))
......@@ -957,42 +957,40 @@ if __name__ == '__main__':
.withColumnRenamed("device_id", USER_PREFIX + CATEGORY_PREFIX + "device_id") \
.withColumnRenamed("os", USER_PREFIX + CATEGORY_PREFIX + "os") \
.withColumnRenamed("user_city_id", USER_PREFIX + CATEGORY_PREFIX + "user_city_id") \
.drop("partition_date", "timestamp")
# | -- card_id: string(nullable=true)
# | -- partition_date: string(nullable=true)
# | -- device_id: string(nullable=true)
# | -- timestamp: long(nullable=true)
# | -- os: string(nullable=true)
# | -- user_city_id: string(nullable=true)
.drop("timestamp")
# | -- ITEM_CATEGORY_card_id: string(nullable=false)
# | -- USER_CATEGORY_device_id: string(nullable=false)
# | -- USER_CATEGORY_os: string(nullable=false)
# | -- USER_CATEGORY_user_city_id: string(nullable=false)
# | -- label: integer(nullable=false)
# | -- second_solutions: string(nullable=true)
# | -- second_demands: string(nullable=true)
# | -- second_positions: string(nullable=true)
# | -- projects: string(nullable=true)
# | -- ITEM_NUMERIC_click_count_sum: double(nullable=true)
# | -- ITEM_NUMERIC_click_count_avg: double(nullable=true)
# | -- ITEM_NUMERIC_click_count_stddev: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_sum: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_avg: double(nullable=true)
# | -- ITEM_NUMERIC_exp_count_stddev: double(nullable=true)
# | -- ITEM_NUMERIC_discount: double(nullable=true)
# | -- ITEM_NUMERIC_case_count: long(nullable=true)
# | -- ITEM_NUMERIC_sales_count: long(nullable=true)
# | -- ITEM_CATEGORY_service_type: string(nullable=true)
# | -- ITEM_CATEGORY_merchant_id: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_type: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_id: string(nullable=true)
# | -- ITEM_CATEGORY_doctor_famous: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_id: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_city_tag_id: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_type: string(nullable=true)
# | -- ITEM_CATEGORY_hospital_is_high_quality: string(nullable=true)
# | -- ITEM_CATEGORY_second_demands: string(nullable=true)
# | -- ITEM_CATEGORY_second_solutions: string(nullable=true)
# | -- ITEM_CATEGORY_second_positions: string(nullable=true)
# | -- ITEM_CATEGORY_projects: string(nullable=true)
# | -- ITEM_NUMERIC_sku_price: double(nullable=true)
# | -- USER_MULTI_CATEGORY_second_solutions: string(nullable=false)
# | -- USER_MULTI_CATEGORY_second_demands: string(nullable=false)
# | -- USER_MULTI_CATEGORY_second_positions: string(nullable=false)
# | -- USER_MULTI_CATEGORY_projects: string(nullable=false)
# | -- ITEM_NUMERIC_click_count_sum: double(nullable=false)
# | -- ITEM_NUMERIC_click_count_avg: double(nullable=false)
# | -- ITEM_NUMERIC_click_count_stddev: double(nullable=false)
# | -- ITEM_NUMERIC_exp_count_sum: double(nullable=false)
# | -- ITEM_NUMERIC_exp_count_avg: double(nullable=false)
# | -- ITEM_NUMERIC_exp_count_stddev: double(nullable=false)
# | -- ITEM_NUMERIC_discount: double(nullable=false)
# | -- ITEM_NUMERIC_case_count: long(nullable=false)
# | -- ITEM_NUMERIC_sales_count: long(nullable=false)
# | -- ITEM_CATEGORY_service_type: string(nullable=false)
# | -- ITEM_CATEGORY_merchant_id: string(nullable=false)
# | -- ITEM_CATEGORY_doctor_type: string(nullable=false)
# | -- ITEM_CATEGORY_doctor_id: string(nullable=false)
# | -- ITEM_CATEGORY_doctor_famous: string(nullable=false)
# | -- ITEM_CATEGORY_hospital_id: string(nullable=false)
# | -- ITEM_CATEGORY_hospital_city_tag_id: string(nullable=false)
# | -- ITEM_CATEGORY_hospital_type: string(nullable=false)
# | -- ITEM_CATEGORY_hospital_is_high_quality: string(nullable=false)
# | -- ITEM_CATEGORY_second_demands: string(nullable=false)
# | -- ITEM_CATEGORY_second_solutions: string(nullable=false)
# | -- ITEM_CATEGORY_second_positions: string(nullable=false)
# | -- ITEM_CATEGORY_projects: string(nullable=false)
# | -- ITEM_NUMERIC_sku_price: double(nullable=false)
#
fields = [field.name for field in samples.schema.fields]
multi_categoty_fields = []
......@@ -1010,17 +1008,29 @@ if __name__ == '__main__':
fields_na_value_dict[field] = 0
samples = samples.na.fill(fields_na_value_dict).coalesce(1)
samples.cache()
samples.printSchema()
samples.show(20, False)
test_samples = samples.where("partition_date = '{}'".format(endDay))
train_samples = samples.where("partition_date <> '{}'".format(endDay))
train_samples.cache()
train_samples.show(20, False)
write_time_start = time.time()
for categoty_field in categoty_fields:
output_file = "file:///home/gmuser/" + categoty_field + "_vocab"
samples.select(categoty_field).where(F.col(categoty_field) != '-1').distinct().write.mode("overwrite").options(header="false").csv(output_file)
train_samples.select(categoty_field).where(F.col(categoty_field) != '-1').distinct().write.mode("overwrite").options(header="false").csv(output_file)
for multi_categoty_field in multi_categoty_fields:
output_file = "file:///home/gmuser/" + multi_categoty_field + "_vocab"
samples.selectExpr("explode(split({multi_categoty_field},','))".format(multi_categoty_field = multi_categoty_field)).where(F.col(multi_categoty_field) != '-1').distinct().write.mode("overwrite").options(header="false").csv(output_file)
train_samples.selectExpr("explode(split({multi_categoty_field},','))".format(multi_categoty_field = multi_categoty_field)).where(F.col(multi_categoty_field) != '-1').distinct().write.mode("overwrite").options(header="false").csv(output_file)
output_file = "file:///home/gmuser/train_samples"
train_samples.write.mode("overwrite").options(header="false").csv(output_file)
output_file = "file:///home/gmuser/test_samples"
test_samples.write.mode("overwrite").options(header="false").csv(output_file)
print("训练数据写入 耗时s:{}".format(time.time() - write_time_start))
print("总耗时:{} mins".format((time.time() - start)/60))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment