Commit 0e345a29 authored by 郭羽's avatar 郭羽

ctr 平滑

parent 6f7df6fc
...@@ -83,7 +83,7 @@ numberToBucketUdf = F.udf(numberToBucket, StringType()) ...@@ -83,7 +83,7 @@ numberToBucketUdf = F.udf(numberToBucket, StringType())
priceToBucketUdf = F.udf(priceToBucket, StringType()) priceToBucketUdf = F.udf(priceToBucket, StringType())
def addItemStaticFeatures(samples,itemDF,dataVocab): def addItemStaticFeatures(samples,itemDF,dataVocab):
ctrUdf = F.udf(wilson_ctr, ArrayType(float())) ctrUdf = F.udf(wilson_ctr, FloatType())
# item不设置over窗口,原因:item可能一直存在,统计数据按照最新即可 # item不设置over窗口,原因:item可能一直存在,统计数据按照最新即可
print("item统计特征处理...") print("item统计特征处理...")
staticFeatures = samples.groupBy('item_id').agg(F.count(F.lit(1)).alias('itemRatingCount'), staticFeatures = samples.groupBy('item_id').agg(F.count(F.lit(1)).alias('itemRatingCount'),
...@@ -244,7 +244,7 @@ def addUserFeatures(samples,dataVocab,multiVocab): ...@@ -244,7 +244,7 @@ def addUserFeatures(samples,dataVocab,multiVocab):
extractTagsUdf = F.udf(extractTags, ArrayType(StringType())) extractTagsUdf = F.udf(extractTags, ArrayType(StringType()))
arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType())) arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType()))
ctrUdf = F.udf(wilson_ctr, ArrayType(float())) ctrUdf = F.udf(wilson_ctr, FloatType())
print("user历史数据处理...") print("user历史数据处理...")
# user历史记录 # user历史记录
samples = samples.withColumn('userPositiveHistory',F.collect_list(when(F.col('label') == 1, F.col('item_id')).otherwise(F.lit(None))).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))) samples = samples.withColumn('userPositiveHistory',F.collect_list(when(F.col('label') == 1, F.col('item_id')).otherwise(F.lit(None))).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1)))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment