Commit 4dcd659a authored by 郭羽's avatar 郭羽

service model 优化

parent bca8a2ed
......@@ -106,8 +106,8 @@ def addStaticsFeatures(samples,dataVocab):
.withColumn('userRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("userRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.withColumn("userClickCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("userExpCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 0, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("userClickCount", F.format_number(F.sum(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(0)).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("userExpCount", F.format_number(F.sum(when(F.col('label') == 0, F.lit(1)).otherwise(F.lit(0)).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("userCtr", F.format_number(F.col("userClickCount")/(F.col("userExpCount")+1),NUMBER_PRECISION).cast("float")) \
.filter(F.col("userRatingCount") > 1)
......@@ -116,8 +116,8 @@ def addStaticsFeatures(samples,dataVocab):
.withColumn('itemRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("itemRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("itemRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.withColumn("itemClickCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("item_id").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("itemExpCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 0, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("item_id").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("itemClickCount", F.format_number(F.sum(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(0)).over(sql.Window.partitionBy("item_id").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("itemExpCount", F.format_number(F.sum(when(F.col('label') == 0, F.lit(1)).otherwise(F.lit(0)).over(sql.Window.partitionBy("item_id").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
.withColumn("itemCtr", F.format_number(F.col("itemClickCount")/(F.col("itemExpCount")+1),NUMBER_PRECISION).cast("float")) \
# 连续特征分桶
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment