Commit 539709e3 authored by 郭羽's avatar 郭羽

美购精排模型

parent 793d205a
...@@ -71,9 +71,10 @@ def addItemFeatures(samples,itemDF): ...@@ -71,9 +71,10 @@ def addItemFeatures(samples,itemDF):
# 统计特征处理 # 统计特征处理
staticFeatures = samples.groupBy('itemid').agg(F.count(F.lit(1)).alias('itemRatingCount'), staticFeatures = samples.groupBy('itemid').agg(F.count(F.lit(1)).alias('itemRatingCount'),
F.format_number(F.avg(F.col('rating')),NUMBER_PRECISION).alias('itemRatingAvg'), F.avg(F.col('rating')).alias('itemRatingAvg'),
F.stddev(F.col('rating')).alias('itemRatingStddev')).fillna(0)\ F.stddev(F.col('rating')).alias('itemRatingStddev')).fillna(0)\
.withColumn('itemRatingStddev', F.format_number(F.col('itemRatingStddev'), NUMBER_PRECISION)) .withColumn('itemRatingStddev', F.format_number(F.col('itemRatingStddev'), NUMBER_PRECISION).cast("float"))\
.withColumn('itemRatingAvg', F.format_number(F.col('itemRatingAvg'), NUMBER_PRECISION).cast("float"))
# join item rating features # join item rating features
samples = samples.join(staticFeatures, on=['itemid'], how='left') samples = samples.join(staticFeatures, on=['itemid'], how='left')
...@@ -119,8 +120,8 @@ def addUserFeatures(samples): ...@@ -119,8 +120,8 @@ def addUserFeatures(samples):
# user历史点击分值统计 # user历史点击分值统计
samples = samples\ samples = samples\
.withColumn('userRatingCount',F.count(F.lit(1)).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1))) \ .withColumn('userRatingCount',F.count(F.lit(1)).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1))) \
.withColumn("userRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION)) \ .withColumn("userRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.filter(F.col("userRatingCount") > 1) .filter(F.col("userRatingCount") > 1)
# user偏好 # user偏好
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment