Commit bca8a2ed authored by 郭羽's avatar 郭羽

service model 优化

parent b887c0c8
......@@ -103,7 +103,7 @@ priceToBucketUdf = F.udf(priceToBucket, FloatType())
def addStaticsFeatures(samples,dataVocab):
print("user统计特征处理...")
samples = samples \
.withColumn('userRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1))), NUMBER_PRECISION).cast("float") \
.withColumn('userRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("userRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('userid').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.withColumn("userClickCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("userid").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
......@@ -113,7 +113,7 @@ def addStaticsFeatures(samples,dataVocab):
print("item统计特征处理...")
samples = samples \
.withColumn('itemRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1))), NUMBER_PRECISION).cast("float") \
.withColumn('itemRatingCount',F.format_number(F.count(F.lit(1)).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("itemRatingAvg", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION).cast("float")) \
.withColumn("itemRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy('item_id').orderBy('timestamp').rowsBetween(-100, -1)),NUMBER_PRECISION).cast("float")) \
.withColumn("itemClickCount", F.format_number(F.count(F.collect_list(when(F.col('label') == 1, F.lit(1)).otherwise(F.lit(None))).over(sql.Window.partitionBy("item_id").orderBy(F.col("timestamp")).rowsBetween(-100, -1))),NUMBER_PRECISION).cast("float")) \
......@@ -775,7 +775,13 @@ if __name__ == '__main__':
timestmp2 = int(round(time.time()))
print("处理item特征, 耗时s:{}".format(timestmp2 - timestmp1))
print("multiVocab:")
print(multiVocab.keys())
for k,v in multiVocab.items():
print(k,len(v))
print("dataVocab:")
for k, v in dataVocab.items():
print(k, len(v))
itemDF_spark = spark.createDataFrame(itemDF)
itemDF_spark.printSchema()
itemDF_spark.show(10, truncate=False)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment