Commit 5b8a4fe3 authored by 郭羽's avatar 郭羽

特征工程优化

parent 4777e78a
...@@ -764,8 +764,6 @@ if __name__ == '__main__': ...@@ -764,8 +764,6 @@ if __name__ == '__main__':
expDF = spark.sql(expSql) expDF = spark.sql(expSql)
# ratingDF = samplesNegAndUnion(clickDF,expDF) # ratingDF = samplesNegAndUnion(clickDF,expDF)
ratingDF = clickDF.union(expDF) ratingDF = clickDF.union(expDF)
print("pos size:"+str(clickDF.count()),"neg size:"+str(expDF.count()))
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\ ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\
.withColumnRenamed("device_id", "userid")\ .withColumnRenamed("device_id", "userid")\
.withColumnRenamed("card_id", "itemid")\ .withColumnRenamed("card_id", "itemid")\
...@@ -782,6 +780,10 @@ if __name__ == '__main__': ...@@ -782,6 +780,10 @@ if __name__ == '__main__':
print("添加label...") print("添加label...")
ratingSamplesWithLabel = addSampleLabel(ratingDF) ratingSamplesWithLabel = addSampleLabel(ratingDF)
posCount = ratingSamplesWithLabel.filter(F.col("label")==1).count()
negCount = ratingSamplesWithLabel.filter(F.col("label")==0).count()
print("pos size:"+str(posCount),"neg size:"+str(negCount))
# 数据字典 # 数据字典
dataVocab = {} dataVocab = {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment