Commit ad996d28 authored by 郭羽's avatar 郭羽

特征工程优化

parent 67de0eae
...@@ -93,8 +93,6 @@ def addItemFeatures(samples,itemDF,dataVocab,multiVocab): ...@@ -93,8 +93,6 @@ def addItemFeatures(samples,itemDF,dataVocab,multiVocab):
itemDF = itemDF.withColumn(c, F.when(F.col(c).isNull(), "-1").otherwise(F.col(c))) itemDF = itemDF.withColumn(c, F.when(F.col(c).isNull(), "-1").otherwise(F.col(c)))
multiVocab[c] = collectMutiColumnToVocab(itemDF, c) multiVocab[c] = collectMutiColumnToVocab(itemDF, c)
itemDF = itemDF.drop(c)
for i in range(1, v + 1): for i in range(1, v + 1):
new_c = ITEM_PREFIX + c + "__" + str(i) new_c = ITEM_PREFIX + c + "__" + str(i)
itemDF = itemDF.withColumn(new_c, F.split(F.col(c), ",")[i - 1]) itemDF = itemDF.withColumn(new_c, F.split(F.col(c), ",")[i - 1])
...@@ -102,6 +100,9 @@ def addItemFeatures(samples,itemDF,dataVocab,multiVocab): ...@@ -102,6 +100,9 @@ def addItemFeatures(samples,itemDF,dataVocab,multiVocab):
dataVocab[new_c] = multiVocab[c] dataVocab[new_c] = multiVocab[c]
itemDF = itemDF.drop(c)
samples = samples.join(itemDF, on=['itemid'], how='inner') samples = samples.join(itemDF, on=['itemid'], how='inner')
# 统计特征处理 # 统计特征处理
print("统计特征处理...") print("统计特征处理...")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment