Commit 76fdb06e authored by 郭羽's avatar 郭羽

service model 优化

parent dc6b471e
...@@ -277,41 +277,6 @@ def collectMutiColumnToVocab(samples,column): ...@@ -277,41 +277,6 @@ def collectMutiColumnToVocab(samples,column):
tagSet.add("-1") # 空值默认 tagSet.add("-1") # 空值默认
return list(tagSet) return list(tagSet)
def getDataVocab(samples,model_columns):
dataVocab = {}
multiVocab = {}
# 多值特征
for c in ITEM_MULTI_COLUMN_EXTRA_MAP.keys():
print(c)
multiVocab[c] = collectMutiColumnToVocab(samples,c)
samples = samples.drop(c)
# id类特征 和 类别特征
for c in ["userid"]:
print(c)
dataVocab[c] = collectColumnToVocab(samples,c)
for c in model_columns:
# 判断是否以Bucket结尾
if c.endswith("Bucket"):
datas = samples.select(c).distinct().collect()
vocabSet = set()
for d in datas:
if d[c]:
vocabSet.add(str(d[c]))
vocabSet.add("-1")# 空值的默认
dataVocab[c] = list(vocabSet)
# elif c.count("userRatedHistory") > 0:
# dataVocab[c] = dataVocab["item_id"]
else:
# 判断是否多值离散列
for cc, v in multiVocab.items():
if c.count(cc) > 0:
dataVocab[c] = v
return dataVocab
def dataVocabToRedis(dataVocab): def dataVocabToRedis(dataVocab):
conn = getRedisConn() conn = getRedisConn()
conn.set(FEATURE_VOCAB_KEY,dataVocab) conn.set(FEATURE_VOCAB_KEY,dataVocab)
...@@ -777,7 +742,8 @@ if __name__ == '__main__': ...@@ -777,7 +742,8 @@ if __name__ == '__main__':
.withColumnRenamed("device_id", "userid")\ .withColumnRenamed("device_id", "userid")\
.withColumnRenamed("card_id", "item_id")\ .withColumnRenamed("card_id", "item_id")\
.withColumnRenamed("page_stay", "rating")\ .withColumnRenamed("page_stay", "rating")\
.withColumnRenamed("os", "user_os") .withColumnRenamed("os", "user_os")\
.withColumn("user_city_id", F.when(F.col("user_city_id").isNull(), "-1").otherwise(F.col("user_city_id")))
print(ratingDF.columns) print(ratingDF.columns)
print(ratingDF.show(10, truncate=False)) print(ratingDF.show(10, truncate=False))
...@@ -791,16 +757,12 @@ if __name__ == '__main__': ...@@ -791,16 +757,12 @@ if __name__ == '__main__':
negCount = df.loc[df["label"]==1]["label"].count() negCount = df.loc[df["label"]==1]["label"].count()
print("pos size:"+str(posCount),"neg size:"+str(negCount)) print("pos size:"+str(posCount),"neg size:"+str(negCount))
#统计数据处理
ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel)
itemDF = get_service_feature_df() itemDF = get_service_feature_df()
print(itemDF.columns) print(itemDF.columns)
print(itemDF.show(10, truncate=False)) print(itemDF.show(10, truncate=False))
# itemDF.to_csv("/tmp/service_{}.csv".format(endDay)) # itemDF.to_csv("/tmp/service_{}.csv".format(endDay))
# df.to_csv("/tmp/service_train_{}.csv".format(endDay)) # df.to_csv("/tmp/service_train_{}.csv".format(endDay))
# 数据字典 # 数据字典
dataVocab = {} dataVocab = {}
multiVocab = {} multiVocab = {}
...@@ -813,6 +775,9 @@ if __name__ == '__main__': ...@@ -813,6 +775,9 @@ if __name__ == '__main__':
print("multiVocab:") print("multiVocab:")
print(multiVocab.keys()) print(multiVocab.keys())
# 统计数据处理
ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel,dataVocab)
samples = ratingSamplesWithLabel.join(itemDF, on=['item_id'], how='inner') samples = ratingSamplesWithLabel.join(itemDF, on=['item_id'], how='inner')
print("处理user特征...") print("处理user特征...")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment