Commit 58f22a26 authored by 郭羽's avatar 郭羽

美购精排模型耗时优化

parent 57937d89
...@@ -311,8 +311,8 @@ def featuresToRedis(samples,columns,prefix,redisKey): ...@@ -311,8 +311,8 @@ def featuresToRedis(samples,columns,prefix,redisKey):
k = d[idCol] k = d[idCol]
v = json.dumps(d.asDict(), ensure_ascii=False) v = json.dumps(d.asDict(), ensure_ascii=False)
newKey = redisKey + k newKey = redisKey + k
conn.set(newKey, v) # conn.set(newKey, v)
conn.expire(newKey, 60 * 60 * 24 * 7) # conn.expire(newKey, 60 * 60 * 24 * 7)
#根据timestamp获取每个user最新的记录 #根据timestamp获取每个user最新的记录
prefixSamples = samples.groupBy(idCol).agg(F.max("timestamp").alias(timestampCol)) prefixSamples = samples.groupBy(idCol).agg(F.max("timestamp").alias(timestampCol))
...@@ -698,30 +698,30 @@ if __name__ == '__main__': ...@@ -698,30 +698,30 @@ if __name__ == '__main__':
timestmp6 = int(round(time.time())) timestmp6 = int(round(time.time()))
print("item feature to redis 耗时s:{}".format(timestmp6 - timestmp5)) print("item feature to redis 耗时s:{}".format(timestmp6 - timestmp5))
# model columns # # model columns
print("model columns to redis...") # print("model columns to redis...")
model_columns = user_columns + item_columns # model_columns = user_columns + item_columns
featureColumnsToRedis(model_columns) # featureColumnsToRedis(model_columns)
#
train_columns = model_columns + ["label", "timestamp"] # train_columns = model_columns + ["label", "timestamp"]
trainSamples = samplesWithUserFeatures.select(*train_columns) # trainSamples = samplesWithUserFeatures.select(*train_columns)
print("write to hdfs start...") # print("write to hdfs start...")
splitTimestamp = int(time.mktime(time.strptime(endDay, "%Y%m%d"))) # splitTimestamp = int(time.mktime(time.strptime(endDay, "%Y%m%d")))
splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH) # splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH)
print("write to hdfs success...") # print("write to hdfs success...")
timestmp7 = int(round(time.time())) # timestmp7 = int(round(time.time()))
print("数据写入hdfs 耗时s:{}".format(timestmp7 - timestmp6)) # print("数据写入hdfs 耗时s:{}".format(timestmp7 - timestmp6))
#
# 离散数据字典生成 # # 离散数据字典生成
print("数据字典生成...") # print("数据字典生成...")
dataVocab = getDataVocab(samplesWithUserFeatures) # dataVocab = getDataVocab(samplesWithUserFeatures)
timestmp8 = int(round(time.time())) # timestmp8 = int(round(time.time()))
print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7)) # print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7))
# 字典转为json 存入redis # # 字典转为json 存入redis
print("数据字典存入redis...") # print("数据字典存入redis...")
dataVocabStr = json.dumps(dataVocab, ensure_ascii=False) # dataVocabStr = json.dumps(dataVocab, ensure_ascii=False)
dataVocabToRedis(dataVocabStr) # dataVocabToRedis(dataVocabStr)
timestmp9 = int(round(time.time())) # timestmp9 = int(round(time.time()))
print("总耗时s:{}".format(timestmp9 - timestmp8)) # print("总耗时s:{}".format(timestmp9 - timestmp8))
spark.stop() spark.stop()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment