Commit 58f22a26 authored by 郭羽's avatar 郭羽

美购精排模型耗时优化

parent 57937d89
......@@ -311,8 +311,8 @@ def featuresToRedis(samples,columns,prefix,redisKey):
k = d[idCol]
v = json.dumps(d.asDict(), ensure_ascii=False)
newKey = redisKey + k
conn.set(newKey, v)
conn.expire(newKey, 60 * 60 * 24 * 7)
# conn.set(newKey, v)
# conn.expire(newKey, 60 * 60 * 24 * 7)
#根据timestamp获取每个user最新的记录
prefixSamples = samples.groupBy(idCol).agg(F.max("timestamp").alias(timestampCol))
......@@ -698,30 +698,30 @@ if __name__ == '__main__':
timestmp6 = int(round(time.time()))
print("item feature to redis 耗时s:{}".format(timestmp6 - timestmp5))
# model columns
print("model columns to redis...")
model_columns = user_columns + item_columns
featureColumnsToRedis(model_columns)
train_columns = model_columns + ["label", "timestamp"]
trainSamples = samplesWithUserFeatures.select(*train_columns)
print("write to hdfs start...")
splitTimestamp = int(time.mktime(time.strptime(endDay, "%Y%m%d")))
splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH)
print("write to hdfs success...")
timestmp7 = int(round(time.time()))
print("数据写入hdfs 耗时s:{}".format(timestmp7 - timestmp6))
# 离散数据字典生成
print("数据字典生成...")
dataVocab = getDataVocab(samplesWithUserFeatures)
timestmp8 = int(round(time.time()))
print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7))
# 字典转为json 存入redis
print("数据字典存入redis...")
dataVocabStr = json.dumps(dataVocab, ensure_ascii=False)
dataVocabToRedis(dataVocabStr)
timestmp9 = int(round(time.time()))
print("总耗时s:{}".format(timestmp9 - timestmp8))
# # model columns
# print("model columns to redis...")
# model_columns = user_columns + item_columns
# featureColumnsToRedis(model_columns)
#
# train_columns = model_columns + ["label", "timestamp"]
# trainSamples = samplesWithUserFeatures.select(*train_columns)
# print("write to hdfs start...")
# splitTimestamp = int(time.mktime(time.strptime(endDay, "%Y%m%d")))
# splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH)
# print("write to hdfs success...")
# timestmp7 = int(round(time.time()))
# print("数据写入hdfs 耗时s:{}".format(timestmp7 - timestmp6))
#
# # 离散数据字典生成
# print("数据字典生成...")
# dataVocab = getDataVocab(samplesWithUserFeatures)
# timestmp8 = int(round(time.time()))
# print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7))
# # 字典转为json 存入redis
# print("数据字典存入redis...")
# dataVocabStr = json.dumps(dataVocab, ensure_ascii=False)
# dataVocabToRedis(dataVocabStr)
# timestmp9 = int(round(time.time()))
# print("总耗时s:{}".format(timestmp9 - timestmp8))
spark.stop()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment