Commit 50ce4106 authored by 郭羽's avatar 郭羽

update feature

parent 14181a35
......@@ -822,24 +822,11 @@ if __name__ == '__main__':
open(configUtils.VOCAB_PATH, mode='w', encoding='utf-8').write(dataVocabStr)
# dataVocabToRedis(dataVocabStr)
"""训练数据保存 ======================================"""
timestmp3 = int(round(time.time()))
train_columns = model_columns + ["label", "timestamp", "rating"]
trainSamples = samplesWithUserFeatures.select(*train_columns)
print("write to hdfs start...")
splitTimestamp = int(time.mktime(time.strptime(addDays(0), "%Y%m%d")))
splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH)
print("write to hdfs success...")
timestmp4 = int(round(time.time()))
print("数据写入hdfs 耗时s:{}".format(timestmp4 - timestmp3))
"""特征数据存入redis======================================"""
# user特征数据存入redis
featuresToRedis(samplesWithUserFeatures, user_columns, "user", FEATURE_USER_KEY)
timestmp5 = int(round(time.time()))
print("user feature to redis 耗时s:{}".format(timestmp5 - timestmp4))
print("user feature to redis 耗时s:{}".format(timestmp5 - timestmp3))
# userDatas = collectFeaturesToDict(samplesWithUserFeatures, user_columns, "user")
# featureToRedis(FEATURE_USER_KEY, userDatas)
# itemDatas = collectFeaturesToDict(samplesWithUserFeatures, item_columns, "item")
......@@ -851,6 +838,17 @@ if __name__ == '__main__':
timestmp6 = int(round(time.time()))
print("item feature to redis 耗时s:{}".format(timestmp6 - timestmp5))
print("总耗时m:{}".format((timestmp6 - start)/60))
"""训练数据保存 ======================================"""
timestmp3 = int(round(time.time()))
train_columns = model_columns + ["label", "timestamp", "rating"]
trainSamples = samplesWithUserFeatures.select(*train_columns)
print("write to hdfs start...")
splitTimestamp = int(time.mktime(time.strptime(addDays(0), "%Y%m%d")))
splitAndSaveTrainingTestSamplesByTimeStamp(trainSamples, splitTimestamp, TRAIN_FILE_PATH)
print("write to hdfs success...")
timestmp4 = int(round(time.time()))
print("数据写入hdfs 耗时s:{}".format(timestmp4 - timestmp3))
print("总耗时m:{}".format((timestmp4 - start)/60))
spark.stop()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment