Commit eed3fb63 authored by 郭羽's avatar 郭羽

update feature

parent 0ab01003
...@@ -153,6 +153,7 @@ def arrayReverse(arr): ...@@ -153,6 +153,7 @@ def arrayReverse(arr):
def addUserFeatures(samples,dataVocab,multiVocab): def addUserFeatures(samples,dataVocab,multiVocab):
dataVocab["userid"] = collectColumnToVocab(samples, "userid") dataVocab["userid"] = collectColumnToVocab(samples, "userid")
dataVocab["user_os"] = ["ios","android","-1"]
extractTagsUdf = F.udf(extractTags, ArrayType(StringType())) extractTagsUdf = F.udf(extractTags, ArrayType(StringType()))
arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType())) arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType()))
...@@ -381,7 +382,7 @@ def getClickSql(start, end): ...@@ -381,7 +382,7 @@ def getClickSql(start, end):
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.page_stay SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.page_stay
FROM FROM
( (
select partition_date,cl_id,business_id as card_id,time_stamp,page_stay select partition_date,cl_id,business_id as card_id,time_stamp,page_stay,cl_type as os
from online.bl_hdfs_maidian_updates from online.bl_hdfs_maidian_updates
where action = 'page_view' where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}' AND partition_date>='{startDay}' and partition_date<='{endDay}'
...@@ -425,7 +426,7 @@ def getClickSql(start, end): ...@@ -425,7 +426,7 @@ def getClickSql(start, end):
def getExposureSql(start, end): def getExposureSql(start, end):
sql = """ sql = """
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp, 0 as page_stay SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp, 0 as page_stay,cl_type as os
from from
( --新首页卡片曝光 ( --新首页卡片曝光
SELECT partition_date,cl_id,card_id,time_stamp SELECT partition_date,cl_id,card_id,time_stamp
...@@ -756,7 +757,6 @@ if __name__ == '__main__': ...@@ -756,7 +757,6 @@ if __name__ == '__main__':
spark.sparkContext.setLogLevel("ERROR") spark.sparkContext.setLogLevel("ERROR")
# 行为数据 # 行为数据
clickSql = getClickSql(startDay,endDay) clickSql = getClickSql(startDay,endDay)
print("--------")
expSql = getExposureSql(startDay,endDay) expSql = getExposureSql(startDay,endDay)
clickDF = spark.sql(clickSql) clickDF = spark.sql(clickSql)
...@@ -766,7 +766,8 @@ if __name__ == '__main__': ...@@ -766,7 +766,8 @@ if __name__ == '__main__':
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\ ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\
.withColumnRenamed("device_id", "userid")\ .withColumnRenamed("device_id", "userid")\
.withColumnRenamed("card_id", "itemid")\ .withColumnRenamed("card_id", "itemid")\
.withColumnRenamed("page_stay", "rating") .withColumnRenamed("page_stay", "rating")\
.withColumnRenamed("os", "user_os")
print(ratingDF.columns) print(ratingDF.columns)
print(ratingDF.show(10, truncate=False)) print(ratingDF.show(10, truncate=False))
...@@ -846,12 +847,14 @@ if __name__ == '__main__': ...@@ -846,12 +847,14 @@ if __name__ == '__main__':
# timestmp8 = int(round(time.time())) # timestmp8 = int(round(time.time()))
# print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7)) # print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7))
# 字典转为json 存入redis # 字典转为json 存入redis
print("数据字典存入redis...") print("数据字典save...")
print("dataVocab:") print("dataVocab:")
print(dataVocab.keys()) print(dataVocab.keys())
vocab_path = "../vocab/{}_vocab.json".format(VERSION)
dataVocabStr = json.dumps(dataVocab, ensure_ascii=False) dataVocabStr = json.dumps(dataVocab, ensure_ascii=False)
dataVocabToRedis(dataVocabStr) open(configUtils.VOCAB_PATH,mode='w',encoding='utf-8').write(dataVocabStr)
# dataVocabToRedis(dataVocabStr)
timestmp9 = int(round(time.time())) timestmp9 = int(round(time.time()))
print("总耗时m:{}".format((timestmp9 - start)/60)) print("总耗时m:{}".format((timestmp9 - start)/60))
......
...@@ -183,7 +183,8 @@ if __name__ == '__main__': ...@@ -183,7 +183,8 @@ if __name__ == '__main__':
# redis中加载数据字典 # redis中加载数据字典
print("redis 中加载模型字典...") print("redis 中加载模型字典...")
data_vocab = getDataVocabFromRedis(VERSION) data_vocab = json.load(open(configUtils.VOCAB_PATH,mode='r',encoding='utf-8'))
# data_vocab = getDataVocabFromRedis(VERSION)
assert data_vocab assert data_vocab
timestmp1 = int(round(time.time())) timestmp1 = int(round(time.time()))
......
SERVICE_VERSION="v1" SERVICE_VERSION="v1"
SERVICE_MODEL_PATH = "/srv/apps/tensorServing_models/service" SERVICE_MODEL_PATH = "/srv/apps/tensorServing_models/service"
\ No newline at end of file VOCAB_PATH = "/srv/apps/serviceRec/vocab/{}_vocab.json".format(SERVICE_VERSION)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment