Commit eed3fb63 authored by 郭羽's avatar 郭羽

update feature

parent 0ab01003
......@@ -153,6 +153,7 @@ def arrayReverse(arr):
def addUserFeatures(samples,dataVocab,multiVocab):
dataVocab["userid"] = collectColumnToVocab(samples, "userid")
dataVocab["user_os"] = ["ios","android","-1"]
extractTagsUdf = F.udf(extractTags, ArrayType(StringType()))
arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType()))
......@@ -381,7 +382,7 @@ def getClickSql(start, end):
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.page_stay
FROM
(
select partition_date,cl_id,business_id as card_id,time_stamp,page_stay
select partition_date,cl_id,business_id as card_id,time_stamp,page_stay,cl_type as os
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
......@@ -425,7 +426,7 @@ def getClickSql(start, end):
def getExposureSql(start, end):
sql = """
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp, 0 as page_stay
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp, 0 as page_stay,cl_type as os
from
( --新首页卡片曝光
SELECT partition_date,cl_id,card_id,time_stamp
......@@ -756,7 +757,6 @@ if __name__ == '__main__':
spark.sparkContext.setLogLevel("ERROR")
# 行为数据
clickSql = getClickSql(startDay,endDay)
print("--------")
expSql = getExposureSql(startDay,endDay)
clickDF = spark.sql(clickSql)
......@@ -766,7 +766,8 @@ if __name__ == '__main__':
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\
.withColumnRenamed("device_id", "userid")\
.withColumnRenamed("card_id", "itemid")\
.withColumnRenamed("page_stay", "rating")
.withColumnRenamed("page_stay", "rating")\
.withColumnRenamed("os", "user_os")
print(ratingDF.columns)
print(ratingDF.show(10, truncate=False))
......@@ -846,12 +847,14 @@ if __name__ == '__main__':
# timestmp8 = int(round(time.time()))
# print("数据字典生成 耗时s:{}".format(timestmp8 - timestmp7))
# 字典转为json 存入redis
print("数据字典存入redis...")
print("数据字典save...")
print("dataVocab:")
print(dataVocab.keys())
vocab_path = "../vocab/{}_vocab.json".format(VERSION)
dataVocabStr = json.dumps(dataVocab, ensure_ascii=False)
dataVocabToRedis(dataVocabStr)
open(configUtils.VOCAB_PATH,mode='w',encoding='utf-8').write(dataVocabStr)
# dataVocabToRedis(dataVocabStr)
timestmp9 = int(round(time.time()))
print("总耗时m:{}".format((timestmp9 - start)/60))
......
......@@ -183,7 +183,8 @@ if __name__ == '__main__':
# redis中加载数据字典
print("redis 中加载模型字典...")
data_vocab = getDataVocabFromRedis(VERSION)
data_vocab = json.load(open(configUtils.VOCAB_PATH,mode='r',encoding='utf-8'))
# data_vocab = getDataVocabFromRedis(VERSION)
assert data_vocab
timestmp1 = int(round(time.time()))
......
SERVICE_VERSION="v1"
SERVICE_MODEL_PATH = "/srv/apps/tensorServing_models/service"
\ No newline at end of file
SERVICE_MODEL_PATH = "/srv/apps/tensorServing_models/service"
VOCAB_PATH = "/srv/apps/serviceRec/vocab/{}_vocab.json".format(SERVICE_VERSION)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment