Commit 03735f88 authored by 宋柯's avatar 宋柯

模型调试

parent 7d5ccd4b
......@@ -291,10 +291,20 @@ def wilson_ctr(num_pv, num_click):
score = (p + z*z/(2*n) - z*math.sqrt((p*(1.0 - p) + z*z /(4.0*n))/n)) / (1.0 + z*z/n);
return float(score);
def addUserFeatures(samples,dataVocab,multiVocab):
dataVocab["userid"] = collectColumnToVocab(samples, "userid")
dataVocab["user_city_id"] = collectColumnToVocab(samples, "user_city_id")
dataVocab["user_os"] = ["ios","android"]
def getUserProfileFeature(samples_iEsF_iStatisticF, spark, startDay, endDay):
#连接doris_olap库
table_query = """
select date as dt, cl_id as device_id, first_solutions, second_solutions, first_demands, second_demands, first_positions, second_positions, projects
from user_tag3_portrait
where date >= {startDay} and date <= {endDay}
""".format(startDay, endDay)
userProfileFeatureDF = spark.read.jdbc('jdbc:mysql://172.16.30.136:3306/doris_olap', table_query, column = 'cl_id', numPartitions = 100,
properties = { 'user': 'doris_olap', 'password': 'bA27hXasdfswuolap', 'driver': 'com.mysql.jdbc.Driver' })
userProfileFeatureDF.show(100, False)
def addUserFeatures(samples):
extractTagsUdf = F.udf(extractTags, ArrayType(StringType()))
arrayReverseUdf = F.udf(arrayReverse, ArrayType(StringType()))
......@@ -307,7 +317,6 @@ def addUserFeatures(samples,dataVocab,multiVocab):
for i in range(1,11):
samples = samples.withColumn("userRatedHistory"+str(i), F.when(F.col("userPositiveHistory")[i-1].isNotNull(),F.col("userPositiveHistory")[i-1]).otherwise("-1"))
dataVocab["userRatedHistory"+str(i)] = dataVocab["item_id"]
samples = samples.drop("userPositiveHistory")
# user偏好
......@@ -923,7 +932,8 @@ if __name__ == '__main__':
spark = get_spark("SERVICE_FEATURE_CSV_EXPORT_SK")
spark.sparkContext.setLogLevel("ERROR")
getUserProfileFeature(None, spark, addDays(-trainDays - 1), addDays(-1))
sys.exit()
#获取点击曝光数据
clickDF, expDF, ratingDF, startDay, endDay = get_click_exp_rating_df(trainDays, spark)
......@@ -941,18 +951,13 @@ if __name__ == '__main__':
.join(expStaticFeatures, on = ["item_id", "partition_date"], how = 'left')\
.join(itemEsFeatureDF, on = ["item_id"], how = 'left')
#item 统计 特征 Process
samples_iEsF_iStatisticF = itemStatisticFeaturesProcess(samples_iEsF_iStatisticF)
sys.exit(1)
# 统计数据处理
# ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel,dataVocab)
#user profile feature
samplesWithUserFeatures = getUserProfileFeature(samples_iEsF_iStatisticF, spark)
# samples = ratingDF.join(itemStaticDF, on=['item_id'], how='inner')
print("处理user特征...")
samplesWithUserFeatures = addUserFeatures(samples,dataVocab,multiVocab)
timestmp3 = int(round(time.time()))
print("处理user特征, 耗时s:{}".format(timestmp3 - timestmp2))
#
# user columns
user_columns = [c for c in samplesWithUserFeatures.columns if c.startswith("user")]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment