Commit fa8c704c authored by 宋柯's avatar 宋柯

模型bug修复

parent eae1757a
...@@ -554,7 +554,10 @@ def getExposureSql(start, end): ...@@ -554,7 +554,10 @@ def getExposureSql(start, end):
def getItemStatisticSql(start, end): def getItemStatisticSql(start, end):
sql = """ sql = """
SELECT TT.card_id, TT.partition_date, TT.label, count(1) as label_count SELECT TTT.card_id, TTT.label, COLLECT_LIST(CONCAT(TTT.partition_date, '_', TTT.label_count)) partition_date_label_count_list
FROM
(
SELECT TT.card_id, TT.partition_date, TT.label, count(1) as label_count
FROM FROM
( (
SELECT T.partition_date, T.card_id, T.label SELECT T.partition_date, T.card_id, T.label
...@@ -651,6 +654,8 @@ def getItemStatisticSql(start, end): ...@@ -651,6 +654,8 @@ def getItemStatisticSql(start, end):
) T ) T
) TT ) TT
GROUP BY TT.card_id, TT.partition_date, TT.label GROUP BY TT.card_id, TT.partition_date, TT.label
) TTT
GROUP BY TTT.card_id, TTT.label
""".format(startDay=start,endDay=end) """.format(startDay=start,endDay=end)
print(sql) print(sql)
return sql return sql
...@@ -922,6 +927,9 @@ def get_service_feature_df(): ...@@ -922,6 +927,9 @@ def get_service_feature_df():
def addDays(n, format="%Y%m%d"): def addDays(n, format="%Y%m%d"):
return (date.today() + timedelta(days=n)).strftime(format) return (date.today() + timedelta(days=n)).strftime(format)
def generatePartitionDates(trainDays):
return [(date.today() + timedelta(days = -trainDay)).strftime(format) for trainDay in range(trainDays)]
#显示所有列 #显示所有列
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
#显示所有行 #显示所有行
...@@ -938,6 +946,7 @@ if __name__ == '__main__': ...@@ -938,6 +946,7 @@ if __name__ == '__main__':
endDay = addDays(0) endDay = addDays(0)
startDay = addDays(-int(trainDays)) startDay = addDays(-int(trainDays))
itemStatisticStartDay = addDays(-int(trainDays + 31))
print("train_data start:{} end:{}".format(startDay,endDay)) print("train_data start:{} end:{}".format(startDay,endDay))
...@@ -954,9 +963,15 @@ if __name__ == '__main__': ...@@ -954,9 +963,15 @@ if __name__ == '__main__':
# 行为数据 # 行为数据
clickSql = getClickSql(startDay,endDay) clickSql = getClickSql(startDay,endDay)
expSql = getExposureSql(startDay,endDay) expSql = getExposureSql(startDay,endDay)
itemStatisticSql = getItemStatisticSql(startDay, endDay) itemStatisticSql = getItemStatisticSql(itemStatisticStartDay, endDay)
itemStatisticDF = spark.sql(itemStatisticSql)
itemStatisticDF.show(100, False)
partitionDatas = generatePartitionDates(trainDays)
partitionDatasBC = spark.sparkContext.broadcast(partitionDatas)
itemStatisticDF.rdd.flatMap()
spark.sql(itemStatisticSql).show(100, False)
sys.exit(1) sys.exit(1)
clickDF = spark.sql(clickSql) clickDF = spark.sql(clickSql)
clickDF.createOrReplaceTempView("clickDF") clickDF.createOrReplaceTempView("clickDF")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment