Commit e4e1f131 authored by 宋柯's avatar 宋柯

模型调试

parent 8a479509
......@@ -162,6 +162,14 @@ def addUserStaticsFeatures(samples,dataVocab):
samples.show(20, truncate=False)
return samples
from collections import Iterable
def flatten(items):
for x in items:
if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
yield from flatten(x)
else:
yield x
def addItemFeatures(itemDF,dataVocab,multi_col_vocab):
# multi_col = ['sku_tags', 'sku_show_tags','second_demands', 'second_solutions', 'second_positions']
multi_col = ['tags_v3','second_demands', 'second_solutions', 'second_positions']
......@@ -175,7 +183,7 @@ def addItemFeatures(itemDF,dataVocab,multi_col_vocab):
for c in multi_col:
#TODO 这里多标签的应该拆开
multi_col_vocab[c] = list(set(itemDF[c].tolist()))
multi_col_vocab[c] = list(set(flatten(map(lambda x: x.split(','), itemDF[c].tolist()))))
for i in range(1, 6):
new_c = ITEM_PREFIX + c + "__" + str(i)
......@@ -509,7 +517,7 @@ def getExposureSql(start, end):
--and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品'))
and card_type in ('card','video')
and card_content_type in ('service')
--and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
) t1
......@@ -877,16 +885,16 @@ if __name__ == '__main__':
print(ratingDF.columns)
print(ratingDF.show(100, truncate=False))
sys.exit(1)
#TODO 负样本为排除点击的数据
# ratingSamplesWithLabel = addSampleLabel(ratingDF)
df = ratingDF.toPandas()
df = pd.DataFrame(df)
posCount = df.loc[df["label"]==1]["label"].count()
negCount = df.loc[df["label"]==0]["label"].count()
print("pos size:"+str(posCount),"neg size:"+str(negCount))
# posCount = df.loc[df["label"]==1]["label"].count()
# negCount = df.loc[df["label"]==0]["label"].count()
# print("pos size:"+str(posCount),"neg size:"+str(negCount))
itemDF = get_service_feature_df()
print(itemDF.columns)
......@@ -910,7 +918,7 @@ if __name__ == '__main__':
print("dataVocab:")
for k, v in dataVocab.items():
print(k, len(v), v)
sys.exit(1)
itemDF_spark = spark.createDataFrame(itemDF)
itemDF_spark.printSchema()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment