Commit e4e1f131 authored by 宋柯's avatar 宋柯

模型调试

parent 8a479509
...@@ -162,6 +162,14 @@ def addUserStaticsFeatures(samples,dataVocab): ...@@ -162,6 +162,14 @@ def addUserStaticsFeatures(samples,dataVocab):
samples.show(20, truncate=False) samples.show(20, truncate=False)
return samples return samples
from collections import Iterable
def flatten(items):
for x in items:
if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
yield from flatten(x)
else:
yield x
def addItemFeatures(itemDF,dataVocab,multi_col_vocab): def addItemFeatures(itemDF,dataVocab,multi_col_vocab):
# multi_col = ['sku_tags', 'sku_show_tags','second_demands', 'second_solutions', 'second_positions'] # multi_col = ['sku_tags', 'sku_show_tags','second_demands', 'second_solutions', 'second_positions']
multi_col = ['tags_v3','second_demands', 'second_solutions', 'second_positions'] multi_col = ['tags_v3','second_demands', 'second_solutions', 'second_positions']
...@@ -175,7 +183,7 @@ def addItemFeatures(itemDF,dataVocab,multi_col_vocab): ...@@ -175,7 +183,7 @@ def addItemFeatures(itemDF,dataVocab,multi_col_vocab):
for c in multi_col: for c in multi_col:
#TODO 这里多标签的应该拆开 #TODO 这里多标签的应该拆开
multi_col_vocab[c] = list(set(itemDF[c].tolist())) multi_col_vocab[c] = list(set(flatten(map(lambda x: x.split(','), itemDF[c].tolist()))))
for i in range(1, 6): for i in range(1, 6):
new_c = ITEM_PREFIX + c + "__" + str(i) new_c = ITEM_PREFIX + c + "__" + str(i)
...@@ -509,7 +517,7 @@ def getExposureSql(start, end): ...@@ -509,7 +517,7 @@ def getExposureSql(start, end):
--and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品')) --and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品'))
and card_type in ('card','video') and card_type in ('card','video')
and card_content_type in ('service') and card_content_type in ('service')
--and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill') and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
) t1 ) t1
...@@ -877,16 +885,16 @@ if __name__ == '__main__': ...@@ -877,16 +885,16 @@ if __name__ == '__main__':
print(ratingDF.columns) print(ratingDF.columns)
print(ratingDF.show(100, truncate=False)) print(ratingDF.show(100, truncate=False))
sys.exit(1)
#TODO 负样本为排除点击的数据 #TODO 负样本为排除点击的数据
# ratingSamplesWithLabel = addSampleLabel(ratingDF) # ratingSamplesWithLabel = addSampleLabel(ratingDF)
df = ratingDF.toPandas() df = ratingDF.toPandas()
df = pd.DataFrame(df) df = pd.DataFrame(df)
posCount = df.loc[df["label"]==1]["label"].count() # posCount = df.loc[df["label"]==1]["label"].count()
negCount = df.loc[df["label"]==0]["label"].count() # negCount = df.loc[df["label"]==0]["label"].count()
print("pos size:"+str(posCount),"neg size:"+str(negCount)) # print("pos size:"+str(posCount),"neg size:"+str(negCount))
itemDF = get_service_feature_df() itemDF = get_service_feature_df()
print(itemDF.columns) print(itemDF.columns)
...@@ -910,7 +918,7 @@ if __name__ == '__main__': ...@@ -910,7 +918,7 @@ if __name__ == '__main__':
print("dataVocab:") print("dataVocab:")
for k, v in dataVocab.items(): for k, v in dataVocab.items():
print(k, len(v), v) print(k, len(v), v)
sys.exit(1)
itemDF_spark = spark.createDataFrame(itemDF) itemDF_spark = spark.createDataFrame(itemDF)
itemDF_spark.printSchema() itemDF_spark.printSchema()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment