Commit 0b4b9c6c authored by 赵威's avatar 赵威

get online ids for answer item2vec

parent 68af57e1
...@@ -130,7 +130,7 @@ def get_online_answer_ids(): ...@@ -130,7 +130,7 @@ def get_online_answer_ids():
for item in get_online_ids("answer"): for item in get_online_ids("answer"):
count += 1 count += 1
try: try:
print(count) # print(count)
id = int(item["_id"]) id = int(item["_id"])
res_set.add(id) res_set.add(id)
except Exception as e: except Exception as e:
...@@ -140,12 +140,13 @@ def get_online_answer_ids(): ...@@ -140,12 +140,13 @@ def get_online_answer_ids():
def get_device_click_answer_ids_dict(click_df): def get_device_click_answer_ids_dict(click_df):
online_ids = get_online_answer_ids()
res = defaultdict(list) res = defaultdict(list)
cols = click_df.orderBy("partition_date", ascending=False).collect() cols = click_df.orderBy("partition_date", ascending=False).collect()
for i in cols: for i in cols:
card_id = i["card_id"] card_id = i["card_id"]
session_id = i["app_session_id"] session_id = i["app_session_id"]
if card_id not in res[session_id]: if (card_id not in res[session_id]) and int(card_id) in online_ids:
res[session_id].append(card_id) res[session_id].append(card_id)
return res return res
...@@ -170,27 +171,26 @@ def save_clicked_answer_ids_item2vec(): ...@@ -170,27 +171,26 @@ def save_clicked_answer_ids_item2vec():
if __name__ == "__main__": if __name__ == "__main__":
begin_time = time.time() begin_time = time.time()
res_set = get_online_answer_ids() # res_set = get_online_answer_ids()
print(len(res_set))
# spark = get_spark("answer_click_ids") spark = get_spark("answer_click_ids")
# click_df = get_answer_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1)) click_df = get_answer_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1))
# click_df.show(5, False) click_df.show(5, False)
# print(click_df.count()) print(click_df.count())
# res_dict = get_device_click_answer_ids_dict(click_df) res_dict = get_device_click_answer_ids_dict(click_df)
# with open(os.path.join(DATA_PATH, "click_answer_ids.csv"), "w") as f: with open(os.path.join(DATA_PATH, "click_answer_ids.csv"), "w") as f:
# for (k, v) in res_dict.items(): for (k, v) in res_dict.items():
# if v: if v:
# f.write("{}|{}\n".format(k, ",".join([str(x) for x in v]))) f.write("{}|{}\n".format(k, ",".join([str(x) for x in v])))
# print("write data done.") print("write data done.")
# save_clicked_answer_ids_item2vec() save_clicked_answer_ids_item2vec()
# for id in ["986424", "744910", "703622"]: for id in ["986424", "744910", "703622"]:
# print(ANSWER_CLICK_IDS_MODEL.wv.most_similar(id, topn=5)) print(ANSWER_CLICK_IDS_MODEL.wv.most_similar(id, topn=5))
# print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60)) print("total cost: {:.2f}mins".format((time.time() - begin_time) / 60))
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/answer.py # spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/answer.py
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment