Commit 84e6abb5 authored by 高雅喆's avatar 高雅喆

add gm_tag_cf

parent ea7bcd8a
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from tool import get_all_tag_parent_tag, get_all_tags_name, get_all_word_synonym_words
# Recommender Engine
def get_similary_tags(tag_id, tags, ratings_matrix):
tag_id_index = tags[tags['tag_id'] == tag_id].index.tolist()
tag_id_index = tag_id_index[0]
tags['similarity'] = ratings_matrix.iloc[tag_id_index]
return tags.sort_values(["similarity"], ascending=False)[:-1].tag_id.tolist() # rm self
if __name__ == '__main__':
# User Log
# index: sample id
# columns: userId,tagId,rating
ratings = pd.read_csv("/home/gmuser/gyz/log/cf/user_item_log.csv", sep="\t")
ratings.drop("timestamp", inplace=True, axis=1)
sorted_tag_ids = sorted(ratings.tagId.unique())
cf_tags = pd.DataFrame({'tag_id': sorted_tag_ids})
# ratings.head()
# Pivot Table
# index: tagId(数值归一化后的)
# columns: userId
ratings_matrix = ratings.pivot_table(index=['tagId'], columns=['userId'], values='rating').reset_index(drop=True)
ratings_matrix.fillna(0, inplace=True)
# ratings_matrix.head()
# Cosine Similarity
# index: tagId
# columns: tagId
movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric="cosine")
np.fill_diagonal(movie_similarity, 0) # Filling diagonals with 0 for future use when sorting is done
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head()
# 1/2/3级标签的父级标签list
all_tag_parent_tags = get_all_tag_parent_tag()
# 词的同义词list
all_word_synonym_words = get_all_word_synonym_words()
# 所有的1/2/3级标签的id及name
all_tags_name = get_all_tags_name()
# 所有标签的协同标签
all_tags_cf_tags = dict()
for tag in all_tags_name:
if tag in cf_tags:
tag_cf_tags = get_similary_tags(tag, cf_tags, ratings_matrix)
else:
tag_cf_tags = []
# 过滤父级标签
parent_tags = all_tag_parent_tags[tag]
for parent_tag in parent_tags:
if parent_tag in tag_cf_tags:
tag_cf_tags.remove(parent_tag)
# 过滤标签的同义词
tag_name = all_tags_name[tag]
tag_cf_tags_names = [all_tags_name[i] for i in tag_cf_tags]
tag_synonym_names = all_word_synonym_words[tag_name]
for tag_synonym_name in tag_synonym_names:
if tag_synonym_name in tag_cf_tags_names:
tag_cf_tags_names.remove(tag_synonym_name)
all_tags_cf_tags[tag_name] = tag_cf_tags_names
......@@ -83,7 +83,7 @@ def get_all_search_word_synonym_tags():
sql = "select a.keyword , c.id from api_wordrel a " \
"left join api_wordrelsynonym b on a.id = b.wordrel_id " \
"left join api_tag c on b.word=c.name " \
"where a.category in (1,13,10,11,12) and c.tag_type+0<'4'+0 and c.is_online=1"
"where c.tag_type+0<'4'+0 and c.is_online=1"
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
result_dict = dict()
for data in mysql_results:
......@@ -96,6 +96,22 @@ def get_all_search_word_synonym_tags():
print(e)
def get_all_word_synonym_words():
try:
sql = "select a.keyword, b.word from api_wordrel a " \
"left join api_wordrelsynonym b on a.id = b.wordrel_id "
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
result_dict = dict()
for data in mysql_results:
if data['keyword'] not in result_dict:
result_dict[data['keyword']] = [data['word']]
else:
result_dict[data['keyword']].append(data['word'])
return result_dict
except Exception as e:
print(e)
def get_all_synonym_tags():
"""
:return:dict {"search_word1":[tag_list1],"search_word2":[tag_list2]...}
......@@ -163,7 +179,25 @@ def get_all_3tag_2tag():
sql = "select a.child_id,a.parent_id from api_tagrelation a" \
" left join api_tag b on a.parent_id=b.id " \
"where a.child_id in (select id from api_tag where tag_type='3' and is_online=1) " \
"and b.tag_type='2'"
"and b.tag_type='2' and b.is_online=1"
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
result_dict = dict()
for data in mysql_results:
if data['child_id'] not in result_dict:
result_dict[data['child_id']] = [data['parent_id']]
else:
result_dict[data['child_id']].append(data['parent_id'])
return result_dict
except Exception as e:
print(e)
def get_all_tag_parent_tag():
try:
sql = "select a.child_id,a.parent_id from api_tagrelation a" \
" left join api_tag b on a.parent_id=b.id " \
"where a.child_id in (select id from api_tag where tag_type+0<'4'+0 and is_online=1) " \
"and b.tag_type+0<'4'+0 and b.is_online=1"
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
result_dict = dict()
for data in mysql_results:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment