import redis
import json
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from tool import get_all_tag_parent_tag, get_all_tags_name, get_all_word_synonym_words, get_data_by_mysql


# Recommender Engine
def get_similary_tags(tag_id, tags, ratings_matrix):
    tag_id_index = tags[tags['tag_id'] == tag_id].index.tolist()
    tag_id_index = tag_id_index[0]
    tags['similarity'] = ratings_matrix.iloc[tag_id_index]
    return tags.sort_values(["similarity"], ascending=False)[:-1].tag_id.tolist()  # rm self


def get_user_log():
    sql = "select userId, tagId, rating from item_cf_log"
    mysql_results = get_data_by_mysql('172.16.40.158', 4000, 'st_user', 'aqpuBLYzEV7tML5RPsN1pntUzFy', 'jerry_test', sql)
    df = pd.DataFrame(mysql_results)
    return df

if __name__ == '__main__':
    # User Log
    # index: sample id
    # columns: userId,tagId,rating
    ratings = get_user_log()
    sorted_tag_ids = sorted(ratings.tagId.unique())
    cf_tags = pd.DataFrame({'tag_id': sorted_tag_ids})
    # ratings.head()

    # Pivot Table
    # index: tagId(数值归一化后的)
    # columns: userId
    ratings_matrix = ratings.pivot_table(index=['tagId'], columns=['userId'], values='rating').reset_index(drop=True)
    ratings_matrix.fillna(0, inplace=True)
    # ratings_matrix.head()

    # Cosine Similarity
    # index: tagId
    # columns: tagId
    movie_similarity = 1 - pairwise_distances(ratings_matrix.as_matrix(), metric="cosine")
    np.fill_diagonal(movie_similarity, 0)  # Filling diagonals with 0 for future use when sorting is done
    ratings_matrix = pd.DataFrame(movie_similarity)
    ratings_matrix.head()

    # 1/2/3级标签的父级标签list
    all_tag_parent_tags = get_all_tag_parent_tag()

    # 词的同义词list
    all_word_synonym_words = get_all_word_synonym_words()

    # 所有的1/2/3级标签的id及name
    all_tags_name = get_all_tags_name()

    # 所有标签的协同标签
    all_tags_cf_tags = dict()
    for tag in all_tags_name:
        if tag in sorted_tag_ids:
            tag_cf_tags = get_similary_tags(tag, cf_tags, ratings_matrix)
        else:
            continue
        # 过滤父级标签
        parent_tags = all_tag_parent_tags.get(tag, [])  # 1级标签没有父级标签
        for parent_tag in parent_tags:
            if parent_tag in tag_cf_tags:
                tag_cf_tags.remove(parent_tag)
        # 过滤标签的同义词
        tag_name = all_tags_name[tag]
        tag_cf_tags_names = [all_tags_name[i] for i in tag_cf_tags]
        tag_synonym_names = all_word_synonym_words.get(tag_name, [])  # 部分标签名称没有同义词
        for tag_synonym_name in tag_synonym_names:
            if tag_synonym_name in tag_cf_tags_names:
                tag_cf_tags_names.remove(tag_synonym_name)
        all_tags_cf_tags[tag_name] = json.dumps(tag_cf_tags_names[:10])

    redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
    gm_tag_cf_tags_key = "gm:tag:cf:tags"
    redis_client.hmset(gm_tag_cf_tags_key, all_tags_cf_tags)
    print("update cf tags count {}".format(len(all_tags_cf_tags)))

    # doris上执行
    # data = redis_client.hgetall(gm_tag_cf_tags_key)
    # data_dict = {str(i, 'utf-8'): json.loads(data[i]) for i in data}
