import jieba from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from scipy.linalg import norm def jaccard_similarity(s1, s2): s1 = " ".join(jieba.cut(s1)) s2 = " ".join(jieba.cut(s2)) cv = CountVectorizer(tokenizer=lambda s: s.split()) # 转化为TF矩阵 corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() numerator = np.sum(np.min(vectors, axis=0)) # 计算交集 denominator = np.sum(np.max(vectors, axis=0)) # 计算并集 return 1.0 * numerator / denominator # 计算杰卡德系数 def tf_similarity(s1, s2): s1 = " ".join(jieba.cut(s1)) s2 = " ".join(jieba.cut(s2)) cv = CountVectorizer(tokenizer=lambda s: s.split()) # 转化为TF矩阵 corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) # 计算TF系数 def tfidf_similarity(s1, s2): s1 = " ".join(jieba.cut(s1)) s2 = " ".join(jieba.cut(s2)) cv = TfidfVectorizer(tokenizer=lambda s: s.split()) # 转化为TF矩阵 corpus = [s1, s2] vectors = cv.fit_transform(corpus).toarray() return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1])) # 计算TFIDF系数结果