Commit a4f0be81 authored by crazyer's avatar crazyer

add info inference

parent bcc60d17
......@@ -9,6 +9,7 @@ from collections import Counter
from config import config
import os
import codecs
import json
class SELECTED_CONTENT_TYPE():
......@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path, synonym_path, encoding="utf-8"):
def __init__(self, network_influencer_path, project_path, star_path, synonym_path, tag_info_path,
support_words_path, encoding="utf-8"):
self.encoding = encoding
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path)
......@@ -48,6 +50,18 @@ class TextClassifical(object):
u"耳软骨 鼻子 耳软骨隆鼻"
]
self.template_logic = self.build_template()
self.tag_info = self.build_tag_info_pro(tag_info_path)
self.support_words = self.build_support_words(support_words_path)
def build_support_words(self, support_words_path):
ret = []
ret = json.loads(codecs.open(support_words_path, "r", encoding=self.encoding).read())
return set(ret)
def build_tag_info_pro(self, tag_info_path):
ret = {}
ret = json.loads(codecs.open(tag_info_path, "r", encoding=self.encoding).read())
return ret
def build_template(self):
ret = []
......@@ -102,19 +116,34 @@ class TextClassifical(object):
ret.append(tag)
return [{item: 1.0} for item in ret]
def run(self, content):
def get_info_inference_tags(self, words, proba_threshold=0.3, topk=10):
tag_proba = {}
common_words_concurrence = self.support_words & set(words)
for word in common_words_concurrence:
for tag in self.tag_info[word]:
if tag in tag_proba:
tag_proba[tag] = tag_proba[tag] + self.tag_info[word][tag]
else:
tag_proba[tag] = self.tag_info[word][tag]
return sorted([(tag, tag_proba[tag]) for tag in tag_proba if tag_proba[tag] > proba_threshold],
key=lambda x: x[1],
reverse=True)[:topk]
def run(self, content, proba_threshold=0.3, topk=10):
ret = {
"content_type": -1,
"star": [],
"celebrity": [],
"projects": [],
"inference_tags": []
"inference_tags": [],
"info_inference_tags": []
}
words = self.tokenprocessor.lcut(content, cut_all=True)
words = stopwords_filter.filter(words)
netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
project_word_concurrence = set(words) & set(self.project_words)
star_words_concurrence = set(words) & set(self.star_words)
info_inference_tags = self.get_info_inference_tags(words, proba_threshold=0.3, topk=10)
counter = Counter(words)
content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
star_words_concurrence)
......@@ -125,6 +154,7 @@ class TextClassifical(object):
ret["projects"].extend(
[{self.standard_project(word): words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
ret["inference_tags"].extend(self.get_inference_tags(words))
ret["info_inference_tags"].extend(info_inference_tags)
return ret
def score(self, counter, concurrence_words):
......@@ -162,4 +192,4 @@ class TextClassifical(object):
root_path = "/".join(str(__file__).split("/")[:-3])
model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic),
os.path.join(root_path, config.synonym_path))
os.path.join(root_path, config.synonym_path), os.path.join(root_path, config.tag_info_path),os.path.join(root_path,config.support_words_path))
......@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic"
projects_dic = "dicts/project.dic"
star_dic = "dicts/star.dic"
synonym_path = "dicts/synonym.dic"
tag_info_path = "dicts/class_suport"
support_words_path = "dicts/beauty_mddicine.dic"
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment