Commit a4f0be81 authored by crazyer's avatar crazyer

add info inference

parent bcc60d17
...@@ -9,6 +9,7 @@ from collections import Counter ...@@ -9,6 +9,7 @@ from collections import Counter
from config import config from config import config
import os import os
import codecs import codecs
import json
class SELECTED_CONTENT_TYPE(): class SELECTED_CONTENT_TYPE():
...@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE(): ...@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object): class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path, synonym_path, encoding="utf-8"): def __init__(self, network_influencer_path, project_path, star_path, synonym_path, tag_info_path,
support_words_path, encoding="utf-8"):
self.encoding = encoding self.encoding = encoding
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path) self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path) self.project_words = self.build_project_words(project_path)
...@@ -48,6 +50,18 @@ class TextClassifical(object): ...@@ -48,6 +50,18 @@ class TextClassifical(object):
u"耳软骨 鼻子 耳软骨隆鼻" u"耳软骨 鼻子 耳软骨隆鼻"
] ]
self.template_logic = self.build_template() self.template_logic = self.build_template()
self.tag_info = self.build_tag_info_pro(tag_info_path)
self.support_words = self.build_support_words(support_words_path)
def build_support_words(self, support_words_path):
ret = []
ret = json.loads(codecs.open(support_words_path, "r", encoding=self.encoding).read())
return set(ret)
def build_tag_info_pro(self, tag_info_path):
ret = {}
ret = json.loads(codecs.open(tag_info_path, "r", encoding=self.encoding).read())
return ret
def build_template(self): def build_template(self):
ret = [] ret = []
...@@ -102,19 +116,34 @@ class TextClassifical(object): ...@@ -102,19 +116,34 @@ class TextClassifical(object):
ret.append(tag) ret.append(tag)
return [{item: 1.0} for item in ret] return [{item: 1.0} for item in ret]
def run(self, content): def get_info_inference_tags(self, words, proba_threshold=0.3, topk=10):
tag_proba = {}
common_words_concurrence = self.support_words & set(words)
for word in common_words_concurrence:
for tag in self.tag_info[word]:
if tag in tag_proba:
tag_proba[tag] = tag_proba[tag] + self.tag_info[word][tag]
else:
tag_proba[tag] = self.tag_info[word][tag]
return sorted([(tag, tag_proba[tag]) for tag in tag_proba if tag_proba[tag] > proba_threshold],
key=lambda x: x[1],
reverse=True)[:topk]
def run(self, content, proba_threshold=0.3, topk=10):
ret = { ret = {
"content_type": -1, "content_type": -1,
"star": [], "star": [],
"celebrity": [], "celebrity": [],
"projects": [], "projects": [],
"inference_tags": [] "inference_tags": [],
"info_inference_tags": []
} }
words = self.tokenprocessor.lcut(content, cut_all=True) words = self.tokenprocessor.lcut(content, cut_all=True)
words = stopwords_filter.filter(words) words = stopwords_filter.filter(words)
netword_influencer_concurrence = set(words) & set(self.network_influencer_words) netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
project_word_concurrence = set(words) & set(self.project_words) project_word_concurrence = set(words) & set(self.project_words)
star_words_concurrence = set(words) & set(self.star_words) star_words_concurrence = set(words) & set(self.star_words)
info_inference_tags = self.get_info_inference_tags(words, proba_threshold=0.3, topk=10)
counter = Counter(words) counter = Counter(words)
content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence, content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
star_words_concurrence) star_words_concurrence)
...@@ -125,6 +154,7 @@ class TextClassifical(object): ...@@ -125,6 +154,7 @@ class TextClassifical(object):
ret["projects"].extend( ret["projects"].extend(
[{self.standard_project(word): words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)]) [{self.standard_project(word): words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
ret["inference_tags"].extend(self.get_inference_tags(words)) ret["inference_tags"].extend(self.get_inference_tags(words))
ret["info_inference_tags"].extend(info_inference_tags)
return ret return ret
def score(self, counter, concurrence_words): def score(self, counter, concurrence_words):
...@@ -162,4 +192,4 @@ class TextClassifical(object): ...@@ -162,4 +192,4 @@ class TextClassifical(object):
root_path = "/".join(str(__file__).split("/")[:-3]) root_path = "/".join(str(__file__).split("/")[:-3])
model = TextClassifical(os.path.join(root_path, config.network_influcer_dic), model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic), os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic),
os.path.join(root_path, config.synonym_path)) os.path.join(root_path, config.synonym_path), os.path.join(root_path, config.tag_info_path),os.path.join(root_path,config.support_words_path))
...@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic" ...@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic"
projects_dic = "dicts/project.dic" projects_dic = "dicts/project.dic"
star_dic = "dicts/star.dic" star_dic = "dicts/star.dic"
synonym_path = "dicts/synonym.dic" synonym_path = "dicts/synonym.dic"
tag_info_path = "dicts/class_suport"
support_words_path = "dicts/beauty_mddicine.dic"
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment