add info inference

a4f0be81 · crazyer · bcc60d17 · a4f0be81 · a4f0be81 · a4f0be81
Commit a4f0be81 authored Jan 30, 2021 by crazyer
Showing with 36 additions and 4 deletions

base.py algorithm/text_classifical/base.py +34 -4

config.py config/config.py +2 -0

beauty_mddicine.dic dicts/beauty_mddicine.dic +0 -0

class_suport dicts/class_suport +0 -0

No files found.
--- a/algorithm/text_classifical/base.py
+++ b/algorithm/text_classifical/base.py
@@ -9,6 +9,7 @@ from collections import Counter
 from config import config
 import os
 import codecs
+import json


 class SELECTED_CONTENT_TYPE():
@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE():


 class TextClassifical(object):
-    def __init__(self, network_influencer_path, project_path, star_path, synonym_path, encoding="utf-8"):
+    def __init__(self, network_influencer_path, project_path, star_path, synonym_path, tag_info_path,
+                 support_words_path, encoding="utf-8"):
        self.encoding = encoding
        self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
        self.project_words = self.build_project_words(project_path)
@@ -48,6 +50,18 @@ class TextClassifical(object):
            u"耳软骨 鼻子 耳软骨隆鼻"
        ]
        self.template_logic = self.build_template()
+        self.tag_info = self.build_tag_info_pro(tag_info_path)
+        self.support_words = self.build_support_words(support_words_path)
+
+    def build_support_words(self, support_words_path):
+        ret = []
+        ret = json.loads(codecs.open(support_words_path, "r", encoding=self.encoding).read())
+        return set(ret)
+
+    def build_tag_info_pro(self, tag_info_path):
+        ret = {}
+        ret = json.loads(codecs.open(tag_info_path, "r", encoding=self.encoding).read())
+        return ret

    def build_template(self):
        ret = []
@@ -102,19 +116,34 @@ class TextClassifical(object):
                ret.append(tag)
        return [{item: 1.0} for item in ret]

-    def run(self, content):
+    def get_info_inference_tags(self, words, proba_threshold=0.3, topk=10):
+        tag_proba = {}
+        common_words_concurrence = self.support_words & set(words)
+        for word in common_words_concurrence:
+            for tag in self.tag_info[word]:
+                if tag in tag_proba:
+                    tag_proba[tag] = tag_proba[tag] + self.tag_info[word][tag]
+                else:
+                    tag_proba[tag] = self.tag_info[word][tag]
+        return sorted([(tag, tag_proba[tag]) for tag in tag_proba if tag_proba[tag] > proba_threshold],
+                      key=lambda x: x[1],
+                      reverse=True)[:topk]
+
+    def run(self, content, proba_threshold=0.3, topk=10):
        ret = {
            "content_type": -1,
            "star": [],
            "celebrity": [],
            "projects": [],
-            "inference_tags": []
+            "inference_tags": [],
+            "info_inference_tags": []
        }
        words = self.tokenprocessor.lcut(content, cut_all=True)
        words = stopwords_filter.filter(words)
        netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
        project_word_concurrence = set(words) & set(self.project_words)
        star_words_concurrence = set(words) & set(self.star_words)
+        info_inference_tags = self.get_info_inference_tags(words, proba_threshold=0.3, topk=10)
        counter = Counter(words)
        content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
                                                 star_words_concurrence)
@@ -125,6 +154,7 @@ class TextClassifical(object):
        ret["projects"].extend(
            [{self.standard_project(word): words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
        ret["inference_tags"].extend(self.get_inference_tags(words))
+        ret["info_inference_tags"].extend(info_inference_tags)
        return ret

    def score(self, counter, concurrence_words):
@@ -162,4 +192,4 @@ class TextClassifical(object):
 root_path = "/".join(str(__file__).split("/")[:-3])
 model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
                        os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic),
-                        os.path.join(root_path, config.synonym_path))
+                        os.path.join(root_path, config.synonym_path), os.path.join(root_path, config.tag_info_path),os.path.join(root_path,config.support_words_path))
--- a/config/config.py
+++ b/config/config.py
@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic"
 projects_dic = "dicts/project.dic"
 star_dic = "dicts/star.dic"
 synonym_path = "dicts/synonym.dic"
+tag_info_path = "dicts/class_suport"
+support_words_path = "dicts/beauty_mddicine.dic"
--- a/dicts/beauty_mddicine.dic
+++ b/dicts/beauty_mddicine.dic
--- a/dicts/class_suport
+++ b/dicts/class_suport