text miner

890ee5a4 · crazyer · eb61cf61 · 890ee5a4 · 890ee5a4 · 890ee5a4
Commit 890ee5a4 authored Jan 07, 2021 by crazyer
48 changed files
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/workspace.xml
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
+<component name="ProjectCodeStyleConfiguration">
+  <code_scheme name="Project" version="173">
+    <DBN-PSQL>
+      <case-options enabled="true">
+        <option name="KEYWORD_CASE" value="lower" />
+        <option name="FUNCTION_CASE" value="lower" />
+        <option name="PARAMETER_CASE" value="lower" />
+        <option name="DATATYPE_CASE" value="lower" />
+        <option name="OBJECT_CASE" value="preserve" />
+      </case-options>
+      <formatting-settings enabled="false" />
+    </DBN-PSQL>
+    <DBN-SQL>
+      <case-options enabled="true">
+        <option name="KEYWORD_CASE" value="lower" />
+        <option name="FUNCTION_CASE" value="lower" />
+        <option name="PARAMETER_CASE" value="lower" />
+        <option name="DATATYPE_CASE" value="lower" />
+        <option name="OBJECT_CASE" value="preserve" />
+      </case-options>
+      <formatting-settings enabled="false">
+        <option name="STATEMENT_SPACING" value="one_line" />
+        <option name="CLAUSE_CHOP_DOWN" value="chop_down_if_statement_long" />
+        <option name="ITERATION_ELEMENTS_WRAPPING" value="chop_down_if_not_single" />
+      </formatting-settings>
+    </DBN-SQL>
+  </code_scheme>
+</component>
\ No newline at end of file
--- a/.idea/dbnavigator.xml
+++ b/.idea/dbnavigator.xml
--- a/.idea/gm-text-miner.iml
+++ b/.idea/gm-text-miner.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="pytest" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/gm-text-miner.iml" filepath="$PROJECT_DIR$/.idea/gm-text-miner.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/Jenkinsfile
+++ b/Jenkinsfile
+@Library('gm-pipeline-library') _
+
+pipeline {
+  agent  any
+
+  options {
+    // Console output add timestamps
+    timestamps()
+
+        // Disallow concurrent executions of the Pipeline
+    disableConcurrentBuilds()
+
+    // On failure, retry the entire Pipeline the specified number of times.
+    retry(1)
+  }
+
+  parameters {
+    choice(name: 'cache', choices: ['', '--no-cache'], description: 'docker build 是否使用cache,默认使用,不使用为--no-cache')
+  }
+
+  stages {
+    stage('打包') {
+      steps {
+        script {
+          sh 'python setup.py sdist'
+        }
+      }
+    }
+    stage('发布Pypi') {
+      steps {
+        script {
+          sh 'twine upload --skip-existing -r coding-pypi dist/* '
+        }
+      }
+    }
+  }
+
+  post {
+      always {
+          dingNotify "after", "${currentBuild.currentResult}"
+      }
+  }
+}
\ No newline at end of file
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include dicts/*
\ No newline at end of file
--- a/algorithm/__init__.py
+++ b/algorithm/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/algorithm/text_classifical/__init__.py
+++ b/algorithm/text_classifical/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/algorithm/text_classifical/base.py
+++ b/algorithm/text_classifical/base.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from preprocesser.processors import token_processor
+from preprocesser.filter import stopwords_filter
+from collections import Counter
+from config import config
+import os
+
+
+class SELECTED_CONTENT_TYPE():
+    BEAUTY_PROJECT = (1, "医美项目")
+    BEAUTY_STAR = (2, "明星医美")
+    BEAUTY_CELEBRITY = (3, "网红医美")
+    STAR_GOSSIP = (4, "明星八卦")
+    CELEBRITY_GOSSIP = (5, "网红八卦")
+
+
+class TextClassifical(object):
+    def __init__(self, network_influencer_path, project_path, star_path):
+        self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
+        self.project_words = self.build_project_words(project_path)
+        self.star_words = self.build_star_words(star_path)
+        self.tokenprocessor = token_processor
+        self.stopwords_filter = stopwords_filter
+
+    def build_network_influencer_words(self, word_path):
+        ret = {}
+        for line in open(word_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def build_project_words(self, project_path):
+        ret = {}
+        for line in open(project_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def build_star_words(self, star_path):
+        ret = {}
+        for line in open(star_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def run(self, content):
+        ret = {
+            "content_type": -1,
+            "star": [],
+            "celebrity": [],
+            "projects": []
+        }
+        words = self.tokenprocessor.lcut(content, cut_all=True)
+        words = stopwords_filter.filter(words)
+        netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
+        project_word_concurrence = set(words) & set(self.project_words)
+        star_words_concurrence = set(words) & set(self.star_words)
+        counter = Counter(words)
+        content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
+                                                 star_words_concurrence)
+        ret["content_type"] = content_type
+        ret["star"].extend([{word: words_proba[2].get(word, 0.0)} for word in list(star_words_concurrence)])
+        ret["celebrity"].extend(
+            [{word: words_proba[0].get(word, 0.0)} for word in list(netword_influencer_concurrence)])
+        ret["projects"].extend(
+            [{word: words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
+        return ret
+
+    def score(self, counter, concurrence_words):
+        pass
+
+    def predict(self, counter, netword_influencer_concurrence, project_word_concurrence, star_words_concurrence):
+        words_proba = []
+        net_influencer_total = sum([counter[word] * 2 for word in netword_influencer_concurrence])
+        net_influencer_proba = {word: float(counter[word] * 2) / net_influencer_total for word in
+                                netword_influencer_concurrence}
+        words_proba.append(net_influencer_proba)
+        project_words_total = sum([counter[word] for word in project_word_concurrence])
+        project_words_proba = {word: float(counter[word]) / project_words_total for word in project_word_concurrence}
+        words_proba.append(project_words_proba)
+        star_words_total = sum([counter[word] * 2 for word in star_words_concurrence])
+        star_words_proba = {word: float(counter[word] * 2) / star_words_total for word in star_words_concurrence}
+        words_proba.append(star_words_proba)
+        total_word = sum([net_influencer_total, project_words_total, star_words_total])
+        if total_word <= 0:
+            return -1, words_proba
+        each_proba = [float(item) / total_word for item in
+                      [net_influencer_total, project_words_total, star_words_total]]
+        if each_proba[1] <= 0 and each_proba[2] >= each_proba[0]:
+            return SELECTED_CONTENT_TYPE.STAR_GOSSIP[0], words_proba
+        elif each_proba[1] <= 0 and each_proba[2] < each_proba[0]:
+            return SELECTED_CONTENT_TYPE[0], words_proba
+        elif each_proba[1] > 0.75:
+            return SELECTED_CONTENT_TYPE.BEAUTY_PROJECT[0], words_proba
+        elif each_proba[0] > each_proba[2]:
+            return SELECTED_CONTENT_TYPE.BEAUTY_CELEBRITY[0], words_proba
+        else:
+            return SELECTED_CONTENT_TYPE.BEAUTY_STAR[0], words_proba
+
+
+root_path = "/".join(str(__file__).split("/")[:-3])
+model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
+                        os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic))
--- a/build/lib/algorithm/__init__.py
+++ b/build/lib/algorithm/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/build/lib/algorithm/text_classifical/__init__.py
+++ b/build/lib/algorithm/text_classifical/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/build/lib/algorithm/text_classifical/base.py
+++ b/build/lib/algorithm/text_classifical/base.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from preprocesser.processors import token_processor
+from preprocesser.filter import stopwords_filter
+from collections import Counter
+from config import config
+import os
+
+
+class SELECTED_CONTENT_TYPE():
+    BEAUTY_PROJECT = (1, "医美项目")
+    BEAUTY_STAR = (2, "明星医美")
+    BEAUTY_CELEBRITY = (3, "网红医美")
+    STAR_GOSSIP = (4, "明星八卦")
+    CELEBRITY_GOSSIP = (5, "网红八卦")
+
+
+class TextClassifical(object):
+    def __init__(self, network_influencer_path, project_path, star_path):
+        self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
+        self.project_words = self.build_project_words(project_path)
+        self.star_words = self.build_star_words(star_path)
+        self.tokenprocessor = token_processor
+        self.stopwords_filter = stopwords_filter
+
+    def build_network_influencer_words(self, word_path):
+        ret = {}
+        for line in open(word_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def build_project_words(self, project_path):
+        ret = {}
+        for line in open(project_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def build_star_words(self, star_path):
+        ret = {}
+        for line in open(star_path, "r", errors="ignore"):
+            line = line.strip()
+            ret[line] = 1
+        return ret
+
+    def run(self, content):
+        ret = {
+            "content_type": -1,
+            "star": [],
+            "celebrity": [],
+            "projects": []
+        }
+        words = self.tokenprocessor.lcut(content, cut_all=True)
+        words = stopwords_filter.filter(words)
+        netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
+        project_word_concurrence = set(words) & set(self.project_words)
+        star_words_concurrence = set(words) & set(self.star_words)
+        counter = Counter(words)
+        content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
+                                                 star_words_concurrence)
+        ret["content_type"] = content_type
+        ret["star"].extend([{word: words_proba[2].get(word, 0.0)} for word in list(star_words_concurrence)])
+        ret["celebrity"].extend(
+            [{word: words_proba[0].get(word, 0.0)} for word in list(netword_influencer_concurrence)])
+        ret["projects"].extend(
+            [{word: words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
+        return ret
+
+    def score(self, counter, concurrence_words):
+        pass
+
+    def predict(self, counter, netword_influencer_concurrence, project_word_concurrence, star_words_concurrence):
+        words_proba = []
+        net_influencer_total = sum([counter[word] * 2 for word in netword_influencer_concurrence])
+        net_influencer_proba = {word: float(counter[word] * 2) / net_influencer_total for word in
+                                netword_influencer_concurrence}
+        words_proba.append(net_influencer_proba)
+        project_words_total = sum([counter[word] for word in project_word_concurrence])
+        project_words_proba = {word: float(counter[word]) / project_words_total for word in project_word_concurrence}
+        words_proba.append(project_words_proba)
+        star_words_total = sum([counter[word] * 2 for word in star_words_concurrence])
+        star_words_proba = {word: float(counter[word] * 2) / star_words_total for word in star_words_concurrence}
+        words_proba.append(star_words_proba)
+        total_word = sum([net_influencer_total, project_words_total, star_words_total])
+        if total_word <= 0:
+            return -1, words_proba
+        each_proba = [float(item) / total_word for item in
+                      [net_influencer_total, project_words_total, star_words_total]]
+        if each_proba[1] <= 0 and each_proba[2] >= each_proba[0]:
+            return SELECTED_CONTENT_TYPE.STAR_GOSSIP[0], words_proba
+        elif each_proba[1] <= 0 and each_proba[2] < each_proba[0]:
+            return SELECTED_CONTENT_TYPE[0], words_proba
+        elif each_proba[1] > 0.75:
+            return SELECTED_CONTENT_TYPE.BEAUTY_PROJECT[0], words_proba
+        elif each_proba[0] > each_proba[2]:
+            return SELECTED_CONTENT_TYPE.BEAUTY_CELEBRITY[0], words_proba
+        else:
+            return SELECTED_CONTENT_TYPE.BEAUTY_STAR[0], words_proba
+
+
+root_path = "/".join(str(__file__).split("/")[:-3])
+model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
+                        os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic))
--- a/build/lib/config/__init__.py
+++ b/build/lib/config/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/build/lib/config/config.py
+++ b/build/lib/config/config.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+DEBUG = False
+"""
+recall_topK：
+    我们选取多少个候选词
+sort_topK：
+    排序后挑选的词汇数量
+min_frequence:
+    所有词汇必须共现的次数
+stopwords_path：
+    停用词路径
+words_path:
+    词典path
+"""
+stopwords_path = "dicts/stopwords.dic"
+words_path = "dicts/words.dic"
+
+network_influcer_dic = "dicts/network_influcer.dic"
+projects_dic = "dicts/project.dic"
+star_dic = "dicts/star.dic"
--- a/build/lib/dicts/__init__.py
+++ b/build/lib/dicts/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/build/lib/dicts/network_influcer.dic
+++ b/build/lib/dicts/network_influcer.dic
+丁真
+周扬青
+冯提莫
+半藏森林
+艾比
+韩安冉
+南笙
+奶茶妹妹
+宋昕冉
+林小宅
+晚晚
+谢安然
+王柠萌
+Naomi
+于文红
+甜仇
+温精灵
+温婉
+Fiona宋亮
+李蒽熙
+una
+夏夏
+水野亚美
+小小如
+卓亨瑜
+彭王者
+滕雨佳
+腻腻ninii
+李恩童
+花珊珊
+小初
+小饼干
+晚妹
+吃一口甜
+徐清婉
+Jy小语
+张贤静
+施安妮
+周子然Femi
+XIZI杨
+彦崽儿
+潘白雪
+方恰拉
+MAGBOW
+CHU小初
+不求上进的柚砸
+LU一丝
+大佬儿
+姚淇瀚Henry
+周小濛
+半藏
+Abbily
+王嘉辉
+罗小伊
+章泽天
+林晓婷
+晚奶
+柠檬
+康雅馨
+仇琳琳
+温仙女
+张曼如
+雷婉婷
+土豆公主
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/build/lib/dicts/project.dic
+++ b/build/lib/dicts/project.dic
--- a/build/lib/dicts/star.dic
+++ b/build/lib/dicts/star.dic
+肖战
+杨紫
+赵丽颖
+杨幂
+倪妮
+迪丽热巴
+范冰冰
+鞠婧祎
+刘诗诗
+Lisa
+吴宣仪
+赵露思
+杨超越
+Angelababy
+高圆圆
+章子怡
+乔欣
+张雨绮
+孙怡
+江疏影
+毛晓彤
+张馨予
+王祖贤
+张子枫
+陈小纭
+舒淇
+石原里美
+关之琳
+权志龙
+陈数
+程潇
+李小璐
+景甜
+奚梦瑶
+戚薇
+萧亚轩
+车晓
+沈梦辰
+陈妍希
+张予曦
+陈坤
+林珍娜
+宋慧乔
+孟佳
+张靓颖
+郭采洁
+白冰
+林允
+吉娜
+姚晨
+昆凌
+白百何
+沈月
+邓文迪
+王心凌
+杨雪
+朴敏英
+水原希子
+甘薇
+秀智
+高允真
+苟芸慧
+新桓结衣
+徐贞姬
+孙胜完
+郑采妍
+战战
+紫妹
+小猴子
+赵姐
+颖宝
+大幂幂
+妮妮
+喵总
+热巴
+范爷
+冰冰
+四千年
+老鞠，
+诗爷
+人间芭比
+小选
+肉丝
+超越妹妹
+杨颖
+baby
+国际章
+乔妹
+绮绮子
+怡宝
+张燕
+妹妹
+十元
+GD
+嫂子
+金莲
+大甜甜
+小明
+戚哥
+鲜肉菩萨
+小笼包
+乔妹
+费霞
+天王嫂
+国民初恋
+GAKKI
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/build/lib/dicts/stopwords.dic
+++ b/build/lib/dicts/stopwords.dic
+“
+，
+“
+。
+”
+(
+)
+：
+
+⇙
+▼
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
+
+,
+-
+--
+.
+..
+...
+......
+...................
+./
+.一
+.数
+.日
+/
+//
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+://
+::
+;
+<
+=
+>
+>>
+?
+@
+A
+Lex
+[
+\
+]
+^
+_
+`
+exp
+sub
+sup
+|
+}
+~
+~~~
+·
+×
+×××
+Δ
+Ψ
+γ
+μ
+φ
+φ．
+В
+—
+——
+———
+‘
+’
+’‘
+“
+”
+”，
+…
+……
+…………………………………………………③
+′∈
+′｜
+℃
+Ⅲ
+↑
+→
+∈［
+∪φ∈
+≈
+①
+②
+的
+我
+你
+了
+呢
+②ｃ
+③
+③］
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+──
+■
+▲
+　
+、
+。
+〈
+〉
+《
+》
+》），
+」
+『
+』
+【
+】
+〔
+〕
+〕〔
+㈧
+︿
+！
+＃
+＄
+％
+＆
+＇
+（
+）
+）÷（１－
+）、
+＊
+＋
+＋ξ
+＋＋
+，
+，也
+－
+－β
+－－
+－［＊］－
+．
+／
+０
+０：２
+１
+１．
+１２％
+２
+２．３％
+３
+４
+５
+５：０
+６
+７
+８
+９
+：
+；
+＜
+＜±
+＜Δ
+＜λ
+＜φ
+＜＜
+＝
+＝″
+＝☆
+＝（
+＝－
+＝［
+＝｛
+＞
+＞λ
+？
+＠
+Ａ
+ＬＩ
+Ｒ．Ｌ．
+ＺＸＦＩＴＬ
+［
+［①①］
+［①②］
+［①③］
+［①④］
+［①⑤］
+［①⑥］
+［①⑦］
+［①⑧］
+［①⑨］
+［①Ａ］
+［①Ｂ］
+［①Ｃ］
+［①Ｄ］
+［①Ｅ］
+［①］
+［①ａ］
+［①ｃ］
+［①ｄ］
+［①ｅ］
+［①ｆ］
+［①ｇ］
+［①ｈ］
+［①ｉ］
+［①ｏ］
+［②
+［②①］
+［②②］
+［②③］
+［②④
+［②⑤］
+［②⑥］
+［②⑦］
+［②⑧］
+［②⑩］
+［②Ｂ］
+［②Ｇ］
+［②］
+［②ａ］
+［②ｂ］
+［②ｃ］
+［②ｄ］
+［②ｅ］
+［②ｆ］
+［②ｇ］
+［②ｈ］
+［②ｉ］
+［②ｊ］
+［③①］
+［③⑩］
+［③Ｆ］
+［③］
+［③ａ］
+［③ｂ］
+［③ｃ］
+［③ｄ］
+［③ｅ］
+［③ｇ］
+［③ｈ］
+［④］
+［④ａ］
+［④ｂ］
+［④ｃ］
+［④ｄ］
+［④ｅ］
+［⑤］
+［⑤］］
+［⑤ａ］
+［⑤ｂ］
+［⑤ｄ］
+［⑤ｅ］
+［⑤ｆ］
+［⑥］
+［⑦］
+［⑧］
+［⑨］
+［⑩］
+［＊］
+［－
+［］
+］
+］∧′＝［
+］［
+＿
+ａ］
+ｂ］
+ｃ］
+ｅ］
+ｆ］
+ｎｇ昉
+｛
+｛－
+｜
+｝
+｝＞
+～
+～±
+～＋
+￥
--- a/build/lib/dicts/words.dic
+++ b/build/lib/dicts/words.dic
--- a/build/lib/preprocesser/__init__.py
+++ b/build/lib/preprocesser/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/build/lib/preprocesser/filter.py
+++ b/build/lib/preprocesser/filter.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+import re, os
+from config import config
+
+
+class Filter(object):
+    def __init__(self, file_path, encoding="utf-*"):
+        self.file_path = file_path
+        self.encoding = encoding
+        self.stopwords = set()
+
+    def filter(self):
+        raise NotImplementedError
+
+
+class StopwordsFilter(Filter):
+    def __init__(self, file_path, encoding="utf-8"):
+        super(StopwordsFilter, self).__init__(file_path, encoding)
+        self.init()
+
+    def remove_irregular_chars(self, corpus: str):
+        return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
+
+    def init(self):
+        for line in open(self.file_path, "r", encoding=self.encoding):
+            try:
+                line = line.strip()
+                self.stopwords.add(line)
+                self.stopwords.add("\n")
+                self.stopwords.add("")
+            except Exception as e:
+                print("{} process error".format(line))
+
+    def filter(self, token_list):
+        return [self.remove_irregular_chars(item.strip()) for item in token_list if
+                self.remove_irregular_chars(item) not in self.stopwords]
+
+
+root_path = "/".join(str(__file__).split("/")[:-2])
+stopwords_filter = StopwordsFilter(os.path.join(root_path, config.stopwords_path))
+
+print()
--- a/build/lib/preprocesser/pipeline.py
+++ b/build/lib/preprocesser/pipeline.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from tqdm import tqdm
+
+
+class Pipeline(object):
+    """
+    pipiline 定义数据的流程,
+    将文件中的词处理与与句为单位的tokenlist
+    """
+
+    def __init__(self):
+        self.pipelines = []
\ No newline at end of file
--- a/build/lib/preprocesser/processors.py
+++ b/build/lib/preprocesser/processors.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from abc import ABC
+
+from jieba import Tokenizer
+import re, os
+from config import config
+
+
+class SentenceSegmenter(object):
+    def __init__(self, split_pun=None):
+        if not split_pun:
+            self.split_pun = r'[;；.。，,！\n!?？]'
+        else:
+            self.split_pun = '{}'.format("".join(split_pun))
+
+    def split(self, sentences):
+        if isinstance(sentences, str):
+            for item in re.split(self.split_pun, sentences):
+                if item:
+                    yield item
+        else:
+            for sentence in sentences:
+                for item in re.split(self.split_pun, sentence):
+                    yield item
+
+
+class Processor(object):
+    def __init__(self, name):
+        self._name = name
+
+    def run(self):
+        raise NotImplementedError
+
+
+class StandardProcessor(Processor):
+    def __init__(self, name=""):
+        super(StandardProcessor, self).__init__(name)
+
+    def lcut(self, line):
+        return " ".join(line)
+
+
+class TokenizerProcessor(Processor, ABC):
+
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.tokenizer = None
+        self.init(self.file_path)
+
+    def init(self, dict_path=None):
+        tokenizer = Tokenizer(dictionary=dict_path)
+        tokenizer.initialize()
+        self.tokenizer = tokenizer
+
+    def lcut(self, line, cut_all=False):
+        """
+        当前只支持smart的切词方式
+        :param line:
+        :type line:
+        :return:
+        :rtype:
+        """
+        return self.tokenizer.lcut(line, HMM=True, cut_all=cut_all)
+
+
+root_path = "/".join(str(__file__).split("/")[:-2])
+token_processor = TokenizerProcessor(os.path.join(root_path, config.words_path))
--- a/config/__init__.py
+++ b/config/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/config/config.py
+++ b/config/config.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+DEBUG = False
+"""
+recall_topK：
+    我们选取多少个候选词
+sort_topK：
+    排序后挑选的词汇数量
+min_frequence:
+    所有词汇必须共现的次数
+stopwords_path：
+    停用词路径
+words_path:
+    词典path
+"""
+stopwords_path = "dicts/stopwords.dic"
+words_path = "dicts/words.dic"
+
+network_influcer_dic = "dicts/network_influcer.dic"
+projects_dic = "dicts/project.dic"
+star_dic = "dicts/star.dic"
--- a/dicts/__init__.py
+++ b/dicts/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/dicts/network_influcer.dic
+++ b/dicts/network_influcer.dic
+丁真
+周扬青
+冯提莫
+半藏森林
+艾比
+韩安冉
+南笙
+奶茶妹妹
+宋昕冉
+林小宅
+晚晚
+谢安然
+王柠萌
+Naomi
+于文红
+甜仇
+温精灵
+温婉
+Fiona宋亮
+李蒽熙
+una
+夏夏
+水野亚美
+小小如
+卓亨瑜
+彭王者
+滕雨佳
+腻腻ninii
+李恩童
+花珊珊
+小初
+小饼干
+晚妹
+吃一口甜
+徐清婉
+Jy小语
+张贤静
+施安妮
+周子然Femi
+XIZI杨
+彦崽儿
+潘白雪
+方恰拉
+MAGBOW
+CHU小初
+不求上进的柚砸
+LU一丝
+大佬儿
+姚淇瀚Henry
+周小濛
+半藏
+Abbily
+王嘉辉
+罗小伊
+章泽天
+林晓婷
+晚奶
+柠檬
+康雅馨
+仇琳琳
+温仙女
+张曼如
+雷婉婷
+土豆公主
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/dicts/project.dic
+++ b/dicts/project.dic
--- a/dicts/star.dic
+++ b/dicts/star.dic
+肖战
+杨紫
+赵丽颖
+杨幂
+倪妮
+迪丽热巴
+范冰冰
+鞠婧祎
+刘诗诗
+Lisa
+吴宣仪
+赵露思
+杨超越
+Angelababy
+高圆圆
+章子怡
+乔欣
+张雨绮
+孙怡
+江疏影
+毛晓彤
+张馨予
+王祖贤
+张子枫
+陈小纭
+舒淇
+石原里美
+关之琳
+权志龙
+陈数
+程潇
+李小璐
+景甜
+奚梦瑶
+戚薇
+萧亚轩
+车晓
+沈梦辰
+陈妍希
+张予曦
+陈坤
+林珍娜
+宋慧乔
+孟佳
+张靓颖
+郭采洁
+白冰
+林允
+吉娜
+姚晨
+昆凌
+白百何
+沈月
+邓文迪
+王心凌
+杨雪
+朴敏英
+水原希子
+甘薇
+秀智
+高允真
+苟芸慧
+新桓结衣
+徐贞姬
+孙胜完
+郑采妍
+战战
+紫妹
+小猴子
+赵姐
+颖宝
+大幂幂
+妮妮
+喵总
+热巴
+范爷
+冰冰
+四千年
+老鞠，
+诗爷
+人间芭比
+小选
+肉丝
+超越妹妹
+杨颖
+baby
+国际章
+乔妹
+绮绮子
+怡宝
+张燕
+妹妹
+十元
+GD
+嫂子
+金莲
+大甜甜
+小明
+戚哥
+鲜肉菩萨
+小笼包
+乔妹
+费霞
+天王嫂
+国民初恋
+GAKKI
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/dicts/stopwords.dic
+++ b/dicts/stopwords.dic
+“
+，
+“
+。
+”
+(
+)
+：
+
+⇙
+▼
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
+
+,
+-
+--
+.
+..
+...
+......
+...................
+./
+.一
+.数
+.日
+/
+//
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+://
+::
+;
+<
+=
+>
+>>
+?
+@
+A
+Lex
+[
+\
+]
+^
+_
+`
+exp
+sub
+sup
+|
+}
+~
+~~~
+·
+×
+×××
+Δ
+Ψ
+γ
+μ
+φ
+φ．
+В
+—
+——
+———
+‘
+’
+’‘
+“
+”
+”，
+…
+……
+…………………………………………………③
+′∈
+′｜
+℃
+Ⅲ
+↑
+→
+∈［
+∪φ∈
+≈
+①
+②
+的
+我
+你
+了
+呢
+②ｃ
+③
+③］
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+──
+■
+▲
+　
+、
+。
+〈
+〉
+《
+》
+》），
+」
+『
+』
+【
+】
+〔
+〕
+〕〔
+㈧
+︿
+！
+＃
+＄
+％
+＆
+＇
+（
+）
+）÷（１－
+）、
+＊
+＋
+＋ξ
+＋＋
+，
+，也
+－
+－β
+－－
+－［＊］－
+．
+／
+０
+０：２
+１
+１．
+１２％
+２
+２．３％
+３
+４
+５
+５：０
+６
+７
+８
+９
+：
+；
+＜
+＜±
+＜Δ
+＜λ
+＜φ
+＜＜
+＝
+＝″
+＝☆
+＝（
+＝－
+＝［
+＝｛
+＞
+＞λ
+？
+＠
+Ａ
+ＬＩ
+Ｒ．Ｌ．
+ＺＸＦＩＴＬ
+［
+［①①］
+［①②］
+［①③］
+［①④］
+［①⑤］
+［①⑥］
+［①⑦］
+［①⑧］
+［①⑨］
+［①Ａ］
+［①Ｂ］
+［①Ｃ］
+［①Ｄ］
+［①Ｅ］
+［①］
+［①ａ］
+［①ｃ］
+［①ｄ］
+［①ｅ］
+［①ｆ］
+［①ｇ］
+［①ｈ］
+［①ｉ］
+［①ｏ］
+［②
+［②①］
+［②②］
+［②③］
+［②④
+［②⑤］
+［②⑥］
+［②⑦］
+［②⑧］
+［②⑩］
+［②Ｂ］
+［②Ｇ］
+［②］
+［②ａ］
+［②ｂ］
+［②ｃ］
+［②ｄ］
+［②ｅ］
+［②ｆ］
+［②ｇ］
+［②ｈ］
+［②ｉ］
+［②ｊ］
+［③①］
+［③⑩］
+［③Ｆ］
+［③］
+［③ａ］
+［③ｂ］
+［③ｃ］
+［③ｄ］
+［③ｅ］
+［③ｇ］
+［③ｈ］
+［④］
+［④ａ］
+［④ｂ］
+［④ｃ］
+［④ｄ］
+［④ｅ］
+［⑤］
+［⑤］］
+［⑤ａ］
+［⑤ｂ］
+［⑤ｄ］
+［⑤ｅ］
+［⑤ｆ］
+［⑥］
+［⑦］
+［⑧］
+［⑨］
+［⑩］
+［＊］
+［－
+［］
+］
+］∧′＝［
+］［
+＿
+ａ］
+ｂ］
+ｃ］
+ｅ］
+ｆ］
+ｎｇ昉
+｛
+｛－
+｜
+｝
+｝＞
+～
+～±
+～＋
+￥
--- a/dicts/words.dic
+++ b/dicts/words.dic
--- a/dist/gm_text_miner-1.0.0-py3.7.egg
+++ b/dist/gm_text_miner-1.0.0-py3.7.egg
--- a/dist/gm_text_miner-1.0.0.tar.gz
+++ b/dist/gm_text_miner-1.0.0.tar.gz
--- a/gm_text_miner.egg-info/PKG-INFO
+++ b/gm_text_miner.egg-info/PKG-INFO
+Metadata-Version: 1.1
+Name: gm-text-miner
+Version: 1.0.0
+Summary: classifical base word dict
+Home-page: UNKNOWN
+Author: crazyer
+Author-email: zhangguodong@igengmei.com
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
--- a/gm_text_miner.egg-info/SOURCES.txt
+++ b/gm_text_miner.egg-info/SOURCES.txt
+MANIFEST.in
+setup.py
+algorithm/__init__.py
+algorithm/text_classifical/__init__.py
+algorithm/text_classifical/base.py
+config/__init__.py
+config/config.py
+dicts/__init__.py
+dicts/network_influcer.dic
+dicts/project.dic
+dicts/star.dic
+dicts/stopwords.dic
+dicts/words.dic
+gm_text_miner.egg-info/PKG-INFO
+gm_text_miner.egg-info/SOURCES.txt
+gm_text_miner.egg-info/dependency_links.txt
+gm_text_miner.egg-info/top_level.txt
+preprocesser/__init__.py
+preprocesser/filter.py
+preprocesser/pipeline.py
+preprocesser/processors.py
\ No newline at end of file
--- a/gm_text_miner.egg-info/dependency_links.txt
+++ b/gm_text_miner.egg-info/dependency_links.txt
+
--- a/gm_text_miner.egg-info/top_level.txt
+++ b/gm_text_miner.egg-info/top_level.txt
+algorithm
+config
+dicts
+preprocesser
--- a/main.py
+++ b/main.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+
+from algorithm.text_classifical.base import model
+content = "双眼皮"
+content = """
+    🥰🥰38+看起来像20+ 我的驻颜秘密大公开
+ 
+😳😳前两天看网上有人说关之琳在30岁才开始大红，我们现在看到的作品很多都是她30以后的电影。女人很少能在这个年龄才开始出头，现在女星你三十都没有出名，后面像再靠颜值火起来根本是没有可能的。关美女能火那是因为她有别人不可比拟的骨相和皮相美。
+-
+👇🏻👇🏻如何让自己能在三十多看起来还像少女一样稚嫩呢？下嘛看看我们案例姑娘的保养秘诀
+-
+🌿🌿术前情况：随着年龄的增加，胶原蛋白的流失皮相一路向下。正常情况下三十岁左右的女人，应该是稍微丰腴一点会显得比较和蔼，有福相看起来会比较年轻。我们案例姑娘在这个年龄确
+-
+✍🏻️✍🏻️整形方案：面部填充。重点填充部位太阳穴和面颊部位，额捏角。
+-
+🎉🎉术后效果：如果你细看那些女明星，三十多还风采依旧的基本上都是做过填充的姑娘，典型的我们填充模板，王子文。填充后那颜值一路高歌。我们案例姑娘一样，填充后，面部线条柔和，没有突兀感所以颜值也是猛然提升。
+-
+🌈🌈术后七天：面部肿胀在三四天的时候达到高峰，七天左右开始消肿。
+-
+🥰🥰术后一个月：这个时候基本已经消肿完毕，但是术后效果并不是稳定期，术后效果也不是最好的。如果还有肿胀也是正常的，建议耐心等待。
+-
+☑️☑️术后三个月：术后效果稳定期，面部填充效果最好的时期无疑是三个月恢复期过后。这个时候填充脂肪细胞基本稳定，面部表情也会变得自然。
+    """
+print(model.run(content))
--- a/preprocesser/__init__.py
+++ b/preprocesser/__init__.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
--- a/preprocesser/filter.py
+++ b/preprocesser/filter.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+import re, os
+from config import config
+
+
+class Filter(object):
+    def __init__(self, file_path, encoding="utf-*"):
+        self.file_path = file_path
+        self.encoding = encoding
+        self.stopwords = set()
+
+    def filter(self):
+        raise NotImplementedError
+
+
+class StopwordsFilter(Filter):
+    def __init__(self, file_path, encoding="utf-8"):
+        super(StopwordsFilter, self).__init__(file_path, encoding)
+        self.init()
+
+    def remove_irregular_chars(self, corpus: str):
+        return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
+
+    def init(self):
+        for line in open(self.file_path, "r", encoding=self.encoding):
+            try:
+                line = line.strip()
+                self.stopwords.add(line)
+                self.stopwords.add("\n")
+                self.stopwords.add("")
+            except Exception as e:
+                print("{} process error".format(line))
+
+    def filter(self, token_list):
+        return [self.remove_irregular_chars(item.strip()) for item in token_list if
+                self.remove_irregular_chars(item) not in self.stopwords]
+
+
+root_path = "/".join(str(__file__).split("/")[:-2])
+stopwords_filter = StopwordsFilter(os.path.join(root_path, config.stopwords_path))
+
+print()
--- a/preprocesser/pipeline.py
+++ b/preprocesser/pipeline.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from tqdm import tqdm
+
+
+class Pipeline(object):
+    """
+    pipiline 定义数据的流程,
+    将文件中的词处理与与句为单位的tokenlist
+    """
+
+    def __init__(self):
+        self.pipelines = []
\ No newline at end of file
--- a/preprocesser/processors.py
+++ b/preprocesser/processors.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+from abc import ABC
+
+from jieba import Tokenizer
+import re, os
+from config import config
+
+
+class SentenceSegmenter(object):
+    def __init__(self, split_pun=None):
+        if not split_pun:
+            self.split_pun = r'[;；.。，,！\n!?？]'
+        else:
+            self.split_pun = '{}'.format("".join(split_pun))
+
+    def split(self, sentences):
+        if isinstance(sentences, str):
+            for item in re.split(self.split_pun, sentences):
+                if item:
+                    yield item
+        else:
+            for sentence in sentences:
+                for item in re.split(self.split_pun, sentence):
+                    yield item
+
+
+class Processor(object):
+    def __init__(self, name):
+        self._name = name
+
+    def run(self):
+        raise NotImplementedError
+
+
+class StandardProcessor(Processor):
+    def __init__(self, name=""):
+        super(StandardProcessor, self).__init__(name)
+
+    def lcut(self, line):
+        return " ".join(line)
+
+
+class TokenizerProcessor(Processor, ABC):
+
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.tokenizer = None
+        self.init(self.file_path)
+
+    def init(self, dict_path=None):
+        tokenizer = Tokenizer(dictionary=dict_path)
+        tokenizer.initialize()
+        self.tokenizer = tokenizer
+
+    def lcut(self, line, cut_all=False):
+        """
+        当前只支持smart的切词方式
+        :param line:
+        :type line:
+        :return:
+        :rtype:
+        """
+        return self.tokenizer.lcut(line, HMM=True, cut_all=cut_all)
+
+
+root_path = "/".join(str(__file__).split("/")[:-2])
+token_processor = TokenizerProcessor(os.path.join(root_path, config.words_path))
--- a/setup.py
+++ b/setup.py
+# -*- coding:utf-8 -*-
+# author:gm
+# mail: zhangguodong@igengmei.com
+# datetime:2020/4/24 3:32 下午
+# software: PyCharm
+
+import setuptools
+
+requires = [
+]
+
+dev_requires = [
+]
+
+setuptools.setup(
+    name='gm-text-miner',
+    version="1.0.0",
+    author="crazyer",
+    author_email="zhangguodong@igengmei.com",
+    description="classifical base word dict",
+    install_requires=requires,
+    url="",
+    packages=setuptools.find_packages(),
+    extra_requires={
+        'dev': dev_requires,
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+    ],
+    include_package_data=True
+)