Commit 890ee5a4 authored by crazyer's avatar crazyer

text miner

parent eb61cf61
# Default ignored files
/workspace.xml
<component name="ProjectCodeStyleConfiguration">
<code_scheme name="Project" version="173">
<DBN-PSQL>
<case-options enabled="true">
<option name="KEYWORD_CASE" value="lower" />
<option name="FUNCTION_CASE" value="lower" />
<option name="PARAMETER_CASE" value="lower" />
<option name="DATATYPE_CASE" value="lower" />
<option name="OBJECT_CASE" value="preserve" />
</case-options>
<formatting-settings enabled="false" />
</DBN-PSQL>
<DBN-SQL>
<case-options enabled="true">
<option name="KEYWORD_CASE" value="lower" />
<option name="FUNCTION_CASE" value="lower" />
<option name="PARAMETER_CASE" value="lower" />
<option name="DATATYPE_CASE" value="lower" />
<option name="OBJECT_CASE" value="preserve" />
</case-options>
<formatting-settings enabled="false">
<option name="STATEMENT_SPACING" value="one_line" />
<option name="CLAUSE_CHOP_DOWN" value="chop_down_if_statement_long" />
<option name="ITERATION_ELEMENTS_WRAPPING" value="chop_down_if_not_single" />
</formatting-settings>
</DBN-SQL>
</code_scheme>
</component>
\ No newline at end of file
This diff is collapsed.
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/gm-text-miner.iml" filepath="$PROJECT_DIR$/.idea/gm-text-miner.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
@Library('gm-pipeline-library') _
pipeline {
agent any
options {
// Console output add timestamps
timestamps()
// Disallow concurrent executions of the Pipeline
disableConcurrentBuilds()
// On failure, retry the entire Pipeline the specified number of times.
retry(1)
}
parameters {
choice(name: 'cache', choices: ['', '--no-cache'], description: 'docker build 是否使用cache,默认使用,不使用为--no-cache')
}
stages {
stage('打包') {
steps {
script {
sh 'python setup.py sdist'
}
}
}
stage('发布Pypi') {
steps {
script {
sh 'twine upload --skip-existing -r coding-pypi dist/* '
}
}
}
}
post {
always {
dingNotify "after", "${currentBuild.currentResult}"
}
}
}
\ No newline at end of file
include dicts/*
\ No newline at end of file
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from preprocesser.processors import token_processor
from preprocesser.filter import stopwords_filter
from collections import Counter
from config import config
import os
class SELECTED_CONTENT_TYPE():
BEAUTY_PROJECT = (1, "医美项目")
BEAUTY_STAR = (2, "明星医美")
BEAUTY_CELEBRITY = (3, "网红医美")
STAR_GOSSIP = (4, "明星八卦")
CELEBRITY_GOSSIP = (5, "网红八卦")
class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path):
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter
def build_network_influencer_words(self, word_path):
ret = {}
for line in open(word_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def build_project_words(self, project_path):
ret = {}
for line in open(project_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def build_star_words(self, star_path):
ret = {}
for line in open(star_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def run(self, content):
ret = {
"content_type": -1,
"star": [],
"celebrity": [],
"projects": []
}
words = self.tokenprocessor.lcut(content, cut_all=True)
words = stopwords_filter.filter(words)
netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
project_word_concurrence = set(words) & set(self.project_words)
star_words_concurrence = set(words) & set(self.star_words)
counter = Counter(words)
content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
star_words_concurrence)
ret["content_type"] = content_type
ret["star"].extend([{word: words_proba[2].get(word, 0.0)} for word in list(star_words_concurrence)])
ret["celebrity"].extend(
[{word: words_proba[0].get(word, 0.0)} for word in list(netword_influencer_concurrence)])
ret["projects"].extend(
[{word: words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
return ret
def score(self, counter, concurrence_words):
pass
def predict(self, counter, netword_influencer_concurrence, project_word_concurrence, star_words_concurrence):
words_proba = []
net_influencer_total = sum([counter[word] * 2 for word in netword_influencer_concurrence])
net_influencer_proba = {word: float(counter[word] * 2) / net_influencer_total for word in
netword_influencer_concurrence}
words_proba.append(net_influencer_proba)
project_words_total = sum([counter[word] for word in project_word_concurrence])
project_words_proba = {word: float(counter[word]) / project_words_total for word in project_word_concurrence}
words_proba.append(project_words_proba)
star_words_total = sum([counter[word] * 2 for word in star_words_concurrence])
star_words_proba = {word: float(counter[word] * 2) / star_words_total for word in star_words_concurrence}
words_proba.append(star_words_proba)
total_word = sum([net_influencer_total, project_words_total, star_words_total])
if total_word <= 0:
return -1, words_proba
each_proba = [float(item) / total_word for item in
[net_influencer_total, project_words_total, star_words_total]]
if each_proba[1] <= 0 and each_proba[2] >= each_proba[0]:
return SELECTED_CONTENT_TYPE.STAR_GOSSIP[0], words_proba
elif each_proba[1] <= 0 and each_proba[2] < each_proba[0]:
return SELECTED_CONTENT_TYPE[0], words_proba
elif each_proba[1] > 0.75:
return SELECTED_CONTENT_TYPE.BEAUTY_PROJECT[0], words_proba
elif each_proba[0] > each_proba[2]:
return SELECTED_CONTENT_TYPE.BEAUTY_CELEBRITY[0], words_proba
else:
return SELECTED_CONTENT_TYPE.BEAUTY_STAR[0], words_proba
root_path = "/".join(str(__file__).split("/")[:-3])
model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic))
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from preprocesser.processors import token_processor
from preprocesser.filter import stopwords_filter
from collections import Counter
from config import config
import os
class SELECTED_CONTENT_TYPE():
BEAUTY_PROJECT = (1, "医美项目")
BEAUTY_STAR = (2, "明星医美")
BEAUTY_CELEBRITY = (3, "网红医美")
STAR_GOSSIP = (4, "明星八卦")
CELEBRITY_GOSSIP = (5, "网红八卦")
class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path):
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter
def build_network_influencer_words(self, word_path):
ret = {}
for line in open(word_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def build_project_words(self, project_path):
ret = {}
for line in open(project_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def build_star_words(self, star_path):
ret = {}
for line in open(star_path, "r", errors="ignore"):
line = line.strip()
ret[line] = 1
return ret
def run(self, content):
ret = {
"content_type": -1,
"star": [],
"celebrity": [],
"projects": []
}
words = self.tokenprocessor.lcut(content, cut_all=True)
words = stopwords_filter.filter(words)
netword_influencer_concurrence = set(words) & set(self.network_influencer_words)
project_word_concurrence = set(words) & set(self.project_words)
star_words_concurrence = set(words) & set(self.star_words)
counter = Counter(words)
content_type, words_proba = self.predict(counter, netword_influencer_concurrence, project_word_concurrence,
star_words_concurrence)
ret["content_type"] = content_type
ret["star"].extend([{word: words_proba[2].get(word, 0.0)} for word in list(star_words_concurrence)])
ret["celebrity"].extend(
[{word: words_proba[0].get(word, 0.0)} for word in list(netword_influencer_concurrence)])
ret["projects"].extend(
[{word: words_proba[1].get(word, 0.0)} for word in list(project_word_concurrence)])
return ret
def score(self, counter, concurrence_words):
pass
def predict(self, counter, netword_influencer_concurrence, project_word_concurrence, star_words_concurrence):
words_proba = []
net_influencer_total = sum([counter[word] * 2 for word in netword_influencer_concurrence])
net_influencer_proba = {word: float(counter[word] * 2) / net_influencer_total for word in
netword_influencer_concurrence}
words_proba.append(net_influencer_proba)
project_words_total = sum([counter[word] for word in project_word_concurrence])
project_words_proba = {word: float(counter[word]) / project_words_total for word in project_word_concurrence}
words_proba.append(project_words_proba)
star_words_total = sum([counter[word] * 2 for word in star_words_concurrence])
star_words_proba = {word: float(counter[word] * 2) / star_words_total for word in star_words_concurrence}
words_proba.append(star_words_proba)
total_word = sum([net_influencer_total, project_words_total, star_words_total])
if total_word <= 0:
return -1, words_proba
each_proba = [float(item) / total_word for item in
[net_influencer_total, project_words_total, star_words_total]]
if each_proba[1] <= 0 and each_proba[2] >= each_proba[0]:
return SELECTED_CONTENT_TYPE.STAR_GOSSIP[0], words_proba
elif each_proba[1] <= 0 and each_proba[2] < each_proba[0]:
return SELECTED_CONTENT_TYPE[0], words_proba
elif each_proba[1] > 0.75:
return SELECTED_CONTENT_TYPE.BEAUTY_PROJECT[0], words_proba
elif each_proba[0] > each_proba[2]:
return SELECTED_CONTENT_TYPE.BEAUTY_CELEBRITY[0], words_proba
else:
return SELECTED_CONTENT_TYPE.BEAUTY_STAR[0], words_proba
root_path = "/".join(str(__file__).split("/")[:-3])
model = TextClassifical(os.path.join(root_path, config.network_influcer_dic),
os.path.join(root_path, config.projects_dic), os.path.join(root_path, config.star_dic))
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
DEBUG = False
"""
recall_topK:
我们选取多少个候选词
sort_topK:
排序后挑选的词汇数量
min_frequence:
所有词汇必须共现的次数
stopwords_path:
停用词路径
words_path:
词典path
"""
stopwords_path = "dicts/stopwords.dic"
words_path = "dicts/words.dic"
network_influcer_dic = "dicts/network_influcer.dic"
projects_dic = "dicts/project.dic"
star_dic = "dicts/star.dic"
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
丁真
周扬青
冯提莫
半藏森林
艾比
韩安冉
南笙
奶茶妹妹
宋昕冉
林小宅
晚晚
谢安然
王柠萌
Naomi
于文红
甜仇
温精灵
温婉
Fiona宋亮
李蒽熙
una
夏夏
水野亚美
小小如
卓亨瑜
彭王者
滕雨佳
腻腻ninii
李恩童
花珊珊
小初
小饼干
晚妹
吃一口甜
徐清婉
Jy小语
张贤静
施安妮
周子然Femi
XIZI杨
彦崽儿
潘白雪
方恰拉
MAGBOW
CHU小初
不求上进的柚砸
LU一丝
大佬儿
姚淇瀚Henry
周小濛
半藏
Abbily
王嘉辉
罗小伊
章泽天
林晓婷
晚奶
柠檬
康雅馨
仇琳琳
温仙女
张曼如
雷婉婷
土豆公主
This diff is collapsed.
肖战
杨紫
赵丽颖
杨幂
倪妮
迪丽热巴
范冰冰
鞠婧祎
刘诗诗
Lisa
吴宣仪
赵露思
杨超越
Angelababy
高圆圆
章子怡
乔欣
张雨绮
孙怡
江疏影
毛晓彤
张馨予
王祖贤
张子枫
陈小纭
舒淇
石原里美
关之琳
权志龙
陈数
程潇
李小璐
景甜
奚梦瑶
戚薇
萧亚轩
车晓
沈梦辰
陈妍希
张予曦
陈坤
林珍娜
宋慧乔
孟佳
张靓颖
郭采洁
白冰
林允
吉娜
姚晨
昆凌
白百何
沈月
邓文迪
王心凌
杨雪
朴敏英
水原希子
甘薇
秀智
高允真
苟芸慧
新桓结衣
徐贞姬
孙胜完
郑采妍
战战
紫妹
小猴子
赵姐
颖宝
大幂幂
妮妮
喵总
热巴
范爷
冰冰
四千年
老鞠,
诗爷
人间芭比
小选
肉丝
超越妹妹
杨颖
baby
国际章
乔妹
绮绮子
怡宝
张燕
妹妹
十元
GD
嫂子
金莲
大甜甜
小明
戚哥
鲜肉菩萨
小笼包
乔妹
费霞
天王嫂
国民初恋
GAKKI
(
)
!
"
#
$
%
&
'
(
)
*
+
,
-
--
.
..
...
......
...................
./
.一
.数
.日
/
//
0
1
2
3
4
5
6
7
8
9
:
://
::
;
<
=
>
>>
?
@
A
Lex
[
\
]
^
_
`
exp
sub
sup
|
}
~
~~~
·
×
×××
Δ
Ψ
γ
μ
φ
φ.
В
——
———
’‘
”,
……
…………………………………………………③
′∈
′|
∈[
∪φ∈
②c
③]
──
 
》),
〕〔
︿
)÷(1-
)、
+ξ
++
,也
-β
--
-[*]-
0:2
1.
12%
2.3%
5:0
<±
<Δ
<λ
<φ
<<
=″
=☆
=(
=-
=[
={
>λ
LI
R.L.
ZXFITL
[①①]
[①②]
[①③]
[①④]
[①⑤]
[①⑥]
[①⑦]
[①⑧]
[①⑨]
[①A]
[①B]
[①C]
[①D]
[①E]
[①]
[①a]
[①c]
[①d]
[①e]
[①f]
[①g]
[①h]
[①i]
[①o]
[②
[②①]
[②②]
[②③]
[②④
[②⑤]
[②⑥]
[②⑦]
[②⑧]
[②⑩]
[②B]
[②G]
[②]
[②a]
[②b]
[②c]
[②d]
[②e]
[②f]
[②g]
[②h]
[②i]
[②j]
[③①]
[③⑩]
[③F]
[③]
[③a]
[③b]
[③c]
[③d]
[③e]
[③g]
[③h]
[④]
[④a]
[④b]
[④c]
[④d]
[④e]
[⑤]
[⑤]]
[⑤a]
[⑤b]
[⑤d]
[⑤e]
[⑤f]
[⑥]
[⑦]
[⑧]
[⑨]
[⑩]
[*]
[-
[]
]∧′=[
][
_
a]
b]
c]
e]
f]
ng昉
{-
}>
~±
~+
This diff is collapsed.
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
import re, os
from config import config
class Filter(object):
def __init__(self, file_path, encoding="utf-*"):
self.file_path = file_path
self.encoding = encoding
self.stopwords = set()
def filter(self):
raise NotImplementedError
class StopwordsFilter(Filter):
def __init__(self, file_path, encoding="utf-8"):
super(StopwordsFilter, self).__init__(file_path, encoding)
self.init()
def remove_irregular_chars(self, corpus: str):
return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
def init(self):
for line in open(self.file_path, "r", encoding=self.encoding):
try:
line = line.strip()
self.stopwords.add(line)
self.stopwords.add("\n")
self.stopwords.add("")
except Exception as e:
print("{} process error".format(line))
def filter(self, token_list):
return [self.remove_irregular_chars(item.strip()) for item in token_list if
self.remove_irregular_chars(item) not in self.stopwords]
root_path = "/".join(str(__file__).split("/")[:-2])
stopwords_filter = StopwordsFilter(os.path.join(root_path, config.stopwords_path))
print()
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from tqdm import tqdm
class Pipeline(object):
"""
pipiline 定义数据的流程,
将文件中的词处理与与句为单位的tokenlist
"""
def __init__(self):
self.pipelines = []
\ No newline at end of file
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from abc import ABC
from jieba import Tokenizer
import re, os
from config import config
class SentenceSegmenter(object):
def __init__(self, split_pun=None):
if not split_pun:
self.split_pun = r'[;;.。,,!\n!??]'
else:
self.split_pun = '{}'.format("".join(split_pun))
def split(self, sentences):
if isinstance(sentences, str):
for item in re.split(self.split_pun, sentences):
if item:
yield item
else:
for sentence in sentences:
for item in re.split(self.split_pun, sentence):
yield item
class Processor(object):
def __init__(self, name):
self._name = name
def run(self):
raise NotImplementedError
class StandardProcessor(Processor):
def __init__(self, name=""):
super(StandardProcessor, self).__init__(name)
def lcut(self, line):
return " ".join(line)
class TokenizerProcessor(Processor, ABC):
def __init__(self, file_path):
self.file_path = file_path
self.tokenizer = None
self.init(self.file_path)
def init(self, dict_path=None):
tokenizer = Tokenizer(dictionary=dict_path)
tokenizer.initialize()
self.tokenizer = tokenizer
def lcut(self, line, cut_all=False):
"""
当前只支持smart的切词方式
:param line:
:type line:
:return:
:rtype:
"""
return self.tokenizer.lcut(line, HMM=True, cut_all=cut_all)
root_path = "/".join(str(__file__).split("/")[:-2])
token_processor = TokenizerProcessor(os.path.join(root_path, config.words_path))
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
DEBUG = False
"""
recall_topK:
我们选取多少个候选词
sort_topK:
排序后挑选的词汇数量
min_frequence:
所有词汇必须共现的次数
stopwords_path:
停用词路径
words_path:
词典path
"""
stopwords_path = "dicts/stopwords.dic"
words_path = "dicts/words.dic"
network_influcer_dic = "dicts/network_influcer.dic"
projects_dic = "dicts/project.dic"
star_dic = "dicts/star.dic"
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
丁真
周扬青
冯提莫
半藏森林
艾比
韩安冉
南笙
奶茶妹妹
宋昕冉
林小宅
晚晚
谢安然
王柠萌
Naomi
于文红
甜仇
温精灵
温婉
Fiona宋亮
李蒽熙
una
夏夏
水野亚美
小小如
卓亨瑜
彭王者
滕雨佳
腻腻ninii
李恩童
花珊珊
小初
小饼干
晚妹
吃一口甜
徐清婉
Jy小语
张贤静
施安妮
周子然Femi
XIZI杨
彦崽儿
潘白雪
方恰拉
MAGBOW
CHU小初
不求上进的柚砸
LU一丝
大佬儿
姚淇瀚Henry
周小濛
半藏
Abbily
王嘉辉
罗小伊
章泽天
林晓婷
晚奶
柠檬
康雅馨
仇琳琳
温仙女
张曼如
雷婉婷
土豆公主
This diff is collapsed.
肖战
杨紫
赵丽颖
杨幂
倪妮
迪丽热巴
范冰冰
鞠婧祎
刘诗诗
Lisa
吴宣仪
赵露思
杨超越
Angelababy
高圆圆
章子怡
乔欣
张雨绮
孙怡
江疏影
毛晓彤
张馨予
王祖贤
张子枫
陈小纭
舒淇
石原里美
关之琳
权志龙
陈数
程潇
李小璐
景甜
奚梦瑶
戚薇
萧亚轩
车晓
沈梦辰
陈妍希
张予曦
陈坤
林珍娜
宋慧乔
孟佳
张靓颖
郭采洁
白冰
林允
吉娜
姚晨
昆凌
白百何
沈月
邓文迪
王心凌
杨雪
朴敏英
水原希子
甘薇
秀智
高允真
苟芸慧
新桓结衣
徐贞姬
孙胜完
郑采妍
战战
紫妹
小猴子
赵姐
颖宝
大幂幂
妮妮
喵总
热巴
范爷
冰冰
四千年
老鞠,
诗爷
人间芭比
小选
肉丝
超越妹妹
杨颖
baby
国际章
乔妹
绮绮子
怡宝
张燕
妹妹
十元
GD
嫂子
金莲
大甜甜
小明
戚哥
鲜肉菩萨
小笼包
乔妹
费霞
天王嫂
国民初恋
GAKKI
(
)
!
"
#
$
%
&
'
(
)
*
+
,
-
--
.
..
...
......
...................
./
.一
.数
.日
/
//
0
1
2
3
4
5
6
7
8
9
:
://
::
;
<
=
>
>>
?
@
A
Lex
[
\
]
^
_
`
exp
sub
sup
|
}
~
~~~
·
×
×××
Δ
Ψ
γ
μ
φ
φ.
В
——
———
’‘
”,
……
…………………………………………………③
′∈
′|
∈[
∪φ∈
②c
③]
──
 
》),
〕〔
︿
)÷(1-
)、
+ξ
++
,也
-β
--
-[*]-
0:2
1.
12%
2.3%
5:0
<±
<Δ
<λ
<φ
<<
=″
=☆
=(
=-
=[
={
>λ
LI
R.L.
ZXFITL
[①①]
[①②]
[①③]
[①④]
[①⑤]
[①⑥]
[①⑦]
[①⑧]
[①⑨]
[①A]
[①B]
[①C]
[①D]
[①E]
[①]
[①a]
[①c]
[①d]
[①e]
[①f]
[①g]
[①h]
[①i]
[①o]
[②
[②①]
[②②]
[②③]
[②④
[②⑤]
[②⑥]
[②⑦]
[②⑧]
[②⑩]
[②B]
[②G]
[②]
[②a]
[②b]
[②c]
[②d]
[②e]
[②f]
[②g]
[②h]
[②i]
[②j]
[③①]
[③⑩]
[③F]
[③]
[③a]
[③b]
[③c]
[③d]
[③e]
[③g]
[③h]
[④]
[④a]
[④b]
[④c]
[④d]
[④e]
[⑤]
[⑤]]
[⑤a]
[⑤b]
[⑤d]
[⑤e]
[⑤f]
[⑥]
[⑦]
[⑧]
[⑨]
[⑩]
[*]
[-
[]
]∧′=[
][
_
a]
b]
c]
e]
f]
ng昉
{-
}>
~±
~+
This diff is collapsed.
Metadata-Version: 1.1
Name: gm-text-miner
Version: 1.0.0
Summary: classifical base word dict
Home-page: UNKNOWN
Author: crazyer
Author-email: zhangguodong@igengmei.com
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
MANIFEST.in
setup.py
algorithm/__init__.py
algorithm/text_classifical/__init__.py
algorithm/text_classifical/base.py
config/__init__.py
config/config.py
dicts/__init__.py
dicts/network_influcer.dic
dicts/project.dic
dicts/star.dic
dicts/stopwords.dic
dicts/words.dic
gm_text_miner.egg-info/PKG-INFO
gm_text_miner.egg-info/SOURCES.txt
gm_text_miner.egg-info/dependency_links.txt
gm_text_miner.egg-info/top_level.txt
preprocesser/__init__.py
preprocesser/filter.py
preprocesser/pipeline.py
preprocesser/processors.py
\ No newline at end of file
algorithm
config
dicts
preprocesser
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from algorithm.text_classifical.base import model
content = "双眼皮"
content = """
🥰🥰38+看起来像20+ 我的驻颜秘密大公开
 
😳😳前两天看网上有人说关之琳在30岁才开始大红,我们现在看到的作品很多都是她30以后的电影。女人很少能在这个年龄才开始出头,现在女星你三十都没有出名,后面像再靠颜值火起来根本是没有可能的。关美女能火那是因为她有别人不可比拟的骨相和皮相美。
-
👇🏻👇🏻如何让自己能在三十多看起来还像少女一样稚嫩呢?下嘛看看我们案例姑娘的保养秘诀
-
🌿🌿术前情况:随着年龄的增加,胶原蛋白的流失皮相一路向下。正常情况下三十岁左右的女人,应该是稍微丰腴一点会显得比较和蔼,有福相看起来会比较年轻。我们案例姑娘在这个年龄确
-
✍🏻️✍🏻️整形方案:面部填充。重点填充部位太阳穴和面颊部位,额捏角。
-
🎉🎉术后效果:如果你细看那些女明星,三十多还风采依旧的基本上都是做过填充的姑娘,典型的我们填充模板,王子文。填充后那颜值一路高歌。我们案例姑娘一样,填充后,面部线条柔和,没有突兀感所以颜值也是猛然提升。
-
🌈🌈术后七天:面部肿胀在三四天的时候达到高峰,七天左右开始消肿。
-
🥰🥰术后一个月:这个时候基本已经消肿完毕,但是术后效果并不是稳定期,术后效果也不是最好的。如果还有肿胀也是正常的,建议耐心等待。
-
☑️☑️术后三个月:术后效果稳定期,面部填充效果最好的时期无疑是三个月恢复期过后。这个时候填充脂肪细胞基本稳定,面部表情也会变得自然。
"""
print(model.run(content))
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
import re, os
from config import config
class Filter(object):
def __init__(self, file_path, encoding="utf-*"):
self.file_path = file_path
self.encoding = encoding
self.stopwords = set()
def filter(self):
raise NotImplementedError
class StopwordsFilter(Filter):
def __init__(self, file_path, encoding="utf-8"):
super(StopwordsFilter, self).__init__(file_path, encoding)
self.init()
def remove_irregular_chars(self, corpus: str):
return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
def init(self):
for line in open(self.file_path, "r", encoding=self.encoding):
try:
line = line.strip()
self.stopwords.add(line)
self.stopwords.add("\n")
self.stopwords.add("")
except Exception as e:
print("{} process error".format(line))
def filter(self, token_list):
return [self.remove_irregular_chars(item.strip()) for item in token_list if
self.remove_irregular_chars(item) not in self.stopwords]
root_path = "/".join(str(__file__).split("/")[:-2])
stopwords_filter = StopwordsFilter(os.path.join(root_path, config.stopwords_path))
print()
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from tqdm import tqdm
class Pipeline(object):
"""
pipiline 定义数据的流程,
将文件中的词处理与与句为单位的tokenlist
"""
def __init__(self):
self.pipelines = []
\ No newline at end of file
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from abc import ABC
from jieba import Tokenizer
import re, os
from config import config
class SentenceSegmenter(object):
def __init__(self, split_pun=None):
if not split_pun:
self.split_pun = r'[;;.。,,!\n!??]'
else:
self.split_pun = '{}'.format("".join(split_pun))
def split(self, sentences):
if isinstance(sentences, str):
for item in re.split(self.split_pun, sentences):
if item:
yield item
else:
for sentence in sentences:
for item in re.split(self.split_pun, sentence):
yield item
class Processor(object):
def __init__(self, name):
self._name = name
def run(self):
raise NotImplementedError
class StandardProcessor(Processor):
def __init__(self, name=""):
super(StandardProcessor, self).__init__(name)
def lcut(self, line):
return " ".join(line)
class TokenizerProcessor(Processor, ABC):
def __init__(self, file_path):
self.file_path = file_path
self.tokenizer = None
self.init(self.file_path)
def init(self, dict_path=None):
tokenizer = Tokenizer(dictionary=dict_path)
tokenizer.initialize()
self.tokenizer = tokenizer
def lcut(self, line, cut_all=False):
"""
当前只支持smart的切词方式
:param line:
:type line:
:return:
:rtype:
"""
return self.tokenizer.lcut(line, HMM=True, cut_all=cut_all)
root_path = "/".join(str(__file__).split("/")[:-2])
token_processor = TokenizerProcessor(os.path.join(root_path, config.words_path))
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
import setuptools
requires = [
]
dev_requires = [
]
setuptools.setup(
name='gm-text-miner',
version="1.0.0",
author="crazyer",
author_email="zhangguodong@igengmei.com",
description="classifical base word dict",
install_requires=requires,
url="",
packages=setuptools.find_packages(),
extra_requires={
'dev': dev_requires,
},
classifiers=[
"Programming Language :: Python :: 3",
],
include_package_data=True
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment