Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
gm-text-miner
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
gm-text-miner
Commits
a4f0be81
Commit
a4f0be81
authored
Jan 30, 2021
by
crazyer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add info inference
parent
bcc60d17
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
36 additions
and
4 deletions
+36
-4
base.py
algorithm/text_classifical/base.py
+34
-4
config.py
config/config.py
+2
-0
beauty_mddicine.dic
dicts/beauty_mddicine.dic
+0
-0
class_suport
dicts/class_suport
+0
-0
No files found.
algorithm/text_classifical/base.py
View file @
a4f0be81
...
@@ -9,6 +9,7 @@ from collections import Counter
...
@@ -9,6 +9,7 @@ from collections import Counter
from
config
import
config
from
config
import
config
import
os
import
os
import
codecs
import
codecs
import
json
class
SELECTED_CONTENT_TYPE
():
class
SELECTED_CONTENT_TYPE
():
...
@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE():
...
@@ -20,7 +21,8 @@ class SELECTED_CONTENT_TYPE():
class
TextClassifical
(
object
):
class
TextClassifical
(
object
):
def
__init__
(
self
,
network_influencer_path
,
project_path
,
star_path
,
synonym_path
,
encoding
=
"utf-8"
):
def
__init__
(
self
,
network_influencer_path
,
project_path
,
star_path
,
synonym_path
,
tag_info_path
,
support_words_path
,
encoding
=
"utf-8"
):
self
.
encoding
=
encoding
self
.
encoding
=
encoding
self
.
network_influencer_words
=
self
.
build_network_influencer_words
(
network_influencer_path
)
self
.
network_influencer_words
=
self
.
build_network_influencer_words
(
network_influencer_path
)
self
.
project_words
=
self
.
build_project_words
(
project_path
)
self
.
project_words
=
self
.
build_project_words
(
project_path
)
...
@@ -48,6 +50,18 @@ class TextClassifical(object):
...
@@ -48,6 +50,18 @@ class TextClassifical(object):
u"耳软骨 鼻子 耳软骨隆鼻"
u"耳软骨 鼻子 耳软骨隆鼻"
]
]
self
.
template_logic
=
self
.
build_template
()
self
.
template_logic
=
self
.
build_template
()
self
.
tag_info
=
self
.
build_tag_info_pro
(
tag_info_path
)
self
.
support_words
=
self
.
build_support_words
(
support_words_path
)
def
build_support_words
(
self
,
support_words_path
):
ret
=
[]
ret
=
json
.
loads
(
codecs
.
open
(
support_words_path
,
"r"
,
encoding
=
self
.
encoding
)
.
read
())
return
set
(
ret
)
def
build_tag_info_pro
(
self
,
tag_info_path
):
ret
=
{}
ret
=
json
.
loads
(
codecs
.
open
(
tag_info_path
,
"r"
,
encoding
=
self
.
encoding
)
.
read
())
return
ret
def
build_template
(
self
):
def
build_template
(
self
):
ret
=
[]
ret
=
[]
...
@@ -102,19 +116,34 @@ class TextClassifical(object):
...
@@ -102,19 +116,34 @@ class TextClassifical(object):
ret
.
append
(
tag
)
ret
.
append
(
tag
)
return
[{
item
:
1.0
}
for
item
in
ret
]
return
[{
item
:
1.0
}
for
item
in
ret
]
def
run
(
self
,
content
):
def
get_info_inference_tags
(
self
,
words
,
proba_threshold
=
0.3
,
topk
=
10
):
tag_proba
=
{}
common_words_concurrence
=
self
.
support_words
&
set
(
words
)
for
word
in
common_words_concurrence
:
for
tag
in
self
.
tag_info
[
word
]:
if
tag
in
tag_proba
:
tag_proba
[
tag
]
=
tag_proba
[
tag
]
+
self
.
tag_info
[
word
][
tag
]
else
:
tag_proba
[
tag
]
=
self
.
tag_info
[
word
][
tag
]
return
sorted
([(
tag
,
tag_proba
[
tag
])
for
tag
in
tag_proba
if
tag_proba
[
tag
]
>
proba_threshold
],
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)[:
topk
]
def
run
(
self
,
content
,
proba_threshold
=
0.3
,
topk
=
10
):
ret
=
{
ret
=
{
"content_type"
:
-
1
,
"content_type"
:
-
1
,
"star"
:
[],
"star"
:
[],
"celebrity"
:
[],
"celebrity"
:
[],
"projects"
:
[],
"projects"
:
[],
"inference_tags"
:
[]
"inference_tags"
:
[],
"info_inference_tags"
:
[]
}
}
words
=
self
.
tokenprocessor
.
lcut
(
content
,
cut_all
=
True
)
words
=
self
.
tokenprocessor
.
lcut
(
content
,
cut_all
=
True
)
words
=
stopwords_filter
.
filter
(
words
)
words
=
stopwords_filter
.
filter
(
words
)
netword_influencer_concurrence
=
set
(
words
)
&
set
(
self
.
network_influencer_words
)
netword_influencer_concurrence
=
set
(
words
)
&
set
(
self
.
network_influencer_words
)
project_word_concurrence
=
set
(
words
)
&
set
(
self
.
project_words
)
project_word_concurrence
=
set
(
words
)
&
set
(
self
.
project_words
)
star_words_concurrence
=
set
(
words
)
&
set
(
self
.
star_words
)
star_words_concurrence
=
set
(
words
)
&
set
(
self
.
star_words
)
info_inference_tags
=
self
.
get_info_inference_tags
(
words
,
proba_threshold
=
0.3
,
topk
=
10
)
counter
=
Counter
(
words
)
counter
=
Counter
(
words
)
content_type
,
words_proba
=
self
.
predict
(
counter
,
netword_influencer_concurrence
,
project_word_concurrence
,
content_type
,
words_proba
=
self
.
predict
(
counter
,
netword_influencer_concurrence
,
project_word_concurrence
,
star_words_concurrence
)
star_words_concurrence
)
...
@@ -125,6 +154,7 @@ class TextClassifical(object):
...
@@ -125,6 +154,7 @@ class TextClassifical(object):
ret
[
"projects"
]
.
extend
(
ret
[
"projects"
]
.
extend
(
[{
self
.
standard_project
(
word
):
words_proba
[
1
]
.
get
(
word
,
0.0
)}
for
word
in
list
(
project_word_concurrence
)])
[{
self
.
standard_project
(
word
):
words_proba
[
1
]
.
get
(
word
,
0.0
)}
for
word
in
list
(
project_word_concurrence
)])
ret
[
"inference_tags"
]
.
extend
(
self
.
get_inference_tags
(
words
))
ret
[
"inference_tags"
]
.
extend
(
self
.
get_inference_tags
(
words
))
ret
[
"info_inference_tags"
]
.
extend
(
info_inference_tags
)
return
ret
return
ret
def
score
(
self
,
counter
,
concurrence_words
):
def
score
(
self
,
counter
,
concurrence_words
):
...
@@ -162,4 +192,4 @@ class TextClassifical(object):
...
@@ -162,4 +192,4 @@ class TextClassifical(object):
root_path
=
"/"
.
join
(
str
(
__file__
)
.
split
(
"/"
)[:
-
3
])
root_path
=
"/"
.
join
(
str
(
__file__
)
.
split
(
"/"
)[:
-
3
])
model
=
TextClassifical
(
os
.
path
.
join
(
root_path
,
config
.
network_influcer_dic
),
model
=
TextClassifical
(
os
.
path
.
join
(
root_path
,
config
.
network_influcer_dic
),
os
.
path
.
join
(
root_path
,
config
.
projects_dic
),
os
.
path
.
join
(
root_path
,
config
.
star_dic
),
os
.
path
.
join
(
root_path
,
config
.
projects_dic
),
os
.
path
.
join
(
root_path
,
config
.
star_dic
),
os
.
path
.
join
(
root_path
,
config
.
synonym_path
))
os
.
path
.
join
(
root_path
,
config
.
synonym_path
)
,
os
.
path
.
join
(
root_path
,
config
.
tag_info_path
),
os
.
path
.
join
(
root_path
,
config
.
support_words_path
)
)
config/config.py
View file @
a4f0be81
...
@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic"
...
@@ -23,3 +23,5 @@ network_influcer_dic = "dicts/network_influcer.dic"
projects_dic
=
"dicts/project.dic"
projects_dic
=
"dicts/project.dic"
star_dic
=
"dicts/star.dic"
star_dic
=
"dicts/star.dic"
synonym_path
=
"dicts/synonym.dic"
synonym_path
=
"dicts/synonym.dic"
tag_info_path
=
"dicts/class_suport"
support_words_path
=
"dicts/beauty_mddicine.dic"
dicts/beauty_mddicine.dic
0 → 100644
View file @
a4f0be81
This diff is collapsed.
Click to expand it.
dicts/class_suport
0 → 100644
View file @
a4f0be81
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment