Commit 002cbe1f authored by crazyer's avatar crazyer

text miner

parent d5440a7e
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" /> <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="TestRunnerService"> <component name="TestRunnerService">
......
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
</project> </project>
\ No newline at end of file
...@@ -20,30 +20,32 @@ class SELECTED_CONTENT_TYPE(): ...@@ -20,30 +20,32 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object): class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path): def __init__(self, network_influencer_path, project_path, star_path, encoding="utf-8"):
self.encoding = encoding
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path) self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path) self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path) self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter self.stopwords_filter = stopwords_filter
def build_network_influencer_words(self, word_path): def build_network_influencer_words(self, word_path):
ret = {} ret = {}
for line in codecs.open(word_path, "r", errors="ignore"): for line in codecs.open(word_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_project_words(self, project_path): def build_project_words(self, project_path):
ret = {} ret = {}
for line in codecs.open(project_path, "r", errors="ignore"): for line in codecs.open(project_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_star_words(self, star_path): def build_star_words(self, star_path):
ret = {} ret = {}
for line in codecs.open(star_path, "r", errors="ignore"): for line in codecs.open(star_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
......
...@@ -20,30 +20,31 @@ class SELECTED_CONTENT_TYPE(): ...@@ -20,30 +20,31 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object): class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path): def __init__(self, network_influencer_path, project_path, star_path, encoding="utf-8"):
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path) self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path) self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path) self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter self.stopwords_filter = stopwords_filter
self.encoding = encoding
def build_network_influencer_words(self, word_path): def build_network_influencer_words(self, word_path):
ret = {} ret = {}
for line in codecs.open(word_path, "r", errors="ignore"): for line in codecs.open(word_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_project_words(self, project_path): def build_project_words(self, project_path):
ret = {} ret = {}
for line in codecs.open(project_path, "r", errors="ignore"): for line in codecs.open(project_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_star_words(self, star_path): def build_star_words(self, star_path):
ret = {} ret = {}
for line in codecs.open(star_path, "r", errors="ignore"): for line in codecs.open(star_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment