Commit 002cbe1f authored by crazyer's avatar crazyer

text miner

parent d5440a7e
......@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
......
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
......@@ -20,30 +20,32 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path):
def __init__(self, network_influencer_path, project_path, star_path, encoding="utf-8"):
self.encoding = encoding
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter
def build_network_influencer_words(self, word_path):
ret = {}
for line in codecs.open(word_path, "r", errors="ignore"):
for line in codecs.open(word_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
def build_project_words(self, project_path):
ret = {}
for line in codecs.open(project_path, "r", errors="ignore"):
for line in codecs.open(project_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
def build_star_words(self, star_path):
ret = {}
for line in codecs.open(star_path, "r", errors="ignore"):
for line in codecs.open(star_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
......
......@@ -20,30 +20,31 @@ class SELECTED_CONTENT_TYPE():
class TextClassifical(object):
def __init__(self, network_influencer_path, project_path, star_path):
def __init__(self, network_influencer_path, project_path, star_path, encoding="utf-8"):
self.network_influencer_words = self.build_network_influencer_words(network_influencer_path)
self.project_words = self.build_project_words(project_path)
self.star_words = self.build_star_words(star_path)
self.tokenprocessor = token_processor
self.stopwords_filter = stopwords_filter
self.encoding = encoding
def build_network_influencer_words(self, word_path):
ret = {}
for line in codecs.open(word_path, "r", errors="ignore"):
for line in codecs.open(word_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
def build_project_words(self, project_path):
ret = {}
for line in codecs.open(project_path, "r", errors="ignore"):
for line in codecs.open(project_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
def build_star_words(self, star_path):
ret = {}
for line in codecs.open(star_path, "r", errors="ignore"):
for line in codecs.open(star_path, "r", errors="ignore", encoding=self.encoding):
line = line.strip()
ret[line] = 1
return ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment