Commit d5440a7e authored by crazyer's avatar crazyer

text miner

parent 21e4e037
...@@ -8,6 +8,7 @@ from preprocesser.filter import stopwords_filter ...@@ -8,6 +8,7 @@ from preprocesser.filter import stopwords_filter
from collections import Counter from collections import Counter
from config import config from config import config
import os import os
import codecs
class SELECTED_CONTENT_TYPE(): class SELECTED_CONTENT_TYPE():
...@@ -28,21 +29,21 @@ class TextClassifical(object): ...@@ -28,21 +29,21 @@ class TextClassifical(object):
def build_network_influencer_words(self, word_path): def build_network_influencer_words(self, word_path):
ret = {} ret = {}
for line in open(word_path, "r", errors="ignore"): for line in codecs.open(word_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_project_words(self, project_path): def build_project_words(self, project_path):
ret = {} ret = {}
for line in open(project_path, "r", errors="ignore"): for line in codecs.open(project_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_star_words(self, star_path): def build_star_words(self, star_path):
ret = {} ret = {}
for line in open(star_path, "r", errors="ignore"): for line in codecs.open(star_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
......
...@@ -8,6 +8,7 @@ from preprocesser.filter import stopwords_filter ...@@ -8,6 +8,7 @@ from preprocesser.filter import stopwords_filter
from collections import Counter from collections import Counter
from config import config from config import config
import os import os
import codecs
class SELECTED_CONTENT_TYPE(): class SELECTED_CONTENT_TYPE():
...@@ -28,21 +29,21 @@ class TextClassifical(object): ...@@ -28,21 +29,21 @@ class TextClassifical(object):
def build_network_influencer_words(self, word_path): def build_network_influencer_words(self, word_path):
ret = {} ret = {}
for line in open(word_path, "r", errors="ignore"): for line in codecs.open(word_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_project_words(self, project_path): def build_project_words(self, project_path):
ret = {} ret = {}
for line in open(project_path, "r", errors="ignore"): for line in codecs.open(project_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
def build_star_words(self, star_path): def build_star_words(self, star_path):
ret = {} ret = {}
for line in open(star_path, "r", errors="ignore"): for line in codecs.open(star_path, "r", errors="ignore"):
line = line.strip() line = line.strip()
ret[line] = 1 ret[line] = 1
return ret return ret
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# software: PyCharm # software: PyCharm
import re, os import re, os
from config import config from config import config
import codecs
class Filter(object): class Filter(object):
def __init__(self, file_path, encoding="utf-8"): def __init__(self, file_path, encoding="utf-8"):
...@@ -26,7 +26,7 @@ class StopwordsFilter(Filter): ...@@ -26,7 +26,7 @@ class StopwordsFilter(Filter):
return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus) return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
def init(self): def init(self):
for line in open(self.file_path, "r", encoding=self.encoding): for line in codecs.open(self.file_path, "r", encoding=self.encoding):
try: try:
line = line.strip() line = line.strip()
self.stopwords.add(line) self.stopwords.add(line)
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# software: PyCharm # software: PyCharm
import re, os import re, os
from config import config from config import config
import codecs
class Filter(object): class Filter(object):
def __init__(self, file_path, encoding="utf-*"): def __init__(self, file_path, encoding="utf-*"):
...@@ -26,7 +26,7 @@ class StopwordsFilter(Filter): ...@@ -26,7 +26,7 @@ class StopwordsFilter(Filter):
return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus) return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)
def init(self): def init(self):
for line in open(self.file_path, "r", encoding=self.encoding): for line in codecs.open(self.file_path, "r", encoding=self.encoding):
try: try:
line = line.strip() line = line.strip()
self.stopwords.add(line) self.stopwords.add(line)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment