# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
import re, os
from config import config
import codecs

class Filter(object):
    def __init__(self, file_path, encoding="utf-*"):
        self.file_path = file_path
        self.encoding = encoding
        self.stopwords = set()

    def filter(self):
        raise NotImplementedError


class StopwordsFilter(Filter):
    def __init__(self, file_path, encoding="utf-8"):
        super(StopwordsFilter, self).__init__(file_path, encoding)
        self.init()

    def remove_irregular_chars(self, corpus):
        return re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", corpus)

    def init(self):
        for line in codecs.open(self.file_path, "r", encoding=self.encoding):
            try:
                line = line.strip()
                self.stopwords.add(line)
                self.stopwords.add("\n")
                self.stopwords.add("")
            except Exception as e:
                print("{} process error".format(line))

    def filter(self, token_list):
        return [self.remove_irregular_chars(item.strip()) for item in token_list if
                self.remove_irregular_chars(item) not in self.stopwords]


root_path = "/".join(str(__file__).split("/")[:-2])
stopwords_filter = StopwordsFilter(os.path.join(root_path, config.stopwords_path))

print()
