# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from abc import ABC

from jieba import Tokenizer
import re, os
from config import config


class SentenceSegmenter(object):
    def __init__(self, split_pun=None):
        if not split_pun:
            self.split_pun = r'[;；.。，,！\n!?？]'
        else:
            self.split_pun = '{}'.format("".join(split_pun))

    def split(self, sentences):
        if isinstance(sentences, str):
            for item in re.split(self.split_pun, sentences):
                if item:
                    yield item
        else:
            for sentence in sentences:
                for item in re.split(self.split_pun, sentence):
                    yield item


class Processor(object):
    def __init__(self, name):
        self._name = name

    def run(self):
        raise NotImplementedError


class StandardProcessor(Processor):
    def __init__(self, name=""):
        super(StandardProcessor, self).__init__(name)

    def lcut(self, line):
        return " ".join(line)


class TokenizerProcessor(Processor, ABC):

    def __init__(self, file_path):
        self.file_path = file_path
        self.tokenizer = None
        self.init(self.file_path)

    def init(self, dict_path=None):
        tokenizer = Tokenizer(dictionary=dict_path)
        tokenizer.initialize()
        self.tokenizer = tokenizer

    def lcut(self, line, cut_all=False):
        """
        当前只支持smart的切词方式
        :param line:
        :type line:
        :return:
        :rtype:
        """
        return self.tokenizer.lcut(line, HMM=True, cut_all=cut_all)


root_path = "/".join(str(__file__).split("/")[:-2])
token_processor = TokenizerProcessor(os.path.join(root_path, config.words_path))
