content.py 7.76 KB
# -*- coding:utf-8 -*-
# author:gm
# mail: zhangguodong@igengmei.com
# datetime:2020/4/24 3:32 下午
# software: PyCharm
from functools import reduce
import copy


class Business(object):
    def __init__(self):
        pass

    def sort(self):
        pass


class Strategy(object):

    def __init__(self):
        pass

    def run(self, documents):
        return documents


class ScatterStrategy(Strategy):

    def __init__(self):
        self.name = "打散"

    def strategy(self, documents, field_name, win_len=4):
        """
        我们根据文档的某个字段将文档进行打乱,我们优先取值比较多的,
        之后按照判断是否满足窗口大小
        :param documents:
        :param field_name:
        :param win_len:多大的便宜内不出现重复值的数据
        :return:
        """
        ret_documents = []
        ret_documents_idx = []
        last_field_value_pointer = {}
        field_dict = self.build_field_dict(documents, field_name)
        while len(ret_documents_idx) < len(documents):
            sorted_value = self.get_sorted_value(field_dict)
            # print(ret_documents_idx, win_len, [documents[idx] for idx in ret_documents_idx])
            is_append = self.ask(field_dict, sorted_value, win_len, last_field_value_pointer, ret_documents_idx)
            if not is_append:
                while win_len > 0:
                    win_len -= 1  # 如果窗口无法满足,那么采取贪心的策略
                    # print(ret_documents_idx, win_len, [documents[idx] for idx in ret_documents_idx])
                    is_append = self.ask(field_dict, sorted_value, win_len, last_field_value_pointer, ret_documents_idx)
                    if is_append:
                        break
        for idx in ret_documents_idx:
            ret_documents.append(documents[idx])
        return ret_documents

    # def remove_empty(self, field_dict):
    #     for value in field_dict:
    #         size = field_dict[value]["len"]
    #         if size <= 0:
    #             del field_dict[value]

    def ask(self, field_dict, sorted_field, win_len=4, last_field_value_pointer={}, ret_documents_idx=[]):
        """
        轮询填充数据
        :param field_dict:分组后的items数据
        :param sorted_field:根据数量排序后的值
        :param win_len:窗口大小
        :param last_field_value_pointer:存放着类别当前存储到的offset
        :param ret_documents_idx:返回数据的下标
        :return:
        """
        is_append = False
        for value in sorted_field:
            # 没有取过的话直接取
            if value not in last_field_value_pointer:
                is_append = True
                # 记录当前类别放置的offset
                last_field_value_pointer[value] = len(ret_documents_idx)
                current_value = field_dict[value]["items"].pop(0)
                ret_documents_idx.append(current_value)
                field_dict[value]["len"] -= 1
                break
            else:
                # 取过的话,我们获取上次放置的offset,然后看窗口大小是否合适
                current_idx = last_field_value_pointer[value]
                current_value = len(ret_documents_idx)
                if current_value - current_idx >= win_len:  # 窗口大小进行判断
                    is_append = True
                    ret_documents_idx.append(field_dict[value]["items"].pop(0))
                    last_field_value_pointer[value] = current_value
                    field_dict[value]["len"] -= 1
                    break
                else:
                    continue
        return is_append

    def get_sorted_value(self, field_dict):
        """
        按照值的数量进行排序,我们优先取值比较多的
        :param field_dict:
        :return:
        """
        ret = sorted(field_dict.keys(), key=lambda x: field_dict[x]["len"], reverse=True)
        ret = filter(lambda x: field_dict[x]["len"] > 0, ret)
        return list(ret)

    def build_field_dict(self, documents, field_name):
        """
        将文档根据某个字段的值进行归类,字段可以是类目,机构等等
        :param documents:
        :param field_name:
        :return:
        """
        ret = {}
        for idx, document in enumerate(documents):
            if field_name in document:
                value = document[field_name]
                if value in ret:
                    ret[value]["items"].append(idx)
                    ret[value]["len"] += 1
                else:
                    ret[value] = {
                        "items": [idx],
                        "len": 1
                    }
            else:
                if "empty_value" in ret:
                    ret["empty_value"]["items"].append(idx)
                    ret["empty_value"]["len"] += 1
                else:
                    ret["empty_value"] = {
                        "items": [idx],
                        "len": 1
                    }
        return ret


class HierarchyStrategy(Strategy):
    def __init__(self):
        pass

    def strategy(self, documents, fields):
        """
        :param documents:
        :param fields:{"is_common_city":[1,0],"score":[1500,1000,800]}
        :return:
        """
        result = [documents]
        for field_name in fields:
            tmp = []
            for item in result:
                for value in fields[field_name]:
                    tmp.append(list(filter( lambda x: x[field_name] == value,item)))
            result = tmp

        return result


if __name__ == "__main__":
    # documents = [{"a": 0}, {"a": 1}, {"a": 1}, {"a": 1}, {"a": 2}, {"a": 0}, {"a": 0}, {"a": 3}, {"a": 3}]
    s = ScatterStrategy()
    # print(s.strategy(documents, "a"))
    documents = [{"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 1},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 11},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 12},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 13},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 14},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 15},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 1},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": False, "id": 2},
                 {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 3},
                 {"is_common_city": 1, "score": 1000, "merchant_id": 1113, "is_promote": False, "id": 4},
                 {"is_common_city": 1, "score": 1000, "merchant_id": 1111, "is_promote": True, "id": 5},
                 {"is_common_city": 0, "score": 1500, "merchant_id": 1114, "is_promote": True, "id": 6},
                 {"is_common_city": 0, "score": 1500, "merchant_id": 1114, "is_promote": False, "id": 7},
                 {"is_common_city": 0, "score": 1000, "merchant_id": 1113, "is_promote": True, "id": 8},
                 {"is_common_city": 0, "score": 1000, "merchant_id": 1113, "is_promote": False, "id": 9},
                 {"is_common_city": 0, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 10}]
    model = HierarchyStrategy()
    result = model.strategy(documents,
                         fields={"is_common_city": [1, 0], "score": [1500, 1000, 0], "is_promote": [True, False]})
    # print(result)
    # print(len(list(filter(lambda x:x!=[],result))))
    # print(s.strategy(documents, "merchant_id"))
    for item in list(filter(lambda x:x!=[],result)):
        print(s.strategy(item,"merchant_id"))