# -*- coding:utf-8 -*- # author:gm # mail: zhangguodong@igengmei.com # datetime:2020/4/24 3:32 下午 # software: PyCharm from functools import reduce import copy class Business(object): def __init__(self): pass def sort(self): pass class Strategy(object): def __init__(self): pass def run(self, documents): return documents class ScatterStrategy(Strategy): def __init__(self): self.name = "打散" def strategy(self, documents, field_name, win_len=4): """ 我们根据文档的某个字段将文档进行打乱,我们优先取值比较多的, 之后按照判断是否满足窗口大小 :param documents: :param field_name: :param win_len:多大的便宜内不出现重复值的数据 :return: """ ret_documents = [] ret_documents_idx = [] last_field_value_pointer = {} field_dict = self.build_field_dict(documents, field_name) while len(ret_documents_idx) < len(documents): sorted_value = self.get_sorted_value(field_dict) # print(ret_documents_idx, win_len, [documents[idx] for idx in ret_documents_idx]) is_append = self.ask(field_dict, sorted_value, win_len, last_field_value_pointer, ret_documents_idx) if not is_append: while win_len > 0: win_len -= 1 # 如果窗口无法满足,那么采取贪心的策略 # print(ret_documents_idx, win_len, [documents[idx] for idx in ret_documents_idx]) is_append = self.ask(field_dict, sorted_value, win_len, last_field_value_pointer, ret_documents_idx) if is_append: break for idx in ret_documents_idx: ret_documents.append(documents[idx]) return ret_documents # def remove_empty(self, field_dict): # for value in field_dict: # size = field_dict[value]["len"] # if size <= 0: # del field_dict[value] def ask(self, field_dict, sorted_field, win_len=4, last_field_value_pointer={}, ret_documents_idx=[]): """ 轮询填充数据 :param field_dict:分组后的items数据 :param sorted_field:根据数量排序后的值 :param win_len:窗口大小 :param last_field_value_pointer:存放着类别当前存储到的offset :param ret_documents_idx:返回数据的下标 :return: """ is_append = False for value in sorted_field: # 没有取过的话直接取 if value not in last_field_value_pointer: is_append = True # 记录当前类别放置的offset last_field_value_pointer[value] = len(ret_documents_idx) current_value = field_dict[value]["items"].pop(0) ret_documents_idx.append(current_value) field_dict[value]["len"] -= 1 break else: # 取过的话,我们获取上次放置的offset,然后看窗口大小是否合适 current_idx = last_field_value_pointer[value] current_value = len(ret_documents_idx) if current_value - current_idx >= win_len: # 窗口大小进行判断 is_append = True ret_documents_idx.append(field_dict[value]["items"].pop(0)) last_field_value_pointer[value] = current_value field_dict[value]["len"] -= 1 break else: continue return is_append def get_sorted_value(self, field_dict): """ 按照值的数量进行排序,我们优先取值比较多的 :param field_dict: :return: """ ret = sorted(field_dict.keys(), key=lambda x: field_dict[x]["len"], reverse=True) ret = filter(lambda x: field_dict[x]["len"] > 0, ret) return list(ret) def build_field_dict(self, documents, field_name): """ 将文档根据某个字段的值进行归类,字段可以是类目,机构等等 :param documents: :param field_name: :return: """ ret = {} for idx, document in enumerate(documents): if field_name in document: value = document[field_name] if value in ret: ret[value]["items"].append(idx) ret[value]["len"] += 1 else: ret[value] = { "items": [idx], "len": 1 } else: if "empty_value" in ret: ret["empty_value"]["items"].append(idx) ret["empty_value"]["len"] += 1 else: ret["empty_value"] = { "items": [idx], "len": 1 } return ret class HierarchyStrategy(Strategy): def __init__(self): pass def strategy(self, documents, fields): """ :param documents: :param fields:{"is_common_city":[1,0],"score":[1500,1000,800]} :return: """ result = [documents] for field_name in fields: tmp = [] for item in result: for value in fields[field_name]: tmp.append(list(filter( lambda x: x[field_name] == value,item))) result = tmp return result if __name__ == "__main__": # documents = [{"a": 0}, {"a": 1}, {"a": 1}, {"a": 1}, {"a": 2}, {"a": 0}, {"a": 0}, {"a": 3}, {"a": 3}] s = ScatterStrategy() # print(s.strategy(documents, "a")) documents = [{"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 1}, {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 11}, {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 12}, {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 13}, {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 14}, {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 15}, {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 1}, {"is_common_city": 1, "score": 1500, "merchant_id": 1112, "is_promote": False, "id": 2}, {"is_common_city": 1, "score": 1500, "merchant_id": 1111, "is_promote": True, "id": 3}, {"is_common_city": 1, "score": 1000, "merchant_id": 1113, "is_promote": False, "id": 4}, {"is_common_city": 1, "score": 1000, "merchant_id": 1111, "is_promote": True, "id": 5}, {"is_common_city": 0, "score": 1500, "merchant_id": 1114, "is_promote": True, "id": 6}, {"is_common_city": 0, "score": 1500, "merchant_id": 1114, "is_promote": False, "id": 7}, {"is_common_city": 0, "score": 1000, "merchant_id": 1113, "is_promote": True, "id": 8}, {"is_common_city": 0, "score": 1000, "merchant_id": 1113, "is_promote": False, "id": 9}, {"is_common_city": 0, "score": 1500, "merchant_id": 1112, "is_promote": True, "id": 10}] model = HierarchyStrategy() result = model.strategy(documents, fields={"is_common_city": [1, 0], "score": [1500, 1000, 0], "is_promote": [True, False]}) # print(result) # print(len(list(filter(lambda x:x!=[],result)))) # print(s.strategy(documents, "merchant_id")) for item in list(filter(lambda x:x!=[],result)): print(s.strategy(item,"merchant_id"))