#!/usr/bin/env python # -*- coding: utf-8 -*- import openpyxl import os import random import copy from django.core.management import BaseCommand from utils.rpc import rpc_client from qa.models import BDAnswer, Answer ''' 百度熊掌号-首期数据同步 ''' def read_from_submit_answer_file(path): '''已提交的回答''' re_data = {} # {answer_id : 医生id} file_name = path + 'bd_submit_list.xlsx' wb = openpyxl.load_workbook(file_name) answers_sheet = wb['工作表 1'] for i, row in enumerate(answers_sheet.rows): if i > 1 and row[0].value: re_data[int(row[0].value)] = str(row[1].value).strip() return re_data def read_from_file(path, info_type): re_data = set() sheet_name = '工作表 1' if info_type == 'answer': # 原库中的回答id、问题、回答内容 file_name = path + 'bd_answers_list.xlsx' sheet_name = '回答列表' elif info_type == 'doctor': # 医生id、医生名称、医生科室、医生职称、医院类型、医院等级 file_name = path + 'bd_doctors_list.xlsx' else: return [] wb = openpyxl.load_workbook(file_name) data_sheet = wb[sheet_name] for i, row in enumerate(data_sheet.rows): if i > 1 and row[0].value: if info_type == 'answer': re_data.add(int(row[0].value)) else: re_data.add(str(row[0].value).strip()) return list(re_data) def get_answer_and_doctor_related(answers, doctors): '''{answer_id:doctor_id}''' x = int(len(answers) / len(doctors)) left = len(answers) % len(doctors) answer_list = copy.deepcopy(answers) doctor_dict = {} index = 0 for doc in doctors: try: doctor_dict[doc] = answer_list[index:index+x] index = index+x except Exception as e: print(e) if left: doctor_extra_ids = random.sample(doctors, left) for id in doctor_extra_ids: doctor_dict[id].append(answer_list.pop()) # 从列表末位 return_data = {} # organize data for k, v in doctor_dict.items(): for answer in v: return_data[answer] = k return return_data def get_qa_info(answer_list, path): re_data = {} import_question_list = [] # 提交到百度的问题id列表 answers_queryset = Answer.objects.filter(id__in=answer_list) for item in answers_queryset: re_data[item.id] = { 'original_id': item.id, 'question': item.question, 'content': item.content, 'is_recommend': item.is_recommend, 'like_num': item.like_num, 'create_time': item.create_time, 'is_submit': True, } import_question_list.append("{question_id}\n".format(question_id=item.question.id)) with open(path + "bd_import_question_id.txt", "w+") as f: f.writelines(import_question_list) print("write data to file : done") return re_data def get_doctor_info(doctor_list): re_data = {} doctors_info = rpc_client['doctor/user/get_doctors'](doctor_ids=doctor_list).unwrap() hospital_info = rpc_client['api/hospital/basic_info'](hospital_ids=[d.get('hospital') for d in doctors_info.get('doctors')]).unwrap() hospital_dict = {h.get('hospital_id'): h for h in hospital_info} for item in doctors_info.get('doctors'): hospital = hospital_dict.get(item.get('hospital')) re_data[item.get('id')] = { 'user_id': item.get('user'), 'name': item.get('name'), 'portrait': item.get('portrait'), 'doctor_id': item.get('id'), 'doctor_title': item.get('title'), 'doctor_department': item.get('department'), 'hospital_type': hospital.get('hospital_type') if hospital else '', 'hospital_name': hospital.get('hospital_name') if hospital else '', 'hospital_level': hospital.get('hospital_level') if hospital else '' } return re_data def insert_data(info_relation, qa_info, doctor_info): insert_lst = [] for answer_id, answer in qa_info.items(): doctor = doctor_info.get(info_relation.get(answer['original_id'])) if not doctor: continue create_kwargs = { 'original_id': answer['original_id'], 'question': answer['question'], 'content': answer['content'], 'is_recommend': answer['is_recommend'], 'like_num': answer['like_num'], 'create_time': answer['create_time'], 'is_submit': answer['is_submit'], 'user_id': doctor['user_id'], 'name': doctor['name'], 'portrait': doctor['portrait'], 'doctor_id': doctor['doctor_id'], 'doctor_title': doctor['doctor_title'], 'doctor_department': doctor['doctor_department'], 'hospital_type': doctor['hospital_type'], 'hospital_name': doctor['hospital_name'], 'hospital_level': doctor['hospital_level'] } insert_lst.append(BDAnswer(**create_kwargs)) print('insert list length') print(len(insert_lst)) print('insert list length') BDAnswer.objects.bulk_create(insert_lst) def query_answer_in_bd(answer_is_submit): BDAnswer.objects.filter(original_id__in=answer_is_submit).update(is_submit=True) id_list = BDAnswer.objects.values_list('original_id', flat=True) return id_list def delete_wrong_data(): BDAnswer.objects.filter(doctor_id='8949').delete() class Command(BaseCommand): def handle(self, *args, **kwargs): print("start") delete_wrong_data() parent_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) path = parent_path+'/files/' # 文件上传地址确定 # answer_is_submit = read_from_submit_answer_file(path) # 已提交的第一批数据 answer_is_submit = {} # 已提交的第一批数据 answer_list = read_from_file(path, 'answer') doctor_list = read_from_file(path, 'doctor') if not answer_list or not doctor_list: print('data missing, check the file plz') exit() print('get data from file : done') # 处理回答和医生的关联关系 info_dict = get_answer_and_doctor_related(answer_list, doctor_list) # 处理已提交的数据覆盖到现有数据中 for answer, doctor in answer_is_submit.items(): info_dict[answer] = doctor # 重复的回答已第一期已提交的为准 # 获取数据 already_in_db_answer_id_lst = query_answer_in_bd(answer_is_submit) # 需要过滤掉已有数据 answer_list.extend(list(answer_is_submit.keys())) answer_list = list(set(answer_list).difference(set(already_in_db_answer_id_lst))) qa_info = get_qa_info(answer_list, path) for k in answer_is_submit.keys(): # 第一批数据已提交 if k in qa_info: qa_info[k]['is_submit'] = True doctor_list.extend(list(answer_is_submit.values())) doctor_info = get_doctor_info(doctor_list) print('get data from db : done') print('start insert') # 插入数据 insert_data(info_dict, qa_info, doctor_info) print("end")