add

d6acc0fb · 段英荣 · d6371db2 · d6acc0fb · d6acc0fb · d6acc0fb
Commit d6acc0fb authored Jan 15, 2019 by 段英荣
6 changed files
--- a/requirements.txt
+++ b/requirements.txt
+Django==1.8
+kafka-python>=1.2.1,<1.4
+elasticsearch==6.3.1
+redis==2.10.6
+celery==4.2.1
+redlock==1.2.0
+kombu==4.2.2
+PyMySQL==0.9.2
+gunicorn==19.9.0
+gevent==1.3.7
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-rpcd.git@master
+git+ssh://git@git.wanmeizhensuo.com/backend/helios.git@master
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-logging.git@master
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-config.git@v0.1.3#egg=gm-config==0.1.3
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-protocol.git@master
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-upload.git@master
+git+ssh://git@git.wanmeizhensuo.com/system/gm-tracer.git@v0.1.2
+git+ssh://git@git.wanmeizhensuo.com/alpha/alpha-types.git@dev
+git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
--- a/trans2es/mapping/suggest.json
+++ b/trans2es/mapping/suggest.json
+{
+  "dynamic":"strict",
+  "properties": {
+    "id":{"type":"long"},
+    "suggest":{
+      "type":"completion"
+    },
+    "suggest_type":{"type":"long"},//0-汉字,1-汉字全拼,2-拼音,3-拼音全拼,4-拼音简写,5-拼音简写全拼
+    "data_type":{"type":"long"},//0-tag,1-hospital,2-doctor,3-wiki
+    "ori_name":{"type":"text","index":"not_analyzed"},//原名称
+    "order_weight":{"type":"double"},//订单权重
+    "offline_score":{"type":"double"},//离线分
+    "results_num":{"type":"integer"},//结果数量
+    "type_flag":{"type":"text","index":"not_analyzed"},
+    "is_online":{"type":"long"}//上线
+  }
+}
\ No newline at end of file
--- a/trans2es/models/__init__.py
+++ b/trans2es/models/__init__.py
--- a/trans2es/models/doctor.py
+++ b/trans2es/models/doctor.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+import traceback
+import logging
+from libs.es import ESPerform
+from django.db import models
+import datetime
+class Doctor(models.Model):
+    class Meta:
+        verbose_name = u'31. 医生'
+        verbose_name_plural = u'31. 医生'
+        db_table = 'api_doctor'
+        app_label = 'api'
+    id = models.CharField(max_length=100, primary_key=True)
+    name = models.CharField(max_length=200, verbose_name=u"姓名", db_index=True)
+    is_online = models.BooleanField(default=False, help_text=u"是否上线", verbose_name=u"上线", db_index=True)
--- a/trans2es/type_info.py
+++ b/trans2es/type_info.py
+# coding=utf-8
+from __future__ import unicode_literals, print_function, absolute_import
+import time
+import datetime
+import logging
+import traceback
+import django.db.models
+from django.conf import settings
+from libs.es import ESPerform
+import elasticsearch
+import elasticsearch.helpers
+import sys
+import copy
+from trans2es.models import doctor
+from trans2es.utils.doctor_transfer import DoctorTransfer
+from libs.es import ESPerform
+__es = None
+def get_elasticsearch_instance():
+    global __es
+    if __es is None:
+        __es = ESPerform.get_cli()
+    return __es
+def get_es_list_by_type(es_type):
+    return [get_elasticsearch_instance()]
+class TypeInfo(object):
+    def __init__(
+            self,
+            name,
+            type,
+            model,
+            query_deferred,
+            get_data_func,
+            bulk_insert_chunk_size,
+            round_insert_chunk_size,
+            round_insert_period,
+            batch_get_data_func=None,  # receive a list of pks, not instance
+            logic_database_id=None,
+    ):
+        self.name = name
+        self.type = type
+        self.model = model
+        self.query_deferred = query_deferred
+        self.get_data_func = get_data_func
+        self.batch_get_data_func = batch_get_data_func
+        self.pk_blacklist = ()
+        self.bulk_insert_chunk_size = bulk_insert_chunk_size
+        self.round_insert_chunk_size = round_insert_chunk_size
+        self.round_insert_period = round_insert_period
+        self.logic_database_id = logic_database_id
+    @property
+    def query(self):
+        return self.query_deferred()
+    @property
+    def queryset(self):
+        return django.db.models.QuerySet(model=self.model, query=self.query)
+    @property
+    def pk_blacklist(self):
+        return self.__pk_blacklist
+    @pk_blacklist.setter
+    def pk_blacklist(self, value):
+        self.__pk_blacklist = frozenset(value)
+    def bulk_get_data(self, instance_iterable):
+        data_list = []
+        if self.batch_get_data_func:
+            _pk_list = [getattr(instance, 'pk', None) for instance in instance_iterable]
+            not_found_pk_list = []
+            blacklisted_pk_list = []
+            pk_list = []
+            for pk in _pk_list:
+                if pk is None:
+                    not_found_pk_list.append(pk)
+                elif pk in self.__pk_blacklist:
+                    blacklisted_pk_list.append(pk)
+                else:
+                    pk_list.append(pk)
+            if not_found_pk_list:
+                logging.exception('those pks not found for name={}, doc_type={}, pk_list={}'.format(
+                    self.name,
+                    self.type,
+                    str(not_found_pk_list),
+                ))
+            if blacklisted_pk_list:
+                logging.info('those pks are in blacklist for name={}, doc_type={}, pk_list={}'.format(
+                    self.name,
+                    self.type,
+                    str(blacklisted_pk_list),
+                ))
+            try:
+                data_list = self.batch_get_data_func(pk_list)
+            except Exception:
+                traceback.print_exc()
+                logging.exception('bulk_get_data for name={}, doc_type={}, pk_list={}'.format(
+                    self.name,
+                    self.type,
+                    str(pk_list),
+                ))
+        else:
+            for instance in instance_iterable:
+                pk = getattr(instance, 'pk', None)
+                try:
+                    if pk is None:
+                        raise Exception('pk not found')
+                    if pk in self.__pk_blacklist:
+                        logging.info('bulk_get_data for name={}, doc_type={}, pk={}: ignore blacklisted pk'.format(
+                            self.name,
+                            self.type,
+                            pk,
+                        ))
+                        continue
+                    data = self.get_data_func(instance)
+                    (item_dict, suggest_list) = data
+                    for suggest_item in suggest_list:
+                        suggest_dict = copy.deepcopy(item_dict)
+                        suggest_dict["suggest"] = {
+                            "input": suggest_item["input"],
+                            "suggest_type": suggest_item["suggest_type"],
+                            "offline_score": suggest_item["word_weight"]
+                        }
+                        data_list.append(suggest_dict)
+                except Exception:
+                    traceback.print_exc()
+                    logging.exception('bulk_get_data for name={}, doc_type={}, pk={}'.format(
+                        self.name,
+                        self.type,
+                        pk,
+                    ))
+        return data_list
+    def elasticsearch_bulk_insert_data(self, sub_index_name, data_list, es=None):
+        # assert (es is not None)
+        # index = ESPerform.get_official_index_name(sub_index_name=sub_index_name,index_flag="write")
+        # bulk_actions = []
+        # for data in data_list:
+        #     bulk_actions.append({
+        #         '_op_type': 'index',
+        #         '_index': index,
+        #         '_type': "_doc",
+        #         '_id': data['id'],
+        #         '_source': data,
+        #     })
+        #
+        # es_result = None
+        # if bulk_actions:
+        #     for t in es:
+        #         try:
+        #             es_result = elasticsearch.helpers.bulk(client=t, actions=bulk_actions)
+        #         except Exception as e:
+        #             traceback.print_exc()
+        #             es_result = 'error'
+        return ESPerform.es_helpers_bulk(es,data_list,sub_index_name,True)
+    def elasticsearch_bulk_insert(self, sub_index_name, instance_iterable, es=None):
+        data_list = self.bulk_get_data(instance_iterable)
+        return self.elasticsearch_bulk_insert_data(
+            sub_index_name=sub_index_name,
+            data_list=data_list,
+            es=es,
+        )
+    def insert_table_by_pk_list(self, sub_index_name, pk_list, es=None, use_batch_query_set=False):
+        if use_batch_query_set:
+            qs = self.queryset
+        else:
+            qs = self.model.objects.all()
+        instance_list = qs.filter(pk__in=pk_list)
+        data_list = self.bulk_get_data(instance_list)
+        self.elasticsearch_bulk_insert_data(
+            sub_index_name=sub_index_name,
+            data_list=data_list,
+            es=es,
+        )
+    def insert_table_chunk(self, sub_index_name, table_chunk, es=None):
+        start_clock = time.clock()
+        start_time = time.time()
+        instance_list = list(table_chunk)
+        stage_1_time = time.time()
+        data_list = self.bulk_get_data(instance_list)
+        stage_2_time = time.time()
+        es_result = ESPerform.es_helpers_bulk(
+            es_cli=es,
+            data_list=data_list,
+            sub_index_name=sub_index_name,
+            auto_create_index=True
+        )
+        stage_3_time = time.time()
+        end_clock = time.clock()
+        return ('{datetime} {index_prefix} {type_name:10s} {pk_start:>15s} {pk_stop:>15s} {count:5d} '
+                '{stage_1_duration:6.3f} {stage_2_duration:6.3f} {stage_3_duration:6.3f} {clock_duration:6.3f} '
+                '{response}').format(
+            datetime=datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f'),
+            index_prefix=sub_index_name,
+            type_name=self.name,
+            pk_start=repr(table_chunk.get_pk_start()),
+            pk_stop=repr(table_chunk.get_pk_stop()),
+            count=len(instance_list),
+            stage_1_duration=stage_1_time - start_time,
+            stage_2_duration=stage_2_time - stage_1_time,
+            stage_3_duration=stage_3_time - stage_2_time,
+            clock_duration=end_clock - start_clock,
+            response=es_result,
+        )
+_get_type_info_map_result = None
+def get_type_info_map():
+    global _get_type_info_map_result
+    if _get_type_info_map_result:
+        return _get_type_info_map_result
+    type_info_list = [
+        TypeInfo(
+            name='doctor_tips',  # 日记
+            type='doctor_tips',
+            model=doctor.Doctor,
+            query_deferred=lambda: doctor.Doctor.objects.all().query,
+            get_data_func=DoctorTransfer.get_doctor_suggest_data_list,
+            bulk_insert_chunk_size=100,
+            round_insert_chunk_size=5,
+            round_insert_period=2,
+        )
+    ]
+    type_info_map = {
+        type_info.name: type_info
+        for type_info in type_info_list
+    }
+    _get_type_info_map_result = type_info_map
+    return type_info_map
--- a/trans2es/utils/doctor_transfer.py
+++ b/trans2es/utils/doctor_transfer.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import sys
+import logging
+import traceback
+from libs.tools import tzlc
+import pypinyin
+from pypinyin import lazy_pinyin
+class DoctorTransfer(object):
+    ch_full_weight = 6.0
+    py_full_weight = 3.0
+    py_acronym_full_weight = 3.0
+    py_acronym_prefix_weight = 2
+    ch_prefix_weight = 1.5
+    py_prefix_weight = 1.0
+    @classmethod
+    def get_doctor_suggest_data_list(cls,instance):
+        try:
+            ret_list = list()
+            item_dict = dict()
+            item_dict["id"] = instance.id
+            item_dict["ori_name"] = instance.name
+            item_dict["is_online"] = instance.is_online
+            item_dict["order_weight"] = 0.0
+            item_dict["results_num"] = 0
+            item_dict["type_flag"] = "unknown"
+            item_dict["offline_score"] = 0.0
+            item_dict["data_type"] = 2
+            ret_list.append(item_dict)
+            suggest_list = list()
+            ch_full_word = instance.name
+            py_full_word = ''.join(lazy_pinyin(ch_full_word))
+            py_acronym_full_word = lazy_pinyin(ch_full_word,style=pypinyin.FIRST_LETTER)
+            #中文
+            for i in range(len(ch_full_word)):
+                for j in range(i,len(ch_full_word)+1):
+                    ch_name_term = ch_full_word[i:j].strip()
+                    if ch_name_term:
+                        prefix_weight = cls.ch_prefix_weight if len(ch_name_term)!=len(ch_full_word) else cls.ch_full_weight
+                        suggest_type = 0 if len(ch_name_term)!=len(ch_full_word) else 1
+                        suggest_item = {
+                            "input":[ch_name_term],
+                            "word_weight":(len(ch_name_term)/len((ch_full_word))) * prefix_weight,
+                            "suggest_type": suggest_type
+                        }
+                        suggest_list.append(suggest_item)
+            #拼音
+            for i in range(len(py_full_word)):
+                for j in range(i,len(py_full_word)+1):
+                    py_name_term = py_full_word[i:j].strip()
+                    if py_name_term:
+                        prefix_weight = cls.py_prefix_weight if len(py_name_term)!=len(py_full_word) else cls.py_full_weight
+                        suggest_type = 2 if len(py_name_term)!=len(py_full_word) else 3
+                        suggest_item = {
+                            "input":[py_name_term],
+                            "word_weight":(len(py_name_term)/len(py_full_word)) * prefix_weight,
+                            "suggest_type": suggest_type
+                        }
+                        suggest_list.append(suggest_item)
+            #简写
+            for i in range(len(py_acronym_full_word)):
+                for j in range(i,len(py_acronym_full_word)+1):
+                    py_acronym_term = py_acronym_full_word[i:j].strip()
+                    if py_acronym_term:
+                        prefix_weight = cls.py_acronym_prefix_weight if len(py_acronym_term)!=len(py_acronym_full_word) else cls.py_acronym_full_weight
+                        suggest_type = 4 if len(py_acronym_term)!=len(py_acronym_full_word) else 5
+                        suggest_item = {
+                            "input":[py_acronym_term],
+                            "word_weight":(len(py_acronym_term)/len(py_acronym_full_word)) * prefix_weight,
+                            "suggest_type": suggest_type
+                        }
+                        suggest_list.append(suggest_item)
+            return (item_dict,suggest_list)
+        except:
+            logging.error("catch exception,err_msg:%s" % traceback.format_exc())
+            return ([],[])