Commit d6acc0fb authored by 段英荣's avatar 段英荣

add

parent d6371db2
Django==1.8
kafka-python>=1.2.1,<1.4
elasticsearch==6.3.1
redis==2.10.6
celery==4.2.1
redlock==1.2.0
kombu==4.2.2
PyMySQL==0.9.2
gunicorn==19.9.0
gevent==1.3.7
git+ssh://git@git.wanmeizhensuo.com/backend/gm-rpcd.git@master
git+ssh://git@git.wanmeizhensuo.com/backend/helios.git@master
git+ssh://git@git.wanmeizhensuo.com/backend/gm-logging.git@master
git+ssh://git@git.wanmeizhensuo.com/backend/gm-config.git@v0.1.3#egg=gm-config==0.1.3
git+ssh://git@git.wanmeizhensuo.com/backend/gm-protocol.git@master
git+ssh://git@git.wanmeizhensuo.com/backend/gm-upload.git@master
git+ssh://git@git.wanmeizhensuo.com/system/gm-tracer.git@v0.1.2
git+ssh://git@git.wanmeizhensuo.com/alpha/alpha-types.git@dev
git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
{
"dynamic":"strict",
"properties": {
"id":{"type":"long"},
"suggest":{
"type":"completion"
},
"suggest_type":{"type":"long"},//0-汉字,1-汉字全拼,2-拼音,3-拼音全拼,4-拼音简写,5-拼音简写全拼
"data_type":{"type":"long"},//0-tag,1-hospital,2-doctor,3-wiki
"ori_name":{"type":"text","index":"not_analyzed"},//原名称
"order_weight":{"type":"double"},//订单权重
"offline_score":{"type":"double"},//离线分
"results_num":{"type":"integer"},//结果数量
"type_flag":{"type":"text","index":"not_analyzed"},
"is_online":{"type":"long"}//上线
}
}
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
import traceback
import logging
from libs.es import ESPerform
from django.db import models
import datetime
class Doctor(models.Model):
class Meta:
verbose_name = u'31. 医生'
verbose_name_plural = u'31. 医生'
db_table = 'api_doctor'
app_label = 'api'
id = models.CharField(max_length=100, primary_key=True)
name = models.CharField(max_length=200, verbose_name=u"姓名", db_index=True)
is_online = models.BooleanField(default=False, help_text=u"是否上线", verbose_name=u"上线", db_index=True)
# coding=utf-8
from __future__ import unicode_literals, print_function, absolute_import
import time
import datetime
import logging
import traceback
import django.db.models
from django.conf import settings
from libs.es import ESPerform
import elasticsearch
import elasticsearch.helpers
import sys
import copy
from trans2es.models import doctor
from trans2es.utils.doctor_transfer import DoctorTransfer
from libs.es import ESPerform
__es = None
def get_elasticsearch_instance():
global __es
if __es is None:
__es = ESPerform.get_cli()
return __es
def get_es_list_by_type(es_type):
return [get_elasticsearch_instance()]
class TypeInfo(object):
def __init__(
self,
name,
type,
model,
query_deferred,
get_data_func,
bulk_insert_chunk_size,
round_insert_chunk_size,
round_insert_period,
batch_get_data_func=None, # receive a list of pks, not instance
logic_database_id=None,
):
self.name = name
self.type = type
self.model = model
self.query_deferred = query_deferred
self.get_data_func = get_data_func
self.batch_get_data_func = batch_get_data_func
self.pk_blacklist = ()
self.bulk_insert_chunk_size = bulk_insert_chunk_size
self.round_insert_chunk_size = round_insert_chunk_size
self.round_insert_period = round_insert_period
self.logic_database_id = logic_database_id
@property
def query(self):
return self.query_deferred()
@property
def queryset(self):
return django.db.models.QuerySet(model=self.model, query=self.query)
@property
def pk_blacklist(self):
return self.__pk_blacklist
@pk_blacklist.setter
def pk_blacklist(self, value):
self.__pk_blacklist = frozenset(value)
def bulk_get_data(self, instance_iterable):
data_list = []
if self.batch_get_data_func:
_pk_list = [getattr(instance, 'pk', None) for instance in instance_iterable]
not_found_pk_list = []
blacklisted_pk_list = []
pk_list = []
for pk in _pk_list:
if pk is None:
not_found_pk_list.append(pk)
elif pk in self.__pk_blacklist:
blacklisted_pk_list.append(pk)
else:
pk_list.append(pk)
if not_found_pk_list:
logging.exception('those pks not found for name={}, doc_type={}, pk_list={}'.format(
self.name,
self.type,
str(not_found_pk_list),
))
if blacklisted_pk_list:
logging.info('those pks are in blacklist for name={}, doc_type={}, pk_list={}'.format(
self.name,
self.type,
str(blacklisted_pk_list),
))
try:
data_list = self.batch_get_data_func(pk_list)
except Exception:
traceback.print_exc()
logging.exception('bulk_get_data for name={}, doc_type={}, pk_list={}'.format(
self.name,
self.type,
str(pk_list),
))
else:
for instance in instance_iterable:
pk = getattr(instance, 'pk', None)
try:
if pk is None:
raise Exception('pk not found')
if pk in self.__pk_blacklist:
logging.info('bulk_get_data for name={}, doc_type={}, pk={}: ignore blacklisted pk'.format(
self.name,
self.type,
pk,
))
continue
data = self.get_data_func(instance)
(item_dict, suggest_list) = data
for suggest_item in suggest_list:
suggest_dict = copy.deepcopy(item_dict)
suggest_dict["suggest"] = {
"input": suggest_item["input"],
"suggest_type": suggest_item["suggest_type"],
"offline_score": suggest_item["word_weight"]
}
data_list.append(suggest_dict)
except Exception:
traceback.print_exc()
logging.exception('bulk_get_data for name={}, doc_type={}, pk={}'.format(
self.name,
self.type,
pk,
))
return data_list
def elasticsearch_bulk_insert_data(self, sub_index_name, data_list, es=None):
# assert (es is not None)
# index = ESPerform.get_official_index_name(sub_index_name=sub_index_name,index_flag="write")
# bulk_actions = []
# for data in data_list:
# bulk_actions.append({
# '_op_type': 'index',
# '_index': index,
# '_type': "_doc",
# '_id': data['id'],
# '_source': data,
# })
#
# es_result = None
# if bulk_actions:
# for t in es:
# try:
# es_result = elasticsearch.helpers.bulk(client=t, actions=bulk_actions)
# except Exception as e:
# traceback.print_exc()
# es_result = 'error'
return ESPerform.es_helpers_bulk(es,data_list,sub_index_name,True)
def elasticsearch_bulk_insert(self, sub_index_name, instance_iterable, es=None):
data_list = self.bulk_get_data(instance_iterable)
return self.elasticsearch_bulk_insert_data(
sub_index_name=sub_index_name,
data_list=data_list,
es=es,
)
def insert_table_by_pk_list(self, sub_index_name, pk_list, es=None, use_batch_query_set=False):
if use_batch_query_set:
qs = self.queryset
else:
qs = self.model.objects.all()
instance_list = qs.filter(pk__in=pk_list)
data_list = self.bulk_get_data(instance_list)
self.elasticsearch_bulk_insert_data(
sub_index_name=sub_index_name,
data_list=data_list,
es=es,
)
def insert_table_chunk(self, sub_index_name, table_chunk, es=None):
start_clock = time.clock()
start_time = time.time()
instance_list = list(table_chunk)
stage_1_time = time.time()
data_list = self.bulk_get_data(instance_list)
stage_2_time = time.time()
es_result = ESPerform.es_helpers_bulk(
es_cli=es,
data_list=data_list,
sub_index_name=sub_index_name,
auto_create_index=True
)
stage_3_time = time.time()
end_clock = time.clock()
return ('{datetime} {index_prefix} {type_name:10s} {pk_start:>15s} {pk_stop:>15s} {count:5d} '
'{stage_1_duration:6.3f} {stage_2_duration:6.3f} {stage_3_duration:6.3f} {clock_duration:6.3f} '
'{response}').format(
datetime=datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f'),
index_prefix=sub_index_name,
type_name=self.name,
pk_start=repr(table_chunk.get_pk_start()),
pk_stop=repr(table_chunk.get_pk_stop()),
count=len(instance_list),
stage_1_duration=stage_1_time - start_time,
stage_2_duration=stage_2_time - stage_1_time,
stage_3_duration=stage_3_time - stage_2_time,
clock_duration=end_clock - start_clock,
response=es_result,
)
_get_type_info_map_result = None
def get_type_info_map():
global _get_type_info_map_result
if _get_type_info_map_result:
return _get_type_info_map_result
type_info_list = [
TypeInfo(
name='doctor_tips', # 日记
type='doctor_tips',
model=doctor.Doctor,
query_deferred=lambda: doctor.Doctor.objects.all().query,
get_data_func=DoctorTransfer.get_doctor_suggest_data_list,
bulk_insert_chunk_size=100,
round_insert_chunk_size=5,
round_insert_period=2,
)
]
type_info_map = {
type_info.name: type_info
for type_info in type_info_list
}
_get_type_info_map_result = type_info_map
return type_info_map
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import logging
import traceback
from libs.tools import tzlc
import pypinyin
from pypinyin import lazy_pinyin
class DoctorTransfer(object):
ch_full_weight = 6.0
py_full_weight = 3.0
py_acronym_full_weight = 3.0
py_acronym_prefix_weight = 2
ch_prefix_weight = 1.5
py_prefix_weight = 1.0
@classmethod
def get_doctor_suggest_data_list(cls,instance):
try:
ret_list = list()
item_dict = dict()
item_dict["id"] = instance.id
item_dict["ori_name"] = instance.name
item_dict["is_online"] = instance.is_online
item_dict["order_weight"] = 0.0
item_dict["results_num"] = 0
item_dict["type_flag"] = "unknown"
item_dict["offline_score"] = 0.0
item_dict["data_type"] = 2
ret_list.append(item_dict)
suggest_list = list()
ch_full_word = instance.name
py_full_word = ''.join(lazy_pinyin(ch_full_word))
py_acronym_full_word = lazy_pinyin(ch_full_word,style=pypinyin.FIRST_LETTER)
#中文
for i in range(len(ch_full_word)):
for j in range(i,len(ch_full_word)+1):
ch_name_term = ch_full_word[i:j].strip()
if ch_name_term:
prefix_weight = cls.ch_prefix_weight if len(ch_name_term)!=len(ch_full_word) else cls.ch_full_weight
suggest_type = 0 if len(ch_name_term)!=len(ch_full_word) else 1
suggest_item = {
"input":[ch_name_term],
"word_weight":(len(ch_name_term)/len((ch_full_word))) * prefix_weight,
"suggest_type": suggest_type
}
suggest_list.append(suggest_item)
#拼音
for i in range(len(py_full_word)):
for j in range(i,len(py_full_word)+1):
py_name_term = py_full_word[i:j].strip()
if py_name_term:
prefix_weight = cls.py_prefix_weight if len(py_name_term)!=len(py_full_word) else cls.py_full_weight
suggest_type = 2 if len(py_name_term)!=len(py_full_word) else 3
suggest_item = {
"input":[py_name_term],
"word_weight":(len(py_name_term)/len(py_full_word)) * prefix_weight,
"suggest_type": suggest_type
}
suggest_list.append(suggest_item)
#简写
for i in range(len(py_acronym_full_word)):
for j in range(i,len(py_acronym_full_word)+1):
py_acronym_term = py_acronym_full_word[i:j].strip()
if py_acronym_term:
prefix_weight = cls.py_acronym_prefix_weight if len(py_acronym_term)!=len(py_acronym_full_word) else cls.py_acronym_full_weight
suggest_type = 4 if len(py_acronym_term)!=len(py_acronym_full_word) else 5
suggest_item = {
"input":[py_acronym_term],
"word_weight":(len(py_acronym_term)/len(py_acronym_full_word)) * prefix_weight,
"suggest_type": suggest_type
}
suggest_list.append(suggest_item)
return (item_dict,suggest_list)
except:
logging.error("catch exception,err_msg:%s" % traceback.format_exc())
return ([],[])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment