Commit 2d7a880c authored by 段英荣's avatar 段英荣

Merge branch 'similar_sort' into 'master'

Similar sort

See merge request !153
parents 236b7237 e51ef4f3
...@@ -5,6 +5,8 @@ from __future__ import unicode_literals, print_function, absolute_import ...@@ -5,6 +5,8 @@ from __future__ import unicode_literals, print_function, absolute_import
import six import six
import random import random
from django.db import models from django.db import models
import logging
import traceback
class ITableChunk(object): class ITableChunk(object):
...@@ -147,36 +149,55 @@ class TableSlicerChunk(ITableChunk): ...@@ -147,36 +149,55 @@ class TableSlicerChunk(ITableChunk):
class TableSlicer(object): class TableSlicer(object):
def __init__(self, queryset, chunk_size=None, chunk_count=None, sep_list=None): def __init__(self, queryset, chunk_size=None, chunk_count=None, sep_list=None):
assert isinstance(queryset, models.QuerySet) try:
assert chunk_size is None or isinstance(chunk_size, six.integer_types) logging.info("duan add,before assert queryset")
assert chunk_count is None or isinstance(chunk_count, six.integer_types) assert isinstance(queryset, models.QuerySet)
assert sep_list is None or isinstance(sep_list, list)
assert (chunk_size is not None) + (chunk_count is not None) + (sep_list is not None) == 1 logging.info("duan add,before assert chunk_size")
assert chunk_size is None or isinstance(chunk_size, six.integer_types)
if sep_list is not None: logging.info("duan add,before assert chunk_count")
sep_list = list(sep_list) assert chunk_count is None or isinstance(chunk_count, six.integer_types)
else:
count = queryset.count()
if chunk_size is None:
chunk_size = count / chunk_count
index_list = list(range(0, count, chunk_size))
sep_list = [
queryset.order_by('pk').values_list('pk', flat=True)[index]
for index in index_list
]
self._model = queryset.model logging.info("duan add,before assert sep_list")
self._query = queryset.query assert sep_list is None or isinstance(sep_list, list)
self._sep_list = [None] + sep_list + [None]
logging.info("duan add,before assert chunk_size")
assert (chunk_size is not None) + (chunk_count is not None) + (sep_list is not None) == 1
logging.info("duan add,after assert chunk_size")
logging.info("duan add,sep_list:%s" % str(sep_list))
if sep_list is not None:
sep_list = list(sep_list)
else:
count = queryset.count()
if chunk_size is None:
chunk_size = count / chunk_count
index_list = list(range(0, count, chunk_size))
sep_list = [
queryset.order_by('pk').values_list('pk', flat=True)[index]
for index in index_list
]
logging.info("duan add,queryset count:%d" % count)
self._model = queryset.model
self._query = queryset.query
self._sep_list = [None] + sep_list + [None]
except:
logging.error("catch exception,err_msg:%s" % traceback.format_exc())
def chunks(self): def chunks(self):
reversed_sep_list = list(reversed(self._sep_list)) try:
for i in range(len(self._sep_list) - 1): reversed_sep_list = list(reversed(self._sep_list))
pk_start = reversed_sep_list[i+1] logging.info("duan add,reversed_sep_list:%d" % (len(self._sep_list) - 1))
pk_stop = reversed_sep_list[i] for i in range(len(self._sep_list) - 1):
yield TableSlicerChunk(model=self._model, query=self._query, pk_start=pk_start, pk_stop=pk_stop) pk_start = reversed_sep_list[i + 1]
pk_stop = reversed_sep_list[i]
yield TableSlicerChunk(model=self._model, query=self._query, pk_start=pk_start, pk_stop=pk_stop)
except:
logging.error("catch exception,err_msg:%s" % traceback.format_exc())
class TableStreamingSlicer(object): class TableStreamingSlicer(object):
......
...@@ -240,24 +240,24 @@ class TopicUtils(object): ...@@ -240,24 +240,24 @@ class TopicUtils(object):
{"term": {"has_image":True}}, {"term": {"has_image":True}},
{"term": {"is_online": True}}, {"term": {"is_online": True}},
{"term": {"is_deleted": False}} {"term": {"is_deleted": False}}
], ]
"should": [ # "should": [
{ # {
"bool":{ # "bool":{
"must":[ # "must":[
{"term":{"has_image":True}}, # {"term":{"has_image":True}},
{"term": {"has_video": False}} # {"term": {"has_video": False}}
] # ]
} # }
},{ # },{
"bool":{ # "bool":{
"must":{ # "must":{
"term":{"has_video":True} # "term":{"has_video":True}
} # }
} # }
} # }
], # ],
"minimum_should_match":1 # "minimum_should_match":1
} }
}, },
"score_mode": "sum", "score_mode": "sum",
......
...@@ -116,8 +116,10 @@ class Command(BaseCommand): ...@@ -116,8 +116,10 @@ class Command(BaseCommand):
type_info = get_type_info_map()[type_name] type_info = get_type_info_map()[type_name]
query_set = type_info.queryset query_set = type_info.queryset
logging.info("before TableSlicer")
slicer = TableSlicer(queryset=query_set, chunk_size=type_info.bulk_insert_chunk_size) slicer = TableSlicer(queryset=query_set, chunk_size=type_info.bulk_insert_chunk_size)
for chunk in slicer.chunks(): for chunk in slicer.chunks():
logging.info("in chunks....")
job = Job( job = Job(
sub_index_name=type_name, sub_index_name=type_name,
type_name=type_name, type_name=type_name,
......
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals, absolute_import, print_function
from django.conf import settings from django.conf import settings
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError
...@@ -54,7 +55,7 @@ class Topic(models.Model): ...@@ -54,7 +55,7 @@ class Topic(models.Model):
Group, verbose_name=u"关联的小组", related_name=u"group_topics", null=True, blank=True, default=None, Group, verbose_name=u"关联的小组", related_name=u"group_topics", null=True, blank=True, default=None,
on_delete=models.CASCADE) on_delete=models.CASCADE)
user_id = models.IntegerField(verbose_name=u'用户ID') user_id = models.IntegerField(verbose_name=u'用户ID')
has_video = models.IntegerField(verbose_name=u'是否是视频日记') has_video = models.BooleanField(verbose_name=u'是否是视频日记')
drop_score = models.IntegerField(verbose_name=u'人工赋分', default=0) drop_score = models.IntegerField(verbose_name=u'人工赋分', default=0)
description = models.CharField(verbose_name=u'日记本描述', max_length=200) description = models.CharField(verbose_name=u'日记本描述', max_length=200)
content = models.CharField(verbose_name=u'日记本内容', max_length=1000) content = models.CharField(verbose_name=u'日记本内容', max_length=1000)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment