Merge branch 'similar_sort' into 'master'

Similar sort See merge request !153

Merge branch 'similar_sort' into 'master'
Similar sort See merge request !153
2d7a880c · 段英荣 · 236b7237 · e51ef4f3 · 2d7a880c · 2d7a880c
Commit 2d7a880c authored Mar 11, 2019 by 段英荣
4 changed files
--- a/libs/table_scan.py
+++ b/libs/table_scan.py
@@ -5,6 +5,8 @@ from __future__ import unicode_literals, print_function, absolute_import
 import six
 import random
 from django.db import models
+import logging
+import traceback


 class ITableChunk(object):
@@ -147,36 +149,55 @@ class TableSlicerChunk(ITableChunk):
 class TableSlicer(object):

    def __init__(self, queryset, chunk_size=None, chunk_count=None, sep_list=None):
-        assert isinstance(queryset, models.QuerySet)
-        assert chunk_size is None or isinstance(chunk_size, six.integer_types)
-        assert chunk_count is None or isinstance(chunk_count, six.integer_types)
-        assert sep_list is None or isinstance(sep_list, list)
+        try:
+            logging.info("duan add,before assert queryset")
+            assert isinstance(queryset, models.QuerySet)

-        assert (chunk_size is not None) + (chunk_count is not None) + (sep_list is not None) == 1
+            logging.info("duan add,before assert chunk_size")
+            assert chunk_size is None or isinstance(chunk_size, six.integer_types)

-        if sep_list is not None:
-            sep_list = list(sep_list)
-        else:
-            count = queryset.count()
-            if chunk_size is None:
-                chunk_size = count / chunk_count
-            index_list = list(range(0, count, chunk_size))
-            sep_list = [
-                queryset.order_by('pk').values_list('pk', flat=True)[index]
-                for index in index_list
-            ]
+            logging.info("duan add,before assert chunk_count")
+            assert chunk_count is None or isinstance(chunk_count, six.integer_types)

-        self._model = queryset.model
-        self._query = queryset.query
-        self._sep_list = [None] + sep_list + [None]
+            logging.info("duan add,before assert sep_list")
+            assert sep_list is None or isinstance(sep_list, list)
+
+            logging.info("duan add,before assert chunk_size")
+            assert (chunk_size is not None) + (chunk_count is not None) + (sep_list is not None) == 1
+            logging.info("duan add,after assert chunk_size")
+
+
+            logging.info("duan add,sep_list:%s" % str(sep_list))
+            if sep_list is not None:
+                sep_list = list(sep_list)
+            else:
+                count = queryset.count()
+                if chunk_size is None:
+                    chunk_size = count / chunk_count
+                index_list = list(range(0, count, chunk_size))
+                sep_list = [
+                    queryset.order_by('pk').values_list('pk', flat=True)[index]
+                    for index in index_list
+                ]
+
+                logging.info("duan add,queryset count:%d" % count)
+            self._model = queryset.model
+            self._query = queryset.query
+            self._sep_list = [None] + sep_list + [None]
+        except:
+            logging.error("catch exception,err_msg:%s" % traceback.format_exc())


    def chunks(self):
-        reversed_sep_list = list(reversed(self._sep_list))
-        for i in range(len(self._sep_list) - 1):
-            pk_start = reversed_sep_list[i+1]
-            pk_stop = reversed_sep_list[i]
-            yield TableSlicerChunk(model=self._model, query=self._query, pk_start=pk_start, pk_stop=pk_stop)
+        try:
+            reversed_sep_list = list(reversed(self._sep_list))
+            logging.info("duan add,reversed_sep_list:%d" % (len(self._sep_list) - 1))
+            for i in range(len(self._sep_list) - 1):
+                pk_start = reversed_sep_list[i + 1]
+                pk_stop = reversed_sep_list[i]
+                yield TableSlicerChunk(model=self._model, query=self._query, pk_start=pk_start, pk_stop=pk_stop)
+        except:
+            logging.error("catch exception,err_msg:%s" % traceback.format_exc())


 class TableStreamingSlicer(object):

--- a/search/utils/topic.py
+++ b/search/utils/topic.py
@@ -240,24 +240,24 @@ class TopicUtils(object):
                            {"term": {"has_image":True}},
                            {"term": {"is_online": True}},
                            {"term": {"is_deleted": False}}
-                        ],
-                        "should": [
-                            {
-                                "bool":{
-                                    "must":[
-                                        {"term":{"has_image":True}},
-                                        {"term": {"has_video": False}}
-                                    ]
-                                }
-                            },{
-                                "bool":{
-                                    "must":{
-                                        "term":{"has_video":True}
-                                    }
-                                }
-                            }
-                        ],
-                        "minimum_should_match":1
+                        ]
+                        # "should": [
+                        #     {
+                        #         "bool":{
+                        #             "must":[
+                        #                 {"term":{"has_image":True}},
+                        #                 {"term": {"has_video": False}}
+                        #             ]
+                        #         }
+                        #     },{
+                        #         "bool":{
+                        #             "must":{
+                        #                 "term":{"has_video":True}
+                        #             }
+                        #         }
+                        #     }
+                        # ],
+                        # "minimum_should_match":1
                    }
                },
                "score_mode": "sum",

--- a/trans2es/management/commands/trans2es_data2es_parallel.py
+++ b/trans2es/management/commands/trans2es_data2es_parallel.py
@@ -116,8 +116,10 @@ class Command(BaseCommand):
            type_info = get_type_info_map()[type_name]
            query_set = type_info.queryset

+            logging.info("before TableSlicer")
            slicer = TableSlicer(queryset=query_set, chunk_size=type_info.bulk_insert_chunk_size)
            for chunk in slicer.chunks():
+                logging.info("in chunks....")
                job = Job(
                    sub_index_name=type_name,
                    type_name=type_name,

--- a/trans2es/models/topic.py
+++ b/trans2es/models/topic.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+from __future__ import unicode_literals, absolute_import, print_function

 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
@@ -54,7 +55,7 @@ class Topic(models.Model):
        Group, verbose_name=u"关联的小组", related_name=u"group_topics", null=True, blank=True, default=None,
        on_delete=models.CASCADE)
    user_id = models.IntegerField(verbose_name=u'用户ID')
-    has_video = models.IntegerField(verbose_name=u'是否是视频日记')
+    has_video = models.BooleanField(verbose_name=u'是否是视频日记')
    drop_score = models.IntegerField(verbose_name=u'人工赋分', default=0)
    description = models.CharField(verbose_name=u'日记本描述', max_length=200)
    content = models.CharField(verbose_name=u'日记本内容', max_length=1000)