gm_feed_cold_start.py 10.1 KB
import redis
import json
from es_tool import es_query
from tool import get_data_by_mysql


def get_all_city_id():
    sql = "select distinct tag_id from api_city"
    mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
    city_tag_ids = [i["tag_id"] for i in mysql_results]
    city_tag_ids.append(-1)  # 没有城市的情况
    return city_tag_ids


def search_diary_by_match_phrase(tag_names, city_tag_id, version):
    q = dict()
    if city_tag_id == -1:
        sort_list = []
    else:
        sort_list = [
            {'_script': {
                'lang': 'groovy',
                'script_file': 'sort_diary-recommend',
                'type': 'number',
                'params': {
                    'user_city_tag_id': city_tag_id,
                },
                'order': 'desc',
                '_cache': True,
            }}
        ]
    if version:
        sort_list += [
            {'has_video_cover': {'order': 'asc'}},
            {'offline_score_v1': {'order': 'desc'}},
            {"good_click": {"order": "desc"}},
            {'last_update_time': {'order': 'desc'}}
        ]
    else:
        sort_list += [
            {'has_video_cover': {'order': 'asc'}},
            {'offline_score': {'order': 'desc'}},
            {"good_click": {"order": "desc"}},
            {'last_update_time': {'order': 'desc'}}
        ]
    total_query_should_list = []
    for tag_name in tag_names:
        term_dict = {
            "match_phrase": {
                "tags": {
                    "query": tag_name
                }
            }
        }
        total_query_should_list.append(term_dict)
    q['query'] = {"bool": {
        "filter": [{"term": {"is_online": True}},
                   {"term": {"has_cover": True}}, {"term": {"is_sink": False}},
                   {"term": {"has_after_cover": True}}, {"term": {"has_before_cover": True}},
                   {"terms": {"content_level": [5, 4, 3.5, 3]}}],
        "should": total_query_should_list,
        "minimum_should_match": 1}}
    q['query']['bool']['must_not'] = [{"term": {"is_operate": True}}]
    q['sort'] = sort_list
    q["_source"] = {"includes": ["id"]}
    es_res = es_query("diary", q, offset=0, size=200)
    diary_ids = []
    for diary_info in es_res['hits']['hits']:
        diary_ids.append(diary_info['_source']['id'])
    return diary_ids


def search_topic_by_match_phrase(tag_names, version=0):
    q = dict()
    total_query_should_list = list()
    for tag_name in tag_names:
        term_dict = {
            "match_phrase": {
                "fresh_tractate_tag_name": {
                    "query": tag_name
                }
            }
        }
        term_dict2 = {
            "match_phrase": {
                "tractate_tag_name_content": {
                    "query": tag_name,
                    'analyzer': 'gm_default_index'
                }
            }
        }
        total_query_should_list.extend([term_dict, term_dict2])
    q['query'] = {"bool": {
        "filter": [{"term": {"is_online": True}}, {"terms": {"content_level": [5, 4, 3.5, 3]}}],
        "should": total_query_should_list,
        "minimum_should_match": 1}}
    q["_source"] = {
        "includes": ["id"]
    }
    if version:
        q["sort"] = [
            {"is_video": {"order": "asc"}},
            {"good_click_tractate_score": {"order": "desc"}},
            {"good_click": {"order": "desc"}}
        ]
    else:
        q["sort"] = [
            {"is_video": {"order": "asc"}},
            {"tractate_score": {"order": "desc"}},
            {"good_click": {"order": "desc"}}
        ]
    es_res = es_query("tractate", q, offset=0, size=200)
    topic_list = []
    for topic_info in es_res['hits']['hits']:
        topic_list.append(topic_info['_source']['id'])
    return topic_list


def search_qa_by_match_phrase(tag_names, version=0):
    if version:
        sort_list = [
            {'has_picture': {'order': 'desc'}},
            {'smart_rank_v3': {'order': 'desc'}},
            {"good_click": {"order": "desc"}}
        ]
    else:
        sort_list = [
            {'has_picture': {'order': 'desc'}},
            {'smart_rank_v2': {'order': 'desc'}},
            {"good_click": {"order": "desc"}}
        ]
    q = dict()
    total_query_should_list = []
    for tag_name in tag_names:
        term_dict = {
            "match_phrase": {
                "tag_name": {
                    "query": tag_name
                }
            }
        }
        total_query_should_list.append(term_dict)
    q['query'] = {"bool": {
        "filter": [{"range": {"content_length": {"gte": 30}}},
                   {"term": {"is_online": True}},
                   {"terms": {"content_level": ['5', '4', '3.5', '3']}}],
        "should": total_query_should_list,
        "minimum_should_match": 1}}
    q["_source"] = {
        "includes": ["id"]
    }
    q['sort'] = sort_list
    es_res = es_query("answer", q, offset=0, size=200)
    qa_list = []
    for qa_info in es_res['hits']['hits']:
        qa_list.append(qa_info['_source']['id'])
    return qa_list


if __name__ == "__main__":

    # 获取所有的城市tag id
    all_city_tag_id = get_all_city_id()

    # 清空历史数据
    redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
    # hot_search_word_topic_queue_key = "coldstart:hot:search:word:topic:queue"
    # hot_search_word_qa_queue_key = "coldstart:hot:search:word:qa:queue"
    # hot_search_word_diary_queue_key = "coldstart:hot:search:word:diary:queue"
    light_clinic_beauty_topic_queue_key = "coldstart:light:clinic:beauty:topic:queue"
    light_clinic_beauty_topic_queue_grey_key = "coldstart:light:clinic:beauty:topic:queue:grey"
    light_clinic_beauty_qa_queue_key = "coldstart:light:clinic:beauty:qa:queue"
    light_clinic_beauty_qa_queue_grey_key = "coldstart:light:clinic:beauty:qa:queue:grey"
    light_clinic_beauty_diary_queue_key = "coldstart:light:clinic:beauty:diary:queue"
    light_clinic_beauty_diary_queue_grey_key = "coldstart:light:clinic:beauty:diary:queue:grey"
    card_types = ['topic', 'qa']  # 日记有400多个城市,且是hmset,因此不用重置
    # word_refers = ['coldstart:hot:search:word', 'coldstart:light:clinic:beauty']
    word_refers = ['coldstart:light:clinic:beauty']
    for card_type in card_types:
        for word_refer in word_refers:
            key = word_refer + ':' + card_type + ':' + 'queue'
            redis_client.delete(key)
            key = word_refer + ':' + card_type + ':' + 'queue:grey'
            redis_client.delete(key)

    # # 热搜词的候选队列
    # hot_search_word_key = "user:service_coldstart_tags2_name"
    # hot_search_word = redis_client.hgetall(hot_search_word_key)
    # hot_search_word = [str(tag, 'utf-8') for tag in hot_search_word]
    # hot_search_word_diary_queue = dict()
    #
    # # 队列存储
    # hot_search_word_topic_queue = search_topic_by_match_phrase(hot_search_word)
    # redis_client.rpush(hot_search_word_topic_queue_key, *hot_search_word_topic_queue)
    # # redis_client.lrange(hot_search_word_topic_queue_key, 0, 3)
    # print("热搜词更新的帖子队列长度:%s" % str(len(hot_search_word_topic_queue)))
    #
    # hot_search_word_qa_queue = search_qa_by_match_phrase(hot_search_word)
    # redis_client.rpush(hot_search_word_qa_queue_key, *hot_search_word_qa_queue)
    # print("热搜词更新的问答队列长度:%s" % str(len(hot_search_word_qa_queue)))
    # # redis_client.lrange(hot_search_word_qa_queue_key, 0, 3)
    #
    # for city_tag_id in all_city_tag_id:
    #     diary_queue = search_diary_by_match_phrase(hot_search_word, city_tag_id)
    #     hot_search_word_diary_queue.update({city_tag_id: json.dumps(diary_queue)})
    # redis_client.hmset(hot_search_word_diary_queue_key, hot_search_word_diary_queue)
    # print("热搜词更新的日记队列长度:%s" % str(len(diary_queue)))

    # 轻医美的候选队列
    light_clinic_beauty_key = "user:service_coldstart_tags3"
    light_clinic_beauty = redis_client.hgetall(light_clinic_beauty_key)
    light_clinic_beauty = [str(tag, 'utf-8') for tag in light_clinic_beauty]
    light_clinic_beauty_diary_queue = dict()

    # 队列存储
    light_clinic_beauty_topic_queue = search_topic_by_match_phrase(light_clinic_beauty)
    light_clinic_beauty_topic_grey_queue = search_topic_by_match_phrase(light_clinic_beauty, version=1)
    redis_client.rpush(light_clinic_beauty_topic_queue_key, *light_clinic_beauty_topic_queue)
    redis_client.rpush(light_clinic_beauty_topic_queue_grey_key, *light_clinic_beauty_topic_grey_queue)
    print("轻医美词更新的帖子队列长度:%s" % str(len(light_clinic_beauty_topic_queue)))
    print("轻医美词更新的灰度帖子队列长度:%s" % str(len(light_clinic_beauty_topic_grey_queue)))

    light_clinic_beauty_qa_queue = search_qa_by_match_phrase(light_clinic_beauty)
    light_clinic_beauty_qa_grey_queue = search_qa_by_match_phrase(light_clinic_beauty, version=1)
    redis_client.rpush(light_clinic_beauty_qa_queue_key, *light_clinic_beauty_qa_queue)
    redis_client.rpush(light_clinic_beauty_qa_queue_grey_key, *light_clinic_beauty_qa_grey_queue)
    print("轻医美词更新的问答队列长度:%s" % str(len(light_clinic_beauty_qa_queue)))
    print("轻医美词更新的灰度问答队列长度:%s" % str(len(light_clinic_beauty_qa_grey_queue)))

    for city_tag_id in all_city_tag_id:
        diary_queue = search_diary_by_match_phrase(light_clinic_beauty, city_tag_id, version=0)
        light_clinic_beauty_diary_queue.update({city_tag_id: json.dumps(diary_queue)})
    redis_client.hmset(light_clinic_beauty_diary_queue_key, light_clinic_beauty_diary_queue)
    print("轻医美词更新的日记队列长度:%s" % str(len(diary_queue)))

    light_clinic_beauty_diary_queue = dict()
    for city_tag_id in all_city_tag_id:
        diary_queue = search_diary_by_match_phrase(light_clinic_beauty, city_tag_id, version=1)
        light_clinic_beauty_diary_queue.update({city_tag_id: json.dumps(diary_queue)})
    redis_client.hmset(light_clinic_beauty_diary_queue_grey_key, light_clinic_beauty_diary_queue)
    print("轻医美词更新的灰度日记队列长度:%s" % str(len(diary_queue)))