import redis import json from es_tool import es_query from tool import get_data_by_mysql def get_all_city_id(): sql = "select distinct tag_id from api_city" mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql) city_tag_ids = [i["tag_id"] for i in mysql_results] city_tag_ids.append(-1) # 没有城市的情况 return city_tag_ids def search_diary_by_match_phrase(tag_names, city_tag_id, version): q = dict() if city_tag_id == -1: sort_list = [] else: sort_list = [ {'_script': { 'lang': 'groovy', 'script_file': 'sort_diary-recommend', 'type': 'number', 'params': { 'user_city_tag_id': city_tag_id, }, 'order': 'desc', '_cache': True, }} ] if version: sort_list += [ {'is_low_quality': {'order': 'asc'}}, {'has_video_cover': {'order': 'asc'}}, {"good_click": {"order": "desc"}}, {'offline_score_v1': {'order': 'desc'}}, {'last_update_time': {'order': 'desc'}} ] else: sort_list += [ {'is_low_quality': {'order': 'asc'}}, {'has_video_cover': {'order': 'asc'}}, {"good_click": {"order": "desc"}}, {'offline_score': {'order': 'desc'}}, {'last_update_time': {'order': 'desc'}} ] total_query_should_list = [] for tag_name in tag_names: term_dict = { "match_phrase": { "tags": { "query": tag_name } } } total_query_should_list.append(term_dict) q['query'] = {"bool": { "filter": [{"term": {"is_online": True}}, {"term": {"has_cover": True}}, {"term": {"is_sink": False}}, {"term": {"has_after_cover": True}}, {"term": {"has_before_cover": True}}, {"terms": {"content_level": [5, 4, 3.5, 3]}}], "should": total_query_should_list, "minimum_should_match": 1}} q['query']['bool']['must_not'] = [{"term": {"is_operate": True}}] q['sort'] = sort_list q["_source"] = {"includes": ["id"]} es_res = es_query("diary", q, offset=0, size=200) diary_ids = [] for diary_info in es_res['hits']['hits']: diary_ids.append(diary_info['_source']['id']) return diary_ids def search_topic_by_match_phrase(tag_names): q = dict() total_query_should_list = list() for tag_name in tag_names: term_dict = { "match_phrase": { "fresh_tractate_tag_name": { "query": tag_name } } } term_dict2 = { "match_phrase": { "tractate_tag_name_content": { "query": tag_name, 'analyzer': 'gm_default_index' } } } total_query_should_list.extend([term_dict, term_dict2]) q['query'] = {"bool": { "filter": [{"term": {"is_online": True}}, {"terms": {"content_level": [5, 4, 3.5, 3]}}], "should": total_query_should_list, "minimum_should_match": 1}} q["_source"] = { "includes": ["id"] } q["sort"] = [ {"is_video": {"order": "asc"}}, {"good_click": {"order": "desc"}}, {"tractate_score": {"order": "desc"}} ] es_res = es_query("tractate", q, offset=0, size=200) topic_list = [] for topic_info in es_res['hits']['hits']: topic_list.append(topic_info['_source']['id']) return topic_list def search_qa_by_match_phrase(tag_names, version=0): if version: sort_list = [ {'has_picture': {'order': 'desc'}}, {"good_click": {"order": "desc"}}, {'smart_rank_v2': {'order': 'desc'}} ] else: sort_list = [ {'has_picture': {'order': 'desc'}}, {"good_click": {"order": "desc"}}, {'smart_rank': {'order': 'desc'}} ] q = dict() total_query_should_list = [] for tag_name in tag_names: term_dict = { "match_phrase": { "tag_name": { "query": tag_name } } } total_query_should_list.append(term_dict) q['query'] = {"bool": { "filter": [{"range": {"content_length": {"gte": 30}}}, {"term": {"is_online": True}}, {"terms": {"content_level": ['5', '4', '3.5', '3']}}], "should": total_query_should_list, "minimum_should_match": 1}} q["_source"] = { "includes": ["id"] } q['sort'] = sort_list es_res = es_query("answer", q, offset=0, size=200) qa_list = [] for qa_info in es_res['hits']['hits']: qa_list.append(qa_info['_source']['id']) return qa_list if __name__ == "__main__": # 获取所有的城市tag id all_city_tag_id = get_all_city_id() # 清空历史数据 redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379') # hot_search_word_topic_queue_key = "coldstart:hot:search:word:topic:queue" # hot_search_word_qa_queue_key = "coldstart:hot:search:word:qa:queue" # hot_search_word_diary_queue_key = "coldstart:hot:search:word:diary:queue" light_clinic_beauty_topic_queue_key = "coldstart:light:clinic:beauty:topic:queue" light_clinic_beauty_qa_queue_key = "coldstart:light:clinic:beauty:qa:queue" light_clinic_beauty_qa_queue_grey_key = "coldstart:light:clinic:beauty:qa:queue:grey" light_clinic_beauty_diary_queue_key = "coldstart:light:clinic:beauty:diary:queue" light_clinic_beauty_diary_queue_key2 = "coldstart:light:clinic:beauty:diary:queue:offline:score:v1" card_types = ['topic', 'qa'] # 日记有400多个城市,且是hmset,因此不用重置 # word_refers = ['coldstart:hot:search:word', 'coldstart:light:clinic:beauty'] word_refers = ['coldstart:light:clinic:beauty'] for card_type in card_types: for word_refer in word_refers: key = word_refer + ':' + card_type + ':' + 'queue' redis_client.delete(key) # # 热搜词的候选队列 # hot_search_word_key = "user:service_coldstart_tags2_name" # hot_search_word = redis_client.hgetall(hot_search_word_key) # hot_search_word = [str(tag, 'utf-8') for tag in hot_search_word] # hot_search_word_diary_queue = dict() # # # 队列存储 # hot_search_word_topic_queue = search_topic_by_match_phrase(hot_search_word) # redis_client.rpush(hot_search_word_topic_queue_key, *hot_search_word_topic_queue) # # redis_client.lrange(hot_search_word_topic_queue_key, 0, 3) # print("热搜词更新的帖子队列长度:%s" % str(len(hot_search_word_topic_queue))) # # hot_search_word_qa_queue = search_qa_by_match_phrase(hot_search_word) # redis_client.rpush(hot_search_word_qa_queue_key, *hot_search_word_qa_queue) # print("热搜词更新的问答队列长度:%s" % str(len(hot_search_word_qa_queue))) # # redis_client.lrange(hot_search_word_qa_queue_key, 0, 3) # # for city_tag_id in all_city_tag_id: # diary_queue = search_diary_by_match_phrase(hot_search_word, city_tag_id) # hot_search_word_diary_queue.update({city_tag_id: json.dumps(diary_queue)}) # redis_client.hmset(hot_search_word_diary_queue_key, hot_search_word_diary_queue) # print("热搜词更新的日记队列长度:%s" % str(len(diary_queue))) # 轻医美的候选队列 light_clinic_beauty_key = "user:service_coldstart_tags3" light_clinic_beauty = redis_client.hgetall(light_clinic_beauty_key) light_clinic_beauty = [str(tag, 'utf-8') for tag in light_clinic_beauty] light_clinic_beauty_diary_queue = dict() # 队列存储 light_clinic_beauty_topic_queue = search_topic_by_match_phrase(light_clinic_beauty) redis_client.rpush(light_clinic_beauty_topic_queue_key, *light_clinic_beauty_topic_queue) print("轻医美词更新的帖子队列长度:%s" % str(len(light_clinic_beauty_topic_queue))) light_clinic_beauty_qa_queue = search_qa_by_match_phrase(light_clinic_beauty) light_clinic_beauty_qa_grey_queue = search_qa_by_match_phrase(light_clinic_beauty, version=1) redis_client.rpush(light_clinic_beauty_qa_queue_key, *light_clinic_beauty_qa_queue) redis_client.rpush(light_clinic_beauty_qa_queue_grey_key, *light_clinic_beauty_qa_grey_queue) print("轻医美词更新的问答队列长度:%s" % str(len(light_clinic_beauty_qa_queue))) for city_tag_id in all_city_tag_id: diary_queue = search_diary_by_match_phrase(light_clinic_beauty, city_tag_id, version=0) light_clinic_beauty_diary_queue.update({city_tag_id: json.dumps(diary_queue)}) redis_client.hmset(light_clinic_beauty_diary_queue_key, light_clinic_beauty_diary_queue) print("轻医美词更新的日记队列长度:%s" % str(len(diary_queue))) light_clinic_beauty_diary_queue = dict() for city_tag_id in all_city_tag_id: diary_queue = search_diary_by_match_phrase(light_clinic_beauty, city_tag_id, version=1) light_clinic_beauty_diary_queue.update({city_tag_id: json.dumps(diary_queue)}) redis_client.hmset(light_clinic_beauty_diary_queue_key2, light_clinic_beauty_diary_queue) print("轻医美词更新的日记队列长度:%s" % str(len(diary_queue)))