Commit d72242ad authored by 高雅喆's avatar 高雅喆

首页feed新用户冷启动的日记、帖子、问答队列

parent 5ce85a56
from tool import es_query
from tool import get_data_by_mysql
def get_all_city_id():
sql = "select distinct tag_id from api_city"
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
city_pinyins = [i["tag_id"] for i in mysql_results]
city_pinyins.append(-1) # 没有城市的情况
return city_pinyins
def search_diary_by_match_phrase(tag_names, city_pinyin):
q = dict()
if city_pinyin == -1:
sort_list = []
else:
sort_list = [
{'_script': {
'lang': groovy,
'script_file': 'sort_diary-recommend',
'type': 'number',
'params': {
'user_city_tag_id': city_pinyin,
},
'order': 'desc',
'_cache': True,
}}
]
sort_list += [
{'has_video_cover': {'order': 'asc'}},
{"good_click": {"order": "desc"}},
{'offline_score': {'order': 'desc'}},
{'last_update_time': {'order': 'desc'}}
]
total_query_should_list = []
for tag_name in tag_names:
term_dict = {
"match_phrase": {
"tags": {
"query": tag_name
}
}
}
total_query_should_list.append(term_dict)
q['query'] = {"bool": {
"filter": [{"term": {"is_online": True}}, {"terms": {"tags": tag_ids}},
{"term": {"has_cover": True}}, {"term": {"is_sink": False}},
{"term": {"has_after_cover": True}}, {"term": {"has_before_cover": True}},
{"terms": {"content_level": [5, 4, 3.5, 3]}}],
"should": total_query_should_list,
"minimum_should_match": 1}}
q['query']['bool']['must_not'] = [{"term": {"is_operate": True}}]
q['sort'] = sort_list
q["_source"] = {"includes": ["id"]}
es_res = es_query("diary", q, offset=0, size=200)
diary_ids = []
for diary_info in es_res['hits']['hits']:
diary_ids.append(diary_info['_source']['id'])
return diary_ids
def search_topic_by_match_phrase(tag_names):
q = dict()
total_query_should_list = list()
for tag_name in tag_names:
term_dict = {
"match_phrase": {
"fresh_tractate_tag_name": {
"query": tag_name
}
}
}
term_dict2 = {
"match_phrase": {
"tractate_tag_name_content": {
"query": tag_name
}
}
}
total_query_should_list.extend([term_dict, term_dict2])
q['query'] = {"bool": {
"filter": [{"term": {"is_online": True}}, {"terms": {"content_level": [5, 4, 3.5, 3]}}],
"should": total_query_should_list,
"minimum_should_match": 1}}
q["_source"] = {
"includes": ["id"]
}
q["sort"] = [
{"is_video": {"order": "asc"}},
{"good_click": {"order": "desc"}},
{"tractate_score": {"order": "desc"}}
]
es_res = es_query("tractate", q, offset=0, size=200)
topic_list = []
for topic_info in es_res['hits']['hits']:
topic_list.append(topic_info['_source']['id'])
return topic_list
def search_qa_by_match_phrase(tag_names):
sort_list = [
{'has_picture': {'order': 'desc'}},
{"good_click": {"order": "desc"}},
{'smart_rank': {'order': 'desc'}}
]
q = dict()
total_query_should_list = []
for tag_name in tag_names:
term_dict = {
"match_phrase": {
"tag_name": {
"query": tag_name
}
}
}
total_query_should_list.append(term_dict)
q['query'] = {"bool": {
"filter": [{"range": {"content_length": {"gte": 30}}},
{"term": {"is_online": True}},
{"terms": {"content_level": ['5', '4', '3.5', '3']}}],
"should": total_query_should_list,
"minimum_should_match": 1}}
q["_source"] = {
"includes": ["id"]
}
q['sort'] = sort_list
es_res = es_query("answer", q, offset=0, size=200)
qa_list = []
for qa_info in es_res['hits']['hits']:
qa_list.append(qa_info['_source']['id'])
return qa_list
if __name__ == "__main__":
# 获取所有的城市tag id
all_city_id = get_all_city_id()
# 热搜词的候选队列
hot_search_word_diary_queue = list()
hot_search_word_topic_queue = list()
hot_search_word_qa_queue = list()
# 轻医美的候选队列
light_clinic_beauty_diary_queue = list()
light_clinic_beauty_topic_queue = list()
light_clinic_beauty_qa_queue = list()
......@@ -13,6 +13,7 @@ import time
import json
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch as Es
def send_email(app,id,e):
......@@ -347,3 +348,43 @@ def get_user_log(cl_id, all_word_tags, pay_time=0, debug=0):
except:
print("error2_user_portrait", traceback.format_exc())
return user_df_service
def get_es():
init_args = {'sniff_on_start': False, 'sniff_on_connection_fail': False,}
new_hosts = [{'host': '172.16.31.17', 'port': 9000}, {'host': '172.16.31.11', 'port': 9000},
{'host': '172.16.31.13', 'port': 9000}]
new_es = Es(hosts=new_hosts, **init_args)
return new_es
def es_index_adapt(index_prefix, doc_type, rw=None):
"""get the adapted index name
"""
assert rw in [None, 'read', 'write']
index = '-'.join((index_prefix, doc_type))
if rw:
index = '-'.join((index, rw))
return index
def es_query(doc, body, offset, size, es=None):
if es is None:
es = get_es()
index = es_index_adapt(index_prefix='gm-dbmw', doc_type=doc, rw='read')
res = es.search(
index=index,
doc_type=doc,
timeout='10s',
body=body,
from_=offset,
size=size)
return res
def es_mquery(doc, body, es=None):
if es is None:
es = get_es()
index = es_index_adapt(index_prefix='gm-dbmw', doc_type=doc, rw='read')
res = es.msearch(body, index=index)
return res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment