首页feed新用户冷启动的日记、帖子、问答队列

d72242ad · 高雅喆 · 5ce85a56 · d72242ad · d72242ad
Commit d72242ad authored Nov 11, 2019 by 高雅喆
Show whitespace changes
Inline Side-by-side

Showing with 187 additions and 0 deletions

gm_feed_cold_start.py eda/smart_rank/gm_feed_cold_start.py +146 -0

tool.py eda/smart_rank/tool.py +41 -0

No files found.
--- a/eda/smart_rank/gm_feed_cold_start.py
+++ b/eda/smart_rank/gm_feed_cold_start.py
+from tool import es_query
+from tool import get_data_by_mysql
+
+
+def get_all_city_id():
+    sql = "select distinct tag_id from api_city"
+    mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing', sql)
+    city_pinyins = [i["tag_id"] for i in mysql_results]
+    city_pinyins.append(-1)  # 没有城市的情况
+    return city_pinyins
+
+
+def search_diary_by_match_phrase(tag_names, city_pinyin):
+    q = dict()
+    if city_pinyin == -1:
+        sort_list = []
+    else:
+        sort_list = [
+            {'_script': {
+                'lang': groovy,
+                'script_file': 'sort_diary-recommend',
+                'type': 'number',
+                'params': {
+                    'user_city_tag_id': city_pinyin,
+                },
+                'order': 'desc',
+                '_cache': True,
+            }}
+        ]
+    sort_list += [
+        {'has_video_cover': {'order': 'asc'}},
+        {"good_click": {"order": "desc"}},
+        {'offline_score': {'order': 'desc'}},
+        {'last_update_time': {'order': 'desc'}}
+    ]
+    total_query_should_list = []
+    for tag_name in tag_names:
+        term_dict = {
+            "match_phrase": {
+                "tags": {
+                    "query": tag_name
+                }
+            }
+        }
+        total_query_should_list.append(term_dict)
+    q['query'] = {"bool": {
+        "filter": [{"term": {"is_online": True}}, {"terms": {"tags": tag_ids}},
+                   {"term": {"has_cover": True}}, {"term": {"is_sink": False}},
+                   {"term": {"has_after_cover": True}}, {"term": {"has_before_cover": True}},
+                   {"terms": {"content_level": [5, 4, 3.5, 3]}}],
+        "should": total_query_should_list,
+        "minimum_should_match": 1}}
+    q['query']['bool']['must_not'] = [{"term": {"is_operate": True}}]
+    q['sort'] = sort_list
+    q["_source"] = {"includes": ["id"]}
+    es_res = es_query("diary", q, offset=0, size=200)
+    diary_ids = []
+    for diary_info in es_res['hits']['hits']:
+        diary_ids.append(diary_info['_source']['id'])
+    return diary_ids
+
+
+def search_topic_by_match_phrase(tag_names):
+    q = dict()
+    total_query_should_list = list()
+    for tag_name in tag_names:
+        term_dict = {
+            "match_phrase": {
+                "fresh_tractate_tag_name": {
+                    "query": tag_name
+                }
+            }
+        }
+        term_dict2 = {
+            "match_phrase": {
+                "tractate_tag_name_content": {
+                    "query": tag_name
+                }
+            }
+        }
+        total_query_should_list.extend([term_dict, term_dict2])
+    q['query'] = {"bool": {
+        "filter": [{"term": {"is_online": True}}, {"terms": {"content_level": [5, 4, 3.5, 3]}}],
+        "should": total_query_should_list,
+        "minimum_should_match": 1}}
+    q["_source"] = {
+        "includes": ["id"]
+    }
+    q["sort"] = [
+        {"is_video": {"order": "asc"}},
+        {"good_click": {"order": "desc"}},
+        {"tractate_score": {"order": "desc"}}
+    ]
+    es_res = es_query("tractate", q, offset=0, size=200)
+    topic_list = []
+    for topic_info in es_res['hits']['hits']:
+        topic_list.append(topic_info['_source']['id'])
+    return topic_list
+
+
+def search_qa_by_match_phrase(tag_names):
+    sort_list = [
+        {'has_picture': {'order': 'desc'}},
+        {"good_click": {"order": "desc"}},
+        {'smart_rank': {'order': 'desc'}}
+    ]
+    q = dict()
+    total_query_should_list = []
+    for tag_name in tag_names:
+        term_dict = {
+            "match_phrase": {
+                "tag_name": {
+                    "query": tag_name
+                }
+            }
+        }
+        total_query_should_list.append(term_dict)
+    q['query'] = {"bool": {
+        "filter": [{"range": {"content_length": {"gte": 30}}},
+                   {"term": {"is_online": True}},
+                   {"terms": {"content_level": ['5', '4', '3.5', '3']}}],
+        "should": total_query_should_list,
+        "minimum_should_match": 1}}
+    q["_source"] = {
+        "includes": ["id"]
+    }
+    q['sort'] = sort_list
+    es_res = es_query("answer", q, offset=0, size=200)
+    qa_list = []
+    for qa_info in es_res['hits']['hits']:
+        qa_list.append(qa_info['_source']['id'])
+    return qa_list
+
+
+if __name__ == "__main__":
+    # 获取所有的城市tag id
+    all_city_id = get_all_city_id()
+    # 热搜词的候选队列
+    hot_search_word_diary_queue = list()
+    hot_search_word_topic_queue = list()
+    hot_search_word_qa_queue = list()
+    # 轻医美的候选队列
+    light_clinic_beauty_diary_queue = list()
+    light_clinic_beauty_topic_queue = list()
+    light_clinic_beauty_qa_queue = list()
+
--- a/eda/smart_rank/tool.py
+++ b/eda/smart_rank/tool.py
@@ -13,6 +13,7 @@ import time
 import json
 import numpy as np
 import pandas as pd
+from elasticsearch import Elasticsearch as Es


 def send_email(app,id,e):
@@ -347,3 +348,43 @@ def get_user_log(cl_id, all_word_tags, pay_time=0, debug=0):
    except:
        print("error2_user_portrait", traceback.format_exc())
        return user_df_service
+
+
+def get_es():
+    init_args = {'sniff_on_start': False, 'sniff_on_connection_fail': False,}
+    new_hosts = [{'host': '172.16.31.17', 'port': 9000}, {'host': '172.16.31.11', 'port': 9000},
+                 {'host': '172.16.31.13', 'port': 9000}]
+    new_es = Es(hosts=new_hosts, **init_args)
+    return new_es
+
+
+def es_index_adapt(index_prefix, doc_type, rw=None):
+    """get the adapted index name
+    """
+    assert rw in [None, 'read', 'write']
+    index = '-'.join((index_prefix, doc_type))
+    if rw:
+        index = '-'.join((index, rw))
+    return index
+
+
+def es_query(doc, body, offset, size, es=None):
+    if es is None:
+        es = get_es()
+    index = es_index_adapt(index_prefix='gm-dbmw', doc_type=doc, rw='read')
+    res = es.search(
+        index=index,
+        doc_type=doc,
+        timeout='10s',
+        body=body,
+        from_=offset,
+        size=size)
+    return res
+
+
+def es_mquery(doc, body, es=None):
+    if es is None:
+        es = get_es()
+    index = es_index_adapt(index_prefix='gm-dbmw', doc_type=doc, rw='read')
+    res = es.msearch(body, index=index)
+    return res