Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
d72242ad
Commit
d72242ad
authored
Nov 11, 2019
by
高雅喆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
首页feed新用户冷启动的日记、帖子、问答队列
parent
5ce85a56
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
187 additions
and
0 deletions
+187
-0
gm_feed_cold_start.py
eda/smart_rank/gm_feed_cold_start.py
+146
-0
tool.py
eda/smart_rank/tool.py
+41
-0
No files found.
eda/smart_rank/gm_feed_cold_start.py
0 → 100644
View file @
d72242ad
from
tool
import
es_query
from
tool
import
get_data_by_mysql
def
get_all_city_id
():
sql
=
"select distinct tag_id from api_city"
mysql_results
=
get_data_by_mysql
(
'172.16.30.141'
,
3306
,
'work'
,
'BJQaT9VzDcuPBqkd'
,
'zhengxing'
,
sql
)
city_pinyins
=
[
i
[
"tag_id"
]
for
i
in
mysql_results
]
city_pinyins
.
append
(
-
1
)
# 没有城市的情况
return
city_pinyins
def
search_diary_by_match_phrase
(
tag_names
,
city_pinyin
):
q
=
dict
()
if
city_pinyin
==
-
1
:
sort_list
=
[]
else
:
sort_list
=
[
{
'_script'
:
{
'lang'
:
groovy
,
'script_file'
:
'sort_diary-recommend'
,
'type'
:
'number'
,
'params'
:
{
'user_city_tag_id'
:
city_pinyin
,
},
'order'
:
'desc'
,
'_cache'
:
True
,
}}
]
sort_list
+=
[
{
'has_video_cover'
:
{
'order'
:
'asc'
}},
{
"good_click"
:
{
"order"
:
"desc"
}},
{
'offline_score'
:
{
'order'
:
'desc'
}},
{
'last_update_time'
:
{
'order'
:
'desc'
}}
]
total_query_should_list
=
[]
for
tag_name
in
tag_names
:
term_dict
=
{
"match_phrase"
:
{
"tags"
:
{
"query"
:
tag_name
}
}
}
total_query_should_list
.
append
(
term_dict
)
q
[
'query'
]
=
{
"bool"
:
{
"filter"
:
[{
"term"
:
{
"is_online"
:
True
}},
{
"terms"
:
{
"tags"
:
tag_ids
}},
{
"term"
:
{
"has_cover"
:
True
}},
{
"term"
:
{
"is_sink"
:
False
}},
{
"term"
:
{
"has_after_cover"
:
True
}},
{
"term"
:
{
"has_before_cover"
:
True
}},
{
"terms"
:
{
"content_level"
:
[
5
,
4
,
3.5
,
3
]}}],
"should"
:
total_query_should_list
,
"minimum_should_match"
:
1
}}
q
[
'query'
][
'bool'
][
'must_not'
]
=
[{
"term"
:
{
"is_operate"
:
True
}}]
q
[
'sort'
]
=
sort_list
q
[
"_source"
]
=
{
"includes"
:
[
"id"
]}
es_res
=
es_query
(
"diary"
,
q
,
offset
=
0
,
size
=
200
)
diary_ids
=
[]
for
diary_info
in
es_res
[
'hits'
][
'hits'
]:
diary_ids
.
append
(
diary_info
[
'_source'
][
'id'
])
return
diary_ids
def
search_topic_by_match_phrase
(
tag_names
):
q
=
dict
()
total_query_should_list
=
list
()
for
tag_name
in
tag_names
:
term_dict
=
{
"match_phrase"
:
{
"fresh_tractate_tag_name"
:
{
"query"
:
tag_name
}
}
}
term_dict2
=
{
"match_phrase"
:
{
"tractate_tag_name_content"
:
{
"query"
:
tag_name
}
}
}
total_query_should_list
.
extend
([
term_dict
,
term_dict2
])
q
[
'query'
]
=
{
"bool"
:
{
"filter"
:
[{
"term"
:
{
"is_online"
:
True
}},
{
"terms"
:
{
"content_level"
:
[
5
,
4
,
3.5
,
3
]}}],
"should"
:
total_query_should_list
,
"minimum_should_match"
:
1
}}
q
[
"_source"
]
=
{
"includes"
:
[
"id"
]
}
q
[
"sort"
]
=
[
{
"is_video"
:
{
"order"
:
"asc"
}},
{
"good_click"
:
{
"order"
:
"desc"
}},
{
"tractate_score"
:
{
"order"
:
"desc"
}}
]
es_res
=
es_query
(
"tractate"
,
q
,
offset
=
0
,
size
=
200
)
topic_list
=
[]
for
topic_info
in
es_res
[
'hits'
][
'hits'
]:
topic_list
.
append
(
topic_info
[
'_source'
][
'id'
])
return
topic_list
def
search_qa_by_match_phrase
(
tag_names
):
sort_list
=
[
{
'has_picture'
:
{
'order'
:
'desc'
}},
{
"good_click"
:
{
"order"
:
"desc"
}},
{
'smart_rank'
:
{
'order'
:
'desc'
}}
]
q
=
dict
()
total_query_should_list
=
[]
for
tag_name
in
tag_names
:
term_dict
=
{
"match_phrase"
:
{
"tag_name"
:
{
"query"
:
tag_name
}
}
}
total_query_should_list
.
append
(
term_dict
)
q
[
'query'
]
=
{
"bool"
:
{
"filter"
:
[{
"range"
:
{
"content_length"
:
{
"gte"
:
30
}}},
{
"term"
:
{
"is_online"
:
True
}},
{
"terms"
:
{
"content_level"
:
[
'5'
,
'4'
,
'3.5'
,
'3'
]}}],
"should"
:
total_query_should_list
,
"minimum_should_match"
:
1
}}
q
[
"_source"
]
=
{
"includes"
:
[
"id"
]
}
q
[
'sort'
]
=
sort_list
es_res
=
es_query
(
"answer"
,
q
,
offset
=
0
,
size
=
200
)
qa_list
=
[]
for
qa_info
in
es_res
[
'hits'
][
'hits'
]:
qa_list
.
append
(
qa_info
[
'_source'
][
'id'
])
return
qa_list
if
__name__
==
"__main__"
:
# 获取所有的城市tag id
all_city_id
=
get_all_city_id
()
# 热搜词的候选队列
hot_search_word_diary_queue
=
list
()
hot_search_word_topic_queue
=
list
()
hot_search_word_qa_queue
=
list
()
# 轻医美的候选队列
light_clinic_beauty_diary_queue
=
list
()
light_clinic_beauty_topic_queue
=
list
()
light_clinic_beauty_qa_queue
=
list
()
eda/smart_rank/tool.py
View file @
d72242ad
...
...
@@ -13,6 +13,7 @@ import time
import
json
import
numpy
as
np
import
pandas
as
pd
from
elasticsearch
import
Elasticsearch
as
Es
def
send_email
(
app
,
id
,
e
):
...
...
@@ -347,3 +348,43 @@ def get_user_log(cl_id, all_word_tags, pay_time=0, debug=0):
except
:
print
(
"error2_user_portrait"
,
traceback
.
format_exc
())
return
user_df_service
def
get_es
():
init_args
=
{
'sniff_on_start'
:
False
,
'sniff_on_connection_fail'
:
False
,}
new_hosts
=
[{
'host'
:
'172.16.31.17'
,
'port'
:
9000
},
{
'host'
:
'172.16.31.11'
,
'port'
:
9000
},
{
'host'
:
'172.16.31.13'
,
'port'
:
9000
}]
new_es
=
Es
(
hosts
=
new_hosts
,
**
init_args
)
return
new_es
def
es_index_adapt
(
index_prefix
,
doc_type
,
rw
=
None
):
"""get the adapted index name
"""
assert
rw
in
[
None
,
'read'
,
'write'
]
index
=
'-'
.
join
((
index_prefix
,
doc_type
))
if
rw
:
index
=
'-'
.
join
((
index
,
rw
))
return
index
def
es_query
(
doc
,
body
,
offset
,
size
,
es
=
None
):
if
es
is
None
:
es
=
get_es
()
index
=
es_index_adapt
(
index_prefix
=
'gm-dbmw'
,
doc_type
=
doc
,
rw
=
'read'
)
res
=
es
.
search
(
index
=
index
,
doc_type
=
doc
,
timeout
=
'10s'
,
body
=
body
,
from_
=
offset
,
size
=
size
)
return
res
def
es_mquery
(
doc
,
body
,
es
=
None
):
if
es
is
None
:
es
=
get_es
()
index
=
es_index_adapt
(
index_prefix
=
'gm-dbmw'
,
doc_type
=
doc
,
rw
=
'read'
)
res
=
es
.
msearch
(
body
,
index
=
index
)
return
res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment