Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
637fd6ce
Commit
637fd6ce
authored
Nov 04, 2019
by
高雅喆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
test
parent
a594fcd6
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
82 additions
and
93 deletions
+82
-93
dist_update_user_portrait_service.py
eda/smart_rank/dist_update_user_portrait_service.py
+82
-93
No files found.
eda/smart_rank/dist_update_user_portrait_service.py
View file @
637fd6ce
...
@@ -77,14 +77,8 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
...
@@ -77,14 +77,8 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
gmkv_tag_score_sum
=
tag_score_sum
[[
"tag2"
,
"tag_score"
,
"weight"
]][:
size
]
.
to_dict
(
'record'
)
gmkv_tag_score_sum
=
tag_score_sum
[[
"tag2"
,
"tag_score"
,
"weight"
]][:
size
]
.
to_dict
(
'record'
)
gmkv_tag_score2_sum
=
tag_score_sum
[[
"tag2"
,
"tag_score"
]][:
size
]
.
to_dict
(
'record'
)
gmkv_tag_score2_sum
=
tag_score_sum
[[
"tag2"
,
"tag_score"
]][:
size
]
.
to_dict
(
'record'
)
gmkv_tag_score2_sum_dict
=
{
i
[
"tag2"
]:
i
[
"tag_score"
]
for
i
in
gmkv_tag_score2_sum
}
gmkv_tag_score2_sum_dict
=
{
i
[
"tag2"
]:
i
[
"tag_score"
]
for
i
in
gmkv_tag_score2_sum
}
# 写gmkv
gm_kv_cli
=
redis
.
Redis
(
host
=
"172.16.40.135"
,
port
=
5379
,
db
=
2
,
socket_timeout
=
2000
)
# 写redis
cl_id_portrait_key
=
"user:service_portrait_tags:cl_id:"
+
str
(
cl_id
)
tag_id_list_json
=
json
.
dumps
(
gmkv_tag_score_sum
)
gm_kv_cli
.
set
(
cl_id_portrait_key
,
tag_id_list_json
)
gm_kv_cli
.
expire
(
cl_id_portrait_key
,
time
=
30
*
24
*
60
*
60
)
写
redis
redis_client
=
redis
.
StrictRedis
.
from_url
(
'redis://:ReDis!GmTx*0aN9@172.16.40.173:6379'
)
redis_client
=
redis
.
StrictRedis
.
from_url
(
'redis://:ReDis!GmTx*0aN9@172.16.40.173:6379'
)
cl_id_portrait_key2
=
"user:service_portrait_tags2:cl_id:"
+
str
(
cl_id
)
cl_id_portrait_key2
=
"user:service_portrait_tags2:cl_id:"
+
str
(
cl_id
)
# 如果画像的tag个数小于5,则补充热搜词
# 如果画像的tag个数小于5,则补充热搜词
...
@@ -137,87 +131,83 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
...
@@ -137,87 +131,83 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
try
:
db_jerry_test
=
pymysql
.
connect
(
host
=
'172.16.40.170'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db_jerry_test
=
pymysql
.
connect
(
host
=
'172.16.40.170'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
,
charset
=
'utf8'
)
db
=
'jerry_test'
,
charset
=
'utf8'
)
cur_jerry_test
=
db_jerry_test
.
cursor
()
cur_jerry_test
=
db_jerry_test
.
cursor
()
# 获取最近30天内的用户设备id
# 获取最近30天内的用户设备id
sql_device_ids
=
"select distinct cl_id from user_new_tag_log "
\
sql_device_ids
=
"select distinct cl_id from user_new_tag_log "
\
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
cur_jerry_test
.
execute
(
sql_device_ids
)
cur_jerry_test
.
execute
(
sql_device_ids
)
device_ids_lst
=
[
i
[
0
]
for
i
in
cur_jerry_test
.
fetchall
()]
device_ids_lst
=
[
i
[
0
]
for
i
in
cur_jerry_test
.
fetchall
()]
db_jerry_test
.
close
()
db_jerry_test
.
close
()
cur_jerry_test
.
close
()
cur_jerry_test
.
close
()
redis_client
=
redis
.
StrictRedis
.
from_url
(
'redis://:ReDis!GmTx*0aN9@172.16.40.173:6379'
)
redis_client
=
redis
.
StrictRedis
.
from_url
(
'redis://:ReDis!GmTx*0aN9@172.16.40.173:6379'
)
# 获取搜索词及其近义词对应的tag
# 获取搜索词及其近义词对应的tag
all_word_tags
=
get_all_word_tags
()
all_word_tags
=
get_all_word_tags
()
all_tag_tag_type
=
get_all_tag_tag_type
()
all_tag_tag_type
=
get_all_tag_tag_type
()
# 3级tag对应的2级tag
# 3级tag对应的2级tag
all_3tag_2tag
=
get_all_3tag_2tag
()
all_3tag_2tag
=
get_all_3tag_2tag
()
# 标签id对应的中文名称
# 标签id对应的中文名称
all_tags_name
=
get_all_tags_name
()
all_tags_name
=
get_all_tags_name
()
# 画像冷启动
# 画像冷启动
hot_search_words
=
get_hot_search_words_tag
()
hot_search_words
=
get_hot_search_words_tag
()
hot_search_words_portrait
=
list
()
hot_search_words_portrait
=
list
()
for
tag_info
in
hot_search_words
:
for
tag_info
in
hot_search_words
:
tmp
=
dict
()
tmp
=
dict
()
tmp
[
"tag_score"
]
=
0.2
tmp
[
"tag_score"
]
=
0.2
tmp
[
"weight"
]
=
10
tmp
[
"weight"
]
=
10
tmp
[
"tag2"
]
=
tag_info
[
"id"
]
tmp
[
"tag2"
]
=
tag_info
[
"id"
]
hot_search_words_portrait
.
append
(
tmp
)
hot_search_words_portrait
.
append
(
tmp
)
# gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
# gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
# hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
# hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
# hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
# hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
# gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
# gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
# redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
# redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
hot_search_words_portrait_portrait_key2
=
"user:service_coldstart_tags2"
hot_search_words_portrait_portrait_key2
=
"user:service_coldstart_tags2"
hot_search_words_portrait_dict
=
{
i
[
"id"
]:
0.2
for
i
in
hot_search_words
}
hot_search_words_portrait_dict
=
{
i
[
"id"
]:
0.2
for
i
in
hot_search_words
}
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key2
,
hot_search_words_portrait_dict
)
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key2
,
hot_search_words_portrait_dict
)
hot_search_words_portrait_portrait_key2
=
"user:service_coldstart_tags2_name"
hot_search_words_portrait_portrait_key2
=
"user:service_coldstart_tags2_name"
hot_search_words_portrait_dict
=
{
i
[
"keywords"
]:
0.2
for
i
in
hot_search_words
}
hot_search_words_portrait_dict
=
{
i
[
"keywords"
]:
0.2
for
i
in
hot_search_words
}
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key2
,
hot_search_words_portrait_dict
)
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key2
,
hot_search_words_portrait_dict
)
hot_search_words
=
[
"明星娱乐"
,
"网红扒一扒"
,
"明星颜值打call"
,
"颜商"
,
"颜值高光时刻"
,
"瘦脸针"
,
"水光针"
,
"光子嫩肤"
,
"热玛吉"
,
"瘦腿针"
,
"超声刀"
,
"瘦肩针"
,
"皮秒"
,
"果酸焕肤"
,
hot_search_words
=
[
"明星娱乐"
,
"网红扒一扒"
,
"明星颜值打call"
,
"颜商"
,
"颜值高光时刻"
,
"瘦脸针"
,
"水光针"
,
"光子嫩肤"
,
"热玛吉"
,
"瘦腿针"
,
"超声刀"
,
"瘦肩针"
,
"皮秒"
,
"果酸焕肤"
,
"热拉提"
,
"微针"
,
"超皮秒"
,
"点阵激光"
,
"小气泡"
,
"玻尿酸丰下巴"
,
"埋线双眼皮"
,
"纹眉"
,
"嗨体"
,
"溶脂针瘦脸"
,
"黄金微针"
,
"点痣"
,
"激光祛斑"
,
"白瓷娃娃"
,
"热拉提"
,
"微针"
,
"超皮秒"
,
"点阵激光"
,
"小气泡"
,
"玻尿酸丰下巴"
,
"埋线双眼皮"
,
"纹眉"
,
"嗨体"
,
"溶脂针瘦脸"
,
"黄金微针"
,
"点痣"
,
"激光祛斑"
,
"白瓷娃娃"
,
"除皱针注射"
,
"微针祛痘坑"
,
"玻尿酸"
,
"大分子玻尿酸"
,
"胶原蛋白"
,
"肉毒素"
,
"水杨酸"
,
"果酸"
,
"杏仁酸"
,
"黑脸娃娃"
,
"童颜针"
,
"祛斑"
,
"祛痣"
,
"祛黑头"
,
"祛疤"
,
"除皱针注射"
,
"微针祛痘坑"
,
"玻尿酸"
,
"大分子玻尿酸"
,
"胶原蛋白"
,
"肉毒素"
,
"水杨酸"
,
"果酸"
,
"杏仁酸"
,
"黑脸娃娃"
,
"童颜针"
,
"祛斑"
,
"祛痣"
,
"祛黑头"
,
"祛疤"
,
"祛痘"
,
"蜂巢皮秒"
,
"深蓝射频"
,
"美瞳"
,
"孕睫"
,
"婴儿针"
,
"三文鱼针"
,
"少女针"
,
"素颜针"
,
"熊猫针"
,
"脱毛"
,
"面部提升"
,
"嫩肤"
,
"镭射净肤"
,
"红蓝光"
,
"清洁"
,
"祛痘"
,
"蜂巢皮秒"
,
"深蓝射频"
,
"美瞳"
,
"孕睫"
,
"婴儿针"
,
"三文鱼针"
,
"少女针"
,
"素颜针"
,
"熊猫针"
,
"脱毛"
,
"面部提升"
,
"嫩肤"
,
"镭射净肤"
,
"红蓝光"
,
"清洁"
,
"补水"
,
"DPL"
,
"抗衰"
,
"针清"
,
"美白"
,
"冷光美白"
,
"非剥落点阵"
,
"网红抗衰"
,
"网红整形"
,
"网红颜值"
,
"网红婚恋"
,
"明星抗衰"
,
"明星整形"
,
"明星婚恋"
,
"明星颜值"
]
"补水"
,
"DPL"
,
"抗衰"
,
"针清"
,
"美白"
,
"冷光美白"
,
"非剥落点阵"
,
"网红抗衰"
,
"网红整形"
,
"网红颜值"
,
"网红婚恋"
,
"明星抗衰"
,
"明星整形"
,
"明星婚恋"
,
"明星颜值"
]
hot_search_words_portrait_portrait_key3
=
"user:service_coldstart_tags3"
hot_search_words_portrait_portrait_key3
=
"user:service_coldstart_tags3"
hot_search_words_portrait3_dict
=
{
i
:
0.2
for
i
in
hot_search_words
}
hot_search_words_portrait3_dict
=
{
i
:
0.2
for
i
in
hot_search_words
}
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key3
,
hot_search_words_portrait3_dict
)
redis_client
.
hmset
(
hot_search_words_portrait_portrait_key3
,
hot_search_words_portrait3_dict
)
# 搜索词tag
# 搜索词tag
search_words_synonym_tags_key
=
"search:words:synonym:tags"
search_words_synonym_tags_key
=
"search:words:synonym:tags"
search_words_synonym_tags_json
=
json
.
dumps
(
all_word_tags
)
search_words_synonym_tags_json
=
json
.
dumps
(
all_word_tags
)
# gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
# gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
redis_client
.
set
(
search_words_synonym_tags_key
,
search_words_synonym_tags_json
)
redis_client
.
set
(
search_words_synonym_tags_key
,
search_words_synonym_tags_json
)
# rdd
# rdd
sparkConf
=
SparkConf
()
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
"true"
)
\
sparkConf
=
SparkConf
()
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
"true"
)
\
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
"true"
)
\
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
"true"
)
\
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
"false"
)
\
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
"false"
)
\
.
set
(
"spark.tispark.plan.allow_index_read"
,
"true"
)
\
.
set
(
"spark.tispark.plan.allow_index_read"
,
"true"
)
\
.
set
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
\
.
set
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.170:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.170:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
.
set
(
"spark.driver.maxResultSize"
,
"8g"
)
.
set
(
"spark.sql.avro.compression.codec"
,
"snappy"
)
.
set
(
"spark.driver.maxResultSize"
,
"8g"
)
.
set
(
"spark.sql.avro.compression.codec"
,
"snappy"
)
spark
=
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
enableHiveSupport
()
.
getOrCreate
()
spark
=
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
enableHiveSupport
()
.
getOrCreate
()
spark
.
sparkContext
.
setLogLevel
(
"WARN"
)
spark
.
sparkContext
.
setLogLevel
(
"WARN"
)
spark
.
sparkContext
.
addPyFile
(
"/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py"
)
spark
.
sparkContext
.
addPyFile
(
"/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py"
)
device_ids_lst_rdd
=
spark
.
sparkContext
.
parallelize
(
device_ids_lst
,
numSlices
=
1000
)
device_ids_lst_rdd
=
spark
.
sparkContext
.
parallelize
(
device_ids_lst
,
numSlices
=
1000
)
result
=
device_ids_lst_rdd
.
repartition
(
100
)
.
map
(
lambda
x
:
get_user_service_portrait
(
x
,
all_word_tags
,
all_tag_tag_type
,
all_3tag_2tag
,
all_tags_name
))
result
=
device_ids_lst_rdd
.
repartition
(
100
)
.
map
(
lambda
x
:
get_user_service_portrait
(
x
,
all_word_tags
,
all_tag_tag_type
,
all_3tag_2tag
,
all_tags_name
))
result
.
collect
()
result
.
collect
()
except
Exception
as
e
:
send_email
(
"dist_update_user_portrait_service"
,
"dist_update_user_portrait_service"
,
"dist_update_user_portrait_service"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment