Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
84e6abb5
Commit
84e6abb5
authored
Nov 07, 2019
by
高雅喆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add gm_tag_cf
parent
ea7bcd8a
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
104 additions
and
2 deletions
+104
-2
gm_tag_cf.py
eda/smart_rank/gm_tag_cf.py
+68
-0
tool.py
eda/smart_rank/tool.py
+36
-2
No files found.
eda/smart_rank/gm_tag_cf.py
0 → 100644
View file @
84e6abb5
import
numpy
as
np
import
pandas
as
pd
from
sklearn.metrics
import
pairwise_distances
from
scipy.spatial.distance
import
cosine
,
correlation
from
tool
import
get_all_tag_parent_tag
,
get_all_tags_name
,
get_all_word_synonym_words
# Recommender Engine
def
get_similary_tags
(
tag_id
,
tags
,
ratings_matrix
):
tag_id_index
=
tags
[
tags
[
'tag_id'
]
==
tag_id
]
.
index
.
tolist
()
tag_id_index
=
tag_id_index
[
0
]
tags
[
'similarity'
]
=
ratings_matrix
.
iloc
[
tag_id_index
]
return
tags
.
sort_values
([
"similarity"
],
ascending
=
False
)[:
-
1
]
.
tag_id
.
tolist
()
# rm self
if
__name__
==
'__main__'
:
# User Log
# index: sample id
# columns: userId,tagId,rating
ratings
=
pd
.
read_csv
(
"/home/gmuser/gyz/log/cf/user_item_log.csv"
,
sep
=
"
\t
"
)
ratings
.
drop
(
"timestamp"
,
inplace
=
True
,
axis
=
1
)
sorted_tag_ids
=
sorted
(
ratings
.
tagId
.
unique
())
cf_tags
=
pd
.
DataFrame
({
'tag_id'
:
sorted_tag_ids
})
# ratings.head()
# Pivot Table
# index: tagId(数值归一化后的)
# columns: userId
ratings_matrix
=
ratings
.
pivot_table
(
index
=
[
'tagId'
],
columns
=
[
'userId'
],
values
=
'rating'
)
.
reset_index
(
drop
=
True
)
ratings_matrix
.
fillna
(
0
,
inplace
=
True
)
# ratings_matrix.head()
# Cosine Similarity
# index: tagId
# columns: tagId
movie_similarity
=
1
-
pairwise_distances
(
ratings_matrix
.
as_matrix
(),
metric
=
"cosine"
)
np
.
fill_diagonal
(
movie_similarity
,
0
)
# Filling diagonals with 0 for future use when sorting is done
ratings_matrix
=
pd
.
DataFrame
(
movie_similarity
)
ratings_matrix
.
head
()
# 1/2/3级标签的父级标签list
all_tag_parent_tags
=
get_all_tag_parent_tag
()
# 词的同义词list
all_word_synonym_words
=
get_all_word_synonym_words
()
# 所有的1/2/3级标签的id及name
all_tags_name
=
get_all_tags_name
()
# 所有标签的协同标签
all_tags_cf_tags
=
dict
()
for
tag
in
all_tags_name
:
if
tag
in
cf_tags
:
tag_cf_tags
=
get_similary_tags
(
tag
,
cf_tags
,
ratings_matrix
)
else
:
tag_cf_tags
=
[]
# 过滤父级标签
parent_tags
=
all_tag_parent_tags
[
tag
]
for
parent_tag
in
parent_tags
:
if
parent_tag
in
tag_cf_tags
:
tag_cf_tags
.
remove
(
parent_tag
)
# 过滤标签的同义词
tag_name
=
all_tags_name
[
tag
]
tag_cf_tags_names
=
[
all_tags_name
[
i
]
for
i
in
tag_cf_tags
]
tag_synonym_names
=
all_word_synonym_words
[
tag_name
]
for
tag_synonym_name
in
tag_synonym_names
:
if
tag_synonym_name
in
tag_cf_tags_names
:
tag_cf_tags_names
.
remove
(
tag_synonym_name
)
all_tags_cf_tags
[
tag_name
]
=
tag_cf_tags_names
eda/smart_rank/tool.py
View file @
84e6abb5
...
...
@@ -83,7 +83,7 @@ def get_all_search_word_synonym_tags():
sql
=
"select a.keyword , c.id from api_wordrel a "
\
"left join api_wordrelsynonym b on a.id = b.wordrel_id "
\
"left join api_tag c on b.word=c.name "
\
"where
a.category in (1,13,10,11,12) and
c.tag_type+0<'4'+0 and c.is_online=1"
"where c.tag_type+0<'4'+0 and c.is_online=1"
mysql_results
=
get_data_by_mysql
(
'172.16.30.141'
,
3306
,
'work'
,
'BJQaT9VzDcuPBqkd'
,
'zhengxing'
,
sql
)
result_dict
=
dict
()
for
data
in
mysql_results
:
...
...
@@ -96,6 +96,22 @@ def get_all_search_word_synonym_tags():
print
(
e
)
def
get_all_word_synonym_words
():
try
:
sql
=
"select a.keyword, b.word from api_wordrel a "
\
"left join api_wordrelsynonym b on a.id = b.wordrel_id "
mysql_results
=
get_data_by_mysql
(
'172.16.30.141'
,
3306
,
'work'
,
'BJQaT9VzDcuPBqkd'
,
'zhengxing'
,
sql
)
result_dict
=
dict
()
for
data
in
mysql_results
:
if
data
[
'keyword'
]
not
in
result_dict
:
result_dict
[
data
[
'keyword'
]]
=
[
data
[
'word'
]]
else
:
result_dict
[
data
[
'keyword'
]]
.
append
(
data
[
'word'
])
return
result_dict
except
Exception
as
e
:
print
(
e
)
def
get_all_synonym_tags
():
"""
:return:dict {"search_word1":[tag_list1],"search_word2":[tag_list2]...}
...
...
@@ -163,7 +179,25 @@ def get_all_3tag_2tag():
sql
=
"select a.child_id,a.parent_id from api_tagrelation a"
\
" left join api_tag b on a.parent_id=b.id "
\
"where a.child_id in (select id from api_tag where tag_type='3' and is_online=1) "
\
"and b.tag_type='2'"
"and b.tag_type='2' and b.is_online=1"
mysql_results
=
get_data_by_mysql
(
'172.16.30.141'
,
3306
,
'work'
,
'BJQaT9VzDcuPBqkd'
,
'zhengxing'
,
sql
)
result_dict
=
dict
()
for
data
in
mysql_results
:
if
data
[
'child_id'
]
not
in
result_dict
:
result_dict
[
data
[
'child_id'
]]
=
[
data
[
'parent_id'
]]
else
:
result_dict
[
data
[
'child_id'
]]
.
append
(
data
[
'parent_id'
])
return
result_dict
except
Exception
as
e
:
print
(
e
)
def
get_all_tag_parent_tag
():
try
:
sql
=
"select a.child_id,a.parent_id from api_tagrelation a"
\
" left join api_tag b on a.parent_id=b.id "
\
"where a.child_id in (select id from api_tag where tag_type+0<'4'+0 and is_online=1) "
\
"and b.tag_type+0<'4'+0 and b.is_online=1"
mysql_results
=
get_data_by_mysql
(
'172.16.30.141'
,
3306
,
'work'
,
'BJQaT9VzDcuPBqkd'
,
'zhengxing'
,
sql
)
result_dict
=
dict
()
for
data
in
mysql_results
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment