Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
7f87caf4
Commit
7f87caf4
authored
Nov 17, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
rename path
parent
ada16bb5
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
32 deletions
+47
-32
app_conf.xml
app_conf.xml
+1
-1
api.py
word_vector/api.py
+11
-30
tractate.py
word_vector/tractate.py
+35
-1
No files found.
app_conf.xml
View file @
7f87caf4
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
</config>
</config>
<config
name=
"initializer_list"
>
<config
name=
"initializer_list"
>
<element
value=
"face_similarity.diary_cover_similarity"
/>
<element
value=
"face_similarity.diary_cover_similarity"
/>
<element
value=
"word_vector.
word_to_vec
"
/>
<element
value=
"word_vector.
api
"
/>
<element
value=
"personas_vector.match_api"
/>
<element
value=
"personas_vector.match_api"
/>
</config>
</config>
<config
name=
"statuses"
value=
"strategy_embedding.system:statuses"
/>
<config
name=
"statuses"
value=
"strategy_embedding.system:statuses"
/>
...
...
word_vector/
word_to_vec
.py
→
word_vector/
api
.py
View file @
7f87caf4
...
@@ -9,8 +9,10 @@ sys.path.append(os.path.realpath("."))
...
@@ -9,8 +9,10 @@ sys.path.append(os.path.realpath("."))
from
gensim.models
import
Word2Vec
,
word2vec
from
gensim.models
import
Word2Vec
,
word2vec
from
gm_rpcd.all
import
bind
from
gm_rpcd.all
import
bind
from
utils.es
import
es_scan
from
utils.es
import
es_scan
from
utils.message
import
send_msg_to_dingtalk
from
utils.file
import
DATA_PATH
,
MODEL_PATH
from
utils.file
import
DATA_PATH
,
MODEL_PATH
from
utils.message
import
send_msg_to_dingtalk
from
word_vector.tractate
import
tractate_click_ids_model_path
model_output_name
=
"w2v_model"
model_output_name
=
"w2v_model"
model_path
=
os
.
path
.
join
(
MODEL_PATH
,
model_output_name
)
model_path
=
os
.
path
.
join
(
MODEL_PATH
,
model_output_name
)
...
@@ -19,8 +21,6 @@ try:
...
@@ -19,8 +21,6 @@ try:
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
tracate_click_ids_model_name
=
"tractate_click_ids_item2vec_model"
tractate_click_ids_model_path
=
os
.
path
.
join
(
MODEL_PATH
,
tracate_click_ids_model_name
)
try
:
try
:
TRACTATE_CLICK_IDS_MODEL
=
word2vec
.
Word2Vec
.
load
(
tractate_click_ids_model_path
)
TRACTATE_CLICK_IDS_MODEL
=
word2vec
.
Word2Vec
.
load
(
tractate_click_ids_model_path
)
except
Exception
as
e
:
except
Exception
as
e
:
...
@@ -89,23 +89,7 @@ def projects_item2vec(score_limit=5):
...
@@ -89,23 +89,7 @@ def projects_item2vec(score_limit=5):
return
model
return
model
def
save_clicked_tractate_ids_item2vec
():
# item2vec
click_ids
=
[]
with
open
(
os
.
path
.
join
(
DATA_PATH
,
"click_tractate_ids.csv"
),
"r"
)
as
f
:
data
=
f
.
readlines
()
for
i
in
data
:
tmp
=
i
.
split
(
"|"
)
# app_session_id = tmp[0]
ids
=
tmp
[
1
]
.
rstrip
(
"
\n
"
)
.
split
(
","
)
click_ids
.
append
(
ids
)
model
=
Word2Vec
(
click_ids
,
hs
=
0
,
min_count
=
3
,
workers
=
multiprocessing
.
cpu_count
(),
iter
=
10
)
print
(
model
)
print
(
len
(
click_ids
))
model
.
save
(
tractate_click_ids_model_path
)
return
model
@bind
(
"strategy_embedding/word_vector/tractate_item2vec"
)
@bind
(
"strategy_embedding/word_vector/tractate_item2vec"
)
def
clicked_tractate_ids_item2vec_model
(
id
,
n
=
5
):
def
clicked_tractate_ids_item2vec_model
(
id
,
n
=
5
):
try
:
try
:
...
@@ -115,17 +99,14 @@ def clicked_tractate_ids_item2vec_model(id, n=5):
...
@@ -115,17 +99,14 @@ def clicked_tractate_ids_item2vec_model(id, n=5):
return
[]
return
[]
if
__name__
==
"__main__"
:
# if __name__ == "__main__":
begin_time
=
time
.
time
()
# w2v_train("dispose_problem.txt", model_output_name)
for
i
in
[
"双眼皮"
,
"隆鼻"
]:
# w2v_train("dispose_problem.txt", model_output_name)
print
(
word_similarity
(
i
))
# save_clicked_tractate_ids_item2vec()
# for i in ["双眼皮", "隆鼻"]:
# print(word_similarity(i))
for
id
in
[
"84375"
,
"148764"
,
"368399"
]:
# save_clicked_tractate_ids_item2vec()
print
(
clicked_tractate_ids_item2vec_model
(
id
,
n
=
5
))
print
(
"total cost: {:.2f}mins"
.
format
((
time
.
time
()
-
begin_time
)
/
60
))
# for id in ["84375", "148764", "368399"]:
# print(clicked_tractate_ids_item2vec_model(id, n=5))
word_vector/tractate.py
View file @
7f87caf4
import
multiprocessing
import
os
import
os
import
sys
import
sys
from
collections
import
defaultdict
from
collections
import
defaultdict
sys
.
path
.
append
(
os
.
path
.
realpath
(
"."
))
sys
.
path
.
append
(
os
.
path
.
realpath
(
"."
))
import
time
from
gensim.models
import
Word2Vec
,
word2vec
from
utils.date
import
get_ndays_before_no_minus
,
get_ndays_before_with_format
from
utils.date
import
get_ndays_before_no_minus
,
get_ndays_before_with_format
from
utils.files
import
DATA_PATH
from
utils.files
import
DATA_PATH
,
MODEL_PATH
from
utils.spark
import
get_spark
from
utils.spark
import
get_spark
from
word_vector.api
import
clicked_tractate_ids_item2vec_model
tractate_click_ids_model_path
=
os
.
path
.
join
(
MODEL_PATH
,
"tractate_click_ids_item2vec_model"
)
def
get_tracate_click_data
(
spark
,
start
,
end
):
def
get_tracate_click_data
(
spark
,
start
,
end
):
reg
=
r"""^\\d+$"""
reg
=
r"""^\\d+$"""
...
@@ -123,7 +131,26 @@ def get_device_click_tractate_ids_dict(click_df):
...
@@ -123,7 +131,26 @@ def get_device_click_tractate_ids_dict(click_df):
return
res
return
res
def
save_clicked_tractate_ids_item2vec
():
click_ids
=
[]
with
open
(
os
.
path
.
join
(
DATA_PATH
,
"click_tractate_ids.csv"
),
"r"
)
as
f
:
data
=
f
.
readlines
()
for
i
in
data
:
tmp
=
i
.
split
(
"|"
)
# app_session_id = tmp[0]
ids
=
tmp
[
1
]
.
rstrip
(
"
\n
"
)
.
split
(
","
)
click_ids
.
append
(
ids
)
model
=
Word2Vec
(
click_ids
,
hs
=
0
,
min_count
=
3
,
workers
=
multiprocessing
.
cpu_count
(),
iter
=
10
)
print
(
model
)
print
(
len
(
click_ids
))
model
.
save
(
tractate_click_ids_model_path
)
return
model
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
begin_time
=
time
.
time
()
spark
=
get_spark
(
"tractate_click_ids"
)
spark
=
get_spark
(
"tractate_click_ids"
)
click_df
=
get_tracate_click_data
(
spark
,
get_ndays_before_no_minus
(
180
),
get_ndays_before_no_minus
(
1
))
click_df
=
get_tracate_click_data
(
spark
,
get_ndays_before_no_minus
(
180
),
get_ndays_before_no_minus
(
1
))
click_df
.
show
(
5
,
False
)
click_df
.
show
(
5
,
False
)
...
@@ -135,4 +162,11 @@ if __name__ == "__main__":
...
@@ -135,4 +162,11 @@ if __name__ == "__main__":
if
v
:
if
v
:
f
.
write
(
"{}|{}
\n
"
.
format
(
k
,
","
.
join
([
str
(
x
)
for
x
in
v
])))
f
.
write
(
"{}|{}
\n
"
.
format
(
k
,
","
.
join
([
str
(
x
)
for
x
in
v
])))
save_clicked_tractate_ids_item2vec
()
for
id
in
[
"84375"
,
"148764"
,
"368399"
]:
print
(
clicked_tractate_ids_item2vec_model
(
id
,
n
=
5
))
print
(
"total cost: {:.2f}mins"
.
format
((
time
.
time
()
-
begin_time
)
/
60
))
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment