Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
bab61776
Commit
bab61776
authored
Oct 28, 2020
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
write file
parent
5b81b3e7
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
6 deletions
+12
-6
tractate.py
word_vector/tractate.py
+12
-6
No files found.
word_vector/tractate.py
View file @
bab61776
import
os
from
collections
import
defaultdict
from
datetime
import
date
,
timedelta
...
...
@@ -5,6 +6,10 @@ from pyspark import SparkConf
from
pyspark.sql
import
SparkSession
from
pytispark
import
pytispark
as
pti
base_dir
=
os
.
getcwd
()
print
(
"base_dir: "
+
base_dir
)
data_dir
=
os
.
path
.
join
(
base_dir
,
"_data"
)
def
get_ndays_before_with_format
(
n
,
format
):
yesterday
=
(
date
.
today
()
+
timedelta
(
days
=-
n
))
.
strftime
(
format
)
...
...
@@ -140,9 +145,9 @@ def get_tracate_click_data(spark, start, end):
return
df
def
get_device_click_tractate_ids
(
click_df
):
def
get_device_click_tractate_ids
_dict
(
click_df
):
res
=
defaultdict
(
list
)
cols
=
click_df
.
orderBy
(
"partition_date"
,
ascending
=
False
)
.
limit
(
100
)
.
collect
()
cols
=
click_df
.
orderBy
(
"partition_date"
,
ascending
=
False
)
.
collect
()
for
i
in
cols
:
res
[
i
[
"cl_id"
]]
.
append
(
i
[
"card_id"
])
return
res
...
...
@@ -150,10 +155,11 @@ def get_device_click_tractate_ids(click_df):
if
__name__
==
"__main__"
:
spark
=
get_spark
(
"test"
)
# TODO 30 days
click_df
=
get_tracate_click_data
(
spark
,
get_ndays_before_no_minus
(
5
),
get_ndays_before_no_minus
(
1
))
click_df
=
get_tracate_click_data
(
spark
,
get_ndays_before_no_minus
(
30
),
get_ndays_before_no_minus
(
1
))
click_df
.
show
(
5
,
False
)
res_dict
=
get_device_click_tractate_ids
(
click_df
)
print
(
res_dict
)
res_dict
=
get_device_click_tractate_ids_dict
(
click_df
)
with
open
(
os
.
path
.
join
(
data_dir
,
"click_tractate_ids.csv"
),
"w"
)
as
f
:
for
(
k
,
v
)
in
res_dict
:
f
.
write
(
k
+
"|"
+
","
.
join
(
v
))
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment