Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
strategy_embedding
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
rank
strategy_embedding
Commits
a1c23e8d
Commit
a1c23e8d
authored
4 years ago
by
赵威
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
get tractate data
parent
887b0b58
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
146 deletions
+12
-146
requirements.txt
requirements.txt
+1
-1
spark.py
utils/spark.py
+6
-3
tests.py
word_vector/tests.py
+5
-142
No files found.
requirements.txt
View file @
a1c23e8d
...
...
@@ -25,7 +25,7 @@ pymysql==0.10.1
gensim
==3.8.3
# pyspark==2.3.
0
# pyspark==2.3.
3
pytispark
==2.0
ipython
This diff is collapsed.
Click to expand it.
utils/spark.py
View file @
a1c23e8d
...
...
@@ -23,8 +23,7 @@ def get_spark(app_name=""):
return
spark
# TODO partition_date
def
get_click_data
(
spark
,
start
,
end
):
def
get_tracate_click_data
(
spark
,
start
,
end
):
reg
=
r"""^\\d+$"""
sql
=
"""
SELECT DISTINCT t1.partition_date, t1.cl_id, cast(t1.business_id as int) card_id
...
...
@@ -33,7 +32,7 @@ def get_click_data(spark, start, end):
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date BETWEEN '{}' AND '{}'
AND page_name='
welfare
_detail'
AND page_name='
user_post
_detail'
AND page_stay>=1
AND cl_id is not null
AND cl_id != ''
...
...
@@ -124,3 +123,7 @@ def get_click_data(spark, start, end):
# print("sql", flush=True)
# print(sql, flush=True)
return
spark
.
sql
(
sql
)
def
get_device_click_tractate_ids
():
pass
This diff is collapsed.
Click to expand it.
word_vector/tests.py
View file @
a1c23e8d
# from django.test import TestCase
from
datetime
import
date
,
timedelta
from
pyspark
import
SparkConf
from
pyspark.sql
import
SparkSession
from
pytispark
import
pytispark
as
pti
# Create your tests here.
def
get_spark
(
app_name
=
""
):
sparkConf
=
SparkConf
()
sparkConf
.
set
(
"spark.sql.crossJoin.enabled"
,
True
)
sparkConf
.
set
(
"spark.debug.maxToStringFields"
,
"100"
)
sparkConf
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
False
)
sparkConf
.
set
(
"spark.tispark.plan.allow_index_read"
,
True
)
sparkConf
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
True
)
sparkConf
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
True
)
sparkConf
.
set
(
"spark.serializer"
,
"org.apache.spark.serializer.KryoSerializer"
)
sparkConf
.
set
(
"mapreduce.output.fileoutputformat.compress"
,
False
)
sparkConf
.
set
(
"mapreduce.map.output.compress"
,
False
)
spark
=
(
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
config
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
.
config
(
"spark.tispark.pd.addresses"
,
"172.16.40.170:2379"
)
.
appName
(
app_name
)
.
enableHiveSupport
()
.
getOrCreate
())
ti
=
pti
.
TiContext
(
spark
)
ti
.
tidbMapDatabase
(
"jerry_test"
)
return
spark
def
getNDaysBeforeWithFormat
(
n
,
format
):
yesterday
=
(
date
.
today
()
+
timedelta
(
days
=-
n
))
.
strftime
(
format
)
return
yesterday
def
getNDaysBeforeNoMinus
(
n
):
return
getNDaysBeforeWithFormat
(
n
,
"
%
Y
%
m
%
d"
)
def
getClickDataForLr
(
spark
,
start
,
end
):
reg
=
r"""^\\d+$"""
positiveSql
=
"""
SELECT DISTINCT t1.partition_date, t1.cl_id, cast(t1.business_id as int) card_id
FROM
(select partition_date,cl_id,business_id,action,page_name,page_stay
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date BETWEEN '{}' AND '{}'
AND page_name='welfare_detail'
AND page_stay>=1
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
AND business_id rlike '{}'
) AS t1
JOIN
(select partition_date,active_type,first_channel_source_type,device_id
from online.ml_device_day_active_status
where partition_date BETWEEN '{}' AND '{}'
AND active_type IN ('1', '2', '4')
AND first_channel_source_type not IN ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei')
AND first_channel_source_type not LIKE 'promotion
\\
_jf
\\
_
%
') as t2
ON t1.cl_id = t2.device_id
AND t1.partition_date = t2.partition_date
LEFT JOIN
(
SELECT DISTINCT device_id
FROM ml.ml_d_ct_dv_devicespam_d --去除机构刷单设备,即作弊设备(浏览和曝光事件去除)
WHERE partition_day='{}'
UNION ALL
SELECT DISTINCT device_id
FROM dim.dim_device_user_staff --去除内网用户
)spam_pv
on spam_pv.device_id=t1.cl_id
LEFT JOIN
(
SELECT partition_date,device_id
FROM
(--找出user_id当天活跃的第一个设备id
SELECT user_id,partition_date,
if(size(device_list) > 0, device_list [ 0 ], '') AS device_id
FROM online.ml_user_updates
WHERE partition_date>='{}' AND partition_date<'{}'
)t1
JOIN
( --医生账号
SELECT distinct user_id
FROM online.tl_hdfs_doctor_view
WHERE partition_date = '{}'
--马甲账号/模特用户
UNION ALL
SELECT user_id
FROM ml.ml_c_ct_ui_user_dimen_d
WHERE partition_day = '{}'
AND (is_puppet = 'true' or is_classifyuser = 'true')
UNION ALL
--公司内网覆盖用户
select distinct user_id
from dim.dim_device_user_staff
UNION ALL
--登陆过医生设备
SELECT distinct t1.user_id
FROM
(
SELECT user_id, v.device_id as device_id
FROM online.ml_user_history_detail
LATERAL VIEW EXPLODE(device_history_list) v AS device_id
WHERE partition_date = '{}'
)t1
JOIN
(
SELECT device_id
FROM online.ml_device_history_detail
WHERE partition_date = '{}'
AND is_login_doctor = '1'
)t2
ON t1.device_id = t2.device_id
)t2
on t1.user_id=t2.user_id
group by partition_date,device_id
)dev
on t1.partition_date=dev.partition_date and t1.cl_id=dev.device_id
WHERE (spam_pv.device_id IS NULL or spam_pv.device_id ='')
and (dev.device_id is null or dev.device_id ='')
"""
.
format
(
start
,
end
,
reg
,
start
,
end
,
end
,
start
,
end
,
end
,
end
,
end
,
end
)
# print('positiveSql',flush=True)
# print(positiveSql,flush=True)
return
spark
.
sql
(
positiveSql
)
from
utils.date
import
get_ndays_before_no_minus
from
utils.spark
import
get_spark
,
get_tracate_click_data
# create your tests here.
if
__name__
==
"__main__"
:
spark
=
get_spark
(
"test"
)
yesterday
=
getNDaysBeforeNoMinus
(
1
)
startDay
=
getNDaysBeforeNoMinus
(
2
)
clickData
=
getClickDataForLr
(
spark
,
startDay
,
yesterday
)
clickData
.
show
(
5
,
False
)
click_data
=
get_tracate_click_data
(
spark
,
get_ndays_before_no_minus
(
2
),
get_ndays_before_no_minus
(
1
))
click_data
.
show
(
5
,
False
)
# /opt/spark/bin/spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tests.py
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment