Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
serviceRec
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
郭羽
serviceRec
Commits
01a55b5e
Commit
01a55b5e
authored
3 years ago
by
宋柯
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
模型调试
parent
c39b62ac
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
13 deletions
+11
-13
featureEngSk.py
spark/featureEngSk.py
+11
-13
No files found.
spark/featureEngSk.py
View file @
01a55b5e
...
...
@@ -511,17 +511,17 @@ def getClickSql(start, end):
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os, t1.city_id as user_city_id
FROM
(
select partition_date,
city_id,cl_id,business_id as card_id,page_stay,
cl_type
select partition_date,
city_id, cl_id, business_id as card_id,
cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND page_name='welfare_detail'
-- AND page_stay>=1
AND page_stay >= 2
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,
city_id,cl_id,business_id,page_stay,
cl_type
group by partition_date,
city_id, cl_id, business_id,
cl_type
) AS t1
join
( --渠道,新老
...
...
@@ -614,20 +614,20 @@ def getItemStatisticSql(start, end):
SELECT T.partition_date, T.card_id, T.label
FROM
(
SELECT
DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,
t1.city_id as user_city_id, 1 as label
SELECT
t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os,
t1.city_id as user_city_id, 1 as label
FROM
(
select partition_date,
city_id,cl_id,business_id as card_id,time_stamp,page_stay,
cl_type
select partition_date,
city_id, cl_id, business_id as card_id,
cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND page_name='welfare_detail'
-- AND page_stay>=1
AND page_stay >= 2
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,
city_id,cl_id,business_id,time_stamp,page_stay,
cl_type
group by partition_date,
city_id, cl_id, business_id,
cl_type
) AS t1
join
( --渠道,新老
...
...
@@ -656,10 +656,10 @@ def getItemStatisticSql(start, end):
on t3.device_id=t2.device_id
WHERE t3.device_id is null
UNION
SELECT
DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,
t1.city_id as user_city_id, 0 as label
SELECT
t1.partition_date, t1.cl_id device_id, t1.card_id, cl_type as os,
t1.city_id as user_city_id, 0 as label
from
( --新首页卡片曝光
SELECT partition_date,
city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp
SELECT partition_date,
city_id, cl_type, cl_id, card_id
FROM online.ml_community_precise_exposure_detail
where partition_date>='{startDay}' and partition_date<='{endDay}'
and action in ('page_precise_exposure','home_choiceness_card_exposure')
...
...
@@ -673,8 +673,7 @@ def getItemStatisticSql(start, end):
and card_type in ('card','video')
and card_content_type in ('service')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
group by partition_date, city_id, cl_type, cl_id, card_id, app_session_id
) t1
join
( --渠道,新老
...
...
@@ -692,7 +691,6 @@ def getItemStatisticSql(start, end):
AND first_channel_source_type not like 'promotion
\
_jf
\
_
%
'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
...
...
@@ -894,7 +892,7 @@ def get_click_exp_rating_df(trainDays, spark):
#曝光数据过滤掉点击数据
print
(
"expDF 过滤点击数据前 count: "
,
expDF
.
count
())
expDF
=
spark
.
sql
(
"""
SELECT t1.partition_date, t1.device_id, t1.card_id, t1.
time_stamp, t1.
os, t1.user_city_id
SELECT t1.partition_date, t1.device_id, t1.card_id, t1.os, t1.user_city_id
FROM expDF t1
LEFT JOIN clickDF t2
ON t1.partition_date = t2.partition_date
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment