Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
86be490b
Commit
86be490b
authored
6 years ago
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add pyspark file
parent
2c1f7d0d
master
gyz
mr/beta/bug22
offic
rtt
updatedb
zhao
zhao22
No related merge requests found
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
28 deletions
+47
-28
eda.py
tensnsorflow/eda.py
+22
-20
feature.py
tensnsorflow/es/feature.py
+8
-8
pyspark.py
tensnsorflow/pyspark.py
+17
-0
No files found.
tensnsorflow/eda.py
View file @
86be490b
...
@@ -14,27 +14,29 @@ def con_sql(db, sql):
...
@@ -14,27 +14,29 @@ def con_sql(db, sql):
return
result
return
result
def
test
(
days
):
#
def test(days):
start
=
(
temp
-
datetime
.
timedelta
(
days
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
#
start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
print
(
start
)
#
print(start)
sql
=
"select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) "
\
#
sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)"
.
format
(
start
,
start
)
#
"from train_data where stat_date = '{}' and z = 1)".format(start,start)
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
#
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
exp
=
con_sql
(
db
,
sql
)
#
exp = con_sql(db, sql)
print
(
exp
)
#
print(exp)
sql
=
"select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) "
\
#
sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)"
.
format
(
start
,
start
)
#
"from train_data where stat_date = '{}' and z = 1)".format(start,start)
click
=
con_sql
(
db
,
sql
)
#
click = con_sql(db, sql)
return
start
,
exp
,
click
#
return start,exp,click
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
temp
=
datetime
.
datetime
.
strptime
(
"2019-03-14"
,
"
%
Y-
%
m-
%
d"
)
# temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
DIRECTORY_PATH
=
"/home/gmuser/"
# DIRECTORY_PATH = "/home/gmuser/"
output_path
=
DIRECTORY_PATH
+
"esmm_train_eda.csv"
# output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
for
i
in
range
(
1
,
41
):
# for i in range(1,41):
a
,
b
,
c
=
test
(
i
)
# a,b,c = test(i)
with
open
(
output_path
,
'a+'
)
as
f
:
# with open(output_path, 'a+') as f:
line
=
str
(
a
)
+
','
+
str
(
b
)
+
','
+
str
(
c
)
+
'
\n
'
# line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
f
.
write
(
line
)
# f.write(line)
This diff is collapsed.
Click to expand it.
tensnsorflow/es/feature.py
View file @
86be490b
...
@@ -47,13 +47,13 @@ def get_data():
...
@@ -47,13 +47,13 @@ def get_data():
df
=
df
.
drop_duplicates
()
df
=
df
.
drop_duplicates
()
df
=
df
.
drop_duplicates
([
"ucity_id"
,
"clevel1_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
df
=
df
.
drop_duplicates
([
"ucity_id"
,
"clevel1_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"l1"
,
"l2"
,
"time"
,
"stat_date"
])
"channel"
,
"top"
,
"l1"
,
"l2"
,
"time"
,
"stat_date"
])
print
(
df
.
shape
)
#
print(df.shape)
print
(
"exp numbers:"
)
#
print("exp numbers:")
print
(
df
[
df
[
"y"
]
==
0
]
.
shape
)
#
print(df[df["y"] == 0].shape)
print
(
"click numbers"
)
#
print("click numbers")
print
(
df
[(
df
[
"y"
]
==
1
)
&
(
df
[
"z"
]
==
0
)]
.
shape
)
#
print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
print
(
"buy numbers"
)
#
print("buy numbers")
print
(
df
[(
df
[
"y"
]
==
1
)
&
(
df
[
"z"
]
==
1
)]
.
shape
)
#
print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
unique_values
=
[]
unique_values
=
[]
features
=
[
"ucity_id"
,
"clevel1_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
features
=
[
"ucity_id"
,
"clevel1_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
...
@@ -169,7 +169,7 @@ def get_predict(date,value_map):
...
@@ -169,7 +169,7 @@ def get_predict(date,value_map):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
train_data_set
=
"train_data"
train_data_set
=
"
esmm_
train_data"
path
=
"/data/esmm/"
path
=
"/data/esmm/"
date
,
value
=
get_data
()
date
,
value
=
get_data
()
get_predict
(
date
,
value
)
get_predict
(
date
,
value
)
...
...
This diff is collapsed.
Click to expand it.
tensnsorflow/pyspark.py
0 → 100644
View file @
86be490b
from
pyspark
import
SparkConf
,
SparkContext
from
pyspark.sql
import
HiveContext
def
test
():
conf
=
SparkConf
()
.
setAppName
(
"My App"
)
sc
=
SparkContext
(
conf
=
conf
)
hive_context
=
HiveContext
(
sc
)
hive_context
.
sql
(
''' select device["device_type"] from online.tl_hdfs_maidian_view
where partition_date = '20181012' and action = "page_view"
and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 '''
)
.
show
(
6
)
if
__name__
==
'__main__'
:
test
()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment