Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
ff2b30cb
Commit
ff2b30cb
authored
Feb 24, 2020
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add
parent
57dd48b1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
249 deletions
+12
-249
make_data.py
make_data.py
+12
-249
No files found.
make_data.py
View file @
ff2b30cb
...
@@ -3,153 +3,19 @@ import pymysql
...
@@ -3,153 +3,19 @@ import pymysql
import
pandas
as
pd
import
pandas
as
pd
def
exp
():
date_str
=
"20200222"
sql
=
"select b.merchant_id "
\
"from statistic_doctor_rank_factor d "
\
"left join hippo_merchantrelevance b on d.doctor_id = b.doctor_id "
\
"where d.partition_date = '{}';"
.
format
(
date_str
)
db
=
pymysql
.
connect
(
host
=
'172.16.30.141'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
df
=
df
.
dropna
()
df
[
0
]
=
df
[
0
]
.
astype
(
'int64'
)
h_merchant_id
=
df
[
0
]
.
values
.
tolist
()
print
(
len
(
h_merchant_id
))
h_merchant_id
=
[
str
(
i
)
for
i
in
h_merchant_id
]
print
(
h_merchant_id
[:
6
])
sql
=
"select merchant_id from statistic_merchant_rank_factor "
\
"where partition_date = '{}';"
.
format
(
date_str
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
s_merchant_id
=
pd
.
DataFrame
(
list
(
result
))[
0
]
.
values
.
tolist
()
print
(
len
(
s_merchant_id
))
s_merchant_id
=
[
str
(
i
)
for
i
in
s_merchant_id
]
print
(
s_merchant_id
[:
6
])
print
(
len
(
set
(
s_merchant_id
)
&
set
(
h_merchant_id
)))
def
doctor
():
date_str
=
"20200222"
date_tmp
=
"2020-01-01"
# date_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y%m%d")
# date_tmp = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
sql
=
"select d.doctor_id,d.service_exposure_pv_30,d.service_ctr_30,d.expert_exposure_pv_30,"
\
"d.expert_pv_30,b.merchant_id from statistic_doctor_rank_factor d "
\
"left join hippo_merchantrelevance b on d.doctor_id = b.doctor_id "
\
"where d.partition_date = '{}';"
.
format
(
date_str
)
db
=
pymysql
.
connect
(
host
=
'172.16.30.141'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"doctor_id"
,
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"merchant_id"
]
df
=
df
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
print
(
df
.
shape
)
df
=
df
.
dropna
(
subset
=
[
"merchant_id"
])
print
(
"drop"
)
print
(
df
.
shape
)
print
(
df
.
head
(
6
))
sql
=
"select merchant_id,doctor_discount_30_days,expand_rechange_amount_30,"
\
"service_pv_30,expert_pv_30,organization_pv_30 from statistic_merchant_rank_factor "
\
"where partition_date = '{}';"
.
format
(
date_str
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
tmp
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"merchant_id"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
]
tmp
=
tmp
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
print
(
"tmp"
)
print
(
tmp
.
shape
)
print
(
tmp
.
head
(
6
))
df
[
"merchant_id"
]
=
df
[
"merchant_id"
]
.
astype
(
'int64'
)
df
[
"merchant_id"
]
=
df
[
"merchant_id"
]
.
astype
(
"str"
)
tmp
[
"merchant_id"
]
=
tmp
[
"merchant_id"
]
.
astype
(
"str"
)
df
=
pd
.
merge
(
df
,
tmp
,
on
=
'merchant_id'
)
print
(
"merge"
)
print
(
df
.
shape
)
for
i
in
[
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
]:
df
[
i
]
=
df
[
i
]
.
astype
(
"float"
)
df
[
"all_exposure"
]
=
df
[
"service_exposure_pv_30"
]
+
df
[
"expert_exposure_pv_30"
]
df
=
df
[
~
df
[
"expert_exposure_pv_30"
]
.
isin
([
0.0
])]
print
(
"expert_exposure_pv_30"
)
print
(
df
.
shape
)
df
=
df
[
~
df
[
"all_exposure"
]
.
isin
([
0.0
])]
print
(
"all_exposure"
)
print
(
df
.
shape
)
df
[
"tmp"
]
=
df
[
"service_pv_30"
]
+
df
[
"mexpert_pv_30"
]
+
df
[
"organization_pv_30"
]
df
=
df
[
~
df
[
"tmp"
]
.
isin
([
0.0
])]
print
(
"tmp"
)
print
(
df
.
shape
)
df
[
"ctr"
]
=
df
[
"service_exposure_pv_30"
]
/
df
[
"all_exposure"
]
*
df
[
"service_ctr_30"
]
+
\
df
[
"expert_exposure_pv_30"
]
/
df
[
"all_exposure"
]
*
(
df
[
"expert_pv_30"
]
/
df
[
"expert_exposure_pv_30"
])
df
.
loc
[
df
[
"doctor_discount_30_days"
]
<
0
,
[
"doctor_discount_30_days"
]]
=
0
# df.loc[df["budan_payment_30_days"] < 0, ["budan_payment_30_days"]] = 0
df
.
loc
[
df
[
"expand_rechange_amount_30"
]
<
0
,
[
"expand_rechange_amount_30"
]]
=
0
df
[
"commission"
]
=
df
[
"doctor_discount_30_days"
]
/
df
[
"tmp"
]
df
[
"pv_ad"
]
=
df
[
"expand_rechange_amount_30"
]
/
df
[
"tmp"
]
df
.
loc
[
df
[
"all_exposure"
]
<=
1500
,
[
"ctr"
]]
=
0.01
df
.
loc
[
df
[
"ctr"
]
<
0.01
,
[
"ctr"
]]
=
0.01
df
.
loc
[
df
[
"ctr"
]
>
0.2
,
[
"ctr"
]]
=
0.2
df
.
loc
[
df
[
"commission"
]
>
20
,
[
"commission"
]]
=
20
df
.
loc
[
df
[
"commission"
]
<
0.01
,
[
"commission"
]]
=
0.01
df
.
loc
[
df
[
"pv_ad"
]
>
20
,
[
"pv_ad"
]]
=
20
df
.
loc
[
df
[
"pv_ad"
]
<
0.01
,
[
"pv_ad"
]]
=
0.01
df
[
"score"
]
=
df
[
"ctr"
]
**
0.5
*
(
df
[
"commission"
]
+
df
[
"pv_ad"
])
print
(
df
.
shape
)
columns
=
[
"doctor_id"
,
"score"
,
"ctr"
,
"commission"
,
"pv_ad"
,
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"merchant_id"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
]
data
=
df
.
loc
[:,
columns
]
data
=
data
.
drop_duplicates
()
print
(
data
.
shape
)
data
.
to_csv
(
'/tmp/1_doctor.csv'
,
index
=
False
)
print
(
"doctor end"
)
def
hospital
():
def
hospital
():
date_str
=
"20200101"
date_str
=
"20200101"
# date_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y%m%d")
# date_tmp = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
sql
=
"select api.id,"
\
sql
=
"select api.id,"
\
"h.hospital_id,h.hospital_exposure_pv_30,h.service_exposure_pv_30,h.expert_exposure_pv_30,"
\
"h.hospital_id,h.hospital_exposure_pv_30,h.service_exposure_pv_30,h.expert_exposure_pv_30,"
\
"h.service_ctr_30,h.hospital_ctr_30,h.expert_ctr_30,"
\
"h.service_ctr_30,h.hospital_ctr_30,h.expert_ctr_30,b.merchant_id,"
\
"b.merchant_id "
\
"m.doctor_discount_30_days,m.expand_rechange_amount_30,"
\
"from api_doctor api left join statistic_hospital_rank_factor h on api.hospital_id = h.hospital_id "
\
"m.service_pv_30,m.expert_pv_30,m.organization_pv_30 "
\
"from api_doctor api "
\
"left join statistic_hospital_rank_factor h on api.hospital_id = h.hospital_id "
\
"left join hippo_merchantrelevance b on api.id = b.doctor_id "
\
"left join hippo_merchantrelevance b on api.id = b.doctor_id "
\
"where api.doctor_type = 1 and h.date = '{}';"
.
format
(
date_str
)
"left join statistic_merchant_rank_factor m on b.merchant_id = m.merchant_id"
\
"where api.doctor_type = 1 and h.date = '{}' and m.partition_date = '{}';"
.
format
(
date_str
,
date_str
)
db
=
pymysql
.
connect
(
host
=
'172.16.30.141'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
db
=
pymysql
.
connect
(
host
=
'172.16.30.141'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
...
@@ -157,36 +23,14 @@ def hospital():
...
@@ -157,36 +23,14 @@ def hospital():
result
=
cursor
.
fetchall
()
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
df
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"doctor_id"
,
"hospital_id"
,
"hospital_exposure_pv_30"
,
"service_exposure_pv_30"
,
name
=
[
"doctor_id"
,
"hospital_id"
,
"hospital_exposure_pv_30"
,
"service_exposure_pv_30"
,
"expert_exposure_pv_30"
,
"service_ctr_30"
,
"hospital_ctr_30"
,
"expert_ctr_30"
,
"merchant_id"
]
"expert_exposure_pv_30"
,
"service_ctr_30"
,
"hospital_ctr_30"
,
"expert_ctr_30"
,
"merchant_id"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
]
df
=
df
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
df
=
df
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
print
(
"df"
)
print
(
"df"
)
print
(
df
.
shape
)
print
(
df
.
shape
)
df
=
df
.
dropna
(
subset
=
[
"merchant_id"
])
print
(
"drop"
)
print
(
df
.
shape
)
sql
=
"select merchant_id,doctor_discount_30_days,"
\
"service_pv_30,expert_pv_30,organization_pv_30,expand_rechange_amount_30 from statistic_merchant_rank_factor "
\
"where partition_date = '{}';"
.
format
(
date_str
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
tmp
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"merchant_id"
,
"doctor_discount_30_days"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
,
"expand_rechange_amount_30"
]
tmp
=
tmp
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
print
(
"tmp"
)
print
(
tmp
.
shape
)
df
[
"merchant_id"
]
=
df
[
"merchant_id"
]
.
astype
(
'int64'
)
df
[
"merchant_id"
]
=
df
[
"merchant_id"
]
.
astype
(
"str"
)
tmp
[
"merchant_id"
]
=
tmp
[
"merchant_id"
]
.
astype
(
"str"
)
df
=
pd
.
merge
(
df
,
tmp
,
on
=
'merchant_id'
)
print
(
"merge"
)
print
(
df
.
shape
)
for
i
in
[
"hospital_exposure_pv_30"
,
"service_exposure_pv_30"
,
"expert_exposure_pv_30"
,
for
i
in
[
"hospital_exposure_pv_30"
,
"service_exposure_pv_30"
,
"expert_exposure_pv_30"
,
"service_ctr_30"
,
"hospital_ctr_30"
,
"expert_ctr_30"
,
"service_ctr_30"
,
"hospital_ctr_30"
,
"expert_ctr_30"
,
"doctor_discount_30_days"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
,
"doctor_discount_30_days"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
,
...
@@ -232,90 +76,8 @@ def hospital():
...
@@ -232,90 +76,8 @@ def hospital():
data
=
data
.
drop_duplicates
()
data
=
data
.
drop_duplicates
()
print
(
data
.
head
(
6
))
data
.
to_csv
(
'/tmp/1_hospital.csv'
,
index
=
False
)
def
old
():
date_str
=
(
datetime
.
datetime
.
now
()
-
datetime
.
timedelta
(
days
=
4
))
.
strftime
(
"
%
Y
%
m
%
d"
)
date_tmp
=
(
datetime
.
datetime
.
now
()
-
datetime
.
timedelta
(
days
=
4
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
date_str
)
sql
=
"select d.doctor_id,d.service_exposure_pv_30,d.service_ctr_30,d.expert_exposure_pv_30,d.expert_pv_30,"
\
"b.merchant_id,budan.budan_payment_30_days "
\
"from statistic_doctor_rank_factor d "
\
"left join hippo_merchantrelevance b on d.doctor_id = b.doctor_id "
\
"left join al_meigou_service_smart_rank_budan_payment budan on b.merchant_id = budan.merchant_id "
\
"where d.partition_date = '{}' and budan.stat_date = '{}';"
.
format
(
date_str
,
date_tmp
)
db
=
pymysql
.
connect
(
host
=
'172.16.30.141'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"doctor_id"
,
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"merchant_id"
,
"budan_payment_30_days"
]
df
=
df
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
sql
=
"select merchant_id,doctor_discount_30_days,expand_rechange_amount_30,"
\
"service_pv_30,expert_pv_30,organization_pv_30 from statistic_merchant_rank_factor "
\
"where partition_date = '{}';"
.
format
(
date_str
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
tmp
=
pd
.
DataFrame
(
list
(
result
))
name
=
[
"merchant_id"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
]
tmp
=
tmp
.
rename
(
columns
=
dict
(
zip
(
list
(
range
(
len
(
name
))),
name
)))
df
[
"merchant_id"
]
=
df
[
"merchant_id"
]
.
astype
(
"str"
)
tmp
[
"merchant_id"
]
=
tmp
[
"merchant_id"
]
.
astype
(
"str"
)
df
=
pd
.
merge
(
df
,
tmp
,
on
=
'merchant_id'
)
for
i
in
[
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
,
"budan_payment_30_days"
]:
df
[
i
]
=
df
[
i
]
.
astype
(
"float"
)
df
[
"all_exposure"
]
=
df
[
"service_exposure_pv_30"
]
+
df
[
"expert_exposure_pv_30"
]
df
=
df
[
~
df
[
"expert_exposure_pv_30"
]
.
isin
([
0.0
])]
df
=
df
[
~
df
[
"all_exposure"
]
.
isin
([
0.0
])]
df
[
"tmp"
]
=
df
[
"service_pv_30"
]
+
df
[
"mexpert_pv_30"
]
+
df
[
"organization_pv_30"
]
df
=
df
[
~
df
[
"tmp"
]
.
isin
([
0.0
])]
print
(
"aaaaaaaa"
)
df
[
"ctr"
]
=
df
[
"service_exposure_pv_30"
]
/
df
[
"all_exposure"
]
*
df
[
"service_ctr_30"
]
+
\
df
[
"expert_exposure_pv_30"
]
/
df
[
"all_exposure"
]
*
(
df
[
"expert_pv_30"
]
/
df
[
"expert_exposure_pv_30"
])
df
.
loc
[
df
[
"doctor_discount_30_days"
]
<
0
,
[
"doctor_discount_30_days"
]]
=
0
df
.
loc
[
df
[
"budan_payment_30_days"
]
<
0
,
[
"budan_payment_30_days"
]]
=
0
df
.
loc
[
df
[
"expand_rechange_amount_30"
]
<
0
,
[
"expand_rechange_amount_30"
]]
=
0
df
[
"commission"
]
=
(
df
[
"doctor_discount_30_days"
]
+
df
[
"budan_payment_30_days"
])
/
df
[
"tmp"
]
df
[
"pv_ad"
]
=
df
[
"expand_rechange_amount_30"
]
/
df
[
"tmp"
]
df
.
loc
[
df
[
"all_exposure"
]
<=
1500
,
[
"ctr"
]]
=
0.01
df
.
loc
[
df
[
"ctr"
]
<
0.01
,
[
"ctr"
]]
=
0.01
df
.
loc
[
df
[
"ctr"
]
>
0.2
,
[
"ctr"
]]
=
0.2
df
.
loc
[
df
[
"commission"
]
>
20
,
[
"commission"
]]
=
20
df
.
loc
[
df
[
"commission"
]
<
0.01
,
[
"commission"
]]
=
0.01
df
.
loc
[
df
[
"pv_ad"
]
>
20
,
[
"pv_ad"
]]
=
20
df
.
loc
[
df
[
"pv_ad"
]
<
0.01
,
[
"pv_ad"
]]
=
0.01
df
[
"score"
]
=
df
[
"ctr"
]
**
0.5
*
(
df
[
"commission"
]
+
df
[
"pv_ad"
])
columns
=
[
"doctor_id"
,
"score"
,
"ctr"
,
"commission"
,
"pv_ad"
,
"service_exposure_pv_30"
,
"service_ctr_30"
,
"expert_exposure_pv_30"
,
"expert_pv_30"
,
"merchant_id"
,
"doctor_discount_30_days"
,
"expand_rechange_amount_30"
,
"service_pv_30"
,
"mexpert_pv_30"
,
"organization_pv_30"
,
"budan_payment_30_days"
]
data
=
df
.
loc
[:,
columns
]
data
=
data
.
drop_duplicates
()
print
(
data
.
shape
)
print
(
data
.
shape
)
data
.
to_csv
(
'/tmp/6_doctor.csv'
,
index
=
False
)
data
.
to_csv
(
'/tmp/21_hospital.csv'
,
index
=
False
)
print
(
"doctor end"
)
def
new_doctor
():
def
new_doctor
():
...
@@ -394,6 +156,7 @@ if __name__ == "__main__":
...
@@ -394,6 +156,7 @@ if __name__ == "__main__":
# old()
# old()
new_doctor
()
new_doctor
()
hospital
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment