Commit 86be490b authored by 张彦钊's avatar 张彦钊

add pyspark file

parent 2c1f7d0d
......@@ -14,27 +14,29 @@ def con_sql(db, sql):
return result
def test(days):
start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
print(start)
sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start,start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
exp = con_sql(db, sql)
print(exp)
sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start,start)
click = con_sql(db, sql)
return start,exp,click
# def test(days):
# start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
# print(start)
# sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
# "from train_data where stat_date = '{}' and z = 1)".format(start,start)
# db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# exp = con_sql(db, sql)
# print(exp)
# sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
# "from train_data where stat_date = '{}' and z = 1)".format(start,start)
# click = con_sql(db, sql)
# return start,exp,click
if __name__ == "__main__":
temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
DIRECTORY_PATH = "/home/gmuser/"
output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
for i in range(1,41):
a,b,c = test(i)
with open(output_path, 'a+') as f:
line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
f.write(line)
# temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
# DIRECTORY_PATH = "/home/gmuser/"
# output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
# for i in range(1,41):
# a,b,c = test(i)
# with open(output_path, 'a+') as f:
# line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
# f.write(line)
......@@ -47,13 +47,13 @@ def get_data():
df = df.drop_duplicates()
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date"])
print(df.shape)
print("exp numbers:")
print(df[df["y"] == 0].shape)
print("click numbers")
print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
print("buy numbers")
print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
# print(df.shape)
# print("exp numbers:")
# print(df[df["y"] == 0].shape)
# print("click numbers")
# print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
# print("buy numbers")
# print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
......@@ -169,7 +169,7 @@ def get_predict(date,value_map):
if __name__ == '__main__':
train_data_set = "train_data"
train_data_set = "esmm_train_data"
path = "/data/esmm/"
date,value = get_data()
get_predict(date, value)
......
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
def test():
conf = SparkConf().setAppName("My App")
sc = SparkContext(conf = conf)
hive_context = HiveContext(sc)
hive_context.sql(''' select device["device_type"] from online.tl_hdfs_maidian_view
where partition_date = '20181012' and action = "page_view"
and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 ''').show(6)
if __name__ == '__main__':
test()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment