Commit 86be490b authored by 张彦钊's avatar 张彦钊

add pyspark file

parent 2c1f7d0d
...@@ -14,27 +14,29 @@ def con_sql(db, sql): ...@@ -14,27 +14,29 @@ def con_sql(db, sql):
return result return result
def test(days): # def test(days):
start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d") # start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
print(start) # print(start)
sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \ # sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start,start) # "from train_data where stat_date = '{}' and z = 1)".format(start,start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') # db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
exp = con_sql(db, sql) # exp = con_sql(db, sql)
print(exp) # print(exp)
sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \ # sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
"from train_data where stat_date = '{}' and z = 1)".format(start,start) # "from train_data where stat_date = '{}' and z = 1)".format(start,start)
click = con_sql(db, sql) # click = con_sql(db, sql)
return start,exp,click # return start,exp,click
if __name__ == "__main__": if __name__ == "__main__":
temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d") # temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
DIRECTORY_PATH = "/home/gmuser/" # DIRECTORY_PATH = "/home/gmuser/"
output_path = DIRECTORY_PATH + "esmm_train_eda.csv" # output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
for i in range(1,41): # for i in range(1,41):
a,b,c = test(i) # a,b,c = test(i)
with open(output_path, 'a+') as f: # with open(output_path, 'a+') as f:
line = str(a) + ',' + str(b)+ ',' + str(c) + '\n' # line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
f.write(line) # f.write(line)
...@@ -47,13 +47,13 @@ def get_data(): ...@@ -47,13 +47,13 @@ def get_data():
df = df.drop_duplicates() df = df.drop_duplicates()
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date"]) "channel", "top", "l1","l2", "time", "stat_date"])
print(df.shape) # print(df.shape)
print("exp numbers:") # print("exp numbers:")
print(df[df["y"] == 0].shape) # print(df[df["y"] == 0].shape)
print("click numbers") # print("click numbers")
print(df[(df["y"] == 1)&(df["z"] == 0)].shape) # print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
print("buy numbers") # print("buy numbers")
print(df[(df["y"] == 1) & (df["z"] == 1)].shape) # print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
...@@ -169,7 +169,7 @@ def get_predict(date,value_map): ...@@ -169,7 +169,7 @@ def get_predict(date,value_map):
if __name__ == '__main__': if __name__ == '__main__':
train_data_set = "train_data" train_data_set = "esmm_train_data"
path = "/data/esmm/" path = "/data/esmm/"
date,value = get_data() date,value = get_data()
get_predict(date, value) get_predict(date, value)
......
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
def test():
conf = SparkConf().setAppName("My App")
sc = SparkContext(conf = conf)
hive_context = HiveContext(sc)
hive_context.sql(''' select device["device_type"] from online.tl_hdfs_maidian_view
where partition_date = '20181012' and action = "page_view"
and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 ''').show(6)
if __name__ == '__main__':
test()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment