add pyspark file

86be490b · 张彦钊 · 2c1f7d0d · 86be490b · 86be490b · 86be490b
Commit 86be490b authored Mar 15, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 28 deletions

eda.py tensnsorflow/eda.py +22 -20

feature.py tensnsorflow/es/feature.py +8 -8

pyspark.py tensnsorflow/pyspark.py +17 -0

No files found.
--- a/tensnsorflow/eda.py
+++ b/tensnsorflow/eda.py
@@ -14,27 +14,29 @@ def con_sql(db, sql):
    return result


-def test(days):
-    start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
-    print(start)
-    sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
-          "from train_data where stat_date = '{}' and z = 1)".format(start,start)
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    exp = con_sql(db, sql)
-    print(exp)
-    sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
-          "from train_data where stat_date = '{}' and z = 1)".format(start,start)
-    click = con_sql(db, sql)
-    return start,exp,click
+# def test(days):
+#     start = (temp - datetime.timedelta(days)).strftime("%Y-%m-%d")
+#     print(start)
+#     sql = "select (select count(*) from esmm_train_data where stat_date = '{}' and y = 0)/(select count(*) " \
+#           "from train_data where stat_date = '{}' and z = 1)".format(start,start)
+#     db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+#     exp = con_sql(db, sql)
+#     print(exp)
+#     sql = "select (select count(*) from train_data where stat_date = '{}' and y = 1 and z = 0)/(select count(*) " \
+#           "from train_data where stat_date = '{}' and z = 1)".format(start,start)
+#     click = con_sql(db, sql)
+#     return start,exp,click


 if __name__ == "__main__":
-    temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
-    DIRECTORY_PATH = "/home/gmuser/"
-    output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
-    for i in range(1,41):
-        a,b,c = test(i)
-        with open(output_path, 'a+') as f:
-            line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
-            f.write(line)
+    # temp = datetime.datetime.strptime("2019-03-14", "%Y-%m-%d")
+    # DIRECTORY_PATH = "/home/gmuser/"
+    # output_path = DIRECTORY_PATH + "esmm_train_eda.csv"
+    # for i in range(1,41):
+    #     a,b,c = test(i)
+    #     with open(output_path, 'a+') as f:
+    #         line = str(a) + ',' + str(b)+ ',' + str(c) + '\n'
+    #         f.write(line)
+
+

--- a/tensnsorflow/es/feature.py
+++ b/tensnsorflow/es/feature.py
@@ -47,13 +47,13 @@ def get_data():
    df = df.drop_duplicates()
    df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                             "channel", "top", "l1","l2", "time", "stat_date"])
-    print(df.shape)
-    print("exp numbers:")
-    print(df[df["y"] == 0].shape)
-    print("click numbers")
-    print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
-    print("buy numbers")
-    print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
+    # print(df.shape)
+    # print("exp numbers:")
+    # print(df[df["y"] == 0].shape)
+    # print("click numbers")
+    # print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
+    # print("buy numbers")
+    # print(df[(df["y"] == 1) & (df["z"] == 1)].shape)

    unique_values = []
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
@@ -169,7 +169,7 @@ def get_predict(date,value_map):


 if __name__ == '__main__':
-    train_data_set = "train_data"
+    train_data_set = "esmm_train_data"
    path = "/data/esmm/"
    date,value = get_data()
    get_predict(date, value)

--- a/tensnsorflow/pyspark.py
+++ b/tensnsorflow/pyspark.py
+from pyspark import SparkConf, SparkContext
+from pyspark.sql import HiveContext
+
+
+
+def test():
+
+        conf = SparkConf().setAppName("My App")
+        sc = SparkContext(conf = conf)
+        hive_context = HiveContext(sc)
+        hive_context.sql(''' select device["device_type"] from online.tl_hdfs_maidian_view 
+        where partition_date = '20181012' and action = "page_view" 
+        and params["page_name"] = "diary_detail" and params["referrer"] = "home" limit 10 ''').show(6)
+
+
+if __name__ == '__main__':
+        test()