change

80a52b52 · 张彦钊 · 7389bd28 · 7389bd28 · 7389bd28 · 7389bd28
Commit 80a52b52 authored Oct 12, 2019 by 张彦钊
6 changed files
--- a/aucCaculate.py
+++ b/aucCaculate.py
-from utils import *
-from config import *
-
-if __name__ == "__main__":
-    test = pd.read_csv(DIRECTORY_PATH + "test_ffm_data.csv", header=None)
-    test_label = test[0].apply(lambda x: x[0]).values
-    predict = pd.read_csv(DIRECTORY_PATH + "test_set_predict_output.txt", header=None)[0].values
-    get_roc_curve(test_label, predict, "1")
--- a/content.py
+++ b/content.py
--- a/delete_temp_files.py
+++ b/delete_temp_files.py
-import os
-import time
-from config import *
-
-
-# 定期删除特定文件夹内特征的文件
-def remove_files(fileDir):
-    for eachFile in os.listdir(fileDir):
-        condition_a = os.path.isfile(fileDir + "/" + eachFile)
-        condition_b = ("DiaryTop3000.csv" in eachFile) or ("output.txt" in eachFile) or ("feed" in eachFile)
-        if condition_a and condition_b:
-            ft = os.stat(fileDir + "/" + eachFile)
-            ltime = int(ft.st_mtime)
-            # 删除5分钟前的文件
-            ntime = int(time.time()) - 5*60
-            if ltime <= ntime:
-                os.remove(fileDir + "/" + eachFile)
-
-def delete_log():
-    for eachFile in os.listdir("/tmp"):
-        if "xlearn" in eachFile:
-            os.remove("/tmp" + "/" + eachFile)
-
-
-if __name__ == "__main__":
-    while True:
-        delete_log()
-        remove_files(DIRECTORY_PATH + "result")
-        print("运行一次")
-        time.sleep(5*60)
--- a/local/meigou.py
+++ b/local/meigou.py
@@ -8,6 +8,7 @@ import time
 from pyspark import StorageLevel


+
 def all_click(x):
    total = []
    sum = 0
@@ -136,7 +137,137 @@ def cpc_click(x):
    return total


+def os_all_click(x,os):
+    total = []
+    sum = 0
+    date = (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y%m%d")
+
+    print("美购搜索点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'search_result_welfare_click_item' "
+                    "and app['version'] >='7.14.0' and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    total.append(tmp)
+    sum = sum + tmp
+
+    print("美购首页相关推荐")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'goto_welfare_detail' "
+                    "and app['version'] >='7.14.0' and params['from'] = 'welfare_home_list_item' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    total.append(tmp)
+    sum = sum + tmp
+
+    home_page_sum = 0
+    print("首页点击'全部'icon按钮进入的列表-美购卡片点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '6' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    home_page_sum = home_page_sum + tmp
+
+    print("首页点击icon进入的列表-美购卡片点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'goto_welfare_detail' "
+                    "and app['version'] >='7.14.0' "
+                    "and params['from'] = 'category' and params['cpc_referer'] = '19' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    home_page_sum = home_page_sum + tmp
+    total.append(home_page_sum)
+    sum = sum + home_page_sum
+
+    meigou_homepage_sum = 0
+    print("美购首页'全部'点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '21' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    meigou_homepage_sum = meigou_homepage_sum + tmp
+
+    print("美购首页icon美购点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '18' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    meigou_homepage_sum = meigou_homepage_sum + tmp
+    total.append(meigou_homepage_sum)
+    sum = sum + meigou_homepage_sum
+
+    total.append(sum)
+
+    return total

+def os_cpc_click(x,os):
+    total = []
+    sum = 0
+    date = (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y%m%d")
+
+    print("美购搜索点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'search_result_welfare_click_item' "
+                    "and app['version'] >='7.14.0' and params['is_cpc'] = '1' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    total.append(tmp)
+    sum = sum + tmp
+
+    print("美购首页相关推荐")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'goto_welfare_detail' "
+                    "and app['version'] >='7.14.0' and params['from'] = 'welfare_home_list_item' "
+                    "and params['is_cpc'] = '1' "
+                    "and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    total.append(tmp)
+    sum = sum + tmp
+
+    home_page_sum = 0
+    print("首页点击'全部'icon按钮进入的列表-美购卡片点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '6' "
+                    "and params['is_cpc'] = '1' and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    home_page_sum = home_page_sum + tmp
+
+    print("首页点击icon进入的列表-美购卡片点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
+                    "where partition_date='{}'and action = 'goto_welfare_detail' "
+                    "and app['version'] >='7.14.0' "
+                    "and params['from'] = 'category' and params['cpc_referer'] = '19' "
+                    "and params['is_cpc'] = '1' and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    home_page_sum = home_page_sum + tmp
+    total.append(home_page_sum)
+    sum = sum + home_page_sum
+
+    meigou_home_sum = 0
+    print("美购首页'全部'点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '21' "
+                    "and params['is_cpc'] = '1' and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    meigou_home_sum = meigou_home_sum + tmp
+
+    print("美购首页icon美购点击")
+    tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
+                    "and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
+                    "and params['from'] = 'welfare_list' and params['cpc_referer'] = '18' "
+                    "and params['is_cpc'] = '1' and device['device_type'] = '{}'"
+                    .format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
+    meigou_home_sum = meigou_home_sum + tmp
+    total.append(meigou_home_sum)
+    sum = sum + meigou_home_sum
+
+    total.append(sum)
+
+    return total


 if __name__ == '__main__':
@@ -149,21 +280,26 @@ if __name__ == '__main__':
        .set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")

    spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
-    all_list = []
-    for i in range(1,27):
-        date_str = (datetime.date.today() - datetime.timedelta(days=i)).strftime("%Y%m%d")
-        tmp_list = [date_str]
-        tmp_list.extend(all_click(i))
-        tmp_list.extend(cpc_click(i))
-        all_list.append(tmp_list)
-
-    df = pd.DataFrame(all_list)
-
-    df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home",
-                            5: "all_clcik",
-                            6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home",
-                            10:"cpc_all"})
-    df.to_csv('/home/gmuser/cpc.csv', index=False)
+    for os in ["ios","android"]:
+        all_list = []
+        for i in range(1,27):
+            date_str = (datetime.date.today() - datetime.timedelta(days=i)).strftime("%Y%m%d")
+            tmp_list = [date_str]
+            tmp_list.extend(os_all_click(i,os))
+            tmp_list.extend(os_cpc_click(i,os))
+            all_list.append(tmp_list)
+        df = pd.DataFrame(all_list)
+        df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home",
+                                5: "all_clcik",
+                                6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home",
+                                10:"cpc_all"})
+        df.to_csv('/home/gmuser/cpc_{}.csv'.format(os), index=False)
+
+    # df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home",
+    #                         5: "all_clcik",
+    #                         6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home",
+    #                         10:"cpc_all"})
+    # df.to_csv('/home/gmuser/cpc.csv', index=False)

    spark.stop()


--- a/location.py
+++ b/location.py
-import numpy as np, pandas as pd
-from sklearn.cluster import DBSCAN
-from shapely.geometry import MultiPoint
-import geopandas
-import shapefile
-
-from matplotlib import pyplot as plt
-
-data = pd.read_csv("/Users/mac/Downloads/location.csv")
-data.drop(["device_id", "partition_date"], axis=1, inplace=True)
-
-data = data[["lat", "lng"]]
-data = data.as_matrix().astype("float32", copy=False)#convert to array
-
-plt.title("beijing location")
-plt.scatter(latlngs[:, 0], latlngs[:, 1], s=1, c="black", marker='.')
-border_shape = shapefile.Reader(shape_path)
-border_shape_2 = shapefile.Reader(shape_path_2huan)
-border_shape_5 = shapefile.Reader(shape_path_5huan)
-border = border_shape.shapes()
-border_2 = border_shape_2.shapes()
-border_5 = border_shape_5.shapes()
-
-
-# 聚类中心区域
-def get_centermost_point(cluster):
-    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
-    print(centroid)
-    return tuple(centroid)
-
-
-# #渲染聚类结果
-for border_detail in clusters:
-    x, y = [], []
-    for cell in border_detail:
-        x.append(cell[0])
-        y.append(cell[1])
-    plt.scatter(x, y, marker='o')
-plt.show()
-
-
-# coding=utf-8
-import numpy as np
-from scipy.spatial.distance import cdist
-import matplotlib.pyplot as plt
-import seaborn as sns
-sns.set()
-
-from sklearn.cluster import DBSCAN
-from sklearn.preprocessing import StandardScaler
-import pandas as pd
-
-data = pd.read_csv("/Users/mac/Downloads/location.csv")
-data.drop(["device_id", "partition_date"], axis=1, inplace=True)
-
-data = data[["lat", "lng"]]
-data = data.as_matrix().astype("float32", copy=False)#convert to array
-
-#数据预处理，特征标准化，每一维是零均值和单位方差
-stscaler = StandardScaler().fit(data)
-data = stscaler.transform(data)
-
-#画出x和y的散点图
-plt.scatter(data[:, 0], data[:, 1])
-plt.xlabel("lat")
-plt.ylabel("lng")
-plt.title("beijng_users")
-# plt.savefig("results/wholesale.png", format="PNG")
-
-dbsc = DBSCAN(eps=0.5, min_samples=15).fit(data)
-
-labels = dbsc.labels_ #聚类得到每个点的聚类标签 -1表示噪点
-#print(labels)
-core_samples = np.zeros_like(labels, dtype=bool) #构造和labels一致的零矩阵,值是false
-core_samples[dbsc.core_sample_indices_] = True
-#print(core_samples)
-
-
-unique_labels = np.unique(labels)
-colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) #linespace返回在【0,1】之间均匀分布数字是len个，Sepectral生成len个颜色
-
-
-#print(zip(unique_labels,colors))
-for (label, color) in zip(unique_labels, colors):
-    class_member_mask = (labels == label)
-    print(class_member_mask&core_samples)
-    xy = data[class_member_mask & core_samples]
-    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=color, markersize=10)
-
-    xy2 = data[class_member_mask & ~core_samples]
-    plt.plot(xy2[:, 0], xy2[:, 1], 'o', markerfacecolor=color, markersize=5)
-plt.title("DBSCAN on beijing_users")
-plt.xlabel("lat (scaled)")
-plt.ylabel("lng (scaled)")
-# plt.savefig("results/(0.9,15)dbscan_wholesale.png", format="PNG")
--- a/make_data.py
+++ b/make_data.py
+import redis
+
+
+
+if __name__ == "__main__":
+    topic_key()