Commit 80a52b52 authored by 张彦钊's avatar 张彦钊

change

parent 7389bd28
from utils import *
from config import *
if __name__ == "__main__":
test = pd.read_csv(DIRECTORY_PATH + "test_ffm_data.csv", header=None)
test_label = test[0].apply(lambda x: x[0]).values
predict = pd.read_csv(DIRECTORY_PATH + "test_set_predict_output.txt", header=None)[0].values
get_roc_curve(test_label, predict, "1")
This diff is collapsed.
import os
import time
from config import *
# 定期删除特定文件夹内特征的文件
def remove_files(fileDir):
for eachFile in os.listdir(fileDir):
condition_a = os.path.isfile(fileDir + "/" + eachFile)
condition_b = ("DiaryTop3000.csv" in eachFile) or ("output.txt" in eachFile) or ("feed" in eachFile)
if condition_a and condition_b:
ft = os.stat(fileDir + "/" + eachFile)
ltime = int(ft.st_mtime)
# 删除5分钟前的文件
ntime = int(time.time()) - 5*60
if ltime <= ntime:
os.remove(fileDir + "/" + eachFile)
def delete_log():
for eachFile in os.listdir("/tmp"):
if "xlearn" in eachFile:
os.remove("/tmp" + "/" + eachFile)
if __name__ == "__main__":
while True:
delete_log()
remove_files(DIRECTORY_PATH + "result")
print("运行一次")
time.sleep(5*60)
...@@ -8,6 +8,7 @@ import time ...@@ -8,6 +8,7 @@ import time
from pyspark import StorageLevel from pyspark import StorageLevel
def all_click(x): def all_click(x):
total = [] total = []
sum = 0 sum = 0
...@@ -136,7 +137,137 @@ def cpc_click(x): ...@@ -136,7 +137,137 @@ def cpc_click(x):
return total return total
def os_all_click(x,os):
total = []
sum = 0
date = (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y%m%d")
print("美购搜索点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'search_result_welfare_click_item' "
"and app['version'] >='7.14.0' and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
total.append(tmp)
sum = sum + tmp
print("美购首页相关推荐")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'goto_welfare_detail' "
"and app['version'] >='7.14.0' and params['from'] = 'welfare_home_list_item' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
total.append(tmp)
sum = sum + tmp
home_page_sum = 0
print("首页点击'全部'icon按钮进入的列表-美购卡片点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '6' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
home_page_sum = home_page_sum + tmp
print("首页点击icon进入的列表-美购卡片点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'goto_welfare_detail' "
"and app['version'] >='7.14.0' "
"and params['from'] = 'category' and params['cpc_referer'] = '19' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
home_page_sum = home_page_sum + tmp
total.append(home_page_sum)
sum = sum + home_page_sum
meigou_homepage_sum = 0
print("美购首页'全部'点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '21' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
meigou_homepage_sum = meigou_homepage_sum + tmp
print("美购首页icon美购点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '18' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
meigou_homepage_sum = meigou_homepage_sum + tmp
total.append(meigou_homepage_sum)
sum = sum + meigou_homepage_sum
total.append(sum)
return total
def os_cpc_click(x,os):
total = []
sum = 0
date = (datetime.date.today() - datetime.timedelta(days=x)).strftime("%Y%m%d")
print("美购搜索点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'search_result_welfare_click_item' "
"and app['version'] >='7.14.0' and params['is_cpc'] = '1' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
total.append(tmp)
sum = sum + tmp
print("美购首页相关推荐")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'goto_welfare_detail' "
"and app['version'] >='7.14.0' and params['from'] = 'welfare_home_list_item' "
"and params['is_cpc'] = '1' "
"and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
total.append(tmp)
sum = sum + tmp
home_page_sum = 0
print("首页点击'全部'icon按钮进入的列表-美购卡片点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '6' "
"and params['is_cpc'] = '1' and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
home_page_sum = home_page_sum + tmp
print("首页点击icon进入的列表-美购卡片点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates "
"where partition_date='{}'and action = 'goto_welfare_detail' "
"and app['version'] >='7.14.0' "
"and params['from'] = 'category' and params['cpc_referer'] = '19' "
"and params['is_cpc'] = '1' and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
home_page_sum = home_page_sum + tmp
total.append(home_page_sum)
sum = sum + home_page_sum
meigou_home_sum = 0
print("美购首页'全部'点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '21' "
"and params['is_cpc'] = '1' and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
meigou_home_sum = meigou_home_sum + tmp
print("美购首页icon美购点击")
tmp = spark.sql("select count(*) from online.bl_hdfs_maidian_updates where partition_date='{}' "
"and action = 'goto_welfare_detail' and app['version'] >='7.14.0' "
"and params['from'] = 'welfare_list' and params['cpc_referer'] = '18' "
"and params['is_cpc'] = '1' and device['device_type'] = '{}'"
.format(date,os)).rdd.map(lambda x: x[0]).collect()[0]
meigou_home_sum = meigou_home_sum + tmp
total.append(meigou_home_sum)
sum = sum + meigou_home_sum
total.append(sum)
return total
if __name__ == '__main__': if __name__ == '__main__':
...@@ -149,21 +280,26 @@ if __name__ == '__main__': ...@@ -149,21 +280,26 @@ if __name__ == '__main__':
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy") .set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate() spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
for os in ["ios","android"]:
all_list = [] all_list = []
for i in range(1,27): for i in range(1,27):
date_str = (datetime.date.today() - datetime.timedelta(days=i)).strftime("%Y%m%d") date_str = (datetime.date.today() - datetime.timedelta(days=i)).strftime("%Y%m%d")
tmp_list = [date_str] tmp_list = [date_str]
tmp_list.extend(all_click(i)) tmp_list.extend(os_all_click(i,os))
tmp_list.extend(cpc_click(i)) tmp_list.extend(os_cpc_click(i,os))
all_list.append(tmp_list) all_list.append(tmp_list)
df = pd.DataFrame(all_list) df = pd.DataFrame(all_list)
df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home", df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home",
5: "all_clcik", 5: "all_clcik",
6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home", 6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home",
10:"cpc_all"}) 10:"cpc_all"})
df.to_csv('/home/gmuser/cpc.csv', index=False) df.to_csv('/home/gmuser/cpc_{}.csv'.format(os), index=False)
# df = df.rename(columns={0: "date",1: "search", 2: "xiangguan",3:"home",4:"service_home",
# 5: "all_clcik",
# 6: "cpc_search", 7: "cpc_xiangguan",8:"cpc_home",9:"cpc_service_home",
# 10:"cpc_all"})
# df.to_csv('/home/gmuser/cpc.csv', index=False)
spark.stop() spark.stop()
......
import numpy as np, pandas as pd
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
import geopandas
import shapefile
from matplotlib import pyplot as plt
data = pd.read_csv("/Users/mac/Downloads/location.csv")
data.drop(["device_id", "partition_date"], axis=1, inplace=True)
data = data[["lat", "lng"]]
data = data.as_matrix().astype("float32", copy=False)#convert to array
plt.title("beijing location")
plt.scatter(latlngs[:, 0], latlngs[:, 1], s=1, c="black", marker='.')
border_shape = shapefile.Reader(shape_path)
border_shape_2 = shapefile.Reader(shape_path_2huan)
border_shape_5 = shapefile.Reader(shape_path_5huan)
border = border_shape.shapes()
border_2 = border_shape_2.shapes()
border_5 = border_shape_5.shapes()
# 聚类中心区域
def get_centermost_point(cluster):
centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
print(centroid)
return tuple(centroid)
# #渲染聚类结果
for border_detail in clusters:
x, y = [], []
for cell in border_detail:
x.append(cell[0])
y.append(cell[1])
plt.scatter(x, y, marker='o')
plt.show()
# coding=utf-8
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import pandas as pd
data = pd.read_csv("/Users/mac/Downloads/location.csv")
data.drop(["device_id", "partition_date"], axis=1, inplace=True)
data = data[["lat", "lng"]]
data = data.as_matrix().astype("float32", copy=False)#convert to array
#数据预处理,特征标准化,每一维是零均值和单位方差
stscaler = StandardScaler().fit(data)
data = stscaler.transform(data)
#画出x和y的散点图
plt.scatter(data[:, 0], data[:, 1])
plt.xlabel("lat")
plt.ylabel("lng")
plt.title("beijng_users")
# plt.savefig("results/wholesale.png", format="PNG")
dbsc = DBSCAN(eps=0.5, min_samples=15).fit(data)
labels = dbsc.labels_ #聚类得到每个点的聚类标签 -1表示噪点
#print(labels)
core_samples = np.zeros_like(labels, dtype=bool) #构造和labels一致的零矩阵,值是false
core_samples[dbsc.core_sample_indices_] = True
#print(core_samples)
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) #linespace返回在【0,1】之间均匀分布数字是len个,Sepectral生成len个颜色
#print(zip(unique_labels,colors))
for (label, color) in zip(unique_labels, colors):
class_member_mask = (labels == label)
print(class_member_mask&core_samples)
xy = data[class_member_mask & core_samples]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=color, markersize=10)
xy2 = data[class_member_mask & ~core_samples]
plt.plot(xy2[:, 0], xy2[:, 1], 'o', markerfacecolor=color, markersize=5)
plt.title("DBSCAN on beijing_users")
plt.xlabel("lat (scaled)")
plt.ylabel("lng (scaled)")
# plt.savefig("results/(0.9,15)dbscan_wholesale.png", format="PNG")
import redis
if __name__ == "__main__":
topic_key()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment