Commit e72f288f authored by 张彦钊's avatar 张彦钊

add multiprocess get_eachCityDiaryTop3000

parent d0a857c9
......@@ -3,6 +3,7 @@ import pandas as pd
from utils import *
from config import *
import numpy as np
import time
# 候选集cid只能从训练数据集cid中选择
......@@ -36,53 +37,6 @@ def get_cityList():
return cityList
def pool_method(i,sql,allCitiesTop3000):
data = con_sql(sql)
data = data.rename(columns={0: "city_id", 1: "cid"})
data = filter_cid(data)
if data.shape[0] < 3000:
n = 3000 - data.shape[0]
# 全国点击量TOP3000日记中去除该城市的日记
temp = allCitiesTop3000[allCitiesTop3000["city_id"] != i].loc[:n - 1]
data = data.append(temp)
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(i)
data.to_csv(file_name, index=False)
print("成功保存{}地区DiaryTop3000".format(i))
# 把城市列表切分成n份,然后拼接成一个列表
# def split_cityList(cityList,n):
# l = len(cityList)
# step = np.rint(l/n)
# new_list = []
# x = 0
# while True:
# if x + step < :
# data_list.append(data.iloc[x:x + step])
# x = x + step + 1
# else:
# data_list.append(data.iloc[x:data.__len__()])
# break
# 多线程方法获取全国城市热门日记
# def multi_get_eachCityDiaryTop3000(processes):
# cityList = get_cityList()
# allCitiesTop3000 = get_allCitiesDiaryTop3000()
#
# pool = Pool(processes)
# for i in range(len(data_list)):
# data_list[i] = pool.apply_async(self.pool_function, (data_list[i], t,))
#
# result_map = {}
# for i in data_list:
# result_map.update(i.get())
# pool.close()
# pool.join()
def get_eachCityDiaryTop3000():
# 获取每个城市点击量TOP3000日记,如果数量小于3000,用全国点击量TOP3000日记补充
cityList = get_cityList()
......@@ -106,6 +60,40 @@ def get_eachCityDiaryTop3000():
data.to_csv(file_name,index=False)
print("成功保存{}地区DiaryTop3000".format(i))
def pool_method(city,sql,allCitiesTop3000):
data = con_sql(sql)
data = data.rename(columns={0: "city_id", 1: "cid"})
data = filter_cid(data)
if data.shape[0] < 3000:
n = 3000 - data.shape[0]
# 全国点击量TOP3000日记中去除该城市的日记
temp = allCitiesTop3000[allCitiesTop3000["city_id"] != city].loc[:n - 1]
data = data.append(temp)
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(city)
data.to_csv(file_name, index=False)
print("成功保存{}地区DiaryTop3000".format(city))
# 多线程方法获取全国城市热门日记
def multi_get_eachCityDiaryTop3000(processes):
city_list = get_cityList()
allCitiesTop3000 = get_allCitiesDiaryTop3000()
pool = Pool(processes)
for city in city_list:
sql = "select city_id,cid from data_feed_click " \
"where cid_type = 'diary' and city_id = '{0}' group by cid " \
"order by max(click_count_choice) desc limit 3000".format(city)
pool.apply_async(pool_method,(city,sql,allCitiesTop3000,))
pool.close()
pool.join()
if __name__ == "__main__":
get_eachCityDiaryTop3000()
start = time.time()
multi_get_eachCityDiaryTop3000(6)
end = time.time()
print("获取各城市热门日记耗时{}分".format((end-start)/60))
......@@ -2,20 +2,40 @@ from utils import *
import datetime
import pickle
if __name__ == '__main__':
data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
data["y"] = 1
test_data = data.tail(1)
ffm = FFMFormatPandas()
data = ffm.fit_transform(data, y='y')
data.to_csv("../data/ffm_data.csv", index=False)
def split_cityList(cityList,n):
l = len(cityList)
if l <= n:
return cityList
else:
step = int(np.rint(l/n))
data_list = []
x = 0
while x + step < l:
data_list.append(cityList[x:x + step])
x += step
data_list.append(cityList[x:])
return data_list
if __name__ == '__main__':
l = list(range(22))
a = split_cityList(l, 5)
print(a)
with open("../data/ffm.object", "wb") as f:
pickle.dump(ffm, f)
with open("../data/ffm.object", "rb") as f:
ffm = pickle.load(f)
result = ffm.transform(test_data)
print(result)
data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
print(data_1)
# data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
# data["y"] = 1
# test_data = data.tail(1)
#
# ffm = FFMFormatPandas()
# data = ffm.fit_transform(data, y='y')
# data.to_csv("../data/ffm_data.csv", index=False)
#
# with open("../data/ffm.object", "wb") as f:
# pickle.dump(ffm, f)
# with open("../data/ffm.object", "rb") as f:
# ffm = pickle.load(f)
# result = ffm.transform(test_data)
# print(result)
# data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
# print(data_1)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment