diaryCandidateSet.py 3.33 KB
Newer Older
1 2
import pymysql
import pandas as pd
张彦钊's avatar
张彦钊 committed
3 4
from utils import *
from config import *
5
import numpy as np
6
import time
张彦钊's avatar
张彦钊 committed
7

8 9 10 11

# 候选集cid只能从训练数据集cid中选择
def filter_cid(df):
    data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
12 13
    if not df.empty:
        df = df.loc[df["cid"].isin(data_set_cid)]
14 15 16
    return df


17 18
def get_allCitiesDiaryTop3000():
    # 获取全国点击量TOP3000日记
张彦钊's avatar
张彦钊 committed
19

20
    sql = "select city_id,cid from data_feed_click2 " \
21 22 23 24 25 26
          "where cid_type = 'diary' group by cid order by max(click_count_choice) desc limit 3000"
    allCitiesTop3000 = con_sql(sql)
    allCitiesTop3000 = allCitiesTop3000.rename(columns={0: "city_id", 1: "cid"})
    allCitiesTop3000 = filter_cid(allCitiesTop3000)
    allCitiesTop3000.to_csv(DIRECTORY_PATH + "diaryTestSet/allCitiesDiaryTop3000.csv",index=False)
    return allCitiesTop3000
张彦钊's avatar
张彦钊 committed
27 28 29 30


def get_cityList():
    # 获取全国城市列表
31
    sql = "select distinct city_id from data_feed_click2"
张彦钊's avatar
张彦钊 committed
32
    cityList = con_sql(sql)
张彦钊's avatar
张彦钊 committed
33
    cityList.to_csv(DIRECTORY_PATH + "diaryTestSet/cityList.csv",index=False)
张彦钊's avatar
张彦钊 committed
34 35 36
    cityList = cityList[0].values.tolist()
    return cityList

37

38 39
def get_eachCityDiaryTop3000():
    # 获取每个城市点击量TOP3000日记,如果数量小于3000,用全国点击量TOP3000日记补充
张彦钊's avatar
张彦钊 committed
40
    cityList = get_cityList()
41
    allCitiesTop3000 = get_allCitiesDiaryTop3000()
张彦钊's avatar
张彦钊 committed
42
    for i in cityList:
43
        sql = "select city_id,cid from data_feed_click2 " \
张彦钊's avatar
张彦钊 committed
44
              "where cid_type = 'diary' and city_id = '{0}' group by cid " \
45
              "order by max(click_count_choice) desc limit 3000".format(i)
张彦钊's avatar
张彦钊 committed
46
        data = con_sql(sql)
47 48
        data = data.rename(columns={0: "city_id", 1: "cid"})
        data = filter_cid(data)
49 50 51 52
        if data.shape[0] < 3000:
            n = 3000 - data.shape[0]
            # 全国点击量TOP3000日记中去除该城市的日记
            temp = allCitiesTop3000[allCitiesTop3000["city_id"] != i].loc[:n - 1]
张彦钊's avatar
张彦钊 committed
53 54 55 56
            data = data.append(temp)
        else:
            pass

57
        file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(i)
张彦钊's avatar
张彦钊 committed
58
        data.to_csv(file_name,index=False)
张彦钊's avatar
张彦钊 committed
59

张彦钊's avatar
张彦钊 committed
60

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
def pool_method(city,sql,allCitiesTop3000):
    data = con_sql(sql)
    data = data.rename(columns={0: "city_id", 1: "cid"})
    data = filter_cid(data)
    if data.shape[0] < 3000:
        n = 3000 - data.shape[0]
        # 全国点击量TOP3000日记中去除该城市的日记
        temp = allCitiesTop3000[allCitiesTop3000["city_id"] != city].loc[:n - 1]
        data = data.append(temp)

    file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(city)
    data.to_csv(file_name, index=False)


# 多线程方法获取全国城市热门日记
76
def multi_get_eachCityDiaryTop3000(processes=8):
77 78 79 80
    city_list = get_cityList()
    allCitiesTop3000 = get_allCitiesDiaryTop3000()
    pool = Pool(processes)
    for city in city_list:
81
        sql = "select city_id,cid from data_feed_click2 " \
82 83 84 85 86 87 88 89
          "where cid_type = 'diary' and city_id = '{0}' group by cid " \
          "order by max(click_count_choice) desc limit 3000".format(city)

        pool.apply_async(pool_method,(city,sql,allCitiesTop3000,))

    pool.close()
    pool.join()

90

张彦钊's avatar
张彦钊 committed
91
if __name__ == "__main__":
92
    start = time.time()
93
    multi_get_eachCityDiaryTop3000()
94 95 96
    end = time.time()
    print("获取各城市热门日记耗时{}分".format((end-start)/60))