import pymysql import pandas as pd db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') # 从数据库获取数据,并将数据转化成DataFrame def get_data(sql): cursor = db.cursor() cursor.execute(sql) data = cursor.fetchall() data = pd.DataFrame(list(data)).dropna() return data # 获取全国点击量TOP2000日记 sql = "select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000" allCitiesTop2000 = get_data(sql) allCitiesTop2000 = allCitiesTop2000.rename(columns={0:"city_id",1:"cid"}) allCitiesTop2000.to_csv("\home\zhangyanzhao\diaryTestSet\allCitiesTop2000.csv") print("成功获取全国日记点击量TOP2000") # 获取全国城市列表 sql = "select distinct city_id from data_feed_click" cityList = get_data(sql) cityList.to_csv("\home\zhangyanzhao\diaryTestSet\cityList.csv") cityList = cityList[0].values.tolist() print("成功获取城市列表") # 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充 for i in cityList: sql = "select city_id,cid from data_feed_click " \ "where cid_type = 'diary' and city_id = {0} " \ "order by click_count_choice desc limit 2000".format(i) data = get_data(sql) data = data.rename(columns={0:"city_id",1:"cid"}) if data.shape[0]<2000: n = 2000-data.shape[0] # 全国点击量TOP2000日记中去除该城市的日记 temp = allCitiesTop2000[allCitiesTop2000["city_id"]!=i].loc[:n-1] data = data.append(temp) else: pass file_name = "\home\zhangyanzhao\diaryTestSet\{0}DiaryTop2000.csv".format(i) data.to_csv(file_name) print("end")