Commit 5dfd9341 authored by 张彦钊's avatar 张彦钊 Committed by 高雅喆

第一次提交日记测试集脚本文件

parent 4f49d54e
import pymysql
import pandas as pd
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
# 从数据库获取数据,并将数据转化成DataFrame
def get_data(sql):
cursor = db.cursor()
cursor.execute(sql)
data = cursor.fetchall()
data = pd.DataFrame(list(data)).dropna()
return data
# 获取全国点击量TOP2000日记
sql = "select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000 = get_data(sql)
allCitiesTop2000 = allCitiesTop2000.rename(columns={0:"city_id",1:"cid"})
allCitiesTop2000.to_csv("\home\zhangyanzhao\diaryTestSet\allCitiesTop2000.csv")
print("成功获取全国日记点击量TOP2000")
# 获取全国城市列表
sql = "select distinct city_id from data_feed_click"
cityList = get_data(sql)
cityList.to_csv("\home\zhangyanzhao\diaryTestSet\cityList.csv")
cityList = cityList[0].values.tolist()
print("成功获取城市列表")
# 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充
for i in cityList:
sql = "select city_id,cid from data_feed_click " \
"where cid_type = 'diary' and city_id = {0} " \
"order by click_count_choice desc limit 2000".format(i)
data = get_data(sql)
data = data.rename(columns={0:"city_id",1:"cid"})
if data.shape[0]<2000:
n = 2000-data.shape[0]
# 全国点击量TOP2000日记中去除该城市的日记
temp = allCitiesTop2000[allCitiesTop2000["city_id"]!=i].loc[:n-1]
data = data.append(temp)
else:
pass
file_name = "\home\zhangyanzhao\diaryTestSet\{0}DiaryTop2000.csv".format(i)
data.to_csv(file_name)
print("end")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment