1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: UTF-8 -*-
import pymysql
import datetime
import pandas as pd
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def get_yesterday_date():
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
return yesterday
def get_click_times_to_count_uid_df():
sql = "select device_id,count(cid_type) click_times from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) group by device_id order by click_times desc"
uid_click_times = con_sql(sql)
uid_lst = [i[0] for i in uid_click_times]
click_times_lst = [i[1] for i in uid_click_times]
uid_click_times_df = pd.DataFrame({"uid":uid_lst,"click_times":click_times_lst})
df = uid_click_times_df.groupby(by="click_times",as_index=False).count()
return df
def df2file(df,fpath):
with open(fpath,"w") as f:
f.write("#2. Top特征\n")
f.write("=================================================================\n")
f.write("2.1用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)\n")
f.write("click_times"+"\t"+"count_uid"+"\n")
for row in df.iterrows():
line = str(row[1][0]) + "\t" + str(row[1][1]) + "\n"
f.write(line)
f.write("\n\n")
def main():
print("2.开始获取用户点击次数表...")
output_path = "/data2/models/eda/recommended_indexs/2click_times_to_count_uid_%s.txt" % get_yesterday_date()
df = get_click_times_to_count_uid_df()
df2file(df,output_path)
print("获取完成")
if __name__ == '__main__':
main()