prepareData.py 2.17 KB
Newer Older
1 2
from utils import con_sql
import datetime
张彦钊's avatar
张彦钊 committed
3
import time
张彦钊's avatar
张彦钊 committed
4
import pymysql
张彦钊's avatar
张彦钊 committed
5

6

张彦钊's avatar
张彦钊 committed
7
def fetch_data(start_date, end_date):
8
    # 获取点击表里的device_id
9
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
10
    sql = "select distinct device_id from data_feed_click"
张彦钊's avatar
张彦钊 committed
11
    click_device_id = con_sql(db,sql)[0].values.tolist()
12 13
    print("成功获取点击表里的device_id")
    # 获取点击表里的数据
14
    sql = "select cid,device_id,time,stat_date from data_feed_click " \
张彦钊's avatar
张彦钊 committed
15
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
张彦钊's avatar
张彦钊 committed
16
    # 因为上面的db已经关了,需要再写一遍
17
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
张彦钊's avatar
张彦钊 committed
18
    click = con_sql(db,sql)
张彦钊's avatar
张彦钊 committed
19
    click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
20 21
    print("成功获取点击表里的数据")
    # 从time特征中抽取hour
张彦钊's avatar
张彦钊 committed
22 23 24
    click["hour"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
    click["minute"] = click["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
    click = click.drop("time_date", axis=1)
25 26

    # 获取曝光表里的数据
27
    sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
张彦钊's avatar
张彦钊 committed
28
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
张彦钊's avatar
张彦钊 committed
29
    start = time.time()
张彦钊's avatar
张彦钊 committed
30
    # 因为上面的db已经关了,需要再写一遍
31
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
张彦钊's avatar
张彦钊 committed
32
    exposure = con_sql(db,sql)
张彦钊's avatar
张彦钊 committed
33 34
    end = time.time()
    print("获取曝光表耗时{}分".format((end-start)/60))
张彦钊's avatar
张彦钊 committed
35
    exposure = exposure.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
36 37
    print("成功获取曝光表里的数据")
    # 从time特征中抽取hour
张彦钊's avatar
张彦钊 committed
38 39 40
    exposure["hour"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
    exposure["minute"] = exposure["time_date"].apply(lambda x: datetime.datetime.fromtimestamp(x).minute)
    exposure = exposure.drop("time_date", axis=1)
41

张彦钊's avatar
张彦钊 committed
42

43
    return exposure, click, click_device_id