1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from pyspark.sql import SQLContext
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
import datetime
from pyspark.sql import HiveContext
def get_data():
sc = SparkContext(conf=SparkConf().setAppName("esmm")).getOrCreate()
sc.setLogLevel("WARN")
ctx = SQLContext(sc)
# end_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
# start_date = (datetime.date.today() - datetime.timedelta(days=day)).strftime("%Y-%m-%d")
dbtable = "(select * from esmm_data)temp"
esmm_data = ctx.read.format("jdbc").options(url="jdbc:mysql://10.66.157.22:4000/jerry_prod",
driver="com.mysql.jdbc.Driver",
dbtable=dbtable,
user="root",
password="3SYz54LS9#^9sBvC").load().na.drop()
esmm_data.show(6)
column_number = {}
for i in esmm_data.columns:
column_number[i] = esmm_data.select(i).distinct
esmm_data = esmm_data.map()
dbtable = "(select * from home_tab_click)temp"
tab_click = ctx.read.format("jdbc").options(url="jdbc:mysql://10.66.157.22:4000/eagle",
driver="com.mysql.jdbc.Driver",
dbtable=dbtable,
user="root",
password="3SYz54LS9#^9sBvC").load()
tab_click.show(6)
# esmm_data = esmm_data.join(tab_click,esmm_data.device_id == tab_click.device_id)
# esmm_data.show(6)
if __name__ == '__main__':
get_data()