from pyspark.sql import SQLContext from pyspark.context import SparkContext from pyspark.conf import SparkConf import datetime from pyspark.sql import HiveContext def get_data(): sc = SparkContext(conf=SparkConf().setAppName("esmm")).getOrCreate() sc.setLogLevel("WARN") ctx = SQLContext(sc) # end_date = (datetime.date.today() - datetime.timedelta(days=1)).strftime("%Y-%m-%d") # start_date = (datetime.date.today() - datetime.timedelta(days=day)).strftime("%Y-%m-%d") dbtable = "(select * from esmm_data)temp" esmm_data = ctx.read.format("jdbc").options(url="jdbc:mysql://10.66.157.22:4000/jerry_prod", driver="com.mysql.jdbc.Driver", dbtable=dbtable, user="root", password="3SYz54LS9#^9sBvC").load().na.drop() esmm_data.show(6) column_number = {} for i in esmm_data.columns: column_number[i] = esmm_data.select(i).distinct esmm_data = esmm_data.map() dbtable = "(select * from home_tab_click)temp" tab_click = ctx.read.format("jdbc").options(url="jdbc:mysql://10.66.157.22:4000/eagle", driver="com.mysql.jdbc.Driver", dbtable=dbtable, user="root", password="3SYz54LS9#^9sBvC").load() tab_click.show(6) # esmm_data = esmm_data.join(tab_click,esmm_data.device_id == tab_click.device_id) # esmm_data.show(6) if __name__ == '__main__': get_data()