Commit 732a6093 authored by 高雅喆's avatar 高雅喆

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

rm -r
parents 701fdb4c 4581dce4
...@@ -300,8 +300,75 @@ object ARPU_COM { ...@@ -300,8 +300,75 @@ object ARPU_COM {
) )
active_num.show(80) active_num.show(80)
}
}
}
object hospital_gengmei {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import sc.implicits._
val stat_date = GmeiConfig.getMinusNDate(1)
//println(param.date)
val partition_date = stat_date.replace("-","")
val hospital_gengmei = sc.sql(
s"""
|SELECT id,name,location,city_id
|FROM online.tl_hdfs_hospital_view
|WHERE partition_date = '20181219'
""".stripMargin
)
hospital_gengmei.show()
GmeiConfig.writeToJDBCTable(hospital_gengmei, "hospital_gengmei", SaveMode.Append)
} }
......
...@@ -142,7 +142,7 @@ def get_data(): ...@@ -142,7 +142,7 @@ def get_data():
validate_date = con_sql(db, sql)[0].values.tolist()[0] validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:" + validate_date) print("validate_date:" + validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=30)).strftime("%Y-%m-%d") start = (temp - datetime.timedelta(days=6)).strftime("%Y-%m-%d")
print(start) print(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
...@@ -165,7 +165,7 @@ def get_data(): ...@@ -165,7 +165,7 @@ def get_data():
df = df.drop(["z","stat_date"], axis=1).fillna(0.0) df = df.drop(["z","stat_date"], axis=1).fillna(0.0)
print(df.head(2)) print(df.head(2))
features = 0 features = 0
for i in ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel"]: for i in ["ucity_id","clevel1_id","ccity_name","device_type","manufacturer","channel","top"]:
features = features + len(df[i].unique()) features = features + len(df[i].unique())
print("fields:{}".format(df.shape[1]-1)) print("fields:{}".format(df.shape[1]-1))
print("features:{}".format(features)) print("features:{}".format(features))
...@@ -261,7 +261,7 @@ if __name__ == "__main__": ...@@ -261,7 +261,7 @@ if __name__ == "__main__":
a = time.time() a = time.time()
df, validate_date, ucity_id,ccity_name = get_data() df, validate_date, ucity_id,ccity_name = get_data()
model = transform(df, validate_date) model = transform(df, validate_date)
get_predict_set(ucity_id,model,ccity_name) # get_predict_set(ucity_id,model,ccity_name)
b = time.time() b = time.time()
print("cost(分钟)") print("cost(分钟)")
print((b-a)/60) print((b-a)/60)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment