迁移到腾讯

a458bb53 · 王志伟 · 25b39fd1 · 6bb8533b · a458bb53 · a458bb53
Commit a458bb53 authored Apr 19, 2019 by 王志伟
11 changed files
--- a/eda/esmm/Model_pipline/feature.py
+++ b/eda/esmm/Model_pipline/feature.py
@@ -6,14 +6,9 @@ import datetime

 def con_sql(db,sql):
    cursor = db.cursor()
-    try:
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(list(result))
-    except Exception:
-        print("发生异常", Exception)
-        df = pd.DataFrame()
-    finally:
    db.close()
    return df

@@ -32,14 +27,14 @@ def multi_hot(df,column,n):


 def get_data():
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select max(stat_date) from {}".format(train_data_set)
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:" + validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
-    start = (temp - datetime.timedelta(days=300)).strftime("%Y-%m-%d")
+    start = (temp - datetime.timedelta(days=3)).strftime("%Y-%m-%d")
    print(start)
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer," \
          "u.channel,c.top,e.device_id,cut.time,dl.app_list,e.diary_service_id,feat.level3_ids,feat.level2 " \
          "from {} e left join user_feature u on e.device_id = u.device_id " \
@@ -55,7 +50,7 @@ def get_data():
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
                            11: "time", 12: "app_list", 13: "service_id", 14: "level3_ids", 15: "level2"})

-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time " \
          "from train_Knowledge_network_data"
    knowledge = con_sql(db, sql)
@@ -67,7 +62,7 @@ def get_data():
    df = df.drop("level2", axis=1)

    service_id = tuple(df["service_id"].unique())
-    db = pymysql.connect(host='rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com', port=3306, user='work',
+    db = pymysql.connect(host='172.16.30.143', port=3306, user='work',
                         passwd='BJQaT9VzDcuPBqkd', db='zhengxing')
    sql = "select s.id,d.hospital_id from api_service s left join api_doctor d on s.doctor_id = d.id " \
          "where s.id in {}".format(service_id)
@@ -152,7 +147,7 @@ def write_csv(df,name,n):


 def get_predict(date,value_map,app_list_map,level2_map,level3_map):
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time," \
          "dl.app_list,e.hospital_id,feat.level3_ids,feat.level2 " \
@@ -160,14 +155,14 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_time_cut cut on e.cid_id = cut.cid " \
          "left join device_app_list dl on e.device_id = dl.device_id " \
-          "left join diary_feat feat on e.cid_id = feat.diary_id"
+          "left join diary_feat feat on e.cid_id = feat.diary_id limit 600"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
                            11: "cid_id", 12: "time", 13: "app_list", 14: "hospital_id", 15: "level3_ids",
                            16: "level2"})

-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time " \
          "from train_Knowledge_network_data"
    knowledge = con_sql(db, sql)
@@ -232,7 +227,7 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):

 if __name__ == '__main__':
    train_data_set = "esmm_train_data"
-    path = "/data/esmm/"
+    path = "/home/gmuser/esmm/"
    date, value, app_list, level2, level3 = get_data()
    get_predict(date, value, app_list, level2, level3)


--- a/eda/esmm/Model_pipline/submit.sh
+++ b/eda/esmm/Model_pipline/submit.sh
 #! /bin/bash
 git checkout master
-PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
+PYTHON_PATH=/opt/anaconda3/envs/esmm/bin/python
 MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm/Model_pipline
-DATA_PATH=/data/esmm
+DATA_PATH=/home/gmuser/esmm

 echo "rm leave tfrecord"
 rm ${DATA_PATH}/tr/*

--- a/eda/esmm/Model_pipline/to_database.py
+++ b/eda/esmm/Model_pipline/to_database.py
@@ -3,14 +3,14 @@
 from sqlalchemy import create_engine
 import pandas as pd
 import pymysql
-import time
+import datetime

 def con_sql(sql):
    """
    :type sql : str
    :rtype : tuple
    """
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    cursor = db.cursor()
    cursor.execute(sql)
    result = cursor.fetchall()
@@ -36,10 +36,10 @@ def native_set_join(lst):
 def main():

    # native queue
-    df2 = pd.read_csv('/data/esmm/native.csv')
+    df2 = pd.read_csv(path+'/native.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)

-    df1 = pd.read_csv("/data/esmm/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv(path+"/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
    df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':native_set_join}).reset_index(drop=False)
    df3.columns = ["device_id","city_id","native_queue"]
@@ -47,10 +47,10 @@ def main():


    # nearby queue
-    df2 = pd.read_csv('/data/esmm/nearby.csv')
+    df2 = pd.read_csv(path+'/nearby.csv')
    df2['cid_id'] = df2['cid_id'].astype(str)

-    df1 = pd.read_csv("/data/esmm/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
+    df1 = pd.read_csv(path+"/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
    df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
    df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':nearby_set_join}).reset_index(drop=False)
    df4.columns = ["device_id","city_id","nearby_queue"]
@@ -60,11 +60,10 @@ def main():
    df_all = pd.merge(df3,df4,on=['device_id','city_id'],how='outer').fillna("")
    df_all['device_id'] = df_all['device_id'].astype(str)
    df_all['city_id'] = df_all['city_id'].astype(str)
-    ctime = int(time.time())
-    df_all["time"] = ctime
+    df_all["time"] = str(datetime.datetime.now().strftime('%Y%m%d%H%M'))
    print("union_device_count",df_all.shape)

-    host='10.66.157.22'
+    host='172.16.40.158'
    port=4000
    user='root'
    password='3SYz54LS9#^9sBvC'
@@ -78,7 +77,7 @@ def main():
        # df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)

        delete_str = 'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'.format(df_merge_str)
-        con = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+        con = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
        cur = con.cursor()
        cur.execute(delete_str)
        con.commit()
@@ -88,5 +87,7 @@ def main():

    print("done")

+
 if __name__ == '__main__':
+    path = "/home/gmuser/esmm"
    main()
\ No newline at end of file
--- a/eda/esmm/Model_pipline/to_tfrecord.py
+++ b/eda/esmm/Model_pipline/to_tfrecord.py
@@ -4,13 +4,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import pandas as pd
-import sys
 import os
 import glob

 import tensorflow as tf
 import numpy as np
-import re
 from multiprocessing import Pool as ThreadPool

 flags = tf.app.flags

--- a/eda/esmm/Model_pipline/train.py
+++ b/eda/esmm/Model_pipline/train.py
@@ -6,12 +6,10 @@

 #import argparse
 import shutil
-#import sys
 import os
 import json
 import glob
 from datetime import date, timedelta
-from time import time

 import random
 import tensorflow as tf

--- a/eda/feededa/src/main/resources/application.properties
+++ b/eda/feededa/src/main/resources/application.properties
-dev.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
-dev.tispark.pd.addresses=10.66.157.22:2379
-dev.mimas.jdbcuri= jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/mimas_test?user=work&password=workwork&rewriteBatchedStatements=true
+dev.tidb.jdbcuri=jdbc:mysql://192.168.15.12:4000/eagle?user=root&password=&rewriteBatchedStatements=true
+dev.tispark.pd.addresses=192.168.15.11:2379
+dev.mimas.jdbcuri= jdbc:mysql://rm-2zenowgrn4i5p0j7txo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1&rewriteBatchedStatements=true
 dev.gaia.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/zhengxing_test?user=work&password=workwork&rewriteBatchedStatements=true
 dev.gold.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/doris_test?user=work&password=workwork&rewriteBatchedStatements=true
-dev.redis.host=10.30.50.58
-dev.redis.port=6379
-
+dev.jerry.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/jerry_test?user=work&password=workwork&rewriteBatchedStatements=true
+dev.test.jdbcuri= jdbc:mysql://rm-2ze0v6uua2hl9he8edo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1&rewriteBatchedStatements=true

 pre.tidb.jdbcuri=jdbc:mysql://192.168.16.11:4000/eagle?user=root&password=&rewriteBatchedStatements=true
 pre.tispark.pd.addresses=192.168.16.11:2379
 pre.mimas.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas_prod?user=mimas&password=workwork&rewriteBatchedStatements=true

+<<<<<<< HEAD
 #prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
 #prod.gold.jdbcuri=jdbc:mysql://rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
 #prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
@@ -19,6 +19,22 @@ pre.mimas.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas
 #prod.redis.host=10.30.50.58
 #prod.redis.port=6379

+=======
+
+
+#阿里云线上配置
+#prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
+#prod.gold.jdbcuri=jdbc:mysql://rm-m5ey2s823bq0lc616.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
+#prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
+#prod.gaia.jdbcuri=jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
+#prod.jerry.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
+#prod.tispark.pd.addresses=10.66.157.22:2379
+#
+#prod.tidb.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
+#prod.jerry.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
+
+#腾讯云线上配置
+>>>>>>> 6bb8533b68efef7c647251ef08479560d5e1216a
 prod.gold.jdbcuri=jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
 prod.mimas.jdbcuri=jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
 prod.gaia.jdbcuri=jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true

--- a/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+++ b/eda/feededa/src/main/scala/com/gmei/EsmmData.scala
--- a/eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
+++ b/eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
@@ -37,27 +37,17 @@ object GmeiConfig extends Serializable {
    sparkConf.set("spark.debug.maxToStringFields", "130")
    sparkConf.set("spark.sql.broadcastTimeout",  "6000")

-    if (!sparkConf.contains("""spark.master""")) {
-      sparkConf.setMaster("local[3]")
-    }
-
-    if (!sparkConf.contains("spark.tispark.pd.addresses")) {
-      sparkConf.set("spark.tispark.pd.addresses", this.config.getString("tispark.pd.addresses"))
-    }
-    println(sparkConf.get("spark.tispark.pd.addresses"))

    val spark = SparkSession
      .builder()
-//      .config(sparkConf)
-      .appName("feededa")
-      .enableHiveSupport()
+      .config(sparkConf)
      .config("spark.tispark.pd.addresses","172.16.40.158:2379")
      .config("spark.sql.extensions","org.apache.spark.sql.TiExtensions")
+      .appName("feededa")
+      .enableHiveSupport()
      .getOrCreate()

-    spark.sql("SET mapreduce.job.queuename=data")
-    spark.sql("SET mapred.input.dir.recursive=true")
-    spark.sql("SET hive.mapred.supports.subdirectories=true")
+    spark.sql("use online")
    spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
    spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
    spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")

--- a/eda/feededa/src/main/scala/com/gmei/Recommendation_strategy_all.scala
+++ b/eda/feededa/src/main/scala/com/gmei/Recommendation_strategy_all.scala
@@ -52,7 +52,7 @@ object Recommendation_strategy_all {


      val stat_date = GmeiConfig.getMinusNDate(1)
-//      val stat_date = param.date
+      //      val stat_date = param.date
      //println(param.date)
      val partition_date = stat_date.replace("-","")
      val decive_id_oldUser = sc.sql(
@@ -119,7 +119,7 @@ object Recommendation_strategy_all {
         """.stripMargin
      )

-//获取策略命中用户device_id
+      //获取策略命中用户device_id
      val device_id_cover = sc.sql(
        s"""
           |select distinct(device_id) as device_id
@@ -287,7 +287,7 @@ object Recommendation_strategy_all {
      GmeiConfig.writeToJDBCTable(result2, "strategy_other", SaveMode.Append)


-//统计新用户点击率
+      //统计新用户点击率
      val devicee_id_newUser = sc.sql(
        s"""
           |select distinct(device_id) as device_id
@@ -442,7 +442,7 @@ object Gini_coefficient {
         """.stripMargin
      )
      agency_id.createOrReplaceTempView("agency_id")
-//统计次数
+      //统计次数
      val diary_clk_num = sc.sql(
        s"""
           |select temp1.diary_id as diary_id,count(ov.cl_id) as diary_clk_num
@@ -468,3 +468,4 @@ object Gini_coefficient {



+
--- a/eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
+++ b/eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
@@ -6,7 +6,7 @@ import java.time.LocalDate

 import com.gmei.lib.AbstractParams
 import org.apache.log4j.{Level, Logger}
-import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
 import scopt.OptionParser

 import scala.util.parsing.json.JSON
@@ -46,9 +46,6 @@ object esmm_feature {
      GmeiConfig.setup(param.env)
      val spark_env = GmeiConfig.getSparkSession()
      val sc = spark_env._2
-      val ti = new TiContext(sc)
-      ti.tidbMapTable(dbName = "jerry_test",tableName = "device_app_list")
-      ti.tidbMapTable(dbName = "jerry_test",tableName = "user_feature")

      user_feature(sc)
      get_applist(sc)
@@ -67,7 +64,7 @@ object esmm_feature {
       """.stripMargin).dropDuplicates("device_id")
    df.persist()

-    val old = spark.sql("select device_id from device_app_list").collect().map(x => x(0).toString)
+    val old = spark.sql("select device_id from jerry_test.device_app_list").collect().map(x => x(0).toString)

    import spark.implicits._
    val android = df.rdd.map(x => (x(0).toString,x(1).toString,x(2).toString))
@@ -81,8 +78,6 @@ object esmm_feature {
    val new_user = rdd.filter(x => old.indexOf(x._1)== -1)
      .toDF("device_id","os","app_list","update_date")
    if (new_user.take(1).nonEmpty){
-      val jdbc = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
-      GmeiConfig.writeToJDBCTable(jdbc, new_user,"device_app_list", SaveMode.Append)
      val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
      GmeiConfig.writeToJDBCTable(tecent_jdbc, new_user,"device_app_list", SaveMode.Append)

@@ -114,7 +109,7 @@ object esmm_feature {
  def user_feature(spark:SparkSession): Unit ={
    val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
    println(yesterday)
-    val sql_exist = "select device_id from user_feature"
+    val sql_exist = "select device_id from jerry_test.user_feature"
    val old = spark.sql(sql_exist)
      .collect().map(x => x(0).toString)
    val sql_yesterday =
@@ -130,12 +125,8 @@ object esmm_feature {
    val df_new = rdd.filter(x => old.indexOf(x._1)== -1)
      .toDF("device_id","device_type","manufacturer","city_id","channel","date")
    if (df_new.take(1).nonEmpty){
-      df_new.persist()
-      val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
-      GmeiConfig.writeToJDBCTable(jdbcuri, df_new, "user_feature", SaveMode.Append)
      val tecent_jdbc = "jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
      GmeiConfig.writeToJDBCTable(tecent_jdbc, df_new, "user_feature", SaveMode.Append)
-      df_new.unpersist()
    }else {
      println("no need to insert into user feature")
    }

--- a/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
+++ b/eda/feededa/src/main/scala/com/gmei/temp_analysis.scala