Commit 68a102b4 authored by 王志伟's avatar 王志伟
parents 20e01904 b49d205c
......@@ -7,7 +7,8 @@ import datetime
my_sender='gaoyazhe@igengmei.com'
my_pass = 'VCrKTui99a7ALhiK'
my_user='gaoyazhe@igengmei.com'
my_user1='gaoyazhe@igengmei.com'
my_user2='zhangyanzhao@igengmei.com'
def mail():
ret=True
try:
......@@ -15,11 +16,11 @@ def mail():
stat_data = f.read()
msg=MIMEText(stat_data,'plain','utf-8')
msg['From']=formataddr(["高雅喆",my_sender])
msg['To']=formataddr(["高雅喆",my_user])
msg['To']=my_user1 + ',' + my_user2
msg['Subject']= str(datetime.date.today())+"-esmm多目标模型训练指标统计"
server=smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
server.login(my_sender, my_pass)
server.sendmail(my_sender,[my_user,],msg.as_string())
server.sendmail(my_sender,[my_user1,my_user2],msg.as_string())
server.quit()
except Exception:
ret=False
......
#! /bin/bash
cd /srv/apps/ffm-baseline/eda/esmm
git checkout master
PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm
DATA_PATH=/home/gmuser/esmm_data
......
......@@ -47,7 +47,6 @@ object Recommendation_strategy_all {
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
......@@ -67,7 +66,7 @@ object Recommendation_strategy_all {
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03')
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
""".stripMargin
)
......@@ -81,7 +80,6 @@ object Recommendation_strategy_all {
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -94,7 +92,6 @@ object Recommendation_strategy_all {
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -106,7 +103,6 @@ object Recommendation_strategy_all {
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -118,7 +114,6 @@ object Recommendation_strategy_all {
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -140,7 +135,6 @@ object Recommendation_strategy_all {
|from data_feed_click jd inner join device_id_cover_older
|on jd.device_id = device_id_cover_older.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -152,7 +146,6 @@ object Recommendation_strategy_all {
|from data_feed_exposure je inner join device_id_cover_older
|on je.device_id = device_id_cover_older.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -173,7 +166,6 @@ object Recommendation_strategy_all {
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -186,7 +178,6 @@ object Recommendation_strategy_all {
|from data_feed_exposure je inner join device_id_cover_older
|on je.device_id = device_id_cover_older.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -212,7 +203,6 @@ object Recommendation_strategy_all {
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -226,7 +216,6 @@ object Recommendation_strategy_all {
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id in (select distinct(device_id) from data_feed_click where device_id regexp '1$$' and stat_date = '${stat_date}')
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......@@ -240,7 +229,6 @@ object Recommendation_strategy_all {
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -253,7 +241,6 @@ object Recommendation_strategy_all {
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from bl_device_list)
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
......@@ -267,7 +254,6 @@ object Recommendation_strategy_all {
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id in (select distinct(device_id) from data_feed_click where stat_date = '${stat_date}')
|and je.device_id not in (select device_id from bl_device_list)
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
......
......@@ -7,6 +7,10 @@ import org.apache.spark.sql.{SaveMode, TiContext}
import org.apache.log4j.{Level, Logger}
import scopt.OptionParser
import com.gmei.lib.AbstractParams
import org.dmg.pmml.True
import scala.util.Try
import scala.util.parsing.json.JSON
object Search_keywords_count {
......@@ -37,6 +41,17 @@ object Search_keywords_count {
)
}
//定义异常捕获
def catch_error(x:String)={
val in =JSON.parseFull(x)
try{
in.toString
}
catch {
case e: ArithmeticException => {e.printStackTrace();e.toString}
}
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
......@@ -45,37 +60,44 @@ object Search_keywords_count {
val stat_date = GmeiConfig.getMinusNDate(1) //获取昨日日期
//println(param.date)
val partition_date = stat_date.replace("-","")
//搜索关键词提取
val search_keywords = sc.sql(
s"""
|select params['query'] as search_keywords
|select params as search_keywords
|from online.tl_hdfs_maidian_view
|where (action = 'do_search' or action = 'search_result_click_search')
|and partition_date ='${partition_date}'
|and partition_date ='20190108'
""".stripMargin
)
//搜索次数统计
val search_count = sc.sql(
s"""
|select '${stat_date}' as stat_date,count(params['query']) as search_num
|from online.tl_hdfs_maidian_view
|where (action = 'do_search' or action = 'search_result_click_search')
|and partition_date ='${partition_date}'
""".stripMargin
)
search_count.show()
// search_keywords.show()
val get_result =search_keywords.rdd.map((_, 1)).reduceByKey(_ + _)
.sortBy(_._2,false)
val temp=get_result.collect()
for (i <- 0 until temp.length ) {
println(temp(i))
val tempp=search_keywords.collect()
for (i <- 0 until tempp.length ) {
println(tempp(i))
}
// GmeiConfig.writeToJDBCTable(search_keywords, table = "temp_search_keywords", SaveMode.Overwrite)
//搜索次数统计
// val search_count = sc.sql(
// s"""
// |select '${stat_date}' as stat_date,count(params['query']) as search_num
// |from online.tl_hdfs_maidian_view
// |where (action = 'do_search' or action = 'search_result_click_search')
// |and partition_date ='20190107'
// """.stripMargin
// )
// search_count.show()
// val get_result =search_keywords.rdd.map((_, 1)).reduceByKey(_ + _)
// .sortBy(_._2,false)
// val temp=get_result.collect()
// for (i <- 0 until temp.length ) {
// println(temp(i))
// }
}
}
}
\ No newline at end of file
package com.gmei
import java.io.Serializable
import com.gmei.WeafareStat.{defaultParams, parser}
import org.apache.spark.sql.{SaveMode, TiContext}
import org.apache.log4j.{Level, Logger}
import scopt.OptionParser
import com.gmei.lib.AbstractParams
object data_feed_exposure_precise {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// val stat_date = GmeiConfig.getMinusNDate(1)
val stat_date = param.date
//println(param.date)
val partition_date = stat_date.replace("-","")
val result01=sc.sql(
s"""
|select
| from_unixtime(unix_timestamp('${partition_date}' ,'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
| time_stamp as time,
| cl_id as device_id,
| channel as device_type,
| card_content_type as card_content_type ,
| business_id as business_id,
| if(card_content_type="diary", concat("diary|", business_id),
| if(card_content_type="live", concat("live|", business_id),
| if(card_content_type="question", concat("question|", business_id),
| if(card_content_type="answer", concat("answer|", business_id),
| if(card_content_type="article", concat("article|", business_id), null
| ))))) as cid,
| city_id as city_id
|from online.ml_community_precise_exposure_detail
|where card_content_type in ("article", "diary", "live", "answer", "question")
| and page_name='home'
| and tab_name="精选"
| and cl_id != "NULL"
| and partition_date='${partition_date}'
""".stripMargin
)
result01.createOrReplaceTempView("temp_result")
val result02 = sc.sql(
s"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${partition_date}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${partition_date}'
| AND pv_ratio>=0.95
| )
""".stripMargin
)
result02.createOrReplaceTempView("temp_result02")
val result_dairy = sc.sql(
s"""
|select
| re.stat_date as stat_date,
| re.time as time,
| re.device_id as device_id,
| re.device_type as device_type,
| re.card_content_type as cid_type,
| re.business_id as cid_id,
| re.cid as cid,
| re.city_id as city_id,
| da.content_level as diary_content_level,
| da.created_time as diary_created_time,
| da.updated_time as diary_updated_time,
| da.service_id as diary_service_id,
| da.doctor_id as diary_doctor_id,
| da.new_topics as diary_new_topics,
| da.new_votes as diary_new_votes,
| da.new_topic_votes as diary_new_topic_votes,
| da.new_replies as diary_new_replies,
| da.new_topic_replies as diary_new_topic_replies,
| da.new_favor as diary_new_favor,
| da.show_count_choice as diary_show_count_choice,
| da.show_count as diary_show_count,
| da.click_count_choice as diary_click_count_choice,
| da.page_view as diary_page_view,
| da.user_view as diary_user_view,
| da.device_view as diary_device_view
|from temp_result02 re
|left outer join
|(
| select
| diary_id,
| content_level,
| unix_timestamp(created_time) as created_time,
| unix_timestamp(last_topic_add_time) as updated_time,
| service_id as service_id,
| doctor_id as doctor_id,
| new_topics as new_topics,
| new_votes as new_votes,
| new_topic_votes as new_topic_votes,
| new_replies as new_replies,
| new_topic_replies as new_topic_replies,
| new_favor as new_favor,
| show_count_choice as show_count_choice,
| show_count as show_count,
| click_count_choice as click_count_choice,
| page_view as page_view,
| user_view as user_view,
| device_view
| from online.ml_community_diary_updates
| where partition_date='${partition_date}'
|)da
|on re.business_id = da.diary_id and re.card_content_type = "diary"
""".stripMargin
)
result_dairy.createOrReplaceTempView("result_dairy")
val result_answer = sc.sql(
s"""
|select
| rd.stat_date as stat_date,
| rd.time as time,
| rd.device_id as device_id,
| rd.device_type as device_type,
| rd.cid_type as cid_type,
| rd.cid_id as cid_id,
| rd.cid as cid,
| rd.city_id as city_id,
| an.is_recommend as is_recommend,
| if(rd.diary_new_votes is Null, an.new_votes, rd.diary_new_votes) as new_votes,
| if(rd.diary_new_replies is Null, an.reply_num, rd.diary_new_replies) as reply_num,
| if(rd.diary_content_level is Null, an.content_level, rd.diary_content_level) as content_level,
| if(rd.diary_created_time is Null, an.created_time, rd.diary_created_time) as created_time,
| rd.diary_updated_time as diary_updated_time,
| rd.diary_service_id as diary_service_id,
| rd.diary_doctor_id as diary_doctor_id,
| rd.diary_new_topics as diary_new_topics,
| rd.diary_new_topic_votes as diary_new_topic_votes,
| rd.diary_new_replies as diary_new_replies,
| rd.diary_new_topic_replies as diary_new_topic_replies,
| rd.diary_new_favor as diary_new_favor,
|
| an.reply_vote_num as answer_reply_vote_num,
|
| if(rd.diary_show_count_choice is Null, an.show_count_choice, rd.diary_show_count_choice ) as show_count_choice,
| if(rd.diary_show_count is Null, an.show_count, rd.diary_show_count ) as show_count,
| if(rd.diary_click_count_choice is Null, an.click_count_choice, rd.diary_click_count_choice) as click_count_choice,
| if(rd.diary_page_view is Null, an.page_view, rd.diary_page_view ) as page_view,
| if(rd.diary_user_view is Null, an.user_view, rd.diary_user_view ) as user_view,
| if(rd.diary_device_view is Null, an.device_view, rd.diary_device_view) as device_view
|
|from result_dairy rd
|left outer join
|(
| select
| answer_id,
| is_recommend,
| unix_timestamp(created_time) as created_time,
| content_level as content_level,
| vote_num as new_votes,
| reply_vote_num as reply_vote_num,
| reply_num as reply_num,
| show_count_choice as show_count_choice,
| show_count as show_count,
| click_count_choice as click_count_choice,
| page_view as page_view,
| user_view as user_view,
| device_view as device_view
| from online.ml_community_answer_updates
| where partition_date='${partition_date}'
|)an
|on rd.cid_id = an.answer_id and rd.cid_type = "answer"
""".stripMargin
)
result_answer.createOrReplaceTempView("result_answer")
val result_article = sc.sql(
s"""
|select
| ra.stat_date as stat_date,
| ra.time as time,
| ra.device_id as device_id,
| ra.device_type as device_type,
| ra.cid_type as cid_type,
| ra.cid_id as cid_id,
| ra.cid as cid,
| ra.city_id as city_id,
| if(ra.is_recommend is Null, ar.is_push, ra.is_recommend) as is_recommend,
| ar.article_type as article_type,
| if(ra.new_votes is Null, ar.vote_num, ra.new_votes) as new_votes,
| if(ra.reply_num is Null, ar.reply_num, ra.reply_num) as reply_num,
| ra.content_level as content_level,
| if(ra.created_time is Null, ar.created_time, ra.created_time) as created_time,
| ra.diary_updated_time as diary_updated_time,
| ra.diary_service_id as diary_service_id,
| ra.diary_doctor_id as diary_doctor_id,
| ra.diary_new_topics as diary_new_topics,
| ra.diary_new_replies as diary_new_replies,
| ra.diary_new_topic_votes as diary_new_topic_votes,
| ra.diary_new_topic_replies as diary_new_topic_replies,
| ra.diary_new_favor as diary_new_favor,
|
| if(ra.answer_reply_vote_num is Null, ar.reply_vote_num, ra.answer_reply_vote_num) as reply_vote_num,
|
| if(ra.show_count_choice is Null, ar.show_count_choice, ra.show_count_choice ) as show_count_choice,
| if(ra.show_count is Null, ar.show_count, ra.show_count ) as show_count,
| if(ra.click_count_choice is Null, ar.click_count_choice, ra.click_count_choice) as click_count_choice,
| if(ra.page_view is Null, ar.page_view, ra.page_view ) as page_view,
| if(ra.user_view is Null, ar.user_view, ra.user_view ) as user_view,
| if(ra.device_view is Null, ar.device_view, ra.device_view) as device_view
|from result_answer ra
|left outer join
|(
| select
| article_id,
| unix_timestamp(created_time) as created_time,
| article_type as article_type,
| is_push,
| vote_num,
| reply_vote_num,
| reply_num,
| show_count_choice,
| show_count,
| click_count_choice,
| page_view,
| user_view,
| device_view
| from online.ml_community_article_updates
| where partition_date='${partition_date}'
|)ar
|on ra.cid_id = ar.article_id and ra.cid_type="article"
""".stripMargin
)
result_article.createOrReplaceTempView("result_article")
val result_question = sc.sql(
s"""
|select
| ra.stat_date as stat_date,
| ra.time as time,
| ra.device_id as device_id,
| ra.device_type as device_type,
| ra.cid_type as cid_type,
| ra.cid_id as cid_id,
| ra.cid as cid,
| ra.city_id as city_id,
| if(ra.is_recommend is Null, qu.is_recommend, ra.is_recommend) as is_recommend,
| ra.article_type as article_type,
| if(ra.new_votes is Null, qu.vote_num, ra.new_votes) as new_votes,
| if(ra.reply_num is Null, qu.reply_num, ra.reply_num) as reply_num,
| ra.content_level as content_level,
| if(ra.created_time is Null, qu.created_time, ra.created_time) as created_time,
| ra.diary_updated_time as diary_updated_time,
| ra.diary_service_id as diary_service_id,
| ra.diary_doctor_id as diary_doctor_id,
| ra.diary_new_topics as diary_new_topics,
| ra.diary_new_replies as diary_new_replies,
| ra.diary_new_topic_votes as diary_new_topic_votes,
| ra.diary_new_topic_replies as diary_new_topic_replies,
| ra.diary_new_favor as diary_new_favor,
|
| ra.reply_vote_num as reply_vote_num,
| qu.answer_reply_num as question_answer_reply_num,
|
| if(ra.show_count_choice is Null, qu.show_count_choice, ra.show_count_choice ) as show_count_choice,
| if(ra.show_count is Null, qu.show_count, ra.show_count ) as show_count,
| if(ra.click_count_choice is Null, qu.click_count_choice, ra.click_count_choice) as click_count_choice,
| if(ra.page_view is Null, qu.page_view, ra.page_view ) as page_view,
| if(ra.user_view is Null, qu.user_view, ra.user_view ) as user_view,
| if(ra.device_view is Null, qu.device_view, ra.device_view) as device_view
|from result_article ra
|left outer join
|(
| select
| question_id,
| unix_timestamp(created_time) as created_time,
| is_recommend,
| answer_num as reply_num,
| vote_num,
| reply_num as answer_reply_num,
| show_count_choice,
| show_count,
| click_count_choice,
| page_view,
| user_view,
| device_view
| from online.ml_community_question_updates
| where partition_date='${partition_date}'
|)qu
|on ra.cid_id = qu.question_id and ra.cid_type="question"
""".stripMargin
)
result_question.createOrReplaceTempView("result_question")
val result = sc.sql(
s"""
|select
| rq.stat_date as stat_date,
| rq.time as time,
| rq.device_id as device_id,
| rq.device_type as device_type,
| rq.cid_type as cid_type,
| rq.cid_id as cid_id,
| rq.cid as cid,
| rq.city_id as city_id,
| rq.is_recommend,
| rq.article_type,
| rq.new_votes,
| rq.reply_num,
| rq.content_level,
| if(rq.created_time is Null, li.created_time, rq.created_time) as created_time,
| if(rq.diary_updated_time is Null, li.updated_time, rq.diary_updated_time) as updated_time,
| rq.diary_service_id as diary_service_id,
| rq.diary_doctor_id as diary_doctor_id,
| rq.diary_new_topics as diary_new_topics,
| rq.diary_new_replies as diary_new_replies,
| rq.diary_new_topic_votes as diary_new_topic_votes,
| rq.diary_new_topic_replies as diary_new_topic_replies,
| rq.diary_new_favor as diary_new_favor,
|
| rq.reply_vote_num as reply_vote_num,
| rq.question_answer_reply_num,
|
| rq.show_count_choice,
| if(rq.show_count is Null, li.show_count, rq.show_count ) as show_count,
| rq.click_count_choice,
| rq.page_view,
| rq.user_view,
| rq.device_view,
|
| li.fake_max_num as live_fake_max_num,
| li.topic_id as live_topic_id,
| li.max_view_num as live_max_view_num,
| li.is_finish as live_is_finish
|from result_question rq
|left outer join
|(
| select
| channel_id,
| unix_timestamp(created_time) as created_time,
| unix_timestamp(updated_time) as updated_time,
| pv as show_count,
| fake_max_num,
| topic_id,
| max_view_num,
| replay_danmu,
| is_finish
| from online.ml_community_live_updates
| where partition_date='${partition_date}'
|)li
|on rq.cid_id=li.channel_id and rq.cid_type="live"
""".stripMargin
)
GmeiConfig.writeToJDBCTable(result, table = "data_feed_exposure_precise", SaveMode.Append)
}
}
}
package com.gmei
import java.io.Serializable
import com.gmei.WeafareStat.{defaultParams, parser}
import org.apache.spark.sql.{SaveMode, TiContext}
import org.apache.log4j.{Level, Logger}
import scopt.OptionParser
import com.gmei.lib.AbstractParams
object find_bug {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
case class Params(env: String = "dev",
date: String = "2018-08-01"
) extends AbstractParams[Params] with Serializable
val defaultParams = Params()
val parser = new OptionParser[Params]("Feed_EDA") {
head("WeafareStat")
opt[String]("env")
.text(s"the databases environment you used")
.action((x, c) => c.copy(env = x))
opt[String] ("date")
.text(s"the date you used")
.action((x,c) => c.copy(date = x))
note(
"""
|For example, the following command runs this app on a tidb dataset:
|
| spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
""".stripMargin +
s"| --env ${defaultParams.env}"
)
}
def main(args: Array[String]): Unit = {
parser.parse(args, defaultParams).map { param =>
GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2
val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// val stat_date = GmeiConfig.getMinusNDate(1)
val stat_date = param.date
//println(param.date)
val partition_date = stat_date.replace("-","")
val decive_id_oldUser = sc.sql(
s"""
|select distinct(device_id) as device_id
|from online.ml_device_day_active_status
|where active_type = '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
""".stripMargin
)
decive_id_oldUser.createOrReplaceTempView("device_id_old")
val clk_count_oldUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as clk_count_oldUser_Contrast
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_oldUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_Contrast
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val clk_count_oldUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as clk_count_oldUser_all
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_oldUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_oldUser_all
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val result1 = clk_count_oldUser_Contrast.join(imp_count_oldUser_Contrast,"stat_date")
.join(clk_count_oldUser_all,"stat_date")
.join(imp_count_oldUser_all,"stat_date")
result1.show()
GmeiConfig.writeToJDBCTable(result1, "bug_Recommendation_strategy_temp", SaveMode.Append)
//device_id尾号1有点击用户日记本点击数
val clk_active_1 = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(jd.cid_id) as clk_active_1
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
//device_id尾号1有点击用户日记本曝光数
val imp_active_1 = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(je.cid_id) as imp_active_1
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id in (select distinct(device_id) from data_feed_click where device_id regexp '1$$' and stat_date = '${stat_date}')
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
//device_id尾号1点击日记本用户数
val clk_diary_device = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(distinct(jd.device_id)) as clk_diary_device
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
//所有有点击用户日记本点击数
val clk_active_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(jd.cid_id) as clk_active_all
|from data_feed_click jd inner join device_id_old
|on jd.device_id = device_id_old.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
//所有有点击用户日记本曝光数
val imp_active_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(je.cid_id) as imp_active_all
|from data_feed_exposure je inner join device_id_old
|on je.device_id = device_id_old.device_id
|where je.cid_type = 'diary'
|and je.device_id in (select distinct(device_id) from data_feed_click where stat_date = '${stat_date}')
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
//策略命中用户点击日记本用户数
val clk_diary_device_cover = sc.sql(
s"""
|select '${stat_date}' as stat_date,count(distinct(device_id)) as clk_diary_device_cover
|from merge_queue_table
|where device_id in (select distinct(device_id) from data_feed_click where stat_date = '${stat_date}')
""".stripMargin
)
//策略命中用户总数
val device_all_cover = sc.sql(
s"""
|select '${stat_date}' as stat_date,count(distinct(device_id)) as device_all_cover
|from merge_queue_table
""".stripMargin
)
val result2 = clk_active_1.join(imp_active_1,"stat_date")
.join(clk_active_all,"stat_date")
.join(imp_active_all,"stat_date")
.join(clk_diary_device,"stat_date")
.join(clk_diary_device_cover,"stat_date")
.join(device_all_cover,"stat_date")
result2.show()
GmeiConfig.writeToJDBCTable(result2, "bug_strategy_other", SaveMode.Append)
//统计新用户点击率
val devicee_id_newUser = sc.sql(
s"""
|select distinct(device_id) as device_id
|from online.ml_device_day_active_status
|where active_type != '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
""".stripMargin
)
devicee_id_newUser.show()
devicee_id_newUser.createOrReplaceTempView("device_id_new")
val clk_count_newUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as clk_count_newUser_Contrast
|from data_feed_click jd inner join device_id_new
|on jd.device_id = device_id_new.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id regexp'1$$'
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_newUser_Contrast = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_Contrast
|from data_feed_exposure je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id regexp'1$$'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val clk_count_newUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as clk_count_newUser_all
|from data_feed_click jd inner join device_id_new
|on jd.device_id = device_id_new.device_id
|where (jd.cid_type = 'diary' or jd.cid_type = 'diary_video')
|and jd.device_id not in (select device_id from blacklist)
|and jd.stat_date ='${stat_date}'
""".stripMargin
)
val imp_count_newUser_all = sc.sql(
s"""
|select '${stat_date}' as stat_date, count(cid_id) as imp_count_newUser_all
|from data_feed_exposure je inner join device_id_new
|on je.device_id = device_id_new.device_id
|where je.cid_type = 'diary'
|and je.device_id not in (select device_id from blacklist)
|and je.stat_date ='${stat_date}'
""".stripMargin
)
val result3 = clk_count_newUser_Contrast.join(imp_count_newUser_Contrast,"stat_date")
.join(clk_count_newUser_all,"stat_date")
.join(imp_count_newUser_all,"stat_date")
result3.show()
GmeiConfig.writeToJDBCTable(result3, "bug_Recommendation_strategy_newUser", SaveMode.Append)
}
}
}
\ No newline at end of file
......@@ -90,7 +90,7 @@ object testt {
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03')
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
""".stripMargin
)
......@@ -101,7 +101,8 @@ object testt {
val device_id_oldUser = sc.sql(
s"""
|select distinct(device_id) as device_id
|from online.ml_device_day_active_status
|from online.ml_device_day_active_status os left join blacklist
|on os.device_id=blacklist.device_id
|where active_type = '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
......@@ -109,8 +110,9 @@ object testt {
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03')
| ,'promotion_shike','promotion_julang_jl03','','unknown')
|and partition_date ='${partition_date}'
|and blacklist.device_id is null
""".stripMargin
)
device_id_oldUser.show()
......
......@@ -22,16 +22,25 @@ def normal():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select * from device_read_time"
df = con_sql(db, sql)
df = df.rename(columns={0:"device_id",1:"0",2:"1",3:"10",4:"1024",5:"1080",6:"11",
7:"12",8:"13",9:"2",10:"2054",11:"2214",12:"3",13:"4",14:"5",16:"6933",
17:"7",18:"9",19:"922",20:"929",21:"971",22:"992"})
device_id = df[["device_id"]]
df = df.drop("device_id",axis=1)
minMax = MinMaxScaler()
result = minMax.fit_transform(df)
result = device_id.join(result)
df = df.rename(columns={0:"device_id",1:"kongbai",2:"eye",3:"simi",4:"zitizhifang",5:"banyongjiu",6:"teeth",
7:"kouchun",8:"ear",9:"nose",10:"banyongjiuzhuang",11:"qita",12:"lunkuo",
13:"shoushen",14:"skin",16:"shenghuo",
17:"breast",18:"hair",19:"kangshuai",20:"shili",21:"chanhou",22:"zhushe"})
# device_id = df[["device_id"]]
# df = df.drop("device_id",axis=1)
# minMax = MinMaxScaler()
# result = pd.DataFrame(minMax.fit_transform(df),columns=["0","1","10","1024","1080","11",
# "12","13","2","2054","2214","3","4","5","6933",
# "7","9","922","929","971","992"])
# result = device_id.join(result)
l = list(df.columns)
l.remove("device_id")
df["sum"] = df.sum(axis=1)
for i in l:
df[i] = df[i]/df["sum"]
df = df.drop("sum",axis=1)
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
pd.io.sql.to_sql(result, "device_read_time_normal", yconnect, schema='jerry_test', if_exists='fail', index=False)
pd.io.sql.to_sql(df, "device_read_time_normal", yconnect, schema='jerry_test', if_exists='fail', index=False)
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment