Commit 8757a3d8 authored by litaolemo's avatar litaolemo

update

parent 9ae90804
# -*- coding:UTF-8 -*-
# @Time : 2020/9/8 13:39
# @File : spark_test.py
# @Time : 2020/9/16 17:41
# @File : new_user_has_protratit_rate.py
# @email : litao@igengmei.com
# @author : litao
import hashlib
import json
......@@ -19,21 +20,11 @@ import sys
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame
from meta_base_code.utils.func_from_redis_get_portrait import *
# from pyspark.sql.functions import lit
# import pytispark.pytispark as pti
from elasticsearch import Elasticsearch
exists_es_dic = {}
es = Elasticsearch([
{
'host': '172.16.31.17',
'port': 9200,
}, {
'host': '172.16.31.11',
'port': 9200,
}])
def con_sql(sql):
# 从数据库的表里获取数据
......@@ -71,7 +62,7 @@ sparkConf.set("prod.jerry.jdbcuri",
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.158:2379")
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.170:4000")
sparkConf.set("prod.tidb.database", "jerry_prod")
sparkConf.setAppName("test")
sparkConf.setAppName("new_user_has_protratit_rate")
spark = (SparkSession.builder.config(conf=sparkConf).config("spark.sql.extensions", "org.apache.spark.sql.TiExtensions")
.config("spark.tispark.pd.addresses", "172.16.40.170:2379").enableHiveSupport().getOrCreate())
......@@ -82,144 +73,169 @@ spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF
spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
# print(huidu_device_id_sql)
# huidu_device_id_df = spark.sql(huidu_device_id_sql)
# huidu_device_id_df.createOrReplaceTempView("dev_view")
sql_search_ctr = r"""
SELECT partition_date,
cl_id,
count(distinct app_session_id) as session_pv0
task_list = []
task_days = 8
for t in range(1, task_days):
day_num = 0 - t
now = (datetime.datetime.now() + datetime.timedelta(days=day_num))
last_30_day_str = (now + datetime.timedelta(days=-30)).strftime("%Y%m%d")
today_str = now.strftime("%Y%m%d")
today_str_format = now.strftime("%Y-%m-%d")
yesterday_str = (now + datetime.timedelta(days=-1)).strftime("%Y%m%d")
yesterday_str_format = (now + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
one_week_age_str = (now + datetime.timedelta(days=-7)).strftime("%Y%m%d")
new_urser_device_id_sql = r"""
select t2.device_id as device_id from
(select device_id from online.ml_device_day_active_status where partition_date = '{today_str}' and active_type in (1,2)) t2
LEFT JOIN
(
select distinct device_id
from ml.ml_d_ct_dv_devicespam_d --去除机构刷单设备,即作弊设备(浏览和曝光事件去除)
WHERE partition_day='{today_str}'
union all
select distinct device_id
from dim.dim_device_user_staff --去除内网用户
)spam_pv
on spam_pv.device_id=t2.device_id
LEFT JOIN
(
SELECT partition_date,device_id
FROM
(--找出user_id当天活跃的第一个设备id
SELECT user_id,partition_date,
if(size(device_list) > 0, device_list [ 0 ], '') AS device_id
FROM online.ml_user_updates
WHERE partition_date='{today_str}'
)t1
JOIN
( --医生账号
SELECT distinct user_id
FROM online.tl_hdfs_doctor_view
WHERE partition_date = '{today_str}'
--马甲账号/模特用户
UNION ALL
SELECT user_id
FROM ml.ml_c_ct_ui_user_dimen_d
WHERE partition_day = '{today_str}'
AND (is_puppet = 'true' or is_classifyuser = 'true')
UNION ALL
--公司内网覆盖用户
select distinct user_id
from dim.dim_device_user_staff
UNION ALL
--登陆过医生设备
SELECT distinct t1.user_id
FROM
(
SELECT partition_date,
cl_id,
case when params['card_content_type'] in ('qa','answer') then 'qa'
when params['card_content_type'] in ('special_pool') then 'special' else params['card_content_type'] end as card_content_type,
CASE when params['transaction_type'] in ('fmctr','samecity_fmctr') then array('fmctr','合计')
when params['transaction_type'] in ('high_quality_fmctr') then array('high_quality_fmctr','合计')
WHEN (params['transaction_type'] like '%ctr' and params['transaction_type'] not in ('high_quality_ctr','high_quality_fmctr','fmctr','samecity_fmctr')) THEN array('ctr预估','合计')
when params['transaction_type'] in ('high_quality_ctr') then array('high_quality_ctr','合计')
WHEN params['transaction_type'] like '%cvr' THEN array('cvr预估','合计')
WHEN params['transaction_type'] in ('-1','smr') THEN array('smr','合计')
when params['transaction_type'] in ('pgc','hotspot') then array('热点卡片')
when params['transaction_type'] in ('newdata') then array('保量卡片')
when params['transaction_type'] in ('hotspot_feed') then array('hotspot_feed','合计')
when params['transaction_type'] in ('aistragegy') then array('新用户AI帖优先','合计')
when params['transaction_type'] in ('excestragegy') then array('新用户精华帖优先','合计')
when params['transaction_type'] in ('FIXEDSTRATEGY') then array('新氧新用户策略一','合计')
when params['transaction_type'] in ('FIXEDSTRATEGY_VIDEO') then array('新氧新用户策略二','合计')
when params['transaction_type'] like 'deeplink%' then array('deeplink策略','合计')
end AS recommend_type,
params['card_id'] as card_id,
app_session_id
from online.bl_hdfs_maidian_updates
WHERE partition_date={partition_day}
AND action='on_click_card'
AND params['page_name'] ='home'
AND params['tab_name'] = '精选'
AND (params['transaction_type'] in ('-1','smr','hotspot','pgc','newdata','hotspot_feed','aistragegy','excestragegy','FIXEDSTRATEGY','FIXEDSTRATEGY_VIDEO')
or params['transaction_type'] like '%ctr' or params['transaction_type'] like '%cvr' or params['transaction_type'] like 'deeplink%')
AND params['card_content_type'] in ('qa','diary','user_post','answer','special_pool')
GROUP BY partition_date,
cl_id,
case when params['card_content_type'] in ('qa','answer') then 'qa'
when params['card_content_type'] in ('special_pool') then 'special' else params['card_content_type'] end,
CASE when params['transaction_type'] in ('fmctr','samecity_fmctr') then array('fmctr','合计')
when params['transaction_type'] in ('high_quality_fmctr') then array('high_quality_fmctr','合计')
WHEN (params['transaction_type'] like '%ctr' and params['transaction_type'] not in ('high_quality_ctr','high_quality_fmctr','fmctr','samecity_fmctr')) THEN array('ctr预估','合计')
when params['transaction_type'] in ('high_quality_ctr') then array('high_quality_ctr','合计')
WHEN params['transaction_type'] like '%cvr' THEN array('cvr预估','合计')
WHEN params['transaction_type'] in ('-1','smr') THEN array('smr','合计')
when params['transaction_type'] in ('pgc','hotspot') then array('热点卡片')
when params['transaction_type'] in ('newdata') then array('保量卡片')
when params['transaction_type'] in ('hotspot_feed') then array('hotspot_feed','合计')
when params['transaction_type'] in ('aistragegy') then array('新用户AI帖优先','合计')
when params['transaction_type'] in ('excestragegy') then array('新用户精华帖优先','合计')
when params['transaction_type'] in ('FIXEDSTRATEGY') then array('新氧新用户策略一','合计')
when params['transaction_type'] in ('FIXEDSTRATEGY_VIDEO') then array('新氧新用户策略二','合计')
when params['transaction_type'] like 'deeplink%' then array('deeplink策略','合计') end,
params['card_id'],
app_session_id
)a
LATERAL VIEW explode (a.recommend_type) v as recommend_type
group by partition_date,card_content_type,cl_id,v.recommend_type,card_id having session_pv0 >0
UNION
SELECT partition_date,
cl_id,
count(distinct card_id) as session_pv0
FROM
(SELECT partition_date,
cl_id,
case when card_content_type in ('qa','answer') then 'qa'
when card_content_type in ('special_pool') then 'special' else card_content_type end as card_content_type,
CASE when transaction_type in ('fmctr','samecity_fmctr') then array('fmctr','合计')
when transaction_type in ('high_quality_fmctr') then array('high_quality_fmctr','合计')
WHEN (transaction_type like '%ctr' and transaction_type not in ('high_quality_ctr','high_quality_fmctr','fmctr','samecity_fmctr') ) THEN array('ctr预估','合计')
when transaction_type in ('high_quality_ctr') then array('high_quality_ctr','合计')
WHEN transaction_type like '%cvr' THEN array('cvr预估','合计')
WHEN transaction_type in ('-1','smr') THEN array('smr','合计')
when transaction_type in ('pgc','hotspot') then array('热点卡片')
when transaction_type in ('newdata') then array('保量卡片')
when transaction_type in ('hotspot_feed') then array('hotspot_feed','合计')
when transaction_type in ('aistragegy') then array('新用户AI帖优先','合计')
when transaction_type in ('excestragegy') then array('新用户精华帖优先','合计')
when transaction_type in ('FIXEDSTRATEGY') then array('新氧新用户策略一','合计')
when transaction_type in ('FIXEDSTRATEGY_VIDEO') then array('新氧新用户策略二','合计')
when transaction_type like 'deeplink%' then array('deeplink策略','合计')
end AS recommend_type,
card_id,
app_session_id
from online.ml_community_precise_exposure_detail
WHERE partition_date={partition_day}
AND action in ('page_precise_exposure','home_choiceness_card_exposure') --7745版本action改为page_precise_exposure
AND is_exposure = '1' ----精准曝光
AND page_name ='home'
AND tab_name = '精选'
AND (transaction_type in ('-1','smr','hotspot','pgc','newdata','hotspot_feed','aistragegy','excestragegy','FIXEDSTRATEGY','FIXEDSTRATEGY_VIDEO')
or transaction_type like '%ctr' or transaction_type like '%cvr' or transaction_type like 'deeplink%')
AND card_content_type in ('qa','diary','user_post','answer','special_pool')
group by partition_date,
case when card_content_type in ('qa','answer') then 'qa'
when card_content_type in ('special_pool') then 'special' else card_content_type end,
cl_id,
CASE when transaction_type in ('fmctr','samecity_fmctr') then array('fmctr','合计')
when transaction_type in ('high_quality_fmctr') then array('high_quality_fmctr','合计')
WHEN (transaction_type like '%ctr' and transaction_type not in ('high_quality_ctr','high_quality_fmctr','fmctr','samecity_fmctr')) THEN array('ctr预估','合计')
when transaction_type in ('high_quality_ctr') then array('high_quality_ctr','合计')
WHEN transaction_type like '%cvr' THEN array('cvr预估','合计')
WHEN transaction_type in ('-1','smr') THEN array('smr','合计')
when transaction_type in ('pgc','hotspot') then array('热点卡片')
when transaction_type in ('newdata') then array('保量卡片')
when transaction_type in ('hotspot_feed') then array('hotspot_feed','合计')
when transaction_type in ('aistragegy') then array('新用户AI帖优先','合计')
when transaction_type in ('excestragegy') then array('新用户精华帖优先','合计')
when transaction_type in ('FIXEDSTRATEGY') then array('新氧新用户策略一','合计')
when transaction_type in ('FIXEDSTRATEGY_VIDEO') then array('新氧新用户策略二','合计')
when transaction_type like 'deeplink%' then array('deeplink策略','合计') end,
card_id,
app_session_id
)a
LATERAL VIEW explode (a.recommend_type) v as recommend_type
group by partition_date,cl_id having session_pv0 >= 4
""".format(start_date='20201017',end_date='20201116',partition_day='20201116')
print(sql_search_ctr)
search_ctr_df = spark.sql(sql_search_ctr)
# spam_pv_df.createOrReplaceTempView("dev_view")
search_ctr_df.show(1)
sql_res = search_ctr_df.collect()
print("-------------------------------")
for res in sql_res:
print(res)
# print(res.query,res.search_pv)
# results = es.search(
# index='gm-dbmw-diary-read',
# doc_type='diary',
# timeout='10s',
# body=body
# )
SELECT user_id, v.device_id as device_id
FROM online.ml_user_history_detail
LATERAL VIEW EXPLODE(device_history_list) v AS device_id
WHERE partition_date = '{today_str}'
) t1
JOIN
(
SELECT device_id
FROM online.ml_device_history_detail
WHERE partition_date = '{today_str}'
AND is_login_doctor = '1'
) t2
ON t1.device_id = t2.device_id
)t2
on t1.user_id=t2.user_id
group by partition_date,device_id
)dev
on t2.device_id=dev.device_id
WHERE spam_pv.device_id IS NULL
and dev.device_id is null
""".format(today_str=today_str, yesterday_str_format=yesterday_str_format, today_str_format=today_str_format)
print(new_urser_device_id_sql)
new_urser_device_id_df = spark.sql(new_urser_device_id_sql)
new_urser_device_id_df.createOrReplaceTempView("device_id_view")
new_urser_device_id_df.show(1)
sql_res = new_urser_device_id_df.collect()
res_dict = {}
portrait_dict = {
"first_demands": {},
"second_demands": {},
"first_solutions": {},
"second_solutions": {},
"first_positions": {},
"second_positions": {},
"projects": {},
'anecdote_tags':{}
}
no_portrait_device_id_list = []
print("-------------------------------")
count_not_has_portratit = 0
for count_user_count, res in enumerate(sql_res):
# print(count, res)
portratit_res = get_user_portrait_tag3_from_redis(res.device_id)
# print(count_user_count, res, portratit_res)
temp_count = 0
for demand in portratit_res:
if portratit_res[demand]:
try:
for tag in portratit_res[demand][0:3]:
if tag in portrait_dict[demand]:
portrait_dict[demand][tag] += 1
else:
portrait_dict[demand][tag] = 1
except Exception as e:
print("error ", e)
temp_count += 1
if not temp_count:
count_not_has_portratit += 1
no_portrait_device_id_list.append(res.device_id)
print(portrait_dict)
print(count_user_count+1,count_not_has_portratit)
print("-------------------------------")
for protratit_type in portrait_dict["projects"]:
partition_date = today_str
pid = hashlib.md5((partition_date + protratit_type).encode("utf8")).hexdigest()
action_count = portrait_dict["projects"][protratit_type]
instert_sql = """replace into new_user_project_count(
partition_day,pid,protratit_count,protratit_type) VALUES('{partition_day}','{pid}',{protratit_count},'{protratit_type}');""".format(
partition_day=today_str, pid=pid, protratit_count=action_count
, protratit_type=protratit_type
)
print(instert_sql)
# cursor.execute("set names 'UTF8'")
db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='aqpuBLYzEV7tML5RPsN1pntUzFy',
db='jerry_prod')
cursor = db.cursor()
res = cursor.execute(instert_sql)
db.commit()
print(res)
# cursor.executemany()
db.close()
# print("-------------------------------")
# for count, res in enumerate(sql_res):
# # print(count, res)
# track = res.track
# if not track:
# continue
# track_list = track.split(",")
# for one_key_word in track_list:
# if one_key_word in res_dict:
# res_dict[one_key_word] += 1
# else:
# res_dict[one_key_word] = 1
# print(res_dict)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment