Commit 87168824 authored by litaolemo's avatar litaolemo

update

parent 42a0f9ef
# -*- coding:UTF-8 -*-
# @Time : 2020/9/9 17:15
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
# -*- coding:UTF-8 -*-
# @Time : 2020/9/9 17:16
# @File : func_get_pv_card_id.py
# @email : litao@igengmei.com
# @author : litao
import hashlib
import json
import pymysql
import xlwt, datetime
import redis
# from pyhive import hive
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from elasticsearch_7 import Elasticsearch
from elasticsearch_7.helpers import scan
import sys
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame
# from pyspark.sql.functions import lit
# import pytispark.pytispark as pti
def get_card_id():
startTime = time.time()
sparkConf = SparkConf()
sparkConf.set("spark.sql.crossJoin.enabled", True)
sparkConf.set("spark.debug.maxToStringFields", "100")
sparkConf.set("spark.tispark.plan.allow_index_double_read", False)
sparkConf.set("spark.tispark.plan.allow_index_read", True)
sparkConf.set("spark.hive.mapred.supports.subdirectories", True)
sparkConf.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", True)
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.set("mapreduce.output.fileoutputformat.compress", False)
sparkConf.set("mapreduce.map.output.compress", False)
sparkConf.set("prod.gold.jdbcuri",
"jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true")
sparkConf.set("prod.mimas.jdbcuri",
"jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true")
sparkConf.set("prod.gaia.jdbcuri",
"jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true")
sparkConf.set("prod.tidb.jdbcuri",
"jdbc:mysql://172.16.40.158:4000/eagle?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
sparkConf.set("prod.jerry.jdbcuri",
"jdbc:mysql://172.16.40.158:4000/jerry_prod?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.158:2379")
sparkConf.set("prod.tispark.pd.addresses", "172.16.40.170:4000")
sparkConf.set("prod.tidb.database", "jerry_prod")
spark = (
SparkSession.builder.config(conf=sparkConf).config("spark.sql.extensions", "org.apache.spark.sql.TiExtensions")
.config("spark.tispark.pd.addresses", "172.16.40.170:2379").appName(
"LR PYSPARK TEST").enableHiveSupport().getOrCreate())
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
task_list = []
task_days = 2
for t in range(1, task_days):
day_num = 0 - t
now = (datetime.datetime.now() + datetime.timedelta(days=day_num))
last_30_day_str = (now + datetime.timedelta(days=-30)).strftime("%Y%m%d")
today_str = now.strftime("%Y%m%d")
yesterday_str = (now + datetime.timedelta(days=-1)).strftime("%Y%m%d")
one_week_age_str = (now + datetime.timedelta(days=-3)).strftime("%Y%m%d")
sql = r"""
SELECT * FROM
(--精准曝光,卡片id和session_id去重
SELECT partition_date,
card_content_type,
cl_id,
recommend_type,
card_id
FROM
(
SELECT partition_date,
cl_id,
case when card_content_type in ('qa','answer') then 'qa' else card_content_type end as card_content_type,
CASE when transaction_type in ('fmctr') then 'fmctr'
WHEN transaction_type like '%ctr' THEN 'ctr预估'
WHEN transaction_type like '%cvr' THEN 'cvr预估'
WHEN transaction_type in ('-1','smr') THEN 'smr'
when transaction_type in ('pgc','hotspot') then '热点卡片'
when transaction_type in ('newdata') then '保量卡片'
when transaction_type in ('hotspot_feed') then 'hotspot_feed'
END AS recommend_type,
card_id,
app_session_id
from online.ml_community_precise_exposure_detail
WHERE partition_date='{partition_day}'
AND action in ('page_precise_exposure','home_choiceness_card_exposure') --7745版本action改为page_precise_exposure
AND is_exposure = '1' ----精准曝光
AND page_name ='home'
AND tab_name = '精选'
AND (transaction_type in ('-1','smr','hotspot','pgc','newdata','hotspot_feed')
or transaction_type like '%ctr' or transaction_type like '%cvr')
AND card_content_type in ('qa','diary','user_post','answer')
group by partition_date,
case when card_content_type in ('qa','answer') then 'qa' else card_content_type end,
cl_id,
CASE when transaction_type in ('fmctr') then 'fmctr'
WHEN transaction_type like '%ctr' THEN 'ctr预估'
WHEN transaction_type like '%cvr' THEN 'cvr预估'
WHEN transaction_type in ('-1','smr') THEN 'smr'
when transaction_type in ('pgc','hotspot') then '热点卡片'
when transaction_type in ('newdata') then '保量卡片'
when transaction_type in ('hotspot_feed') then 'hotspot_feed' END,
card_id,
app_session_id
)a
group by partition_date,card_content_type,cl_id,recommend_type,card_id
)t2
LEFT JOIN
(
select distinct device_id
from ml.ml_d_ct_dv_devicespam_d --去除机构刷单设备,即作弊设备(浏览和曝光事件去除)
WHERE partition_day='{partition_day}'
union all
select distinct device_id
from dim.dim_device_user_staff --去除内网用户
)spam_pv
on spam_pv.device_id=t2.cl_id
LEFT JOIN
(
SELECT partition_date,device_id
FROM
(--找出user_id当天活跃的第一个设备id
SELECT user_id,partition_date,
if(size(device_list) > 0, device_list [ 0 ], '') AS device_id
FROM online.ml_user_updates
WHERE partition_date='{partition_day}'
)t1
JOIN
( --医生账号
SELECT distinct user_id
FROM online.tl_hdfs_doctor_view
WHERE partition_date = '{partition_day}'
--马甲账号/模特用户
UNION ALL
SELECT user_id
FROM ml.ml_c_ct_ui_user_dimen_d
WHERE partition_day = '{partition_day}'
AND (is_puppet = 'true' or is_classifyuser = 'true')
UNION ALL
--公司内网覆盖用户
select distinct user_id
from dim.dim_device_user_staff
UNION ALL
--登陆过医生设备
SELECT distinct t1.user_id
FROM
(
SELECT user_id, v.device_id as device_id
FROM online.ml_user_history_detail
LATERAL VIEW EXPLODE(device_history_list) v AS device_id
WHERE partition_date = '{partition_day}'
) t1
JOIN
(
SELECT device_id
FROM online.ml_device_history_detail
WHERE partition_date = '{partition_day}'
AND is_login_doctor = '1'
) t2
ON t1.device_id = t2.device_id
)t2
on t1.user_id=t2.user_id
group by partition_date,device_id
)dev
on t2.partition_date=dev.partition_date and t2.cl_id=dev.device_id
WHERE spam_pv.device_id IS NULL
and dev.device_id is null
""".format(partition_day=yesterday_str)
device_df = spark.sql(sql)
device_df.show(1, False)
sql_res = device_df.collect()
res_dict = {
"diary": [],
"user_post": [],
"qa": []
}
for res in sql_res:
print(res)
card_content_type = res.card_content_type
card_id = res.card_id
if card_content_type in res_dict:
res_dict[card_content_type].append(card_id)
return res_dict
# -*- coding:UTF-8 -*-
# @Time : 2020/9/9 10:07
# @File : portary_div_exposure.py
# @email : litao@igengmei.com
# @author : litao
import json
import traceback
import redis
import pymysql
from elasticsearch import Elasticsearch
from meta_base_code.utils.func_get_pv_card_id import get_card_id
redis_client = redis.StrictRedis.from_url("redis://:ReDis!GmTx*0aN6@172.16.40.133:6379")
redis_client2 = redis.StrictRedis.from_url("redis://:ReDis!GmTx*0aN9@172.16.40.173:6379")
redis_client3 = redis.StrictRedis.from_url("redis://:ReDis!GmTx*0aN12@172.16.40.164:6379")
redis_client4 = redis.StrictRedis.from_url("redis://:XfkMCCdWDIU%ls$h@172.16.50.145:6379")
es = Elasticsearch([
{
'host': '172.16.31.17',
'port': 9200,
}, {
'host': '172.16.31.11',
'port': 9200,
}])
def user_portrait_scan_info():
try:
round = 0
all_count = 0
empty_count = 0
just_projects_count = 0
keys = "doris:user_portrait:tag3:device_id:*"
cur, results = redis_client2.scan(0, keys, 3000)
while cur != 0:
round += 1
print("round: " + str(round))
cur, results = redis_client2.scan(cur, keys, 3000)
for key in results:
key = str(key, "utf-8")
device_id = key.split(":")[-1]
all_count += 1
# if user_portrait_is_empty(device_id):
# print(device_id)
# empty_count += 1
# if user_portrait_just_projects(device_id):
# print(device_id)
# just_projects_count += 1
# user_portrait_get_empty_candidates(device_id)
yield get_user_portrait_tag3_from_redis(device_id)
print("all count: " + str(all_count))
print("empty portrait: " + str(empty_count))
print("just projects portrait: " + str(just_projects_count))
except Exception as e:
print(e)
def get_user_portrait_tag3_redis_key(device_id):
return "doris:user_portrait:tag3:device_id:" + str(device_id)
def get_user_portrait_tag3_from_redis(device_id, limit_score=0):
def items_gt_score(d):
new_d = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
res = {tag: float(score) for tag, score in new_d.items() if float(score) >= limit_score}
return list(res.keys())
portrait_key = get_user_portrait_tag3_redis_key(device_id)
if redis_client2.exists(portrait_key):
user_portrait = json.loads(redis_client2.get(portrait_key))
first_demands = items_gt_score(user_portrait.get("first_demands", {})) # 一级诉求
second_demands = items_gt_score(user_portrait.get("second_demands", {})) # 二级诉求
first_solutions = items_gt_score(user_portrait.get("first_solutions", {})) # 一级方式
second_solutions = items_gt_score(user_portrait.get("second_solutions", {})) # 二级方式
first_positions = items_gt_score(user_portrait.get("first_positions", {})) # 一级部位
second_positions = items_gt_score(user_portrait.get("second_positions", {}))
projects = items_gt_score(user_portrait.get("projects", {})) # 项目
anecdote_tags = items_gt_score(user_portrait.get("anecdote_tags", {})) # 八卦
return {
"first_demands": first_demands,
"second_demands": second_demands,
"first_solutions": first_solutions,
"second_solutions": second_solutions,
"first_positions": first_positions,
"second_positions": second_positions,
"projects": projects,
"anecdote_tags": anecdote_tags
}
return {}
def get_channel_tags_info():
"""
tag_ids: [416, 432, 421, 423, 275, 582]
return:
"""
sql = "SELECT name, tag_type from api_tag_3_0"
results = get_data_by_mysql("172.16.30.141", 3306, "zx_str", "ZXueX58pStrage", "zhengxing", sql)
first_demands_lst = []
second_demands_lst = []
first_solutions_lst = []
second_solutions_lst = []
first_positions_lst = []
second_positions_lst = []
projects_lst = []
# channels_lst = []
for i in results:
name = i.get("name", "")
tag_id = i.get("tag_type", -1)
if tag_id == 1:
projects_lst.append(name)
elif tag_id == 21:
first_positions_lst.append(name)
elif tag_id == 22:
second_positions_lst.append(name)
elif tag_id == 19:
first_demands_lst.append(name)
elif tag_id == 20:
second_demands_lst.append(name)
elif tag_id == 18:
first_solutions_lst.append(name)
elif tag_id == 16:
second_solutions_lst.append(name)
# elif tag_id == 29:
# channels_lst.append(name)
return {
"first_demands": first_demands_lst,
"second_demands": second_demands_lst,
"first_solutions": first_solutions_lst,
"second_solutions": second_solutions_lst,
"first_positions": first_positions_lst,
"second_positions": second_positions_lst,
"projects": projects_lst,
# "channels": channels_lst
}
def get_device_num_from_es(word):
results = es.search(
index='gm-dbmw-device',
doc_type='doc',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"should": [
{
"nested": {
"path": "first_demands",
"query": {
"bool": {
"must": [
{
"terms": {
"first_demands.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "second_demands",
"query": {
"bool": {
"must": [
{
"terms": {
"second_demands.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "first_solutions",
"query": {
"bool": {
"must": [
{
"terms": {
"first_solutions.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "second_solutions",
"query": {
"bool": {
"must": [
{
"terms": {
"second_solutions.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "first_positions",
"query": {
"bool": {
"must": [
{
"terms": {
"first_positions.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "second_positions",
"query": {
"bool": {
"must": [
{
"terms": {
"second_positions.name": [
word
]
}
}
]
}
}
}
},
{
"nested": {
"path": "projects",
"query": {
"bool": {
"must": [
{
"terms": {
"projects.name": [
word
]
}
}
]
}
}
}
}
],
"minimum_should_match": 1
}
}
}
)
tractate_content_num = results["hits"]["total"]
return tractate_content_num
def get_es_article_num(tag_dict):
# {tag_name:(answer_content_num, tractate_content_num, diary_content_num, total_num)}
article_dict = {
"first_demands": [],
"second_demands": [],
"first_solutions": [],
"second_solutions": [],
"first_positions": [],
"second_positions": [],
"projects": [],
}
for tag_type in tag_dict:
for tag_name in tag_dict[tag_type]:
body = {
"query": {
"bool": {
"minimum_should_match": 1,
"should": [],
"must": [
{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"range": {
"content_length": {
"gte": 30
}
}
}],
}
},
}
body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}})
results = es.search(
index='gm-dbmw-answer-read',
doc_type='answer',
timeout='10s',
size=0,
body=body
)
answer_content_num = results["hits"]["total"]
body = {
"query": {
"bool": {
"minimum_should_match": 1,
"should": [],
"must": [{"term": {"is_online": True}},
{"terms": {"content_level": [6, 5, 4, 3.5, 3]}}
]
}
}
}
body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}})
# tractate
results = es.search(
index='gm-dbmw-tractate-read',
doc_type='tractate',
timeout='10s',
size=0,
body=body
)
tractate_content_num = results["hits"]["total"]
body = {
"query": {
"bool": {
"minimum_should_match": 1,
"should": [],
"must": [{"term": {"is_online": True}}, {
"term": {
"has_cover": True
}
}, {"term": {
"is_sink": False
}
}, {
"term": {
"has_after_cover": True
}
}, {
"term": {
"has_before_cover": True
}
}, {"range": {"content_level": {"gte": "3"}}},
{
"term": {
"content_simi_bol_show": 0
}
}
]
}
},
}
body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}})
###diary 日记
results = es.search(
index='gm-dbmw-diary-read',
doc_type='diary',
timeout='10s',
size=0,
body=body
)
diary_content_num = results["hits"]["total"]
total_num = answer_content_num + tractate_content_num + diary_content_num
data_dic = {tag_name: (answer_content_num, tractate_content_num, diary_content_num, total_num)}
print(data_dic)
article_dict[tag_type].append(data_dic)
return article_dict
def get_data_by_mysql(host, port, user, passwd, db, sql):
try:
db = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db,
cursorclass=pymysql.cursors.DictCursor)
cursor = db.cursor()
cursor.execute(sql)
results = cursor.fetchall()
db.close()
return results
except:
print("error2_user_portrait", traceback.format_exc())
return traceback.format_exc()
def from_id_get_tag(card_id_dict):
index = ""
doc_type = ""
query_count = {}
for card_type in card_id_dict:
if card_type == "diary":
index = 'gm-dbmw-diary-read'
doc_type = 'diary'
elif card_type == "qa":
index = 'gm-dbmw-answer-read'
doc_type = 'answer'
elif card_type == "user_post":
index = 'gm-dbmw-tractate-read'
doc_type = 'tractate'
for card_id in card_id_dict[card_type]:
res = es.get_source(index,card_id,doc_type=doc_type)
print(res)
first_demands = res.get("first_demands")
second_demands = res.get("second_demands")
first_solutions = res.get("first_solutions")
second_solutions = res.get("second_solutions")
first_positions = res.get("first_positions")
second_positions = res.get("second_positions")
projects = res.get("projects")
word_count_list = first_demands + second_demands + first_solutions + second_solutions + first_positions + second_positions + projects
for word in word_count_list:
if word in query_count:
query_count[word] += 1
else:
query_count[word] = 0
return query_count
def parse_data():
demands_num = {}
# 获取全部标签
all_tags = get_channel_tags_info()
print(all_tags)
# 获取标签对应的日记帖子回答数
article_num_dict = get_es_article_num(all_tags)
# 获取曝光的id
card_id_dict = get_card_id()
# 获取曝光id对应的标签
word_count_exposure = from_id_get_tag(card_id_dict)
if __name__ == "__main__":
parse_data()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment