Commit a408229b authored by 张彦钊's avatar 张彦钊

add

parent 16054827
......@@ -18,109 +18,31 @@ import redis
import datetime
# filter logging
def gbk_decoder(s):
if s is None:
return None
try:
data = msgpack.loads(s,encoding='utf-8')
return data
except:
data = json.loads(s)
return data
def ctr(x):
sum = 0
for i in x:
if i['is_cpc'] == 1:
sum = sum + 1
return sum
def maidian(x):
try:
data = json.loads(x[1])
if 'type' in data and 'device' in data:
if data['type'] == 'on_click_button' \
and data['params']['page_name'] == 'home' and data['params']['tab_name'] == '精选' \
and data['params']['button_name'] == 'user_feedback_type' \
and data['params']['extra_param'][0]["card_content_type"] == "diary" \
and ("1" in data['params']['extra_param'][0]["feedback_type"]
or "2" in data['params']['extra_param'][0]["feedback_type"]):
return True
else:
return False
else:
return False
except Exception as e:
print("filter fail")
print(e)
def get_data(x):
try:
device_id = x[1]['device']['device_id']
diary_id = x[1]['params']['extra_param'][0]["card_id"]
return device_id,diary_id
except Exception as e:
print("get_data fail")
send_email("get_data", "get_data", e)
def write_redis(device_id,cid_list):
try:
db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
"on a.tag_id = b.id where b.tag_type = '3' and a.diary_id in {}".format(tuple(cid_list))
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
tags = list(set([i[0] for i in result]))
if tags is not None:
sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
"on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
"where a.is_online = 1 and a.content_level >= '3' " \
"and c.id in {} and c.tag_type = '3'".format(tuple(tags))
cursor.execute(sql)
result = cursor.fetchall()
if result is not None:
cids = list(set([i[0] for i in result]))
r = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN6@172.16.40.133:6379')
key = str(device_id) + "_dislike_diary"
if r.exists(key):
value = eval(r.get(key))
value.extend(cids)
cids = json.dumps(list(set(value)))
r.set(key, json.dumps(cids))
else:
r.set(key, json.dumps(cids))
r.expire(key, 7*24*60*60)
except Exception as e:
print("insert redis fail")
print(e)
def model(rdd):
try:
rdd.filter(lambda x: maidian(x)).map(lambda x:get_data(x).na.drop().groupByKey())\
.map(lambda x:write_redis(x[0],x[1]))
except Exception as e:
print("fail")
print(e)
if __name__ == '__main__':
sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("dislike_filter").set(
"spark.io.compression.codec", "lzf"))
ssc = StreamingContext(sc, 10)
sc.setLogLevel("WARN")
kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
"group.id": "dislike",
"socket.timeout.ms": "600000",
"auto.offset.reset": "largest"}
try:
stream = KafkaUtils.createDirectStream(ssc, ["gm-maidian-data"], kafkaParams, keyDecoder=gbk_decoder,
valueDecoder=gbk_decoder)
transformstream = stream.transform(lambda x: model(x))
transformstream.pprint()
ssc.start()
ssc.awaitTermination()
except Exception as e:
print(e)
# send_email(sc.appName, sc.applicationId, e)
sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
.set("spark.tispark.plan.allow_index_double_read", "false") \
.set("spark.tispark.plan.allow_index_read", "true") \
.set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
.set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sql = "select params['exposure_cards'] from online.ml_community_precise_exposure_detail " \
"where action = 'page_precise_exposure' and page_name = 'search_result_welfare' " \
"AND partition_date='20190926' limit 20"
rdd = spark.sql(sql).rdd.map(lambda x:x[0]).map(lambda x:eval(x[0])).map(lambda x:ctr(x))
spark.createDataFrame(rdd).show(6)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment