add

a408229b · 张彦钊 · 16054827 · a408229b
Commit a408229b authored Sep 27, 2019 by 张彦钊
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 99 deletions

hello.py hello.py +21 -99

No files found.
--- a/hello.py
+++ b/hello.py
@@ -18,109 +18,31 @@ import redis
 import datetime


-# filter logging
-def gbk_decoder(s):
-    if s is None:
-        return None
-    try:
-        data = msgpack.loads(s,encoding='utf-8')
-        return data
-    except:
-        data = json.loads(s)
-        return data
+def ctr(x):
+    sum = 0
+    for i in x:
+        if i['is_cpc'] == 1:
+            sum = sum + 1
+    return sum


-def maidian(x):
-    try:
-        data = json.loads(x[1])
-        if 'type' in data and 'device' in data:
-            if data['type'] == 'on_click_button' \
-                    and data['params']['page_name'] == 'home' and data['params']['tab_name'] == '精选' \
-                    and data['params']['button_name'] == 'user_feedback_type' \
-                    and data['params']['extra_param'][0]["card_content_type"] == "diary" \
-                    and ("1" in data['params']['extra_param'][0]["feedback_type"]
-                         or "2" in data['params']['extra_param'][0]["feedback_type"]):
-                return True
-            else:
-                return False
-        else:
-            return False
-
-    except Exception as e:
-        print("filter fail")
-        print(e)
-
-
-def get_data(x):
-    try:
-        device_id = x[1]['device']['device_id']
-        diary_id = x[1]['params']['extra_param'][0]["card_id"]
-        return device_id,diary_id
-    except Exception as e:
-        print("get_data fail")
-        send_email("get_data", "get_data", e)
-
-
-def write_redis(device_id,cid_list):
-    try:
-        db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
-        sql = "select b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b " \
-              "on a.tag_id = b.id where b.tag_type = '3' and a.diary_id in {}".format(tuple(cid_list))
-        cursor = db.cursor()
-        cursor.execute(sql)
-        result = cursor.fetchall()
-        tags = list(set([i[0] for i in result]))
-        if tags is not None:
-            sql = "select a.id from src_mimas_prod_api_diary a left join src_mimas_prod_api_diary_tags b " \
-                  "on a.id=b.diary_id left join src_zhengxing_api_tag c on b.tag_id=c.id " \
-                  "where a.is_online = 1 and a.content_level >= '3' " \
-                  "and c.id in {} and c.tag_type = '3'".format(tuple(tags))
-            cursor.execute(sql)
-            result = cursor.fetchall()
-            if result is not None:
-                cids = list(set([i[0] for i in result]))
-                r = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN6@172.16.40.133:6379')
-                key = str(device_id) + "_dislike_diary"
-                if r.exists(key):
-                    value = eval(r.get(key))
-                    value.extend(cids)
-                    cids = json.dumps(list(set(value)))
-                    r.set(key, json.dumps(cids))
-                else:
-                    r.set(key, json.dumps(cids))
-                    r.expire(key, 7*24*60*60)
-    except Exception as e:
-        print("insert redis fail")
-        print(e)
-
-
-def model(rdd):
-    try:
-        rdd.filter(lambda x: maidian(x)).map(lambda x:get_data(x).na.drop().groupByKey())\
-            .map(lambda x:write_redis(x[0],x[1]))
-    except Exception as e:
-        print("fail")
-        print(e)


 if __name__ == '__main__':
-    sc = SparkContext(conf=SparkConf().setMaster("spark://nvwa01:7077").setAppName("dislike_filter").set(
-        "spark.io.compression.codec", "lzf"))
-    ssc = StreamingContext(sc, 10)
-    sc.setLogLevel("WARN")
-    kafkaParams = {"metadata.broker.list": "172.16.44.25:9092,172.16.44.31:9092,172.16.44.45:9092",
-                   "group.id": "dislike",
-                   "socket.timeout.ms": "600000",
-                   "auto.offset.reset": "largest"}
-    try:
-        stream = KafkaUtils.createDirectStream(ssc, ["gm-maidian-data"], kafkaParams, keyDecoder=gbk_decoder,
-                                               valueDecoder=gbk_decoder)
-        transformstream = stream.transform(lambda x: model(x))
-        transformstream.pprint()
-        ssc.start()
-        ssc.awaitTermination()
-    except Exception as e:
-        print(e)
-        # send_email(sc.appName, sc.applicationId, e)
+    sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
+        .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
+        .set("spark.tispark.plan.allow_index_double_read", "false") \
+        .set("spark.tispark.plan.allow_index_read", "true") \
+        .set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
+        .set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
+        .set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
+
+    spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
+    sql = "select params['exposure_cards'] from online.ml_community_precise_exposure_detail " \
+          "where action = 'page_precise_exposure' and page_name = 'search_result_welfare' " \
+          "AND partition_date='20190926' limit 20"
+    rdd = spark.sql(sql).rdd.map(lambda x:x[0]).map(lambda x:eval(x[0])).map(lambda x:ctr(x))
+    spark.createDataFrame(rdd).show(6)
+