update

56c55b5a · litaolemo · b9ce66aa · 56c55b5a · b9ce66aa · 56c55b5a
Commit 56c55b5a authored Aug 19, 2020 by litaolemo
5 changed files
--- a/crawler_sys/tools/revise_table_data_to_table.py
+++ b/crawler_sys/tools/revise_table_data_to_table.py
+# -*- coding:UTF-8 -*-
+# @Time  : 2020/8/19 11:47
+# @File  : revise_table_data_to_table.py
+# @email : litao@igengmei.com
+# @author : litao
+import pymysql
+
+
+def con_sql(sql):
+    # 从数据库的表里获取数据
+    """
+	:type sql : str
+	:rtype : tuple
+	"""
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
+                         db='jerry_prod')
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    db.close()
+    return result
+
+
+sql = "slect * from xxx"
\ No newline at end of file
--- a/tasks/check_high_play_count_data_source_v_qq.py
+++ b/tasks/check_high_play_count_data_source_v_qq.py
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Sep  5 17:52:53 2018
-
-@author: fangyucheng
-"""
-
-from crawler_sys.site_crawler.crawler_v_qq import Crawler_v_qq
-from crawler_sys.utils.output_results import output_result
-from crawler_sys.utils import Metaorphosis as meta
-from crawler_sys.utils.output_log import output_log
-
-logging = output_log(page_category='video_page',
-                     program_info='tencent')
-
-def tran_input_data_to_lst(file_name, file_category='csv'):
-    if file_category == 'csv':
-        video_info_lst = meta.csv_to_lst_whth_headline(file_name)
-        url_lst = []
-        for line in video_info_lst:
-            try:
-                if line['data_provider'] == 'CCR':
-                    url_lst.append(line['url'])
-            except:
-                pass
-        return url_lst
-    elif file_category == 'file':
-        url_lst = meta.str_file_to_lst(file_name)
-        return url_lst
-
-url_lst = tran_input_data_to_lst(file_name='R:/CCR/数据需求/短期临时需求/TX', file_category='file')
-
-crawler = Crawler_v_qq()
-get_video_page = crawler.video_page
-
-def get_data_source(url_lst=url_lst,
-                    output_to_file=False,
-                    filepath=None,
-                    output_to_es_raw=False,
-                    output_to_es_register=False,
-                    push_to_redis=False,
-                    output_es_index=None,
-                    output_doc_type=None):
-    result_lst = []
-    for url in url_lst:
-        video_info = get_video_page(url=url)
-        result_lst.append(video_info)
-        logging.info('get_data at page %s' % url)
-        if len(result_lst) >= 100:
-            if output_es_index is not None and output_doc_type is not None:
-                output_result(result_lst,
-                              platform='腾讯视频',
-                              output_to_file=output_to_file,
-                              output_to_es_raw=output_to_es_raw,
-                              output_to_es_register=output_to_es_register,
-                              push_to_redis=push_to_redis,
-                              es_index=output_es_index,
-                              doc_type=output_doc_type)
-                result_lst.clear()
-            else:
-                output_result(result_lst,
-                              platform='腾讯视频',
-                              output_to_file=output_to_file,
-                              output_to_es_raw=output_to_es_raw,
-                              output_to_es_register=output_to_es_register,
-                              push_to_redis=push_to_redis)
-                result_lst.clear()
-    if len(result_lst) != []:
-        if output_es_index is not None and output_doc_type is not None:
-            output_result(result_lst,
-                          platform='腾讯视频',
-                          output_to_file=output_to_file,
-                          output_to_es_raw=output_to_es_raw,
-                          output_to_es_register=output_to_es_register,
-                          push_to_redis=push_to_redis,
-                          es_index=output_es_index,
-                          doc_type=output_doc_type)
-            result_lst.clear()
-        else:
-            output_result(result_lst,
-                          platform='腾讯视频',
-                          output_to_file=output_to_file,
-                          output_to_es_raw=output_to_es_raw,
-                          output_to_es_register=output_to_es_register,
-                          push_to_redis=push_to_redis)
-            result_lst.clear()
-
-if __name__ == '__main__':
-    get_data_source(output_to_es_raw=True,
-                    output_es_index='test2',
-                    output_doc_type='fyc')
\ No newline at end of file
--- a/tasks/from_sparksql_to_mysql.py
+++ b/tasks/from_sparksql_to_mysql.py
+# -*- coding:UTF-8 -*-
+# @Time  : 2020/8/19 11:53
+# @File  : from_sparksql_to_mysql.py
+# @email : litao@igengmei.com
+# @author : litao
+import hashlib
+import json
+
+import pymysql
+import xlwt, datetime
+import redis
+# from pyhive import hive
+from maintenance.func_send_email_with_file import send_file_email
+from typing import Dict, List
+from elasticsearch_7 import Elasticsearch
+from elasticsearch_7.helpers import scan
+import sys
+import time
+from pyspark import SparkConf
+from pyspark.sql import SparkSession, DataFrame
+from pyspark.sql.functions import lit
+import pytispark.pytispark as pti
+
+db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
+                         db='jerry_prod')
+cursor = db.cursor()
+
+def con_sql(sql):
+    # 从数据库的表里获取数据
+
+    db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='YPEzp78HQBuhByWPpefQu6X3D6hEPfD6',
+                         db='jerry_prod')
+    cursor = db.cursor()
+    cursor.execute(sql)
+    result = cursor.fetchall()
+    db.close()
+    return result
+
+
+startTime = time.time()
+sparkConf = SparkConf()
+sparkConf.set("spark.sql.crossJoin.enabled", True)
+sparkConf.set("spark.debug.maxToStringFields", "100")
+sparkConf.set("spark.tispark.plan.allow_index_double_read", False)
+sparkConf.set("spark.tispark.plan.allow_index_read", True)
+sparkConf.set("spark.hive.mapred.supports.subdirectories", True)
+sparkConf.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", True)
+sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+sparkConf.set("mapreduce.output.fileoutputformat.compress", False)
+sparkConf.set("mapreduce.map.output.compress", False)
+sparkConf.set("prod.gold.jdbcuri",
+              "jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true")
+sparkConf.set("prod.mimas.jdbcuri",
+              "jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true")
+sparkConf.set("prod.gaia.jdbcuri",
+              "jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true")
+sparkConf.set("prod.tidb.jdbcuri",
+              "jdbc:mysql://172.16.40.158:4000/eagle?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
+sparkConf.set("prod.jerry.jdbcuri",
+              "jdbc:mysql://172.16.40.158:4000/jerry_prod?user=st_user&password=aqpuBLYzEV7tML5RPsN1pntUzFy&rewriteBatchedStatements=true")
+sparkConf.set("prod.tispark.pd.addresses", "172.16.40.158:2379")
+sparkConf.set("prod.tispark.pd.addresses", "172.16.40.170:4000")
+sparkConf.set("prod.tidb.database", "jerry_prod")
+
+spark = (SparkSession.builder.config(conf=sparkConf).config("spark.sql.extensions", "org.apache.spark.sql.TiExtensions")
+         .config("spark.tispark.pd.addresses", "172.16.40.170:2379").appName(
+    "LR PYSPARK TEST").enableHiveSupport().getOrCreate())
+
+spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar")
+spark.sql("ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar")
+spark.sql("CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'")
+spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJsonFormatCheck'")
+spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
+
+select_sql = """SELECT * FROM pm.tl_pm_contentpage_ctr"""
+device_df = spark.sql(select_sql)
+device_df.show(1, False)
+sql_res = device_df.collect()
+print("-----------------------------------------------------------------------------")
+for res in sql_res:
+    day_id = res.day_id
+    device_os_type = res.device_os_type
+    active_type = res.active_type
+    grey_type = res.grey_type
+    page_name = res.page_name
+    content_pv = res.content_pv
+    content_uv = res.content_uv
+    wel_exp_pv = res.wel_exp_pv
+    content_exp_pv = res.content_exp_pv
+
+    wel_click_pv = res.wel_click_pv
+    content_click_pv = res.content_click_pv
+    slide_wel_click_pv = res.slide_wel_click_pv
+    self_wel_click_pv = res.self_wel_click_pv
+    partition_day = res.partition_day
+    pid = hashlib.md5(day_id + device_os_type + active_type + grey_type + page_name)
+    sql = """INSERT INTO conent_detail_page_grayscale_ctr(day_id,device_os_type,
+    active_type,grey_type,page_name,content_pv,content_uv,wel_exp_pv,content_exp_pv,wel_click_pv,content_click_pv,
+    wel_click_pv,content_click_pv,slide_wel_click_pv,partition_day,pid
+    ) VALUES('{day_id}','{device_os_type}',
+    '{active_type}','{grey_type}','{page_name}',{content_pv},{content_uv},{wel_exp_pv},{content_exp_pv},{wel_click_pv},{content_click_pv},
+    {wel_click_pv},{content_click_pv},{slide_wel_click_pv},'{partition_day}','{pid})'""".format(
+        day_id=day_id,device_os_type=device_os_type,active_type=active_type,grey_type=grey_type,page_name=page_name,
+        content_pv=content_pv,content_uv=content_uv,wel_exp_pv=wel_exp_pv,content_exp_pv=content_exp_pv,wel_click_pv=wel_click_pv,
+        content_click_pv=content_click_pv,slide_wel_click_pv=slide_wel_click_pv,self_wel_click_pv=self_wel_click_pv,
+        partition_day=partition_day, pid=pid
+    )
+    cursor.execute(sql)
+# cursor.executemany()
+db.close()
\ No newline at end of file
--- a/tasks/update_DU_ATU_from_crawler_raw.py
+++ b/tasks/update_DU_ATU_from_crawler_raw.py
--- a/tasks/write_key_releaser_to_week_doc_weekly.py
+++ b/tasks/write_key_releaser_to_week_doc_weekly.py
-# -*- coding:utf-8 -*-
-# @Time : 2019/8/14 18:01 
-# @Author : litao
-
-
-import json
-# import argparse
-import datetime
-from elasticsearch import Elasticsearch
-from elasticsearch.helpers import scan
-from func_find_week_num import find_week_belongs_to
-from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
-from crawler.crawler_sys.utils import trans_format
-from write_data_into_es.func_cal_doc_id import cal_doc_id
-
-hosts = '192.168.17.11'
-port = 80
-user = 'zhouyujiang'
-passwd = '8tM9JDN2LVxM'
-http_auth = (user, passwd)
-
-es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
-
-
-# parser = argparse.ArgumentParser()
-# parser.add_argument('-w', '--week_str', type=str, default=None)
-
-def week_start_day(week_year, week_no, week_day, week_day_start=1):
-    year_week_start = find_first_day_for_given_start_weekday(week_year, week_day_start)
-    week_start = year_week_start + datetime.timedelta(days=(week_no - 1) * 7)
-    return week_start
-
-
-def define_doc_type(week_year, week_no, week_day_start):
-    """
-    doc_type = 'daily-url-2018_w24_s2' means select Tuesday as the
-    first day of each week, it's year 2018's 24th week.
-    In isocalendar defination, Monday - weekday 1, Tuesday - weekday 2,
-    ..., Saturday - weekday 6, Sunday -  weekday 7.
-    """
-    doc_type_str = 'daily-url-%d_w%02d_s%d' % (week_year, week_no, week_day_start)
-    return doc_type_str
-
-
-def find_first_day_for_given_start_weekday(year, start_weekday):
-    i = 0
-    while i < 7:
-        dayDi = datetime.date(year, 1, 1) + datetime.timedelta(days=i)
-        if dayDi.weekday() == start_weekday:
-            cal_day1D = dayDi - datetime.timedelta(days=1)
-            break
-        else:
-            cal_day1D = None
-        i += 1
-    return cal_day1D
-
-
-def get_target_releaser_video_info(platform,
-                                   releaserUrl,
-                                   log_file=None,
-
-                                   output_to_es_raw=True,
-                                   es_index=None,
-                                   doc_type=None,
-                                   releaser_page_num_max=100):
-    if log_file == None:
-        log_file = open('error.log', 'w')
-
-    crawler = get_crawler(platform=platform)
-    crawler_initialization = crawler()
-    if platform == 'haokan':
-        try:
-            crawler_initialization.releaser_page(releaserUrl=releaserUrl,
-                                                 releaser_page_num_max=releaser_page_num_max,
-                                                 output_to_es_raw=True,
-                                                 es_index=es_index,
-                                                 doc_type=doc_type,
-                                                 fetchFavoriteCommnt=True)
-        except:
-            print(releaserUrl, platform, file=log_file)
-    else:
-        try:
-            crawler_initialization.releaser_page(releaserUrl=releaserUrl,
-                                                 releaser_page_num_max=releaser_page_num_max,
-                                                 output_to_es_raw=True,
-                                                 es_index=es_index,
-                                                 doc_type=doc_type)
-        except:
-            print(releaserUrl, platform, file=log_file)
-
-
-def func_search_reUrl_from_target_index(platform, releaser):
-    search_body = {
-            "query": {
-                    "bool": {
-                            "filter": [
-                                    {"term": {"platform.keyword": platform}},
-                                    {"term": {"releaser.keyword": releaser}}
-                            ]
-                    }
-            }
-    }
-    search_re = es.search(index='target_releasers', doc_type='doc', body=search_body)
-    if search_re['hits']['total'] > 0:
-        return search_re['hits']['hits'][0]['_source']['releaserUrl']
-    else:
-        print('Can not found:', platform, releaser)
-        return None
-
-
-def func_write_into_weekly_index_new_released(line_list, doc_type, index='short-video-weekly'):
-    count = 0
-    bulk_all_body = ''
-    re_list = []
-    for line in line_list:
-        count = count + 1
-        weekly_net_inc_play_count = line['play_count']
-        weekly_net_inc_comment_count = line['comment_count']
-        weekly_net_inc_favorite_count = line['favorite_count']
-        weekly_cal_base = 'accumulate'
-        timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
-
-        line.update({
-                'timestamp': timestamp,
-                'weekly_cal_base': weekly_cal_base,
-                'weekly_net_inc_favorite_count': weekly_net_inc_favorite_count,
-                'weekly_net_inc_comment_count': weekly_net_inc_comment_count,
-                'weekly_net_inc_play_count': weekly_net_inc_play_count
-        })
-        re_list.append(line)
-        url = line['url']
-        platform = line['platform']
-        doc_id = cal_doc_id(platform, url=url, doc_id_type='all-time-url',data_dict=line)
-        bulk_head = '{"index": {"_id":"%s"}}' % doc_id
-        data_str = json.dumps(line, ensure_ascii=False)
-
-        bulk_one_body = bulk_head + '\n' + data_str + '\n'
-        #
-        bulk_all_body += bulk_one_body
-        if count % 500 == 0:
-            eror_dic = es.bulk(index=index, doc_type=doc_type,
-                               body=bulk_all_body, request_timeout=200)
-            bulk_all_body = ''
-            if eror_dic['errors'] is True:
-                print(eror_dic['items'])
-                print(bulk_all_body)
-            print(count)
-
-    if bulk_all_body != '':
-        eror_dic = es.bulk(body=bulk_all_body,
-                           index=index,
-                           doc_type=doc_type,
-                           request_timeout=200)
-        if eror_dic['errors'] is True:
-            print(eror_dic)
-
-
-
-todayT = datetime.datetime.now()
-# todayT=datetime.datetime(2019,2,5)
-week_day_start = 1
-# if args.week_str is None:
-seven_days_ago_T = todayT - datetime.timedelta(days=7)
-week_year, week_no, week_day = find_week_belongs_to(seven_days_ago_T,
-                                                    week_day_start)
-week_start = week_start_day(week_year, week_no, week_day)
-re_s = week_start - datetime.timedelta(1)
-re_s_dt = datetime.datetime.strptime(str(re_s), '%Y-%m-%d')
-re_s_t = int(datetime.datetime.timestamp(re_s_dt) * 1000)
-
-re_e = week_start + datetime.timedelta(6)
-re_e_dt = datetime.datetime.strptime(str(re_e), '%Y-%m-%d')
-re_e_t = int(datetime.datetime.timestamp(re_e_dt) * 1000)
-#    nowT_feihua = week_start + datetime.timedelta(days=6)
-weekly_doc_type_name = define_doc_type(week_year, week_no,
-                                       week_day_start=week_day_start)
-key_releaser_body = {
-  "query": {
-    "bool": {
-      "filter": [
-     {"term": {"key_releaser.keyword": "True"}}
-        ]
-    }
-  }
-}
-
-releaser_re = scan(client=es, index='target_releasers', doc_type='doc',
-                           query=key_releaser_body, scroll='3m')
-for re in releaser_re:
-    releaser = re["_source"]['releaser']
-    platform = re["_source"]['platform']
-    if releaser != None:
-        re_list = []
-        search_body = {
-                "query": {
-                        "bool": {
-                                "filter": [
-                                        {"term": {"platform.keyword": platform}},
-                                        {"term": {"releaser.keyword": releaser}},
-                                        {"range": {"release_time": {"gte": re_s_t, "lt": re_e_t}}},
-                                        {"range": {"fetch_time": {"gte": re_s_t}}}
-
-                                ]
-                        }
-                }
-        }
-
-        scan_re = scan(client=es, index='short-video-all-time-url', doc_type='all-time-url',
-                       query=search_body, scroll='3m')
-        for one_scan in scan_re:
-            re_list.append(one_scan['_source'])
-        func_write_into_weekly_index_new_released(re_list, doc_type=weekly_doc_type_name)
-
-