update

781e362a · litaolemo · 381836ae · 781e362a
Commit 781e362a authored Aug 12, 2020 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 203 additions and 189 deletions

crawler_week_report.py tasks/crawler_week_report.py +203 -189

No files found.
--- a/tasks/crawler_week_report.py
+++ b/tasks/crawler_week_report.py
@@ -19,6 +19,16 @@ from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
 es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
 rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')

+conn_mimas = pymysql.connect(host='172.16.30.138', port=3306, user='mimas', passwd='GJL3UJe1Ck9ggL6aKnZCq4cRvM',
+                           db='mimas_prod', charset='utf8mb4')
+cur_mimas = conn_mimas.cursor()
+
+db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
+                                     password="o5gbA27hXHHm",
+                                     db="doris_prod",
+                                     charset='utf8',
+                                     cursorclass=pymysql.cursors.DictCursor)
+zhengxing_cursor = db_zhengxing_eagle.cursor()

 es = Elasticsearch([
    {
@@ -30,6 +40,7 @@ es = Elasticsearch([
    }])


+
 # def zipDir(dirpath,outFullName):
 #     """
 #     压缩指定文件夹
@@ -53,51 +64,15 @@ def send_email_tome():
        fromaddr = 'litao@igengmei.com'

        content = 'hi  all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果，请查收！'
-        zipFile = "/srv/apps/crawler/crawler_sys/utils/近一月数据统计结果.xls"
-        send_file_email("", "", email_group=["‎<duanyingrong@igengmei.com>","<litao@igengmei.com>"], title_str=content
+        zipFile = "/srv/apps/crawler/近一月数据统计结果.xls"
+        send_file_email("", "", email_group=["<litao@igengmei.com>"], title_str=content
                        , email_msg_body_str=content, file=zipFile)
    except Exception as e:
        print(e)


-def get_es_word(word):
-    ###answer
-    results = es.search(
-        index='gm-dbmw-answer-read',
-        doc_type='answer',
-        timeout='10s',
-        size=0,
-        body={
-            "query": {
-                "bool": {
-                    "minimum_should_match": 1,
-                    "should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
-                               {"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
-                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
-                    "must": [
-                        {
-                            "term": {
-                                "is_online": True
-                            }
-                        }, {
-                            "terms": {
-                                "content_level": [6, 5, 4, 3.5, 3]
-                            }
-                        }, {
-                            "range": {
-                                "content_length": {
-                                    "gte": 30
-                                }
-                            }
-                        }],
-
-                }
-            },
-        }
-    )
-    answer_content_num = results["hits"]["total"]
-
-    # tractate
+def get_es_word(word,start_ts):
+    # tractate oneline
    results = es.search(
        index='gm-dbmw-tractate-read',
        doc_type='tractate',
@@ -113,129 +88,81 @@ def get_es_word(word):
                                                                                  "analyzer": "gm_default_index"}}}],
                    "must": [{"term": {"is_online": True}}, {
                        "terms":
-                            {"content_level": [6, 5, 4, 3.5, 3]}
+                            {"content_level": [6, 5, 4, 3]}
                        }]
                }
+            },"size": 0,"aggs": {
+                  "content_level": {
+                    "terms": {
+                      "field": "content_level",
+                      "size": 10
+                    }
+                 }
            }
        }
    )
    tractate_content_num = results["hits"]["total"]
+    content_level_count = results["aggregations"]["content_level"]["buckets"]

-    ###diary 日记
+    # tractate craw
    results = es.search(
-        index='gm-dbmw-diary-read',
-        doc_type='diary',
+        index='gm-dbmw-tractate-read',
+        doc_type='tractate',
        timeout='10s',
        size=0,
        body={
            "query": {
                "bool": {
                    "minimum_should_match": 1,
-                    "should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}},
-                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}},
-                               {"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}],
-                    "must": [{"term": {"is_online": True}}, {
-                        "term": {
-                            "has_cover": True
-                        }
-                    }, {"term": {
-                        "is_sink": False
-                    }
-                             }, {
-                                 "term": {
-                                     "has_after_cover": True
-                                 }
-                             }, {
-                                 "term": {
-                                     "has_before_cover": True
-                                 }
-                             }, {"range": {"content_level": {"gte": "3"}}},
-                             {
-                                 "term": {
-                                     "content_simi_bol_show": 0
-                                 }
-                             }
-                             ]
+                    "should": [{"match_phrase": {"content": {"query": word, "analyzer": "gm_default_index"}}}, {
+                        "match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
+                                   "match_phrase": {"tractate_tag_name_content": {"query": word,
+                                                                                  "analyzer": "gm_default_index"}}}],
+                    "must": [
+                             {"terms": {"platform": [13,14,15,16,17]}},  {"range":{"create_time":{"gte":start_ts}}}
+                    ]
                }
-            },
-        }
-    )
-    diary_content_num = results["hits"]["total"]
-
-    diary_query = {
-        "query": {
-            "bool": {
-                "must": [{
-                    "term": {
-                        "is_online": True
-                    }
-                }, {
-                    "term": {
-                        "has_cover": True
-                    }
-                }, {
-                    "term": {
-                        "is_sink": False
-                    }
-                }, {
-                    "term": {
-                        "has_after_cover": True
-                    }
-                }, {
-                    "term": {
-                        "has_before_cover": True
-                    }
-                }, {
-                    "terms": {
-                        "content_level": [6, 5, 4, 3.5, 3]
-                    }
-                }, {
-                    "term": {
-                        "content_simi_bol_show": 0
-                    }
-                }]
-            }
-        },
-        "_source": ["id"]
-    }
-    answer_query = {
-        "query": {
-            "bool": {
-
-                "must": [{
-                    "term": {
-                        "is_online": True
-                    }
-                }, {
+            }, "size": 0, "aggs": {
+                "content_level": {
                    "terms": {
-                        "content_level": [6, 5, 4, 3.5, 3]
-                    }
-                }, {
-                    "range": {
-                        "content_length": {
-                            "gte": 30
+                        "field": "content_level",
+                        "size": 10
+                    },
+                      "aggs": {
+                        "is_online": {
+                          "terms": {
+                            "field": "is_online",
+                            "size": 10
+                          }
                        }
-                    }
-                }]
-            }
-        }
-    }
-    tractate_query = {
-        "query": {
-            "bool": {
-                "must": [{
-                    "term": {
-                        "is_online": True
-                    }
-                }, {
-                    "terms": {
-                        "content_level": [6, 5, 4, 3.5, 3]
-                    }
-                }]
+                      }
+                }
            }
        }
-    }
-    return answer_content_num, tractate_content_num, diary_content_num
+    )
+    craw_tractate_num = results["hits"]["total"]
+    craw_level_count = results["aggregations"]["content_level"]["buckets"]
+    res_list = []
+    # 可见3456星帖子
+    for content_level in ("3","4","5","6"):
+        doc_count = 0
+        for content_level_dic in content_level_count:
+            if content_level_dic["keys"] == content_level:
+                doc_count = content_level_dic["doc_count"]
+                break
+        res_list.append(doc_count)
+    res_list.append(craw_tractate_num)
+    for content_level in ("3","4","5","6"):
+        doc_count = 0
+        for content_level_dic in craw_level_count:
+            if content_level_dic["keys"] == content_level:
+                for buck in content_level_dic["is_online"]["buckets"]:
+                    if buck["key"] == 1:
+                        doc_count = buck["doc_count"]
+                        break
+        res_list.append(doc_count)
+
+    return tractate_content_num


 class WritrExcel():
@@ -274,66 +201,153 @@ class WritrExcel():
        print("写入文件成功，共" + str(k) + "行数据")


-if __name__ == "__main__":
-    tag_names_list = []
+def get_search_keywrod_dict():
+    search_body = {
+        "query": {
+          "bool": {
+            "filter": [
+
+              ]
+          }
+        }
+    }
+    return_dict = []
+    scan_res = scan(es_framework,query=search_body,index="search_word")
+    for res in scan_res:
+        return_dict[res["_source"]["title"]] = {}
+    return return_dict
+
+
+def get_hot_crawler_data_count(index,start_ts):
+    search_body = {
+        "query": {
+          "bool": {
+            "filter": [
+                {"range":{"fetch_time":{"gte":start_ts}}}
+              ],"must_not": [
+                {"exists": {"field": "search_word"}}
+              ]
+          }
+        },"size": 0,"aggs": {
+          "NAME": {
+            "terms": {
+              "field": "platform",
+              "size": 10
+            }
+          }
+        }
+    }
+    search_res = es_framework.search(body=search_body,index=index)
+    platform_count = search_res["aggregations"]["NAME"]["buckets"]
+    return platform_count
+
+def get_count_hot_crawler_data_from_mysql(start_ts:int) ->Dict:
+    data_platform_id_list = [11,12]
+    res_dict = {}
+    for platform_id in data_platform_id_list:
+        start_date = datetime.datetime.fromtimestamp(start_ts/1e3)
+        match_all_sql = """select count(id) as nums from api_tractate where create_time >= '{time}' and platform = {platfrom}""".format(time=str(start_date),platfrom=platform_id)
+        cur_mimas.execute(match_all_sql)
+        data = cur_mimas.fetchall()
+        match_all_num = data[0][0]
+        # print(match_all_num)
+        oneline_sql = """select count(id) as nums from api_tractate where create_time >= '{time}' and is_online = 1 and platform = {platfrom}""".format(time=str(start_date),platfrom=platform_id)
+        cur_mimas.execute(oneline_sql)
+        data = cur_mimas.fetchall()
+        match_oneline_num = data[0][0]
+        # print(match_oneline_num)
+        if platform_id == 11:
+            platform = 'weibo'
+        elif platform_id == 12:
+            platform = "douban"
+        res_dict[platform] = []
+        res_dict[platform].append(match_all_num)
+        res_dict[platform].append(match_oneline_num)
+    return res_dict
+
+def week_num(year=None, cycle=None, cycle_num=None, compare_type=None):
+    now = datetime.datetime.now()
+    now_canlendar = now.isocalendar()
+    if not cycle_num:
+        week_canlendar = now_canlendar
+    else:
+        week_canlendar = (now.year, cycle_num + 1, 0)
+    year = week_canlendar[0]
+    this_week = week_canlendar[1]
+    if this_week == 0:
+        last_year = year - 1
+        this_week = 1
+    else:
+        last_year = year
+    if this_week == 1:
+        last_week = "52"
+    else:
+        last_week = this_week - 1
+    today = datetime.datetime(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day)
+    # today = datetime.datetime(year=2018, month=12, day=25)
+    first_day_in_week = today - datetime.timedelta(
+        days=now_canlendar[2] + 7 * (now_canlendar[1] - week_canlendar[1]) + 2)
+    fisrt_day_ts = int(first_day_in_week.timestamp() * 1e3)
+    last_day_in_week = first_day_in_week + datetime.timedelta(days=7)
+    last_day_ts = int(last_day_in_week.timestamp() * 1e3)
+
+    this_week_index = 'crawler-data-raw'
+    return this_week_index, fisrt_day_ts, last_day_ts, this_week, last_week, last_year
+
+def get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year):
+    res_data = [("周","抓取平台","入库内容数","推送候选内容入库数","上线内容数","推送内容数")]
+    craw_data_count = get_hot_crawler_data_count(data_index, start_ts)
+    res_dict = get_count_hot_crawler_data_from_mysql(start_ts)
+    for platform in ("douban","weibo"):
+        craw_one_week_count = ""
+        for buck in craw_data_count:
+            if buck["key"] == platform:
+                craw_one_week_count = buck["doc_count"]
+        push_num = res_dict[platform][0]
+        oneline_num = res_dict[platform][1]
+        new_line = (week_num,platform,craw_one_week_count,push_num,oneline_num)
+        res_data.append(new_line)
+    return res_data
+
+
+def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year):
    tag_names_list_week = []
-    all_data_day = []
    all_data_week = []
-    db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
-                                         password="o5gbA27hXHHm",
-                                         db="doris_prod",
-                                         charset='utf8',
-                                         cursorclass=pymysql.cursors.DictCursor)
-    zhengxing_cursor = db_zhengxing_eagle.cursor()
-    # date = datetime.datetime.now().date() - datetime.timedelta(days=30)
-    # sql = 'select keywords,sum(sorted) as nums,uv  from api_search_words where is_delete = 0 and create_time = "' + str(
-    #     date) + '" group by keywords  order by  nums  desc'
-    # print(sql)
-    # zhengxing_cursor.execute("set names 'UTF8'")
-    # zhengxing_cursor.execute(sql)
-    # data = zhengxing_cursor.fetchall()
-    #
-    # tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
-    # for name in list(data):
-    #     word = name.get("keywords", None)
-    #     num = name.get("nums", 0)
-    #     uv = name.get("uv", 0)
-    #
-    #     answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
-    #     tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
-    #
-    # all_data_day.append(tup_title)
-    # for item in tag_names_list:
-    #     all_data_day.append(tuple(item))
-    #
-    # path = str(date) + ".xls"
-    # WritrExcel().write_excel(path, tuple(all_data_day))
-    # print(u'创建demo.xls文件成功')
-
-    date = datetime.datetime.now().date() - datetime.timedelta(days=15)
+    date = datetime.datetime.fromtimestamp(start_ts)
+    one_month_ago = datetime.datetime.now().date() - datetime.timedelta(days=30)
+    search_keyword_dict = get_search_keywrod_dict()
    sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs  from api_search_words where is_delete = 0 and create_time >= "' + str(
-        date) + '" group by keywords  order by  nums  desc'
+        one_month_ago) + '" group by keywords  order by  nums  desc'

    print(sql)
    zhengxing_cursor.execute("set names 'UTF8'")
    zhengxing_cursor.execute(sql)
    data = zhengxing_cursor.fetchall()

-    tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
+    tup_title = ("关键词", "近一周搜索内容ctr", "过去30天搜索pv", "过去30天搜索uv", "帖子-3星内容数量", "帖子-4星内容数量",
+                 "帖子-5星内容数量","帖子-6星内容数量","近一周抓取数量","近一周审核上线3星数量",
+                 "近一周审核上线4星数量","近一周审核上线5星数量","近一周审核上线6星数量")
+    tag_names_list_week.append(tup_title)
    for name in list(data):
        word = name.get("keywords", None)
-        sorteds = name.get("nums", 0)
-        uv = name.get("uvs", 0)
+        if word in search_keyword_dict:
+            sorteds = name.get("nums", 0)
+            uv = name.get("uvs", 0)
+            tractate_content_num = get_es_word(word,start_ts)
+            tag_names_list_week.append(tuple([word,"" ,sorteds,uv].extend(tractate_content_num)))

-        answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
-        tag_names_list_week.append([word, sorteds, uv, diary_content_num, answer_content_num, tractate_content_num])
+    return tag_names_list_week

-    all_data_week.append(tup_title)
-    for item in tag_names_list_week:
-        all_data_week.append(tuple(item))
+if __name__ == "__main__":
+    data_index, start_ts, end_ts, week_num, last_week_num, year = week_num()
+    # 一周爬虫抓取热点数
+    craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year)
+    # query 一周抓取详情
+    all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year )

+    craw_one_week.append(())
+    craw_one_week.append(all_data_week)
    path = "近1周数据统计结果.xls"
    WritrExcel().write_excel(path, tuple(all_data_week))
    print(u'创建demo.xls文件成功')
-
    send_email_tome()