update

ec314915 · litaolemo · 71d3f7a7 · ec314915 · ec314915
Commit ec314915 authored Aug 11, 2020 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 339 additions and 0 deletions

crawler_week_report.py tasks/crawler_week_report.py +339 -0

test_read_config.py test/test_read_config.py +0 -0

No files found.
--- a/tasks/crawler_week_report.py
+++ b/tasks/crawler_week_report.py
+# -*- coding:UTF-8 -*-
+# @Time  : 2020/8/11 14:33
+# @File  : crawler_week_report.py
+# @email : litao@igengmei.com
+# @author : litao
+
+# coding=utf-8
+import pymysql
+import xlwt, datetime
+from maintenance.func_send_email_with_file import send_file_email
+import zipfile
+import redis
+from maintenance.func_send_email_with_file import send_file_email
+from typing import Dict, List
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import scan
+from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
+
+es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
+rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
+
+
+es = Elasticsearch([
+    {
+        'host': '172.16.31.17',
+        'port': 9200,
+    }, {
+        'host': '172.16.31.11',
+        'port': 9200,
+    }])
+
+
+# def zipDir(dirpath,outFullName):
+#     """
+#     压缩指定文件夹
+#     :param dirpath: 目标文件夹路径
+#     :param outFullName: 压缩文件保存路径+xxxx.zip
+#     :return: 无
+#     """
+#     import zipfile
+#     zip = zipfile.ZipFile(outFullName,"w",zipfile.ZIP_DEFLATED)
+#     for path,dirnames,filenames in os.walk(dirpath):
+#         # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
+#         fpath = path.replace(dirpath,'')
+#
+#         for filename in filenames:
+#             zip.write(os.path.join(path,filename),os.path.join(fpath,filename))
+#     zip.close()
+
+def send_email_tome():
+    try:
+        date = datetime.datetime.now().date() - datetime.timedelta(days=1)
+        fromaddr = 'litao@igengmei.com'
+
+        content = 'hi  all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果，请查收！'
+        zipFile = "/srv/apps/crawler/crawler_sys/utils/近一月数据统计结果.xls"
+        send_file_email("", "", email_group=["‎<duanyingrong@igengmei.com>","<litao@igengmei.com>"], title_str=content
+                        , email_msg_body_str=content, file=zipFile)
+    except Exception as e:
+        print(e)
+
+
+def get_es_word(word):
+    ###answer
+    results = es.search(
+        index='gm-dbmw-answer-read',
+        doc_type='answer',
+        timeout='10s',
+        size=0,
+        body={
+            "query": {
+                "bool": {
+                    "minimum_should_match": 1,
+                    "should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
+                               {"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
+                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
+                    "must": [
+                        {
+                            "term": {
+                                "is_online": True
+                            }
+                        }, {
+                            "terms": {
+                                "content_level": [6, 5, 4, 3.5, 3]
+                            }
+                        }, {
+                            "range": {
+                                "content_length": {
+                                    "gte": 30
+                                }
+                            }
+                        }],
+
+                }
+            },
+        }
+    )
+    answer_content_num = results["hits"]["total"]
+
+    # tractate
+    results = es.search(
+        index='gm-dbmw-tractate-read',
+        doc_type='tractate',
+        timeout='10s',
+        size=0,
+        body={
+            "query": {
+                "bool": {
+                    "minimum_should_match": 1,
+                    "should": [{"match_phrase": {"content": {"query": word, "analyzer": "gm_default_index"}}}, {
+                        "match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
+                                   "match_phrase": {"tractate_tag_name_content": {"query": word,
+                                                                                  "analyzer": "gm_default_index"}}}],
+                    "must": [{"term": {"is_online": True}}, {
+                        "terms":
+                            {"content_level": [6, 5, 4, 3.5, 3]}
+                        }]
+                }
+            }
+        }
+    )
+    tractate_content_num = results["hits"]["total"]
+
+    ###diary 日记
+    results = es.search(
+        index='gm-dbmw-diary-read',
+        doc_type='diary',
+        timeout='10s',
+        size=0,
+        body={
+            "query": {
+                "bool": {
+                    "minimum_should_match": 1,
+                    "should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}},
+                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}},
+                               {"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}],
+                    "must": [{"term": {"is_online": True}}, {
+                        "term": {
+                            "has_cover": True
+                        }
+                    }, {"term": {
+                        "is_sink": False
+                    }
+                             }, {
+                                 "term": {
+                                     "has_after_cover": True
+                                 }
+                             }, {
+                                 "term": {
+                                     "has_before_cover": True
+                                 }
+                             }, {"range": {"content_level": {"gte": "3"}}},
+                             {
+                                 "term": {
+                                     "content_simi_bol_show": 0
+                                 }
+                             }
+                             ]
+                }
+            },
+        }
+    )
+    diary_content_num = results["hits"]["total"]
+
+    diary_query = {
+        "query": {
+            "bool": {
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "term": {
+                        "has_cover": True
+                    }
+                }, {
+                    "term": {
+                        "is_sink": False
+                    }
+                }, {
+                    "term": {
+                        "has_after_cover": True
+                    }
+                }, {
+                    "term": {
+                        "has_before_cover": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }, {
+                    "term": {
+                        "content_simi_bol_show": 0
+                    }
+                }]
+            }
+        },
+        "_source": ["id"]
+    }
+    answer_query = {
+        "query": {
+            "bool": {
+
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }, {
+                    "range": {
+                        "content_length": {
+                            "gte": 30
+                        }
+                    }
+                }]
+            }
+        }
+    }
+    tractate_query = {
+        "query": {
+            "bool": {
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }]
+            }
+        }
+    }
+    return answer_content_num, tractate_content_num, diary_content_num
+
+
+class WritrExcel():
+
+    def set_style(self, name, height, bold=False):
+        style = xlwt.XFStyle()  # 初始化样式
+        font = xlwt.Font()  # 为样式创建字体
+        font.name = name
+        font.bold = bold
+        font.color_index = 4
+        font.height = height
+        style.font = font
+        return style
+
+    # 写入Excel
+    def write_excel(self, path, rows):
+        # 创建工作簿
+        workbook = xlwt.Workbook(encoding='utf-8')
+        # 创建sheet
+        data_sheet = workbook.add_sheet('Sheet1')
+        # 将样式定义在循环之外
+        default = self.set_style('Times New Roman', 220, True)
+        j = k = 0
+        # 循环读取每一行数据并写入Excel
+        for row in rows[:65530]:
+            for i in range(len(row)):
+                try:
+                    # 写入
+                    data_sheet.write((j + k), i, row[i], default)
+                except:
+                    print(i)
+                    raise
+                # data_sheet.write(1, i, row1[i], self.set_style('Times New Roman', 220, True))
+            k = k + 1
+        workbook.save(path)
+        print("写入文件成功，共" + str(k) + "行数据")
+
+
+if __name__ == "__main__":
+    tag_names_list = []
+    tag_names_list_week = []
+    all_data_day = []
+    all_data_week = []
+    db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
+                                         password="o5gbA27hXHHm",
+                                         db="doris_prod",
+                                         charset='utf8',
+                                         cursorclass=pymysql.cursors.DictCursor)
+    zhengxing_cursor = db_zhengxing_eagle.cursor()
+    # date = datetime.datetime.now().date() - datetime.timedelta(days=30)
+    # sql = 'select keywords,sum(sorted) as nums,uv  from api_search_words where is_delete = 0 and create_time = "' + str(
+    #     date) + '" group by keywords  order by  nums  desc'
+    # print(sql)
+    # zhengxing_cursor.execute("set names 'UTF8'")
+    # zhengxing_cursor.execute(sql)
+    # data = zhengxing_cursor.fetchall()
+    #
+    # tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
+    # for name in list(data):
+    #     word = name.get("keywords", None)
+    #     num = name.get("nums", 0)
+    #     uv = name.get("uv", 0)
+    #
+    #     answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
+    #     tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
+    #
+    # all_data_day.append(tup_title)
+    # for item in tag_names_list:
+    #     all_data_day.append(tuple(item))
+    #
+    # path = str(date) + ".xls"
+    # WritrExcel().write_excel(path, tuple(all_data_day))
+    # print(u'创建demo.xls文件成功')
+
+    date = datetime.datetime.now().date() - datetime.timedelta(days=15)
+    sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs  from api_search_words where is_delete = 0 and create_time >= "' + str(
+        date) + '" group by keywords  order by  nums  desc'
+
+    print(sql)
+    zhengxing_cursor.execute("set names 'UTF8'")
+    zhengxing_cursor.execute(sql)
+    data = zhengxing_cursor.fetchall()
+
+    tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
+    for name in list(data):
+        word = name.get("keywords", None)
+        sorteds = name.get("nums", 0)
+        uv = name.get("uvs", 0)
+
+        answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
+        tag_names_list_week.append([word, sorteds, uv, diary_content_num, answer_content_num, tractate_content_num])
+
+    all_data_week.append(tup_title)
+    for item in tag_names_list_week:
+        all_data_week.append(tuple(item))
+
+    path = "近1周数据统计结果.xls"
+    WritrExcel().write_excel(path, tuple(all_data_week))
+    print(u'创建demo.xls文件成功')
+
+    send_email_tome()
--- a/test/test_read_config.py
+++ b/test/test_read_config.py