update

27c7d4a2 · litaolemo · f64c7300 · 27c7d4a2
Commit 27c7d4a2 authored Aug 04, 2020 by litaolemo
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 20 deletions

get_query_result.py crawler_sys/utils/get_query_result.py +133 -20

No files found.
--- a/crawler_sys/utils/get_query_result.py
+++ b/crawler_sys/utils/get_query_result.py
@@ -10,6 +10,7 @@ from email.mime.application import MIMEApplication
 from email.utils import formataddr
 from maintenance.func_send_email_with_file import send_file_email
 import zipfile
 es = Elasticsearch([
    {
        'host': '172.16.31.17',
@@ -19,6 +20,7 @@ es = Elasticsearch([
        'port': 9200,
    }])
 # def zipDir(dirpath,outFullName):
 #     """
 #     压缩指定文件夹
@@ -49,16 +51,15 @@ def send_email_tome():
        # toaddrs5 = "malinxi@igengmei.com"
        toaddrs6 = "litao@igengmei.com"
        content = 'hi  all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果，请查收！'
        zipFile = "/srv/apps/crawler/近一周数据统计结果.xls"
        # out_path = "/srv/apps/crawler/近一周数据统计结果.zip"
        # f = zipfile.ZipFile(zipFile, 'w', zipfile.ZIP_DEFLATED)
        # f.write(out_path)
        # f.close()
-        #zipFile = '昨日数据统计结果.xls'
+        # zipFile = '昨日数据统计结果.xls'
-        send_file_email("","",email_group=["litao@igengmei.com"],title_str=content
+        send_file_email("", "", email_group=["litao@igengmei.com"], title_str=content
-                        ,email_msg_body_str=content,file=zipFile)
+                        , email_msg_body_str=content, file=zipFile)
    except Exception as e:
        print(e)
@@ -77,7 +78,23 @@ def get_es_word(word):
                    "should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
                               {"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
-                    "must": [{"term": {"is_online": True}}]
+                    "must": [
+                        {
+                            "term": {
+                                "is_online": True
+                            }
+                        }, {
+                            "terms": {
+                                "content_level": [6, 5, 4, 3.5, 3]
+                            }
+                        }, {
+                            "range": {
+                                "content_length": {
+                                    "gte": 30
+                                }
+                            }
+                        }],
                }
            },
        }
@@ -98,14 +115,17 @@ def get_es_word(word):
                        "match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
                                   "match_phrase": {"tractate_tag_name_content": {"query": word,
                                                                                  "analyzer": "gm_default_index"}}}],
-                    "must": [{"term": {"is_online": True}}]
+                    "must": [{"term": {"is_online": True}}, {
+                        "terms":
+                            {"content_level": [6, 5, 4, 3.5, 3]}
+                        }]
                }
-            },
+            }
        }
    )
    tractate_content_num = results["hits"]["total"]
-    ###diary
+    ###diary 日记
    results = es.search(
        index='gm-dbmw-diary-read',
        doc_type='diary',
@@ -118,13 +138,107 @@ def get_es_word(word):
                    "should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}},
                               {"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}},
                               {"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}],
-                    "must": [{"term": {"is_online": True}}, {"range": {"content_level": {"gte": "3"}}}]
+                    "must": [{"term": {"is_online": True}}, {
+                        "term": {
+                            "has_cover": True
+                        }
+                    }, {"term": {
+                        "is_sink": False
+                    }
+                             }, {
+                                 "term": {
+                                     "has_after_cover": True
+                                 }
+                             }, {
+                                 "term": {
+                                     "has_before_cover": True
+                                 }
+                             }, {"range": {"content_level": {"gte": "3"}}},
+                             {
+                                 "term": {
+                                     "content_simi_bol_show": 0
+                                 }
+                             }
+                             ]
                }
            },
        }
    )
    diary_content_num = results["hits"]["total"]
+    diary_query = {
+        "query": {
+            "bool": {
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "term": {
+                        "has_cover": True
+                    }
+                }, {
+                    "term": {
+                        "is_sink": False
+                    }
+                }, {
+                    "term": {
+                        "has_after_cover": True
+                    }
+                }, {
+                    "term": {
+                        "has_before_cover": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }, {
+                    "term": {
+                        "content_simi_bol_show": 0
+                    }
+                }]
+            }
+        },
+        "_source": ["id"]
+    }
+    answer_query = {
+        "query": {
+            "bool": {
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }, {
+                    "range": {
+                        "content_length": {
+                            "gte": 30
+                        }
+                    }
+                }]
+            }
+        }
+    }
+    tractate_query = {
+        "query": {
+            "bool": {
+                "must": [{
+                    "term": {
+                        "is_online": True
+                    }
+                }, {
+                    "terms": {
+                        "content_level": [6, 5, 4, 3.5, 3]
+                    }
+                }]
+            }
+        }
+    }
    return answer_content_num, tractate_content_num, diary_content_num
@@ -175,7 +289,7 @@ if __name__ == "__main__":
                                         charset='utf8',
                                         cursorclass=pymysql.cursors.DictCursor)
    zhengxing_cursor = db_zhengxing_eagle.cursor()
-    date = datetime.datetime.now().date() - datetime.timedelta(days=1)
+    date = datetime.datetime.now().date() - datetime.timedelta(days=30)
    sql = 'select keywords,sum(sorted) as nums,uv  from api_search_words where is_delete = 0 and create_time = "' + str(
        date) + '" group by keywords  order by  nums  desc'
    print(sql)
@@ -183,24 +297,24 @@ if __name__ == "__main__":
    zhengxing_cursor.execute(sql)
    data = zhengxing_cursor.fetchall()
-    tup_title = ("关键词", "搜索次数","uv", "日记数量", "回答数量", "帖子数量")
+    tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
    for name in list(data):
        word = name.get("keywords", None)
        num = name.get("nums", 0)
-        uv = name.get("uv",0)
+        uv = name.get("uv", 0)
        answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
-        tag_names_list.append([word, num,uv, diary_content_num, answer_content_num, tractate_content_num])
+        tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
    all_data_day.append(tup_title)
    for item in tag_names_list:
        all_data_day.append(tuple(item))
-    path = str(date)+".xls"
+    path = str(date) + ".xls"
    WritrExcel().write_excel(path, tuple(all_data_day))
    print(u'创建demo.xls文件成功')
-    date = datetime.datetime.now().date() - datetime.timedelta(days=7)
+    date = datetime.datetime.now().date() - datetime.timedelta(days=30)
    sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs  from api_search_words where is_delete = 0 and create_time >= "' + str(
        date) + '" group by keywords  order by  nums  desc'
@@ -209,22 +323,21 @@ if __name__ == "__main__":
    zhengxing_cursor.execute(sql)
    data = zhengxing_cursor.fetchall()
-    tup_title = ("关键词", "搜索次数", "uv","日记数量", "回答数量", "帖子数量")
+    tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
    for name in list(data):
        word = name.get("keywords", None)
        sorteds = name.get("nums", 0)
-        uv  = name.get("uvs",0)
+        uv = name.get("uvs", 0)
        answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
-        tag_names_list_week.append([word, sorteds,uv, diary_content_num, answer_content_num, tractate_content_num])
+        tag_names_list_week.append([word, sorteds, uv, diary_content_num, answer_content_num, tractate_content_num])
    all_data_week.append(tup_title)
    for item in tag_names_list_week:
        all_data_week.append(tuple(item))
-    path = "近一周数据统计结果.xls"
+    path = "近一月数据统计结果.xls"
    WritrExcel().write_excel(path, tuple(all_data_week))
    print(u'创建demo.xls文件成功')
    send_email_tome()