Commit ec314915 authored by litaolemo's avatar litaolemo

update

parent 71d3f7a7
# -*- coding:UTF-8 -*-
# @Time : 2020/8/11 14:33
# @File : crawler_week_report.py
# @email : litao@igengmei.com
# @author : litao
# coding=utf-8
import pymysql
import xlwt, datetime
from maintenance.func_send_email_with_file import send_file_email
import zipfile
import redis
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
es = Elasticsearch([
{
'host': '172.16.31.17',
'port': 9200,
}, {
'host': '172.16.31.11',
'port': 9200,
}])
# def zipDir(dirpath,outFullName):
# """
# 压缩指定文件夹
# :param dirpath: 目标文件夹路径
# :param outFullName: 压缩文件保存路径+xxxx.zip
# :return: 无
# """
# import zipfile
# zip = zipfile.ZipFile(outFullName,"w",zipfile.ZIP_DEFLATED)
# for path,dirnames,filenames in os.walk(dirpath):
# # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
# fpath = path.replace(dirpath,'')
#
# for filename in filenames:
# zip.write(os.path.join(path,filename),os.path.join(fpath,filename))
# zip.close()
def send_email_tome():
try:
date = datetime.datetime.now().date() - datetime.timedelta(days=1)
fromaddr = 'litao@igengmei.com'
content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
zipFile = "/srv/apps/crawler/crawler_sys/utils/近一月数据统计结果.xls"
send_file_email("", "", email_group=["‎<duanyingrong@igengmei.com>","<litao@igengmei.com>"], title_str=content
, email_msg_body_str=content, file=zipFile)
except Exception as e:
print(e)
def get_es_word(word):
###answer
results = es.search(
index='gm-dbmw-answer-read',
doc_type='answer',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
"must": [
{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"range": {
"content_length": {
"gte": 30
}
}
}],
}
},
}
)
answer_content_num = results["hits"]["total"]
# tractate
results = es.search(
index='gm-dbmw-tractate-read',
doc_type='tractate',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"content": {"query": word, "analyzer": "gm_default_index"}}}, {
"match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
"match_phrase": {"tractate_tag_name_content": {"query": word,
"analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}, {
"terms":
{"content_level": [6, 5, 4, 3.5, 3]}
}]
}
}
}
)
tractate_content_num = results["hits"]["total"]
###diary 日记
results = es.search(
index='gm-dbmw-diary-read',
doc_type='diary',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}, {
"term": {
"has_cover": True
}
}, {"term": {
"is_sink": False
}
}, {
"term": {
"has_after_cover": True
}
}, {
"term": {
"has_before_cover": True
}
}, {"range": {"content_level": {"gte": "3"}}},
{
"term": {
"content_simi_bol_show": 0
}
}
]
}
},
}
)
diary_content_num = results["hits"]["total"]
diary_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"term": {
"has_cover": True
}
}, {
"term": {
"is_sink": False
}
}, {
"term": {
"has_after_cover": True
}
}, {
"term": {
"has_before_cover": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"term": {
"content_simi_bol_show": 0
}
}]
}
},
"_source": ["id"]
}
answer_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"range": {
"content_length": {
"gte": 30
}
}
}]
}
}
}
tractate_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}]
}
}
}
return answer_content_num, tractate_content_num, diary_content_num
class WritrExcel():
def set_style(self, name, height, bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
# 写入Excel
def write_excel(self, path, rows):
# 创建工作簿
workbook = xlwt.Workbook(encoding='utf-8')
# 创建sheet
data_sheet = workbook.add_sheet('Sheet1')
# 将样式定义在循环之外
default = self.set_style('Times New Roman', 220, True)
j = k = 0
# 循环读取每一行数据并写入Excel
for row in rows[:65530]:
for i in range(len(row)):
try:
# 写入
data_sheet.write((j + k), i, row[i], default)
except:
print(i)
raise
# data_sheet.write(1, i, row1[i], self.set_style('Times New Roman', 220, True))
k = k + 1
workbook.save(path)
print("写入文件成功,共" + str(k) + "行数据")
if __name__ == "__main__":
tag_names_list = []
tag_names_list_week = []
all_data_day = []
all_data_week = []
db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
password="o5gbA27hXHHm",
db="doris_prod",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
zhengxing_cursor = db_zhengxing_eagle.cursor()
# date = datetime.datetime.now().date() - datetime.timedelta(days=30)
# sql = 'select keywords,sum(sorted) as nums,uv from api_search_words where is_delete = 0 and create_time = "' + str(
# date) + '" group by keywords order by nums desc'
# print(sql)
# zhengxing_cursor.execute("set names 'UTF8'")
# zhengxing_cursor.execute(sql)
# data = zhengxing_cursor.fetchall()
#
# tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
# for name in list(data):
# word = name.get("keywords", None)
# num = name.get("nums", 0)
# uv = name.get("uv", 0)
#
# answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
# tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
#
# all_data_day.append(tup_title)
# for item in tag_names_list:
# all_data_day.append(tuple(item))
#
# path = str(date) + ".xls"
# WritrExcel().write_excel(path, tuple(all_data_day))
# print(u'创建demo.xls文件成功')
date = datetime.datetime.now().date() - datetime.timedelta(days=15)
sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "' + str(
date) + '" group by keywords order by nums desc'
print(sql)
zhengxing_cursor.execute("set names 'UTF8'")
zhengxing_cursor.execute(sql)
data = zhengxing_cursor.fetchall()
tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
for name in list(data):
word = name.get("keywords", None)
sorteds = name.get("nums", 0)
uv = name.get("uvs", 0)
answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
tag_names_list_week.append([word, sorteds, uv, diary_content_num, answer_content_num, tractate_content_num])
all_data_week.append(tup_title)
for item in tag_names_list_week:
all_data_week.append(tuple(item))
path = "近1周数据统计结果.xls"
WritrExcel().write_excel(path, tuple(all_data_week))
print(u'创建demo.xls文件成功')
send_email_tome()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment