Commit 730bfca3 authored by litaolemo's avatar litaolemo

update

parent ba1348d0
......@@ -9,22 +9,18 @@ import json
import pymysql
import xlwt, datetime
# from maintenance.func_send_email_with_file import send_file_email
# import zipfile
import redis
# from pyhive import hive
from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
# from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
import sys
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lit
import pytispark.pytispark as pti
import pandas as pd
startTime = time.time()
sparkConf = SparkConf()
......@@ -121,7 +117,7 @@ def send_email_tome():
fromaddr = 'litao@igengmei.com'
content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
zipFile = "/srv/apps/crawler/近1周数据统计结果.xls"
send_file_email("", "", email_group=["<litao@igengmei.com>"], title_str=content
send_file_email("", "", email_group=["<litao@igengmei.com>","‎<duanyingrong@igengmei.com>"], title_str=content
, email_msg_body_str=content, file=zipFile)
except Exception as e:
print(e)
......@@ -385,16 +381,16 @@ def get_keyword_ctr(start_ts,end_ts):
end_date_str = end_date.strftime("%Y%m%d")
data_dic = {}
# --query词曝光
# baoguang_sql = """
# SELECT card_id as query,count(*) as query_count FROM online.ml_community_precise_exposure_detail
# WHERE partition_date>='{start_date}' AND partition_date<'{end_date}' and page_name in ('search_home','search_home_more','search_home_welfare','search_home_diary','search_home_wiki','search_home_post','search_home_hospital','search_home_doctor') group by query
# """.format(start_date=str(start_date_str),end_date=str(end_date_str))
# device_df = spark.sql(baoguang_sql)
# device_df.show(1, False)
# sql_res = device_df.collect()
# print("-----------------------------------------------------------------------------")
# for res in sql_res:
# data_dic[res.query] = res.query_count
baoguang_sql = """
SELECT card_id as query,count(*) as query_count FROM online.ml_community_precise_exposure_detail
WHERE partition_date>='{start_date}' AND partition_date<'{end_date}' and page_name in ('search_home','search_home_more','search_home_welfare','search_home_diary','search_home_wiki','search_home_post','search_home_hospital','search_home_doctor') group by query
""".format(start_date=str(start_date_str),end_date=str(end_date_str))
device_df = spark.sql(baoguang_sql)
device_df.show(1, False)
sql_res = device_df.collect()
print("-----------------------------------------------------------------------------")
for res in sql_res:
data_dic[res.query] = res.query_count
# --query词曝光
query_sql = """
SELECT params['query_words'] as query_words
......@@ -410,7 +406,7 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search
for res in sql_res:
try:
res_json = json.loads(res.query_words)
print(res_json, type(res_json))
# print(res_json, type(res_json))
except:
continue
for single_keyword in res_json:
......@@ -418,12 +414,12 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search
if data_count:
data_dic[single_keyword] = data_dic[single_keyword] + 1
else:
data_dic[single_keyword] = 0
data_dic[single_keyword] = 1
print(data_dic)
return data_dic
def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year):
def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year,word_count):
tag_names_list_week = []
one_month_ago = datetime.datetime.now().date() - datetime.timedelta(days=30)
......@@ -446,8 +442,12 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y
sorteds = name.get("nums", 0)
uv = name.get("uvs", 0)
tractate_content_num = get_es_word(word, start_ts)
new_line = [word, "", sorteds, uv]
exposure_count = word_count.get(word)
if not exposure_count:
ctr = ""
else:
ctr = int(sorteds/exposure_count)
new_line = [word,ctr, sorteds, uv]
tag_names_list_week.append(tuple(new_line + tractate_content_num))
for word in search_keyword_dict:
tractate_content_num = get_es_word(word, start_ts)
......@@ -459,16 +459,16 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y
if __name__ == "__main__":
data_index, start_ts, end_ts, week_num, last_week_num, year = week_num()
# 一周爬虫抓取热点数
# craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year)
# print(craw_one_week)
# # query 一周抓取详情
# all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year)
# path = "近1周数据统计结果.xls"
# exl = WritrExcel()
# exl.write_excel("热点内容抓取周报", tuple(craw_one_week))
# exl.write_excel("query抓取周报", tuple(all_data_week))
# exl.save_excel(path)
# print(u'创建demo.xls文件成功')
# send_email_tome()
word_count = get_keyword_ctr(start_ts, end_ts)
# 一周爬虫抓取热点数
craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year)
print(craw_one_week)
# query 一周抓取详情
all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year,word_count)
path = "近1周数据统计结果.xls"
exl = WritrExcel()
exl.write_excel("热点内容抓取周报", tuple(craw_one_week))
exl.write_excel("query抓取周报", tuple(all_data_week))
exl.save_excel(path)
print(u'创建demo.xls文件成功')
send_email_tome()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment