Commit 730bfca3 authored by litaolemo's avatar litaolemo

update

parent ba1348d0
...@@ -9,22 +9,18 @@ import json ...@@ -9,22 +9,18 @@ import json
import pymysql import pymysql
import xlwt, datetime import xlwt, datetime
# from maintenance.func_send_email_with_file import send_file_email
# import zipfile
import redis import redis
# from pyhive import hive # from pyhive import hive
from maintenance.func_send_email_with_file import send_file_email from maintenance.func_send_email_with_file import send_file_email
from typing import Dict, List from typing import Dict, List
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan from elasticsearch.helpers import scan
# from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
import sys import sys
import time import time
from pyspark import SparkConf from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrame from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lit from pyspark.sql.functions import lit
import pytispark.pytispark as pti import pytispark.pytispark as pti
import pandas as pd
startTime = time.time() startTime = time.time()
sparkConf = SparkConf() sparkConf = SparkConf()
...@@ -121,7 +117,7 @@ def send_email_tome(): ...@@ -121,7 +117,7 @@ def send_email_tome():
fromaddr = 'litao@igengmei.com' fromaddr = 'litao@igengmei.com'
content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!' content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
zipFile = "/srv/apps/crawler/近1周数据统计结果.xls" zipFile = "/srv/apps/crawler/近1周数据统计结果.xls"
send_file_email("", "", email_group=["<litao@igengmei.com>"], title_str=content send_file_email("", "", email_group=["<litao@igengmei.com>","‎<duanyingrong@igengmei.com>"], title_str=content
, email_msg_body_str=content, file=zipFile) , email_msg_body_str=content, file=zipFile)
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -385,16 +381,16 @@ def get_keyword_ctr(start_ts,end_ts): ...@@ -385,16 +381,16 @@ def get_keyword_ctr(start_ts,end_ts):
end_date_str = end_date.strftime("%Y%m%d") end_date_str = end_date.strftime("%Y%m%d")
data_dic = {} data_dic = {}
# --query词曝光 # --query词曝光
# baoguang_sql = """ baoguang_sql = """
# SELECT card_id as query,count(*) as query_count FROM online.ml_community_precise_exposure_detail SELECT card_id as query,count(*) as query_count FROM online.ml_community_precise_exposure_detail
# WHERE partition_date>='{start_date}' AND partition_date<'{end_date}' and page_name in ('search_home','search_home_more','search_home_welfare','search_home_diary','search_home_wiki','search_home_post','search_home_hospital','search_home_doctor') group by query WHERE partition_date>='{start_date}' AND partition_date<'{end_date}' and page_name in ('search_home','search_home_more','search_home_welfare','search_home_diary','search_home_wiki','search_home_post','search_home_hospital','search_home_doctor') group by query
# """.format(start_date=str(start_date_str),end_date=str(end_date_str)) """.format(start_date=str(start_date_str),end_date=str(end_date_str))
# device_df = spark.sql(baoguang_sql) device_df = spark.sql(baoguang_sql)
# device_df.show(1, False) device_df.show(1, False)
# sql_res = device_df.collect() sql_res = device_df.collect()
# print("-----------------------------------------------------------------------------") print("-----------------------------------------------------------------------------")
# for res in sql_res: for res in sql_res:
# data_dic[res.query] = res.query_count data_dic[res.query] = res.query_count
# --query词曝光 # --query词曝光
query_sql = """ query_sql = """
SELECT params['query_words'] as query_words SELECT params['query_words'] as query_words
...@@ -410,7 +406,7 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search ...@@ -410,7 +406,7 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search
for res in sql_res: for res in sql_res:
try: try:
res_json = json.loads(res.query_words) res_json = json.loads(res.query_words)
print(res_json, type(res_json)) # print(res_json, type(res_json))
except: except:
continue continue
for single_keyword in res_json: for single_keyword in res_json:
...@@ -418,12 +414,12 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search ...@@ -418,12 +414,12 @@ AND page_name in ('search_home','search_home_more','search_home_welfare','search
if data_count: if data_count:
data_dic[single_keyword] = data_dic[single_keyword] + 1 data_dic[single_keyword] = data_dic[single_keyword] + 1
else: else:
data_dic[single_keyword] = 0 data_dic[single_keyword] = 1
print(data_dic) print(data_dic)
return data_dic return data_dic
def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year): def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year,word_count):
tag_names_list_week = [] tag_names_list_week = []
one_month_ago = datetime.datetime.now().date() - datetime.timedelta(days=30) one_month_ago = datetime.datetime.now().date() - datetime.timedelta(days=30)
...@@ -446,8 +442,12 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y ...@@ -446,8 +442,12 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y
sorteds = name.get("nums", 0) sorteds = name.get("nums", 0)
uv = name.get("uvs", 0) uv = name.get("uvs", 0)
tractate_content_num = get_es_word(word, start_ts) tractate_content_num = get_es_word(word, start_ts)
exposure_count = word_count.get(word)
new_line = [word, "", sorteds, uv] if not exposure_count:
ctr = ""
else:
ctr = int(sorteds/exposure_count)
new_line = [word,ctr, sorteds, uv]
tag_names_list_week.append(tuple(new_line + tractate_content_num)) tag_names_list_week.append(tuple(new_line + tractate_content_num))
for word in search_keyword_dict: for word in search_keyword_dict:
tractate_content_num = get_es_word(word, start_ts) tractate_content_num = get_es_word(word, start_ts)
...@@ -459,16 +459,16 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y ...@@ -459,16 +459,16 @@ def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, y
if __name__ == "__main__": if __name__ == "__main__":
data_index, start_ts, end_ts, week_num, last_week_num, year = week_num() data_index, start_ts, end_ts, week_num, last_week_num, year = week_num()
word_count = get_keyword_ctr(start_ts, end_ts)
# 一周爬虫抓取热点数 # 一周爬虫抓取热点数
# craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year) craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year)
# print(craw_one_week) print(craw_one_week)
# # query 一周抓取详情 # query 一周抓取详情
# all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year) all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year,word_count)
# path = "近1周数据统计结果.xls" path = "近1周数据统计结果.xls"
# exl = WritrExcel() exl = WritrExcel()
# exl.write_excel("热点内容抓取周报", tuple(craw_one_week)) exl.write_excel("热点内容抓取周报", tuple(craw_one_week))
# exl.write_excel("query抓取周报", tuple(all_data_week)) exl.write_excel("query抓取周报", tuple(all_data_week))
# exl.save_excel(path) exl.save_excel(path)
# print(u'创建demo.xls文件成功') print(u'创建demo.xls文件成功')
# send_email_tome() send_email_tome()
word_count = get_keyword_ctr(start_ts, end_ts)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment