Commit 781e362a authored by litaolemo's avatar litaolemo

update

parent 381836ae
...@@ -19,6 +19,16 @@ from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql ...@@ -19,6 +19,16 @@ from crawler.crawler_sys.utils.trans_qiniu_img import write_data_into_mysql
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200) es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12') rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
conn_mimas = pymysql.connect(host='172.16.30.138', port=3306, user='mimas', passwd='GJL3UJe1Ck9ggL6aKnZCq4cRvM',
db='mimas_prod', charset='utf8mb4')
cur_mimas = conn_mimas.cursor()
db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
password="o5gbA27hXHHm",
db="doris_prod",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
zhengxing_cursor = db_zhengxing_eagle.cursor()
es = Elasticsearch([ es = Elasticsearch([
{ {
...@@ -30,6 +40,7 @@ es = Elasticsearch([ ...@@ -30,6 +40,7 @@ es = Elasticsearch([
}]) }])
# def zipDir(dirpath,outFullName): # def zipDir(dirpath,outFullName):
# """ # """
# 压缩指定文件夹 # 压缩指定文件夹
...@@ -53,51 +64,15 @@ def send_email_tome(): ...@@ -53,51 +64,15 @@ def send_email_tome():
fromaddr = 'litao@igengmei.com' fromaddr = 'litao@igengmei.com'
content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!' content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
zipFile = "/srv/apps/crawler/crawler_sys/utils/近一月数据统计结果.xls" zipFile = "/srv/apps/crawler/近一月数据统计结果.xls"
send_file_email("", "", email_group=["‎<duanyingrong@igengmei.com>","<litao@igengmei.com>"], title_str=content send_file_email("", "", email_group=["<litao@igengmei.com>"], title_str=content
, email_msg_body_str=content, file=zipFile) , email_msg_body_str=content, file=zipFile)
except Exception as e: except Exception as e:
print(e) print(e)
def get_es_word(word): def get_es_word(word,start_ts):
###answer # tractate oneline
results = es.search(
index='gm-dbmw-answer-read',
doc_type='answer',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
"must": [
{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"range": {
"content_length": {
"gte": 30
}
}
}],
}
},
}
)
answer_content_num = results["hits"]["total"]
# tractate
results = es.search( results = es.search(
index='gm-dbmw-tractate-read', index='gm-dbmw-tractate-read',
doc_type='tractate', doc_type='tractate',
...@@ -113,129 +88,81 @@ def get_es_word(word): ...@@ -113,129 +88,81 @@ def get_es_word(word):
"analyzer": "gm_default_index"}}}], "analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}, { "must": [{"term": {"is_online": True}}, {
"terms": "terms":
{"content_level": [6, 5, 4, 3.5, 3]} {"content_level": [6, 5, 4, 3]}
}] }]
} }
},"size": 0,"aggs": {
"content_level": {
"terms": {
"field": "content_level",
"size": 10
}
}
} }
} }
) )
tractate_content_num = results["hits"]["total"] tractate_content_num = results["hits"]["total"]
content_level_count = results["aggregations"]["content_level"]["buckets"]
###diary 日记 # tractate craw
results = es.search( results = es.search(
index='gm-dbmw-diary-read', index='gm-dbmw-tractate-read',
doc_type='diary', doc_type='tractate',
timeout='10s', timeout='10s',
size=0, size=0,
body={ body={
"query": { "query": {
"bool": { "bool": {
"minimum_should_match": 1, "minimum_should_match": 1,
"should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}}, "should": [{"match_phrase": {"content": {"query": word, "analyzer": "gm_default_index"}}}, {
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}, "match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
{"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}], "match_phrase": {"tractate_tag_name_content": {"query": word,
"must": [{"term": {"is_online": True}}, { "analyzer": "gm_default_index"}}}],
"term": { "must": [
"has_cover": True {"terms": {"platform": [13,14,15,16,17]}}, {"range":{"create_time":{"gte":start_ts}}}
} ]
}, {"term": {
"is_sink": False
}
}, {
"term": {
"has_after_cover": True
}
}, {
"term": {
"has_before_cover": True
}
}, {"range": {"content_level": {"gte": "3"}}},
{
"term": {
"content_simi_bol_show": 0
}
}
]
} }
}, }, "size": 0, "aggs": {
} "content_level": {
)
diary_content_num = results["hits"]["total"]
diary_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"term": {
"has_cover": True
}
}, {
"term": {
"is_sink": False
}
}, {
"term": {
"has_after_cover": True
}
}, {
"term": {
"has_before_cover": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}, {
"term": {
"content_simi_bol_show": 0
}
}]
}
},
"_source": ["id"]
}
answer_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"terms": { "terms": {
"content_level": [6, 5, 4, 3.5, 3] "field": "content_level",
} "size": 10
}, { },
"range": { "aggs": {
"content_length": { "is_online": {
"gte": 30 "terms": {
"field": "is_online",
"size": 10
}
} }
} }
}] }
}
}
}
tractate_query = {
"query": {
"bool": {
"must": [{
"term": {
"is_online": True
}
}, {
"terms": {
"content_level": [6, 5, 4, 3.5, 3]
}
}]
} }
} }
} )
return answer_content_num, tractate_content_num, diary_content_num craw_tractate_num = results["hits"]["total"]
craw_level_count = results["aggregations"]["content_level"]["buckets"]
res_list = []
# 可见3456星帖子
for content_level in ("3","4","5","6"):
doc_count = 0
for content_level_dic in content_level_count:
if content_level_dic["keys"] == content_level:
doc_count = content_level_dic["doc_count"]
break
res_list.append(doc_count)
res_list.append(craw_tractate_num)
for content_level in ("3","4","5","6"):
doc_count = 0
for content_level_dic in craw_level_count:
if content_level_dic["keys"] == content_level:
for buck in content_level_dic["is_online"]["buckets"]:
if buck["key"] == 1:
doc_count = buck["doc_count"]
break
res_list.append(doc_count)
return tractate_content_num
class WritrExcel(): class WritrExcel():
...@@ -274,66 +201,153 @@ class WritrExcel(): ...@@ -274,66 +201,153 @@ class WritrExcel():
print("写入文件成功,共" + str(k) + "行数据") print("写入文件成功,共" + str(k) + "行数据")
if __name__ == "__main__": def get_search_keywrod_dict():
tag_names_list = [] search_body = {
"query": {
"bool": {
"filter": [
]
}
}
}
return_dict = []
scan_res = scan(es_framework,query=search_body,index="search_word")
for res in scan_res:
return_dict[res["_source"]["title"]] = {}
return return_dict
def get_hot_crawler_data_count(index,start_ts):
search_body = {
"query": {
"bool": {
"filter": [
{"range":{"fetch_time":{"gte":start_ts}}}
],"must_not": [
{"exists": {"field": "search_word"}}
]
}
},"size": 0,"aggs": {
"NAME": {
"terms": {
"field": "platform",
"size": 10
}
}
}
}
search_res = es_framework.search(body=search_body,index=index)
platform_count = search_res["aggregations"]["NAME"]["buckets"]
return platform_count
def get_count_hot_crawler_data_from_mysql(start_ts:int) ->Dict:
data_platform_id_list = [11,12]
res_dict = {}
for platform_id in data_platform_id_list:
start_date = datetime.datetime.fromtimestamp(start_ts/1e3)
match_all_sql = """select count(id) as nums from api_tractate where create_time >= '{time}' and platform = {platfrom}""".format(time=str(start_date),platfrom=platform_id)
cur_mimas.execute(match_all_sql)
data = cur_mimas.fetchall()
match_all_num = data[0][0]
# print(match_all_num)
oneline_sql = """select count(id) as nums from api_tractate where create_time >= '{time}' and is_online = 1 and platform = {platfrom}""".format(time=str(start_date),platfrom=platform_id)
cur_mimas.execute(oneline_sql)
data = cur_mimas.fetchall()
match_oneline_num = data[0][0]
# print(match_oneline_num)
if platform_id == 11:
platform = 'weibo'
elif platform_id == 12:
platform = "douban"
res_dict[platform] = []
res_dict[platform].append(match_all_num)
res_dict[platform].append(match_oneline_num)
return res_dict
def week_num(year=None, cycle=None, cycle_num=None, compare_type=None):
now = datetime.datetime.now()
now_canlendar = now.isocalendar()
if not cycle_num:
week_canlendar = now_canlendar
else:
week_canlendar = (now.year, cycle_num + 1, 0)
year = week_canlendar[0]
this_week = week_canlendar[1]
if this_week == 0:
last_year = year - 1
this_week = 1
else:
last_year = year
if this_week == 1:
last_week = "52"
else:
last_week = this_week - 1
today = datetime.datetime(datetime.datetime.now().year, datetime.datetime.now().month, datetime.datetime.now().day)
# today = datetime.datetime(year=2018, month=12, day=25)
first_day_in_week = today - datetime.timedelta(
days=now_canlendar[2] + 7 * (now_canlendar[1] - week_canlendar[1]) + 2)
fisrt_day_ts = int(first_day_in_week.timestamp() * 1e3)
last_day_in_week = first_day_in_week + datetime.timedelta(days=7)
last_day_ts = int(last_day_in_week.timestamp() * 1e3)
this_week_index = 'crawler-data-raw'
return this_week_index, fisrt_day_ts, last_day_ts, this_week, last_week, last_year
def get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year):
res_data = [("周","抓取平台","入库内容数","推送候选内容入库数","上线内容数","推送内容数")]
craw_data_count = get_hot_crawler_data_count(data_index, start_ts)
res_dict = get_count_hot_crawler_data_from_mysql(start_ts)
for platform in ("douban","weibo"):
craw_one_week_count = ""
for buck in craw_data_count:
if buck["key"] == platform:
craw_one_week_count = buck["doc_count"]
push_num = res_dict[platform][0]
oneline_num = res_dict[platform][1]
new_line = (week_num,platform,craw_one_week_count,push_num,oneline_num)
res_data.append(new_line)
return res_data
def craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year):
tag_names_list_week = [] tag_names_list_week = []
all_data_day = []
all_data_week = [] all_data_week = []
db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris", date = datetime.datetime.fromtimestamp(start_ts)
password="o5gbA27hXHHm", one_month_ago = datetime.datetime.now().date() - datetime.timedelta(days=30)
db="doris_prod", search_keyword_dict = get_search_keywrod_dict()
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
zhengxing_cursor = db_zhengxing_eagle.cursor()
# date = datetime.datetime.now().date() - datetime.timedelta(days=30)
# sql = 'select keywords,sum(sorted) as nums,uv from api_search_words where is_delete = 0 and create_time = "' + str(
# date) + '" group by keywords order by nums desc'
# print(sql)
# zhengxing_cursor.execute("set names 'UTF8'")
# zhengxing_cursor.execute(sql)
# data = zhengxing_cursor.fetchall()
#
# tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
# for name in list(data):
# word = name.get("keywords", None)
# num = name.get("nums", 0)
# uv = name.get("uv", 0)
#
# answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
# tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
#
# all_data_day.append(tup_title)
# for item in tag_names_list:
# all_data_day.append(tuple(item))
#
# path = str(date) + ".xls"
# WritrExcel().write_excel(path, tuple(all_data_day))
# print(u'创建demo.xls文件成功')
date = datetime.datetime.now().date() - datetime.timedelta(days=15)
sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "' + str( sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "' + str(
date) + '" group by keywords order by nums desc' one_month_ago) + '" group by keywords order by nums desc'
print(sql) print(sql)
zhengxing_cursor.execute("set names 'UTF8'") zhengxing_cursor.execute("set names 'UTF8'")
zhengxing_cursor.execute(sql) zhengxing_cursor.execute(sql)
data = zhengxing_cursor.fetchall() data = zhengxing_cursor.fetchall()
tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量") tup_title = ("关键词", "近一周搜索内容ctr", "过去30天搜索pv", "过去30天搜索uv", "帖子-3星内容数量", "帖子-4星内容数量",
"帖子-5星内容数量","帖子-6星内容数量","近一周抓取数量","近一周审核上线3星数量",
"近一周审核上线4星数量","近一周审核上线5星数量","近一周审核上线6星数量")
tag_names_list_week.append(tup_title)
for name in list(data): for name in list(data):
word = name.get("keywords", None) word = name.get("keywords", None)
sorteds = name.get("nums", 0) if word in search_keyword_dict:
uv = name.get("uvs", 0) sorteds = name.get("nums", 0)
uv = name.get("uvs", 0)
tractate_content_num = get_es_word(word,start_ts)
tag_names_list_week.append(tuple([word,"" ,sorteds,uv].extend(tractate_content_num)))
answer_content_num, tractate_content_num, diary_content_num = get_es_word(word) return tag_names_list_week
tag_names_list_week.append([word, sorteds, uv, diary_content_num, answer_content_num, tractate_content_num])
all_data_week.append(tup_title) if __name__ == "__main__":
for item in tag_names_list_week: data_index, start_ts, end_ts, week_num, last_week_num, year = week_num()
all_data_week.append(tuple(item)) # 一周爬虫抓取热点数
craw_one_week = get_how_word_crawler_count(data_index, start_ts, end_ts, week_num, last_week_num, year)
# query 一周抓取详情
all_data_week = craw_query_one_week(data_index, start_ts, end_ts, week_num, last_week_num, year )
craw_one_week.append(())
craw_one_week.append(all_data_week)
path = "近1周数据统计结果.xls" path = "近1周数据统计结果.xls"
WritrExcel().write_excel(path, tuple(all_data_week)) WritrExcel().write_excel(path, tuple(all_data_week))
print(u'创建demo.xls文件成功') print(u'创建demo.xls文件成功')
send_email_tome() send_email_tome()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment