Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
ec314915
Commit
ec314915
authored
Aug 11, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
71d3f7a7
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
339 additions
and
0 deletions
+339
-0
crawler_week_report.py
tasks/crawler_week_report.py
+339
-0
test_read_config.py
test/test_read_config.py
+0
-0
No files found.
tasks/crawler_week_report.py
0 → 100644
View file @
ec314915
# -*- coding:UTF-8 -*-
# @Time : 2020/8/11 14:33
# @File : crawler_week_report.py
# @email : litao@igengmei.com
# @author : litao
# coding=utf-8
import
pymysql
import
xlwt
,
datetime
from
maintenance.func_send_email_with_file
import
send_file_email
import
zipfile
import
redis
from
maintenance.func_send_email_with_file
import
send_file_email
from
typing
import
Dict
,
List
from
elasticsearch
import
Elasticsearch
from
elasticsearch.helpers
import
scan
from
crawler.crawler_sys.utils.trans_qiniu_img
import
write_data_into_mysql
es_framework
=
Elasticsearch
(
hosts
=
'172.16.32.37'
,
port
=
9200
)
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164'
,
port
=
6379
,
db
=
19
,
password
=
'ReDis!GmTx*0aN12'
)
es
=
Elasticsearch
([
{
'host'
:
'172.16.31.17'
,
'port'
:
9200
,
},
{
'host'
:
'172.16.31.11'
,
'port'
:
9200
,
}])
# def zipDir(dirpath,outFullName):
# """
# 压缩指定文件夹
# :param dirpath: 目标文件夹路径
# :param outFullName: 压缩文件保存路径+xxxx.zip
# :return: 无
# """
# import zipfile
# zip = zipfile.ZipFile(outFullName,"w",zipfile.ZIP_DEFLATED)
# for path,dirnames,filenames in os.walk(dirpath):
# # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
# fpath = path.replace(dirpath,'')
#
# for filename in filenames:
# zip.write(os.path.join(path,filename),os.path.join(fpath,filename))
# zip.close()
def
send_email_tome
():
try
:
date
=
datetime
.
datetime
.
now
()
.
date
()
-
datetime
.
timedelta
(
days
=
1
)
fromaddr
=
'litao@igengmei.com'
content
=
'hi all:附件为'
+
str
(
date
)
+
'的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
zipFile
=
"/srv/apps/crawler/crawler_sys/utils/近一月数据统计结果.xls"
send_file_email
(
""
,
""
,
email_group
=
[
"<duanyingrong@igengmei.com>"
,
"<litao@igengmei.com>"
],
title_str
=
content
,
email_msg_body_str
=
content
,
file
=
zipFile
)
except
Exception
as
e
:
print
(
e
)
def
get_es_word
(
word
):
###answer
results
=
es
.
search
(
index
=
'gm-dbmw-answer-read'
,
doc_type
=
'answer'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"title"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"desc"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"answer"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[
{
"term"
:
{
"is_online"
:
True
}
},
{
"terms"
:
{
"content_level"
:
[
6
,
5
,
4
,
3.5
,
3
]
}
},
{
"range"
:
{
"content_length"
:
{
"gte"
:
30
}
}
}],
}
},
}
)
answer_content_num
=
results
[
"hits"
][
"total"
]
# tractate
results
=
es
.
search
(
index
=
'gm-dbmw-tractate-read'
,
doc_type
=
'tractate'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"content"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"tractate_tag_name"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"tractate_tag_name_content"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[{
"term"
:
{
"is_online"
:
True
}},
{
"terms"
:
{
"content_level"
:
[
6
,
5
,
4
,
3.5
,
3
]}
}]
}
}
}
)
tractate_content_num
=
results
[
"hits"
][
"total"
]
###diary 日记
results
=
es
.
search
(
index
=
'gm-dbmw-diary-read'
,
doc_type
=
'diary'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"tags"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"answer"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"service.name"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[{
"term"
:
{
"is_online"
:
True
}},
{
"term"
:
{
"has_cover"
:
True
}
},
{
"term"
:
{
"is_sink"
:
False
}
},
{
"term"
:
{
"has_after_cover"
:
True
}
},
{
"term"
:
{
"has_before_cover"
:
True
}
},
{
"range"
:
{
"content_level"
:
{
"gte"
:
"3"
}}},
{
"term"
:
{
"content_simi_bol_show"
:
0
}
}
]
}
},
}
)
diary_content_num
=
results
[
"hits"
][
"total"
]
diary_query
=
{
"query"
:
{
"bool"
:
{
"must"
:
[{
"term"
:
{
"is_online"
:
True
}
},
{
"term"
:
{
"has_cover"
:
True
}
},
{
"term"
:
{
"is_sink"
:
False
}
},
{
"term"
:
{
"has_after_cover"
:
True
}
},
{
"term"
:
{
"has_before_cover"
:
True
}
},
{
"terms"
:
{
"content_level"
:
[
6
,
5
,
4
,
3.5
,
3
]
}
},
{
"term"
:
{
"content_simi_bol_show"
:
0
}
}]
}
},
"_source"
:
[
"id"
]
}
answer_query
=
{
"query"
:
{
"bool"
:
{
"must"
:
[{
"term"
:
{
"is_online"
:
True
}
},
{
"terms"
:
{
"content_level"
:
[
6
,
5
,
4
,
3.5
,
3
]
}
},
{
"range"
:
{
"content_length"
:
{
"gte"
:
30
}
}
}]
}
}
}
tractate_query
=
{
"query"
:
{
"bool"
:
{
"must"
:
[{
"term"
:
{
"is_online"
:
True
}
},
{
"terms"
:
{
"content_level"
:
[
6
,
5
,
4
,
3.5
,
3
]
}
}]
}
}
}
return
answer_content_num
,
tractate_content_num
,
diary_content_num
class
WritrExcel
():
def
set_style
(
self
,
name
,
height
,
bold
=
False
):
style
=
xlwt
.
XFStyle
()
# 初始化样式
font
=
xlwt
.
Font
()
# 为样式创建字体
font
.
name
=
name
font
.
bold
=
bold
font
.
color_index
=
4
font
.
height
=
height
style
.
font
=
font
return
style
# 写入Excel
def
write_excel
(
self
,
path
,
rows
):
# 创建工作簿
workbook
=
xlwt
.
Workbook
(
encoding
=
'utf-8'
)
# 创建sheet
data_sheet
=
workbook
.
add_sheet
(
'Sheet1'
)
# 将样式定义在循环之外
default
=
self
.
set_style
(
'Times New Roman'
,
220
,
True
)
j
=
k
=
0
# 循环读取每一行数据并写入Excel
for
row
in
rows
[:
65530
]:
for
i
in
range
(
len
(
row
)):
try
:
# 写入
data_sheet
.
write
((
j
+
k
),
i
,
row
[
i
],
default
)
except
:
print
(
i
)
raise
# data_sheet.write(1, i, row1[i], self.set_style('Times New Roman', 220, True))
k
=
k
+
1
workbook
.
save
(
path
)
print
(
"写入文件成功,共"
+
str
(
k
)
+
"行数据"
)
if
__name__
==
"__main__"
:
tag_names_list
=
[]
tag_names_list_week
=
[]
all_data_day
=
[]
all_data_week
=
[]
db_zhengxing_eagle
=
pymysql
.
connect
(
host
=
"172.16.30.136"
,
port
=
3306
,
user
=
"doris"
,
password
=
"o5gbA27hXHHm"
,
db
=
"doris_prod"
,
charset
=
'utf8'
,
cursorclass
=
pymysql
.
cursors
.
DictCursor
)
zhengxing_cursor
=
db_zhengxing_eagle
.
cursor
()
# date = datetime.datetime.now().date() - datetime.timedelta(days=30)
# sql = 'select keywords,sum(sorted) as nums,uv from api_search_words where is_delete = 0 and create_time = "' + str(
# date) + '" group by keywords order by nums desc'
# print(sql)
# zhengxing_cursor.execute("set names 'UTF8'")
# zhengxing_cursor.execute(sql)
# data = zhengxing_cursor.fetchall()
#
# tup_title = ("关键词", "搜索次数", "uv", "日记数量", "回答数量", "帖子数量")
# for name in list(data):
# word = name.get("keywords", None)
# num = name.get("nums", 0)
# uv = name.get("uv", 0)
#
# answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
# tag_names_list.append([word, num, uv, diary_content_num, answer_content_num, tractate_content_num])
#
# all_data_day.append(tup_title)
# for item in tag_names_list:
# all_data_day.append(tuple(item))
#
# path = str(date) + ".xls"
# WritrExcel().write_excel(path, tuple(all_data_day))
# print(u'创建demo.xls文件成功')
date
=
datetime
.
datetime
.
now
()
.
date
()
-
datetime
.
timedelta
(
days
=
15
)
sql
=
'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "'
+
str
(
date
)
+
'" group by keywords order by nums desc'
print
(
sql
)
zhengxing_cursor
.
execute
(
"set names 'UTF8'"
)
zhengxing_cursor
.
execute
(
sql
)
data
=
zhengxing_cursor
.
fetchall
()
tup_title
=
(
"关键词"
,
"搜索次数"
,
"uv"
,
"日记数量"
,
"回答数量"
,
"帖子数量"
)
for
name
in
list
(
data
):
word
=
name
.
get
(
"keywords"
,
None
)
sorteds
=
name
.
get
(
"nums"
,
0
)
uv
=
name
.
get
(
"uvs"
,
0
)
answer_content_num
,
tractate_content_num
,
diary_content_num
=
get_es_word
(
word
)
tag_names_list_week
.
append
([
word
,
sorteds
,
uv
,
diary_content_num
,
answer_content_num
,
tractate_content_num
])
all_data_week
.
append
(
tup_title
)
for
item
in
tag_names_list_week
:
all_data_week
.
append
(
tuple
(
item
))
path
=
"近1周数据统计结果.xls"
WritrExcel
()
.
write_excel
(
path
,
tuple
(
all_data_week
))
print
(
u'创建demo.xls文件成功'
)
send_email_tome
()
test/test_read_config.py
View file @
ec314915
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment