Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
ffd0efe4
Commit
ffd0efe4
authored
Aug 03, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
d3373a11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
257 additions
and
5 deletions
+257
-5
README.md
README.md
+4
-2
cal_ni_and_put_to_backend.py
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
+8
-1
crawler_toutiao.py
crawler_sys/site_crawler/crawler_toutiao.py
+0
-0
__init__.py
crawler_sys/site_crawler/crawler_weibo/__init__.py
+6
-0
crawler_weibo.py
crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
+0
-0
crawler_douban.py
crawler_sys/site_crawler_test/crawler_douban.py
+1
-1
get_query_result.py
crawler_sys/utils/get_query_result.py
+237
-0
output_results.py
crawler_sys/utils/output_results.py
+1
-1
No files found.
README.md
View file @
ffd0efe4
# crawler
## 发布者页爬虫
1.
部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
2.
切换权限 sudo su - gmuser
3.
source /root/anaconda3/bin/activate
4.
创建虚拟环境 conda activate crawler_env/conda deactivate
5.
抓取程序 nohup python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py > /data/log/fect_task.log &
6.
写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
\ No newline at end of file
6.
写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
##搜索页爬虫
crawler_sys/scheduler/cal_ni_and_put_to_backend.py
View file @
ffd0efe4
...
...
@@ -401,4 +401,11 @@ def task_main():
if
__name__
==
"__main__"
:
task_main
()
from
concurrent.futures
import
ProcessPoolExecutor
executor
=
ProcessPoolExecutor
(
max_workers
=
4
)
futures
=
[]
for
processe
in
range
(
4
):
future
=
executor
.
submit
(
task_main
)
futures
.
append
(
future
)
print
(
'Processe
%
s start'
%
processe
)
executor
.
shutdown
(
True
)
crawler_sys/site_crawler/crawler_toutiao.py
View file @
ffd0efe4
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler/crawler_weibo/__init__.py
0 → 100644
View file @
ffd0efe4
# -*- coding:UTF-8 -*-
# @Time : 2020/7/31 11:32
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
crawler_sys/site_crawler/crawler_weibo/crawler_weibo.py
View file @
ffd0efe4
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_test/crawler_douban.py
View file @
ffd0efe4
...
...
@@ -81,7 +81,7 @@ class Crawler_douban():
comment_count
=
trans_play_count
(
page_json
[
"comments_count"
])
favorite_count
=
trans_play_count
(
page_json
[
"like_count"
])
collection_count
=
trans_play_count
(
page_json
[
"collections_count"
])
img_list
=
re
.
findall
(
'img src="
.*?
"'
,
content
)
img_list
=
re
.
findall
(
'img src="
(.*?)
"'
,
content
)
dic
=
{
"content"
:
content
,
"repost_count"
:
repost_count
,
...
...
crawler_sys/utils/get_query_result.py
0 → 100644
View file @
ffd0efe4
# coding=utf-8
import
pymysql
from
elasticsearch
import
Elasticsearch
import
smtplib
,
xlwt
,
logging
,
traceback
,
datetime
import
smtplib
from
email.mime.text
import
MIMEText
from
email.mime.image
import
MIMEImage
from
email.mime.multipart
import
MIMEMultipart
from
email.mime.application
import
MIMEApplication
from
email.utils
import
formataddr
es
=
Elasticsearch
([
{
'host'
:
'172.16.31.17'
,
'port'
:
9200
,
},
{
'host'
:
'172.16.31.11'
,
'port'
:
9200
,
}])
def
send_email_tome
():
try
:
date
=
datetime
.
datetime
.
now
()
.
date
()
-
datetime
.
timedelta
(
days
=
1
)
fromaddr
=
'litao@igengmei.com'
password
=
'hTx9kAikArsSNsDr'
# toaddrs = "lixiaofang@igengmei.com"
# toaddrs1 = "duanyingrong@igengmei.com"
# toaddrs2 = "dengguangyu@igengmei.com"
# toaddrs3 = "wangxin@igengmei.com"
# toaddrs4 ="hezijun@igengmei.com"
# toaddrs5 = "malinxi@igengmei.com"
toaddrs6
=
"litao@igengmei.com"
content
=
'hi all:附件为'
+
str
(
date
)
+
'的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
textApart
=
MIMEText
(
content
)
zipFile
=
str
(
date
)
+
".xls"
#zipFile = '昨日数据统计结果.xls'
zipApart
=
MIMEApplication
(
open
(
zipFile
,
'rb'
)
.
read
())
zipApart
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zipFile
)
zipFile_week
=
'近一周数据统计结果.xls'
zipApart_week
=
MIMEApplication
(
open
(
zipFile_week
,
'rb'
)
.
read
())
zipApart_week
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zipFile_week
)
m
=
MIMEMultipart
()
m
.
attach
(
textApart
)
m
.
attach
(
zipApart_week
)
m
.
attach
(
zipApart
)
m
[
'From'
]
=
formataddr
([
"黎涛"
,
toaddrs6
])
# m["To"] = formataddr(["李小芳", toaddrs])
# m["To"] = formataddr(["段英荣", toaddrs1])
# m["To"] = formataddr(["邓光宇", toaddrs2])
# m["To"] = formataddr(["王昕", toaddrs3])
# m["To"] = formataddr(["赫梓君", toaddrs4])
m
[
"To"
]
=
formataddr
([
"黎涛"
,
toaddrs6
])
m
[
'Subject'
]
=
'每日搜索词结果统计'
try
:
server
=
smtplib
.
SMTP_SSL
(
'smtp.exmail.qq.com'
,
465
)
server
.
login
(
fromaddr
,
password
)
server
.
sendmail
(
fromaddr
,
[
toaddrs6
],
m
.
as_string
())
print
(
'success'
)
server
.
quit
()
except
smtplib
.
SMTPException
as
e
:
print
(
'error'
,
e
)
except
Exception
:
logging
.
error
(
"catch exception,main:
%
s"
%
traceback
.
format_exc
())
def
get_es_word
(
word
):
###answer
results
=
es
.
search
(
index
=
'gm-dbmw-answer-read'
,
doc_type
=
'answer'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"title"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"desc"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"answer"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[{
"term"
:
{
"is_online"
:
True
}}]
}
},
}
)
answer_content_num
=
results
[
"hits"
][
"total"
]
# tractate
results
=
es
.
search
(
index
=
'gm-dbmw-tractate-read'
,
doc_type
=
'tractate'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"content"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"tractate_tag_name"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"tractate_tag_name_content"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[{
"term"
:
{
"is_online"
:
True
}}]
}
},
}
)
tractate_content_num
=
results
[
"hits"
][
"total"
]
###diary
results
=
es
.
search
(
index
=
'gm-dbmw-diary-read'
,
doc_type
=
'diary'
,
timeout
=
'10s'
,
size
=
0
,
body
=
{
"query"
:
{
"bool"
:
{
"minimum_should_match"
:
1
,
"should"
:
[{
"match_phrase"
:
{
"tags"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"answer"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}},
{
"match_phrase"
:
{
"service.name"
:
{
"query"
:
word
,
"analyzer"
:
"gm_default_index"
}}}],
"must"
:
[{
"term"
:
{
"is_online"
:
True
}},
{
"range"
:
{
"content_level"
:
{
"gte"
:
"3"
}}}]
}
},
}
)
diary_content_num
=
results
[
"hits"
][
"total"
]
return
answer_content_num
,
tractate_content_num
,
diary_content_num
class
WritrExcel
():
def
set_style
(
self
,
name
,
height
,
bold
=
False
):
style
=
xlwt
.
XFStyle
()
# 初始化样式
font
=
xlwt
.
Font
()
# 为样式创建字体
font
.
name
=
name
font
.
bold
=
bold
font
.
color_index
=
4
font
.
height
=
height
style
.
font
=
font
return
style
# 写入Excel
def
write_excel
(
self
,
path
,
rows
):
# 创建工作簿
workbook
=
xlwt
.
Workbook
(
encoding
=
'utf-8'
)
# 创建sheet
data_sheet
=
workbook
.
add_sheet
(
'Sheet1'
)
# 将样式定义在循环之外
default
=
self
.
set_style
(
'Times New Roman'
,
220
,
True
)
j
=
k
=
0
# 循环读取每一行数据并写入Excel
for
row
in
rows
[:
65530
]:
for
i
in
range
(
len
(
row
)):
try
:
# 写入
data_sheet
.
write
((
j
+
k
),
i
,
row
[
i
],
default
)
except
:
print
(
i
)
raise
# data_sheet.write(1, i, row1[i], self.set_style('Times New Roman', 220, True))
k
=
k
+
1
workbook
.
save
(
path
)
print
(
"写入文件成功,共"
+
str
(
k
)
+
"行数据"
)
if
__name__
==
"__main__"
:
tag_names_list
=
[]
tag_names_list_week
=
[]
all_data_day
=
[]
all_data_week
=
[]
db_zhengxing_eagle
=
pymysql
.
connect
(
host
=
"172.16.30.136"
,
port
=
3306
,
user
=
"doris"
,
password
=
"o5gbA27hXHHm"
,
db
=
"doris_prod"
,
charset
=
'utf8'
,
cursorclass
=
pymysql
.
cursors
.
DictCursor
)
zhengxing_cursor
=
db_zhengxing_eagle
.
cursor
()
date
=
datetime
.
datetime
.
now
()
.
date
()
-
datetime
.
timedelta
(
days
=
1
)
sql
=
'select keywords,sum(sorted) as nums,uv from api_search_words where is_delete = 0 and create_time = "'
+
str
(
date
)
+
'" group by keywords order by nums desc'
print
(
sql
)
zhengxing_cursor
.
execute
(
"set names 'UTF8'"
)
zhengxing_cursor
.
execute
(
sql
)
data
=
zhengxing_cursor
.
fetchall
()
tup_title
=
(
"关键词"
,
"搜索次数"
,
"uv"
,
"日记数量"
,
"回答数量"
,
"帖子数量"
)
for
name
in
list
(
data
):
word
=
name
.
get
(
"keywords"
,
None
)
num
=
name
.
get
(
"nums"
,
0
)
uv
=
name
.
get
(
"uv"
,
0
)
answer_content_num
,
tractate_content_num
,
diary_content_num
=
get_es_word
(
word
)
tag_names_list
.
append
([
word
,
num
,
uv
,
diary_content_num
,
answer_content_num
,
tractate_content_num
])
all_data_day
.
append
(
tup_title
)
for
item
in
tag_names_list
:
all_data_day
.
append
(
tuple
(
item
))
path
=
str
(
date
)
+
".xls"
WritrExcel
()
.
write_excel
(
path
,
tuple
(
all_data_day
))
print
(
u'创建demo.xls文件成功'
)
date
=
datetime
.
datetime
.
now
()
.
date
()
-
datetime
.
timedelta
(
days
=
7
)
sql
=
'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "'
+
str
(
date
)
+
'" group by keywords order by nums desc'
print
(
sql
)
zhengxing_cursor
.
execute
(
"set names 'UTF8'"
)
zhengxing_cursor
.
execute
(
sql
)
data
=
zhengxing_cursor
.
fetchall
()
tup_title
=
(
"关键词"
,
"搜索次数"
,
"uv"
,
"日记数量"
,
"回答数量"
,
"帖子数量"
)
for
name
in
list
(
data
):
word
=
name
.
get
(
"keywords"
,
None
)
sorteds
=
name
.
get
(
"nums"
,
0
)
uv
=
name
.
get
(
"uvs"
,
0
)
answer_content_num
,
tractate_content_num
,
diary_content_num
=
get_es_word
(
word
)
tag_names_list_week
.
append
([
word
,
sorteds
,
uv
,
diary_content_num
,
answer_content_num
,
tractate_content_num
])
all_data_week
.
append
(
tup_title
)
for
item
in
tag_names_list_week
:
all_data_week
.
append
(
tuple
(
item
))
path
=
"近一周数据统计结果.xls"
WritrExcel
()
.
write_excel
(
path
,
tuple
(
all_data_week
))
print
(
u'创建demo.xls文件成功'
)
send_email_tome
()
crawler_sys/utils/output_results.py
View file @
ffd0efe4
...
...
@@ -174,7 +174,7 @@ def bulk_write_into_es(dict_Lst,
)
except
TransportError
:
print
(
"output to es register error"
)
write_str_into_file
(
file_path
=
'/home/
fangyucheng/
'
,
write_str_into_file
(
file_path
=
'/home/'
,
file_name
=
'debug'
,
var
=
bulk_write_body
)
return
retry_counter_for_UnicodeEncodeError
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment