Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
b6079ebc
Commit
b6079ebc
authored
Aug 17, 2021
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
ad65e200
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
125 deletions
+0
-125
app_dianping_by_cityname.py
dev/DianPing/app_dianping_by_cityname.py
+0
-125
dianping_test.py
dev/DianPing/dianping_test.py
+0
-0
No files found.
dev/DianPing/app_dianping_by_cityname.py
deleted
100644 → 0
View file @
ad65e200
import
json
import
logging
import
smtplib
import
sys
import
time
import
traceback
import
datetime
import
os
import
random
from
email.mime.application
import
MIMEApplication
from
email.mime.multipart
import
MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
import
re
import
pandas
as
pd
import
requests
from
lxml
import
etree
from
pypinyin
import
lazy_pinyin
import
requests
from
bs4
import
BeautifulSoup
import
pandas
as
pd
from
tqdm
import
tqdm_notebook
from
fontTools.ttLib
import
TTFont
logger
=
logging
.
getLogger
(
__name__
)
class
DianPintCraw
(
object
):
def
__init__
(
self
):
self
.
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
"Cookie"
:
"fspop=test; cy=2; cye=beijing; _lxsdk_cuid=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _lxsdk=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _hc.v=7cd93c95-3674-1de2-0725-2e8d4141c973.1626848053; s_ViewType=10; dplet=45b53ad04cb79c04c2e30bea98dca7ef; dper=8591feb7929077261e0c0702628cd4314faa13a74729c7e6480d13c3220c85e5b0f336a0b2af7450370e86f53958152509c44d579007ab941b3a66bc922cdf19cde4eecbdb3f94ef3a0532a955ea9e11803bbf18d01a29bad962ca22e13f6543; ll=7fd06e815b796be3df069dec7836c3df; ua=
%
E9
%99%
AA
%
E4
%
BD
%
A0
%
E6
%90%9
E
%
E6
%80%
AA; ctu=23034069fac8b78bdb78108ada1c10714737c4da63d46c011bfd4779f1daa177; cityid=2; switchcityflashtoast=1; default_ab=citylist
%3
AA
%3
A1
%7
Cindex
%3
AA
%3
A3; source=m_browser_test_33; Appshare2021_ab=shop
%3
AA
%3
A1
%7
Cmap
%3
AA
%3
A1
%7
Cshopphoto
%3
AA
%3
A1; _lx_utm=utm_source
%3
DBaidu
%26
utm_medium
%3
Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1626862684,1627020606,1627041159,1627292689; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1627294126; _lxsdk_s=17ae233df3e-b4b-9f4-00d
%7
C
%7
C304"
,
'Host'
:
'www.dianping.com'
,
'Referer'
:
'http://www.dianping.com'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
,
}
self
.
url
=
'http://www.dianping.com/search/keyword/2/0_
%
E6
%
B0
%
B4
%
E5
%85%89%
E9
%92%88
'
def
parse_url
(
self
):
response
=
requests
.
get
(
url
=
self
.
url
,
headers
=
self
.
headers
)
content
=
response
.
text
print
(
content
)
if
response
.
status_code
==
200
:
return
content
else
:
return
None
def
search
(
self
):
content
=
self
.
parse_url
()
parsed_response
=
BeautifulSoup
(
content
,
"lxml"
)
shop_search
=
parsed_response
.
find
(
attrs
=
{
"class"
:
"section Fix J-shop-search"
})
.
find
(
attrs
=
{
"class"
:
"content-wrap"
})
.
find
(
attrs
=
{
"class"
:
"shop-wrap"
})
.
find
(
attrs
=
{
"class"
:
"content"
})
.
find
(
attrs
=
{
"class"
:
"shop-list J_shop-list shop-all-list"
})
.
find
(
"ul"
)
.
find_all
(
"li"
)
for
item
in
shop_search
:
hospital_name
=
item
.
find
(
attrs
=
{
"class"
:
"txt"
})
.
find
(
attrs
=
{
"class"
:
"tit"
})
.
find
(
"a"
)
.
find
(
"h4"
)
.
get_text
()
print
(
hospital_name
)
star_info
=
item
.
find
(
attrs
=
{
"class"
:
"txt"
})
.
find
(
attrs
=
{
"class"
:
"comment"
})
.
find
(
attrs
=
{
"class"
:
"nebula_star"
})
.
find
(
attrs
=
{
"class"
:
"star_icon"
})
.
find_all
(
"span"
)
print
(
"star_info:"
,
star_info
)
review_num_info
=
item
.
find
(
attrs
=
{
"class"
:
"txt"
})
.
find
(
attrs
=
{
"class"
:
"comment"
})
.
find
(
attrs
=
{
"class"
:
"review-num"
})
.
find
(
"b"
)
print
(
"review_num_info:"
,
review_num_info
)
meanprice_info
=
item
.
find
(
attrs
=
{
"class"
:
"txt"
})
.
find
(
attrs
=
{
"class"
:
"comment"
})
.
find
(
attrs
=
{
"class"
:
"mean-price"
})
.
find
(
"b"
)
print
(
"meanprice_info:"
,
meanprice_info
)
# service_info_data = item.find(attrs={"class": "svr-info"}).find(
# attrs={"class": "si-deal d-packup"}).find_all("a")
# for service_info in service_info_data:
# sku_info = service_info.text()
# print(base_info_data)
# print(service_info_data)
print
(
"-----------"
)
return
shop_search
def
woff_change
(
self
,
wofflist
,
TTG
,
woffdict
):
try
:
woff_string
=
'''
1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福
人百餐茶务通味所山区门药银 农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建 '''
woffs
=
[
i
for
i
in
woff_string
if
i
!=
'
\n
'
and
i
!=
' '
]
woff_content
=
''
for
char
in
wofflist
:
text
=
str
(
char
.
encode
(
'raw_unicode_escape'
)
.
replace
(
b
'
\\
u'
,
b
'uni'
),
'utf-8'
)
if
text
in
TTG
:
content
=
woffs
[
woffdict
[
str
(
char
.
encode
(
'raw_unicode_escape'
)
.
replace
(
b
'
\\
u'
,
b
'uni'
),
'utf-8'
)]]
else
:
content
=
char
woff_content
+=
''
.
join
(
content
)
except
UnicodeDecodeError
:
return
"编码错误"
else
:
return
woff_content
# 以爬取地址为例子
# soup为网页的内容
# def get_adress(self):
# addressfont = TTFont('/Users/edz/Downloads/3944c230.woff')
# address_TTGlyphs = addressfont['cmap'].tables[0].ttFont.getGlyphOrder()[2:]
# address_dict = {}
# for i, x in enumerate(address_TTGlyphs):
# address_dict[x] = i
# # adress = soup("div.tag-addr > span").text()
#
# location = self.woff_change(adress, address_TTGlyphs, address_dict)
# locations = re.sub('\s', '', location)
# return locations
if
__name__
==
'__main__'
:
spider
=
DianPintCraw
()
spider
.
parse_url
()
dev/DianPing/dianping_test.py
deleted
100644 → 0
View file @
ad65e200
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment