Commit b6079ebc authored by 李小芳's avatar 李小芳

update

parent ad65e200
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
from pypinyin import lazy_pinyin
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
from fontTools.ttLib import TTFont
logger = logging.getLogger(__name__)
class DianPintCraw(object):
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
"Cookie": "fspop=test; cy=2; cye=beijing; _lxsdk_cuid=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _lxsdk=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _hc.v=7cd93c95-3674-1de2-0725-2e8d4141c973.1626848053; s_ViewType=10; dplet=45b53ad04cb79c04c2e30bea98dca7ef; dper=8591feb7929077261e0c0702628cd4314faa13a74729c7e6480d13c3220c85e5b0f336a0b2af7450370e86f53958152509c44d579007ab941b3a66bc922cdf19cde4eecbdb3f94ef3a0532a955ea9e11803bbf18d01a29bad962ca22e13f6543; ll=7fd06e815b796be3df069dec7836c3df; ua=%E9%99%AA%E4%BD%A0%E6%90%9E%E6%80%AA; ctu=23034069fac8b78bdb78108ada1c10714737c4da63d46c011bfd4779f1daa177; cityid=2; switchcityflashtoast=1; default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3; source=m_browser_test_33; Appshare2021_ab=shop%3AA%3A1%7Cmap%3AA%3A1%7Cshopphoto%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1626862684,1627020606,1627041159,1627292689; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1627294126; _lxsdk_s=17ae233df3e-b4b-9f4-00d%7C%7C304",
'Host': 'www.dianping.com',
'Referer': 'http://www.dianping.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
}
self.url = 'http://www.dianping.com/search/keyword/2/0_%E6%B0%B4%E5%85%89%E9%92%88'
def parse_url(self):
response = requests.get(url=self.url, headers=self.headers)
content = response.text
print(content)
if response.status_code == 200:
return content
else:
return None
def search(self):
content = self.parse_url()
parsed_response = BeautifulSoup(content, "lxml")
shop_search = parsed_response.find(
attrs={"class": "section Fix J-shop-search"}).find(
attrs={"class": "content-wrap"}).find(
attrs={"class": "shop-wrap"}).find(
attrs={"class": "content"}).find(
attrs={"class": "shop-list J_shop-list shop-all-list"}).find("ul").find_all("li")
for item in shop_search:
hospital_name = item.find(attrs={"class": "txt"}).find(attrs={"class": "tit"}).find("a").find(
"h4").get_text()
print(hospital_name)
star_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "nebula_star"}).find(attrs={"class": "star_icon"}).find_all("span")
print("star_info:", star_info)
review_num_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "review-num"}).find("b")
print("review_num_info:", review_num_info)
meanprice_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "mean-price"}).find("b")
print("meanprice_info:", meanprice_info)
# service_info_data = item.find(attrs={"class": "svr-info"}).find(
# attrs={"class": "si-deal d-packup"}).find_all("a")
# for service_info in service_info_data:
# sku_info = service_info.text()
# print(base_info_data)
# print(service_info_data)
print("-----------")
return shop_search
def woff_change(self, wofflist, TTG, woffdict):
try:
woff_string = '''
1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福
人百餐茶务通味所山区门药银 农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建 '''
woffs = [i for i in woff_string if i != '\n' and i != ' ']
woff_content = ''
for char in wofflist:
text = str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')
if text in TTG:
content = woffs[woffdict[str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')]]
else:
content = char
woff_content += ''.join(content)
except UnicodeDecodeError:
return "编码错误"
else:
return woff_content
# 以爬取地址为例子
# soup为网页的内容
# def get_adress(self):
# addressfont = TTFont('/Users/edz/Downloads/3944c230.woff')
# address_TTGlyphs = addressfont['cmap'].tables[0].ttFont.getGlyphOrder()[2:]
# address_dict = {}
# for i, x in enumerate(address_TTGlyphs):
# address_dict[x] = i
# # adress = soup("div.tag-addr > span").text()
#
# location = self.woff_change(adress, address_TTGlyphs, address_dict)
# locations = re.sub('\s', '', location)
# return locations
if __name__ == '__main__':
spider = DianPintCraw()
spider.parse_url()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment