Commit ad65e200 authored by 李小芳's avatar 李小芳

update

parent 67b86bd3
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
from pypinyin import lazy_pinyin
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
from fontTools.ttLib import TTFont
logger = logging.getLogger(__name__)
class DianPintCraw(object):
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
"Cookie": "fspop=test; cy=2; cye=beijing; _lxsdk_cuid=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _lxsdk=17ac7b3482cc8-0577482fde4f21-34647600-1fa400-17ac7b3482cc8; _hc.v=7cd93c95-3674-1de2-0725-2e8d4141c973.1626848053; s_ViewType=10; dplet=45b53ad04cb79c04c2e30bea98dca7ef; dper=8591feb7929077261e0c0702628cd4314faa13a74729c7e6480d13c3220c85e5b0f336a0b2af7450370e86f53958152509c44d579007ab941b3a66bc922cdf19cde4eecbdb3f94ef3a0532a955ea9e11803bbf18d01a29bad962ca22e13f6543; ll=7fd06e815b796be3df069dec7836c3df; ua=%E9%99%AA%E4%BD%A0%E6%90%9E%E6%80%AA; ctu=23034069fac8b78bdb78108ada1c10714737c4da63d46c011bfd4779f1daa177; cityid=2; switchcityflashtoast=1; default_ab=citylist%3AA%3A1%7Cindex%3AA%3A3; source=m_browser_test_33; Appshare2021_ab=shop%3AA%3A1%7Cmap%3AA%3A1%7Cshopphoto%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1626862684,1627020606,1627041159,1627292689; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1627294126; _lxsdk_s=17ae233df3e-b4b-9f4-00d%7C%7C304",
'Host': 'www.dianping.com',
'Referer': 'http://www.dianping.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
}
self.url = 'http://www.dianping.com/search/keyword/2/0_%E6%B0%B4%E5%85%89%E9%92%88'
def parse_url(self):
response = requests.get(url=self.url, headers=self.headers)
content = response.text
print(content)
if response.status_code == 200:
return content
else:
return None
def search(self):
content = self.parse_url()
parsed_response = BeautifulSoup(content, "lxml")
shop_search = parsed_response.find(
attrs={"class": "section Fix J-shop-search"}).find(
attrs={"class": "content-wrap"}).find(
attrs={"class": "shop-wrap"}).find(
attrs={"class": "content"}).find(
attrs={"class": "shop-list J_shop-list shop-all-list"}).find("ul").find_all("li")
for item in shop_search:
hospital_name = item.find(attrs={"class": "txt"}).find(attrs={"class": "tit"}).find("a").find(
"h4").get_text()
print(hospital_name)
star_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "nebula_star"}).find(attrs={"class": "star_icon"}).find_all("span")
print("star_info:", star_info)
review_num_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "review-num"}).find("b")
print("review_num_info:", review_num_info)
meanprice_info = item.find(attrs={"class": "txt"}).find(attrs={"class": "comment"}).find(
attrs={"class": "mean-price"}).find("b")
print("meanprice_info:", meanprice_info)
# service_info_data = item.find(attrs={"class": "svr-info"}).find(
# attrs={"class": "si-deal d-packup"}).find_all("a")
# for service_info in service_info_data:
# sku_info = service_info.text()
# print(base_info_data)
# print(service_info_data)
print("-----------")
return shop_search
def woff_change(self, wofflist, TTG, woffdict):
try:
woff_string = '''
1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福
人百餐茶务通味所山区门药银 农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建 '''
woffs = [i for i in woff_string if i != '\n' and i != ' ']
woff_content = ''
for char in wofflist:
text = str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')
if text in TTG:
content = woffs[woffdict[str(char.encode('raw_unicode_escape').replace(b'\\u', b'uni'), 'utf-8')]]
else:
content = char
woff_content += ''.join(content)
except UnicodeDecodeError:
return "编码错误"
else:
return woff_content
# 以爬取地址为例子
# soup为网页的内容
# def get_adress(self):
# addressfont = TTFont('/Users/edz/Downloads/3944c230.woff')
# address_TTGlyphs = addressfont['cmap'].tables[0].ttFont.getGlyphOrder()[2:]
# address_dict = {}
# for i, x in enumerate(address_TTGlyphs):
# address_dict[x] = i
# # adress = soup("div.tag-addr > span").text()
#
# location = self.woff_change(adress, address_TTGlyphs, address_dict)
# locations = re.sub('\s', '', location)
# return locations
if __name__ == '__main__':
spider = DianPintCraw()
spider.parse_url()
import json
import logging
import smtplib
import sys
import time
import traceback
import datetime
import os
import random
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formataddr
import re
import pandas as pd
import requests
from lxml import etree
from pypinyin import lazy_pinyin
import gevent
logger = logging.getLogger(__name__)
......@@ -58,7 +55,6 @@ def send_email_tome():
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
......@@ -78,12 +74,13 @@ def get_service_info(query="", city_name="", get_data_file=None):
break_flat = False
other_city_count = 0
for page in range(0, 3000, 10):
if break_flat == False and other_city_count < 100:
if break_flat == False and other_city_count < 10:
s = random.random()
time.sleep(s)
# time.sleep(s)
url = 'https://backend.igengmei.com/api/janus/search/v7/content?platform=iPhone&os_version=13.6.1&version=7.46.0&model=iphone%20X%20++&release=1&idfa=057F28DF-20B8-488F-A285-931367FCC110&idfv=74FE9CFB-DAD2-4379-B8F8-FC656F38BCA5&device_id=057F28DF-20B8-488F-A285-931367FCC110&uqid=47517624-F42B-469C-96EC-3BF936E44613&channel=App%20Store&app_name=gengmeiios&current_city_id={}&lat=39.98323941854652&lng=116.4880417854629&is_WiFi=1&hardware_model=iPhone12,1&ua=Mozilla/5.0%20(iPhone;%20CPU%20iPhone%20OS%2013_6_1%20like%20Mac%20OS%20X)%20AppleWebKit/605.1.15%20(KHTML,%20like%20Gecko)%20Mobile/15E148&sm_idfa=(null)&trace_id=2021/07/22/0956/53f8f1c10868&area_id=worldwide&count=10&is_first=1&is_gray=1&max_price=100000&min_price=0&offset={}&order_by=0&query={}&show_mode=1&size=10&tab_type=0'.format(
city_name, page, query)
response_res = requests.get(url, verify=False)
response_res = requests.get(url, verify=False, timeout=30)
time.sleep(1)
if response_res.status_code == 200 and response_res.text:
response = json.loads(response_res.text)
responseData = response.get("data", {}).get("cards")
......@@ -93,7 +90,6 @@ def get_service_info(query="", city_name="", get_data_file=None):
for service_data in item.get("feed", []):
city_id = service_data.get("hospital_info", {}).get("city_id")
if str(city_id) == str(city_name):
# print(service_data)
service_info = dict()
service_info['skuid'] = service_data.get("service_item_id")
service_info['美购id'] = service_data.get("service_id")
......@@ -123,9 +119,10 @@ def get_service_info(query="", city_name="", get_data_file=None):
service_info[
'链接'] = "https://m.igengmei.com/promotion/{}?sku_id={}&distribute_type=1&distribute_id=30775628&is_share=1".format(
service_info['美购id'], service_info['skuid'])
print(service_info)
get_data_file.write(json.dumps(service_info))
get_data_file.write("\n")
print("write success ", query, city_name)
else:
other_city_count += 1
......@@ -139,17 +136,13 @@ def get_service_info(query="", city_name="", get_data_file=None):
print(page, city_name, query, "本地已爬完")
def main(city_tag=""):
def main():
begin = time.time()
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
file_name = "gengmei_save_data_" + str(today) + city_tag + ".txt"
file_name = "test_gengmei_save_data_" + str(today) + ".txt"
get_data_file = open(file_name, "a+", encoding="utf-8")
cityIdMapping = {'北京': '328', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉',
'脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针', '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光',
......@@ -164,10 +157,24 @@ def main(city_tag=""):
'美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
# city_list = ["beijing", "shanghai", "guangzhou", "shenzhen", "hangzhou", "chengdu", "chongqing", "nanjing", "wuhan", "changsha", "zhengzhou", "xian"]
city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
city_list = [city_tag]
city_list = ['合肥市', '芜湖市', '蚌埠市', '淮南市', '马鞍山市', '淮北市', '铜陵市', '安庆市', '滁州市', '阜阳市', '宿州市', '六安地区', '亳州市', '池州市',
'宣城市', '福州市', '厦门市', '莆田市', '三明市', '泉州市', '漳州市', '南平市', '龙岩市', '宁德市', '韶关市', '东莞市', '中山市', '珠海市',
'佛山市', '汕头市', '江门市', '湛江市', '茂名市', '肇庆市', '惠州市', '梅州市', '河源市', '阳江市', '清远市', '潮州市', '揭阳市', '南宁市',
'柳州市', '桂林市', '梧州市', '北海市', '钦州市', '贵港市', '玉林市', '百色市', '贵阳市', '遵义市', '六盘水市', '铜仁地区', '毕节地区', '兰州市',
'张掖市', '平凉市', '庆阳市', '开封市', '洛阳市', '平顶山市', '安阳市', '新乡市', '焦作市', '濮阳市', '许昌市', '漯河市', '三门峡市', '南阳市',
'商丘市', '信阳市', '周口市', '驻马店市', '黄石市', '荆州市', '宜昌市', '襄阳市', '天门市', '十堰市', '荆门市', '孝感市', '黄冈市', '咸宁市',
'随州市', '仙桃市', '潜江市', '株洲市', '衡阳市', '益阳市', '邵阳市', '湘潭市', '岳阳市', '常德市', '郴州市', '永州市', '怀化市', '娄底市',
'石家庄市', '唐山市', '秦皇岛市', '邯郸市', '邢台市', '保定市', '张家口市', '承德市', '衡水市', '廊坊市', '沧州市', '哈尔滨市', '齐齐哈尔市',
'鸡西市', '鹤岗市', '大庆市', '佳木斯市', '七台河市', '牡丹江市', '绥化市', '海口市', '三亚市', '东方市', '苏州市', '无锡市', '常州市', '南通市',
'盐城市', '徐州市', '连云港市', '扬州市', '淮安市', '镇江市', '泰州市', '宿迁市', '南昌市', '景德镇市', '萍乡市', '九江市', '新余市', '鹰潭市',
'赣州市', '吉安市', '宜春市', '抚州市', '上饶市', '长春市', '吉林市', '四平市', '通化市', '白山市', '松原市', '沈阳市', '大连市', '鞍山市',
'抚顺市', '丹东市', '锦州市', '营口市', '阜新市', '辽阳市', '铁岭市', '朝阳市', '葫芦岛市', '呼和浩特市', '包头市', '赤峰市', '通辽市', '鄂尔多斯市',
'呼伦贝尔市', '巴彦淖尔市', '银川市', '西宁市', '自贡市', '绵阳市', '德阳市', '乐山市', '南充市', '攀枝花市', '泸州市', '广元市', '遂宁市', '内江市',
'眉山市', '宜宾市', '广安市', '达州市', '雅安市', '巴中市', '资阳市', '济南市', '青岛市', '枣庄市', '烟台市', '淄博市', '临沂市', '东营市',
'潍坊市', '济宁市', '泰安市', '威海市', '日照市', '德州市', '聊城市', '滨州市', '菏泽市', '宝鸡市', '咸阳市', '汉中市', '渭南市', '延安市',
'安康市', '太原市', '晋中市', '运城市', '大同市', '阳泉市', '长治市', '晋城市', '临汾市', '乌鲁木齐市', '石河子市', '昆明市', '曲靖市', '玉溪市',
'保山市', '昭通市', '丽江市', '宁波市', '温州市', '嘉兴市', '金华市', '台州市', '湖州市', '绍兴市', '舟山市', '衢州市', '丽水市', '台北市',
'台中市', '台北县']
for city_name in city_list:
for word in keywords:
......@@ -178,5 +185,4 @@ def main(city_tag=""):
if __name__ == '__main__':
args = sys.argv[1]
main(city_tag=args)
main()
......@@ -123,12 +123,14 @@ if __name__ == '__main__':
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
# city_list = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]
city_list = ["gengmei_save_data_2021-07-27.txt", "gengmei_save_data_2021-07-28.txt",
"gengmei_save_data_2021-07-29.txt", "gengmei_save_data_2021-07-30.txt"]
for city_name in city_list:
file_name = "gengmei_save_data_2021-07-22" + city_name + ".txt"
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
print(file_name)
# file_name = "gengmei_save_data_2021-07-22" + city_name + ".txt"
if os.path.exists(city_name):
open_file = open(city_name, "r", encoding="utf-8")
print(city_name)
for item in open_file.readlines():
try:
data = json.loads(item.strip())
......
This diff is collapsed.
......@@ -57,7 +57,6 @@ def send_email_tome():
def get_keynote_sentence(content):
try:
content_list = []
ss = content.encode('utf-16', 'surrogatepass').decode('utf-16')
dr = re.compile(r"<[^>]+>", re.S)
str_re = dr.sub("", ss)
......@@ -98,8 +97,6 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"keyword": str(keyword),
"list_name": "sy_app_superlist_search_page",
"lver": "8.28.2",
# "menu1_id": "--Boundary+D46DCD61FE6FA268",
# "menu2_id": "--Boundary+D46DCD61FE6FA268",
"page": page,
"page_size": 20,
"push_app_id": 42,
......@@ -107,7 +104,6 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"s_mei_device_id": "20200317131719d8bcbc37c54be511421dc3ebf7f1d0a801036b566bd47092",
"s_meng_device_id": "D2VCzq4o472Ur7QtdVY6RlcjO6h3455JlJ+OC39JcQC7sX6a",
"schemecard": "--Boundary+D46DCD61FE6FA268",
# "sub_tab": "--Boundary+D46DCD61FE6FA268",
"sys": 1,
"tab": "mix",
"uid": "48804194",
......@@ -184,23 +180,20 @@ def main(city_tag=""):
cityIdMapping = {'北京': '1', '上海': '9', '重庆': '22', '广州市': '289', '深圳市': '291', '郑州市': '240', '武汉市': '258',
'长沙市': '275', '南京市': '162', '成都市': '385', '西安市': '438', '杭州市': '175'}
"""
'瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻',
"""
keywords = ['菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸',
'微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
'双眼皮修复',
'欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
'欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨',
'艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
'开眼角',
keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉',
'脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针', '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光',
'面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
'黄金微针', '隆胸', '微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针',
'熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮',
'菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充', '溶脂针', '法令纹',
'鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升',
'm22', '鼻翼缩小', '欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇',
'水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
'除皱', '颧骨', '艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针',
'美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针', '开眼角',
'海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list = [city_tag]
all_skuids = []
......
This diff is collapsed.
import csv
def read_cika_info():
all_cika_title = {}
cika_file = open("soyoung_service_cika.csv", "r", encoding="utf-8")
for item in cika_file.readlines():
cika = eval(item)
for key, values in cika.items():
all_cika_title[key] = values
print(all_cika_title)
cika_file.close()
return all_cika_title
def np_write_csv_data():
all_cika_title = read_cika_info()
write_file = open("soyoung_service_write_cika.csv", "a+", encoding="utf-8")
# /Users/edz/Desktop/xinyang.csv
with open("soyoung_service.csv", encoding='utf-8') as f:
reader = csv.reader(f)
header_row = next(reader)
print(header_row)
for row in reader:
service_id = row[2]
cika_info = all_cika_title.get(service_id)
if cika_info:
print('cika_info:', cika_info)
write_file.write(str(row.append(cika_info)))
write_file.write("\n")
write_file.close()
if __name__ == '__main__':
np_write_csv_data()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -36,7 +36,7 @@ def send_email_tome():
content = '爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart = MIMEText(content, 'plain', "utf-8")
zip_file_week = 'soyoung_service.csv.gz'
zip_file_week = 'soyoung_service_other.csv.gz'
zip_apart_week = MIMEApplication(open(zip_file_week, 'rb').read())
zip_apart_week.add_header('Content-Disposition', 'attachment', filename=zip_file_week)
......@@ -87,12 +87,13 @@ if __name__ == '__main__':
nowday = datetime.datetime.now()
today = str(nowday).split()[0]
all_data = []
city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list = ["save_data_2021-07-27.txt", "save_data_2021-07-28.txt", "save_data_2021-07-29.txt"]
for city_name in city_list:
file_name = "save_data_" + today + city_name + ".txt"
print(file_name)
if os.path.exists(file_name):
open_file = open(file_name, "r", encoding="utf-8")
# file_name = "save_data_" + today + ".txt"
# print(file_name)
if os.path.exists(city_name):
open_file = open(city_name, "r", encoding="utf-8")
for item in open_file.readlines():
try:
data = json.loads(item.strip())
......@@ -103,7 +104,7 @@ if __name__ == '__main__':
if len(all_data):
res = pd.DataFrame(all_data)
res.to_csv("soyoung_service.csv.gz", compression='gzip', index=False, encoding="gb18030")
res.to_csv("soyoung_service_other.csv.gz", compression='gzip', index=False, encoding="gb18030")
send_email_tome()
print(time.time() - begin)
......
This diff is collapsed.
0,666426,108536,北京俏中关医疗美容门诊部,9800,1972,5,【除皱瘦脸】美国进口标准装【除皱瘦脸】瘦脸针100U·足量·正品 进口/提升/下颌线,329,"['付尾款,最高立减068', '尾款满100减8']",[],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap666426
1,84880,82258,北京画美医疗美容医院,1680,551,4.8,【注射瘦脸】除皱瘦脸国产80-100u 限购一次 正品足量 正品可验 小V脸 去咬肌 咬肌肥大瘦脸针,2321,[],['新人首单立减0629'],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap84880
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import requests
import gevent
from gevent import monkey
import re
from pypinyin import lazy_pinyin
import time
import random
monkey.patch_all()
def get_service_info(spu_id, url, ids_bug):
time.sleep(random.uniform(1, 2))
url = url + "?tab=0&rj=0"
headers = {
"Cookie": "smidV2=202108111914527c82de7c31a1ebbb6f65a12dfd5021de00aefc1da947e1af0; _ga=GA1.2.1705589232.1628680492; _gid=GA1.2.1191277434.1628680492; PHPSESSID=e81ba73301b430097db1c9599826880a; __usersign__=1628680488549938593",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 AliApp(DingTalk/6.0.23) com.laiwang.DingTalk/15108471 Channel/201200 language/zh-Hans-CN UT4Aplus/0.0.6 WK",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"DingTalk-Flag": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Host": "m.soyoung.com"
}
response_res = requests.get(url, headers=headers, verify=False, timeout=30)
print("url:%s,status:%s" % (url, response_res.status_code))
sub_title_dict = {}
if response_res.status_code == 200:
if spu_id in ids_bug:
ids_bug.remove(spu_id)
sub_title = re.compile('<span class="sub-title">(.*?)</span><span class=".*?">(.*?)</span>').findall(
response_res.text)
for item in sub_title:
times = item[0].strip()
price = item[1].strip().split("<")[0]
sub_title_dict[times] = price
x_p_o = re.compile('<span class="x-p-o">(.*?)</span>').findall(response_res.text)
if x_p_o:
sub_title_dict['门市价'] = x_p_o
else:
ids_bug.append(spu_id)
time.sleep(600)
return sub_title_dict
def gevent_test():
tasks = []
for city_name in ['合肥市', '芜湖市', '蚌埠市', '淮南市']:
for word in ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉']:
tasks.append(
gevent.spawn(get_service_info, query=word, city_name="".join(lazy_pinyin(city_name)))
)
print(tasks)
gevent.joinall(tasks)
def get_cika_info_to_csv():
read_file = open("soyoung_service.csv", "r", encoding="utf-8")
write_file = open("soyoung_service_cika.csv", "a+", encoding="utf-8")
all_cika_title = list()
ids_bug = []
have_read_service = []
for item in read_file.readlines():
try:
cika_price_dict = dict()
print("index:", item.strip().split(",")[0])
if int(item.strip().split(",")[0]) > 33755:
service_id = item.strip().split(",")[2]
url = item.strip().split(",")[-1]
if service_id in have_read_service:
pass
else:
have_read_service.append(service_id)
sub_title_dict = get_service_info(str(service_id), url, ids_bug)
if sub_title_dict:
all_cika_title.extend(sub_title_dict.keys())
cika_price_dict[service_id] = sub_title_dict
print(cika_price_dict)
write_file.write(str(cika_price_dict))
write_file.write("\n")
except Exception as e:
print(e)
pass
print(all_cika_title)
read_file.close()
write_file.close()
def read_cika_info():
all_cika_title = {}
cika_file = open("soyoung_service_cika.csv", "r", encoding="utf-8")
for item in cika_file.readlines():
cika = eval(item)
for key, values in cika.items():
all_cika_title[key] = values
print(all_cika_title)
cika_file.close()
return all_cika_title
def write_service_info_to_csv():
all_cika_sub = ['单次', '2次', '3次', '4次', '5次', '6次', '8次', '9次', '10次', '20次' '门市价']
read_file = open("soyoung_service.csv", "r", encoding="utf-8")
write_file = open("soyoung_service_write_cika.csv", "w", encoding="utf-8")
all_cika_title = read_cika_info()
for item in read_file.readlines():
try:
service_info = item.strip().split(",")
print("service_info", service_info)
service_id = service_info[2]
cika_info = all_cika_title.get(service_id)
if cika_info:
print('cika_info:', cika_info)
for cika in all_cika_sub:
service_info.append(cika_info.get(cika, ""))
write_file.write(str(service_info))
write_file.write("\n")
except Exception as e:
print("eeeeee:", e)
pass
read_file.close()
write_file.close()
if __name__ == '__main__':
get_cika_info_to_csv()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment