Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
827549ca
Commit
827549ca
authored
Jul 23, 2021
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add
parent
bb6ce850
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
335 additions
and
148 deletions
+335
-148
app_gengmei.py
dev/xinyang_ask_tag/app_gengmei.py
+107
-87
app_soyoung_by_cityname.py
dev/xinyang_ask_tag/app_soyoung_by_cityname.py
+38
-18
app_soyoung_v1.py
dev/xinyang_ask_tag/app_soyoung_v1.py
+20
-23
city.py
dev/xinyang_ask_tag/city.py
+3
-5
save_gengmei_data_to_csv.py
dev/xinyang_ask_tag/save_gengmei_data_to_csv.py
+148
-0
save_soyoung_data_to_csv.py
dev/xinyang_ask_tag/save_soyoung_data_to_csv.py
+19
-15
No files found.
dev/xinyang_ask_tag/app_gengmei.py
View file @
827549ca
import
json
import
logging
import
smtplib
import
s
ocket
import
s
ys
import
time
import
traceback
import
datetime
...
...
@@ -11,10 +11,11 @@ from email.mime.application import MIMEApplication
from
email.mime.multipart
import
MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
import
re
import
pandas
as
pd
import
requests
from
lxml
import
etree
from
pypinyin
import
lazy_pinyin
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -55,108 +56,127 @@ def send_email_tome():
logger
.
error
(
"catch exception,main:
%
s"
%
traceback
.
format_exc
())
def
get_service_info
(
city_id
=-
1
,
keyword
=
""
,
city_name
=
""
,
all_skuids
=
[],
get_data_file
=
None
):
def
get_keynote_sentence
(
content
):
try
:
content_list
=
[]
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
para
=
re
.
sub
(
'([;。!?
\
?])([^”’])'
,
r"\1\n\2"
,
str_re
)
# 单字符断句符
para
=
re
.
sub
(
'(
\
.{6})([^”’])'
,
r"\1\n\2"
,
para
)
# 英文省略号
para
=
re
.
sub
(
'(
\
…{2})([^”’])'
,
r"\1\n\2"
,
para
)
# 中文省略号
para
=
re
.
sub
(
'([;。!?
\
?][”’])([^,。!?
\
?])'
,
r'\1\n\2'
,
para
)
para
=
para
.
rstrip
()
# 段尾如果有多余的\n就去掉它
return
para
except
:
logging
.
error
(
"catch exception,logins:
%
s"
%
traceback
.
format_exc
())
return
[]
def
get_service_info
(
query
=
""
,
city_name
=
""
,
get_data_file
=
None
):
print
(
"get_service_info"
)
service_info_list
=
[]
for
page
in
range
(
1
,
1000
,
10
):
url
=
"https://backend.igengmei.com/api/janus/search/v7/content?platform=iPhone&os_version=13.6.1&version=7.46.0&model=iphone
%20
X
%20
++&release=1&idfa=057F28DF-20B8-488F-A285-931367FCC110&idfv=74FE9CFB-DAD2-4379-B8F8-FC656F38BCA5&device_id=057F28DF-20B8-488F-A285-931367FCC110&uqid=47517624-F42B-469C-96EC-3BF936E44613&channel=App
%20
Store&app_name=gengmeiios¤t_city_id={}&lat=39.98320387964299&lng=116.4880504620152&is_WiFi=1&hardware_model=iPhone12,1&ua=Mozilla/5.0
%20
(iPhone;
%20
CPU
%20
iPhone
%20
OS
%2013
_6_1
%20
like
%20
Mac
%20
OS
%20
X)
%20
AppleWebKit/605.1.15
%20
(KHTML,
%20
like
%20
Gecko)
%20
Mobile/15E148&sm_idfa=(null)&trace_id=2021/07/21/1104/8085fa19e028&area_id=worldwide&count=10&is_first=0&is_gray=1&max_price=100000&min_price=0&offset={}&order_by=0&query={}&show_mode=1&size=10&tab_type=0"
.
format
(
city_name
,
page
,
keyword
)
print
(
url
)
s
=
random
.
random
()
time
.
sleep
(
s
)
response_res
=
requests
.
post
(
url
,
data
,
verify
=
False
)
if
response_res
.
status_code
==
200
and
response_res
.
text
:
response
=
json
.
loads
(
response_res
.
text
)
responseData
=
response
.
get
(
"responseData"
,
{})
.
get
(
"data"
)
for
item
in
responseData
:
if
item
.
get
(
"type"
)
==
"feed_area"
:
if
item
.
get
(
"items"
,
{})
.
get
(
"feed_list"
,
[]):
for
data
in
item
.
get
(
"items"
,
{})
.
get
(
"feed_list"
,
[]):
if
data
.
get
(
"type"
)
==
"feed_shop_diallel"
:
for
service
in
data
.
get
(
"items"
,
[]):
service_data
=
service
.
get
(
"data"
)
if
str
(
service_data
.
get
(
"district_2"
))
==
str
(
city_id
):
service_info
=
dict
()
service_info
[
'skuid'
]
=
service_data
.
get
(
"pid"
)
service_info
[
'美购id'
]
=
service_data
.
get
(
"spu_id"
)
# service_info['医生名'] = service_data.get("doctor_name")
service_info
[
'医院名称'
]
=
service_data
.
get
(
"hospital_name"
)
service_info
[
'sku原价'
]
=
service_data
.
get
(
"price_origin"
)
service_info
[
'sku活动价'
]
=
service_data
.
get
(
"price_online"
)
service_info
[
'机构等级'
]
=
service_data
.
get
(
"avg_score"
)
service_info
[
'美购名称'
]
=
service_data
.
get
(
"title"
)
service_info
[
'销量'
]
=
service_data
.
get
(
"order_cnt"
)
icon_data
=
service_data
.
get
(
"icons"
,
[])
service_info
[
'可用尾款券'
]
=
service_data
.
get
(
"wei_kuan_list"
,
[])
service_info
[
'可领取预约金优惠券'
]
=
[
service_data
.
get
(
"new_user_text"
,
""
)]
if
service_data
.
get
(
"new_user_text"
,
""
)
else
[]
for
item
in
icon_data
:
if
"预约金满"
in
item
:
service_info
[
'可领取预约金优惠券'
]
.
append
(
item
)
elif
"尾款满"
in
item
:
service_info
[
'可用尾款券'
]
.
append
(
item
)
service_info
[
'query词'
]
=
keyword
service_info
[
'城市'
]
=
city_name
service_info
[
'平台'
]
=
"新氧"
service_info
[
'链接'
]
=
"https://y.soyoung.com/cp{}"
.
format
(
service_info
[
'skuid'
])
print
(
service_info
)
if
service_data
.
get
(
"pid"
)
not
in
all_skuids
:
get_data_file
.
write
(
json
.
dumps
(
service_info
))
get_data_file
.
write
(
"
\n
"
)
else
:
print
(
"break"
)
break
break_flat
=
False
other_city_count
=
0
for
page
in
range
(
0
,
3000
,
10
):
if
break_flat
==
False
and
other_city_count
<
100
:
s
=
random
.
random
()
time
.
sleep
(
s
)
url
=
'https://backend.igengmei.com/api/janus/search/v7/content?platform=iPhone&os_version=13.6.1&version=7.46.0&model=iphone
%20
X
%20
++&release=1&idfa=057F28DF-20B8-488F-A285-931367FCC110&idfv=74FE9CFB-DAD2-4379-B8F8-FC656F38BCA5&device_id=057F28DF-20B8-488F-A285-931367FCC110&uqid=47517624-F42B-469C-96EC-3BF936E44613&channel=App
%20
Store&app_name=gengmeiios¤t_city_id={}&lat=39.98323941854652&lng=116.4880417854629&is_WiFi=1&hardware_model=iPhone12,1&ua=Mozilla/5.0
%20
(iPhone;
%20
CPU
%20
iPhone
%20
OS
%2013
_6_1
%20
like
%20
Mac
%20
OS
%20
X)
%20
AppleWebKit/605.1.15
%20
(KHTML,
%20
like
%20
Gecko)
%20
Mobile/15E148&sm_idfa=(null)&trace_id=2021/07/22/0956/53f8f1c10868&area_id=worldwide&count=10&is_first=1&is_gray=1&max_price=100000&min_price=0&offset={}&order_by=0&query={}&show_mode=1&size=10&tab_type=0'
.
format
(
city_name
,
page
,
query
)
response_res
=
requests
.
get
(
url
,
verify
=
False
)
if
response_res
.
status_code
==
200
and
response_res
.
text
:
response
=
json
.
loads
(
response_res
.
text
)
responseData
=
response
.
get
(
"data"
,
{})
.
get
(
"cards"
)
for
item
in
responseData
:
if
item
.
get
(
"title"
)
==
"商品"
:
if
len
(
item
.
get
(
"feed"
,
[])):
for
service_data
in
item
.
get
(
"feed"
,
[]):
city_id
=
service_data
.
get
(
"hospital_info"
,
{})
.
get
(
"city_id"
)
if
str
(
city_id
)
==
str
(
city_name
):
# print(service_data)
service_info
=
dict
()
service_info
[
'skuid'
]
=
service_data
.
get
(
"service_item_id"
)
service_info
[
'美购id'
]
=
service_data
.
get
(
"service_id"
)
service_info
[
'医生名称'
]
=
service_data
.
get
(
"doctor_info"
,
{})
.
get
(
"name"
)
service_info
[
'医院名称'
]
=
service_data
.
get
(
"doctor_hospital_name"
)
service_info
[
'sku原价'
]
=
service_data
.
get
(
"price_info"
,
{})
.
get
(
"original_payment_desc"
)
service_info
[
'sku活动价'
]
=
service_data
.
get
(
"price_info"
,
{})
.
get
(
"total_payment_desc"
)
# service_info['机构等级'] = service_data.get("avg_score")
service_info
[
'美购名称'
]
=
get_keynote_sentence
(
service_data
.
get
(
"title"
))
service_info
[
'销量'
]
=
service_data
.
get
(
"sell_num_desc"
,
0
)
service_info
[
'可用尾款券'
]
=
[]
service_info
[
'可领取预约金优惠券'
]
=
[]
icon_data
=
service_data
.
get
(
"sales_info"
,
[])
# print(icon_data)
for
sales_item
in
icon_data
:
if
int
(
sales_item
.
get
(
"type"
))
==
1
or
int
(
sales_item
.
get
(
"type"
))
==
3
:
service_info
[
'可用尾款券'
]
.
append
(
sales_item
.
get
(
"desc"
))
elif
int
(
sales_item
.
get
(
"type"
))
==
2
:
service_info
[
'可领取预约金优惠券'
]
.
append
(
sales_item
.
get
(
"desc"
))
else
:
pass
service_info
[
'query词'
]
=
query
service_info
[
'城市'
]
=
city_name
service_info
[
'平台'
]
=
"更美"
service_info
[
'链接'
]
=
"https://m.igengmei.com/promotion/{}?sku_id={}&distribute_type=1&distribute_id=30775628&is_share=1"
.
format
(
service_info
[
'美购id'
],
service_info
[
'skuid'
])
print
(
service_info
)
get_data_file
.
write
(
json
.
dumps
(
service_info
))
get_data_file
.
write
(
"
\n
"
)
else
:
other_city_count
+=
1
else
:
print
(
"break"
)
break_flat
=
True
break
else
:
print
(
city_name
,
query
,
"爬取失败"
)
else
:
# if response_res
print
(
city_id
,
keyword
,
"fail or end"
)
print
(
page
,
city_name
,
query
,
"本地已爬完"
)
if
__name__
==
'__main__'
:
def
main
(
city_tag
=
""
)
:
begin
=
time
.
time
()
nowday
=
datetime
.
datetime
.
now
()
today
=
str
(
nowday
)
.
split
()[
0
]
file_name
=
"gengmei_save_data_"
+
str
(
today
)
+
".txt"
file_name
=
"gengmei_save_data_"
+
str
(
today
)
+
city_tag
+
".txt"
get_data_file
=
open
(
file_name
,
"a+"
,
encoding
=
"utf-8"
)
cityIdMapping
=
{
'北京'
:
'
1
'
,
'上海'
:
'9'
,
'重庆'
:
'22'
,
'广州市'
:
'289'
,
'深圳市'
:
'291'
,
'郑州市'
:
'240'
,
'武汉市'
:
'258'
,
cityIdMapping
=
{
'北京'
:
'
328
'
,
'上海'
:
'9'
,
'重庆'
:
'22'
,
'广州市'
:
'289'
,
'深圳市'
:
'291'
,
'郑州市'
:
'240'
,
'武汉市'
:
'258'
,
'长沙市'
:
'275'
,
'南京市'
:
'162'
,
'成都市'
:
'385'
,
'西安市'
:
'438'
,
'杭州市'
:
'175'
}
keywords
=
[
'瘦脸针'
,
'双眼皮'
,
'光子嫩肤'
,
'吸脂'
,
'水光针'
,
'玻尿酸'
,
'热玛吉'
,
'脱毛'
,
'瘦腿针'
,
'鼻综合'
,
'瘦肩针'
,
'下颌角'
,
'线雕'
,
'超声刀'
,
'美白针'
,
'眼综合'
,
'隆鼻'
,
'菲洛嘉'
,
'下巴'
,
'热拉提'
,
'点阵激光'
,
'面部吸脂'
,
'开内眼角'
,
'嗨体'
,
'牙齿矫正'
,
'皮秒'
,
'超皮秒'
,
'植发'
,
'黄金微针'
,
'隆胸'
,
'微针'
,
'光子嫩肤'
,
'祛斑'
,
'小气泡'
,
'嗨体熊猫针'
,
'熊猫针'
,
'果酸焕肤'
,
'自体脂肪隆胸'
,
'7D聚拉提'
,
'瘦脸'
,
'埋线双眼皮'
,
'菲洛嘉水光针'
,
'双眼皮修复'
,
'欧洲之星'
,
'脂肪填充'
,
'溶脂针'
,
'法令纹'
,
'鼻基底'
,
'全切双眼皮'
,
'颧骨内推'
,
'鼻子'
,
'光子嫩肤m22'
,
'下颌缘提升'
,
'm22'
,
'鼻翼缩小'
,
'欧洲之星fotona4d'
,
'自体脂肪全面部填充'
,
'玻尿酸丰唇'
,
'水光'
,
'嗨体祛颈纹'
,
'假体隆胸'
,
'英诺小棕瓶'
,
'黄金微雕'
,
'眼袋'
,
'乔雅登'
,
'除皱'
,
'颧骨'
,
'艾莉薇'
,
'抽脂'
,
'瘦腿'
,
'玻尿酸丰下巴'
,
'纹眉'
,
'伊肤泉微针'
,
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'除皱针'
,
'开眼角'
,
keywords
=
[
'瘦脸针'
,
'双眼皮'
,
'光子嫩肤'
,
'吸脂'
,
'水光针'
,
'玻尿酸'
,
'热玛吉'
,
'脱毛'
,
'瘦腿针'
,
'鼻综合'
,
'瘦肩针'
,
'下颌角'
,
'线雕'
,
'超声刀'
,
'美白针'
,
'眼综合'
,
'隆鼻'
,
'菲洛嘉'
,
'下巴'
,
'热拉提'
,
'点阵激光'
,
'面部吸脂'
,
'开内眼角'
,
'嗨体'
,
'牙齿矫正'
,
'皮秒'
,
'超皮秒'
,
'植发'
,
'黄金微针'
,
'隆胸'
,
'微针'
,
'光子嫩肤'
,
'祛斑'
,
'小气泡'
,
'嗨体熊猫针'
,
'熊猫针'
,
'果酸焕肤'
,
'自体脂肪隆胸'
,
'7D聚拉提'
,
'瘦脸'
,
'埋线双眼皮'
,
'菲洛嘉水光针'
,
'双眼皮修复'
,
'欧洲之星'
,
'脂肪填充'
,
'溶脂针'
,
'法令纹'
,
'鼻基底'
,
'全切双眼皮'
,
'颧骨内推'
,
'鼻子'
,
'光子嫩肤m22'
,
'下颌缘提升'
,
'm22'
,
'鼻翼缩小'
,
'欧洲之星fotona4d'
,
'自体脂肪全面部填充'
,
'玻尿酸丰唇'
,
'水光'
,
'嗨体祛颈纹'
,
'假体隆胸'
,
'英诺小棕瓶'
,
'黄金微雕'
,
'眼袋'
,
'乔雅登'
,
'除皱'
,
'颧骨'
,
'艾莉薇'
,
'抽脂'
,
'瘦腿'
,
'玻尿酸丰下巴'
,
'纹眉'
,
'伊肤泉微针'
,
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'除皱针'
,
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
all_skuids
=
[]
# city_list = ["beijing", "shanghai", "guangzhou", "shenzhen", "hangzhou", "chengdu", "chongqing", "nanjing", "wuhan", "changsha", "zhengzhou", "xian"]
city_list
=
[
"北京"
,
"上海"
,
"广州"
,
"深圳"
,
"杭州"
,
"成都"
,
"重庆"
,
"南京"
,
"武汉"
,
"长沙"
,
"郑州"
,
"西安"
]
city_list
=
[
city_tag
]
for
city_name
in
city_list
:
city_id
=
cityIdMapping
.
get
(
city_name
)
for
word
in
keywords
:
if
city_name
==
"北京"
and
word
not
in
[
'瘦脸针'
,
'双眼皮'
,
'光子嫩肤'
,
'吸脂'
,
'水光针'
,
'玻尿酸'
,
'热玛吉'
,
'脱毛'
,
'瘦腿针'
,
'鼻综合'
,
'瘦肩针'
]:
get_service_info
(
city_id
=
city_id
,
keyword
=
word
,
city_name
=
city_name
,
all_skuids
=
all_skuids
,
get_data_file
=
get_data_file
)
get_service_info
(
query
=
word
,
city_name
=
""
.
join
(
lazy_pinyin
(
city_name
)),
get_data_file
=
get_data_file
)
get_data_file
.
close
()
print
(
time
.
time
()
-
begin
)
all_data
=
[]
if
os
.
path
.
exists
(
file_name
):
open_file
=
open
(
file_name
,
"r"
,
encoding
=
"utf-8"
)
for
item
in
open_file
.
readlines
():
all_data
.
append
(
json
.
loads
(
item
))
open_file
.
close
()
res
=
pd
.
DataFrame
(
all_data
)
res
.
to_csv
(
"gengmei_result1.csv"
,
encoding
=
"gb18030"
)
send_email_tome
()
print
(
time
.
time
()
-
begin
)
if
__name__
==
'__main__'
:
args
=
sys
.
argv
[
1
]
main
(
city_tag
=
args
)
dev/xinyang_ask_tag/app_soyoung_by_cityname.py
View file @
827549ca
...
...
@@ -11,7 +11,7 @@ from email.mime.application import MIMEApplication
from
email.mime.multipart
import
MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
import
re
import
pandas
as
pd
import
requests
from
lxml
import
etree
...
...
@@ -55,6 +55,23 @@ def send_email_tome():
logger
.
error
(
"catch exception,main:
%
s"
%
traceback
.
format_exc
())
def
get_keynote_sentence
(
content
):
try
:
content_list
=
[]
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
para
=
re
.
sub
(
'([;。!?
\
?])([^”’])'
,
r"\1\n\2"
,
str_re
)
# 单字符断句符
para
=
re
.
sub
(
'(
\
.{6})([^”’])'
,
r"\1\n\2"
,
para
)
# 英文省略号
para
=
re
.
sub
(
'(
\
…{2})([^”’])'
,
r"\1\n\2"
,
para
)
# 中文省略号
para
=
re
.
sub
(
'([;。!?
\
?][”’])([^,。!?
\
?])'
,
r'\1\n\2'
,
para
)
para
=
para
.
rstrip
()
# 段尾如果有多余的\n就去掉它
return
para
except
:
logging
.
error
(
"catch exception,logins:
%
s"
%
traceback
.
format_exc
())
return
[]
def
get_service_info
(
city_id
=-
1
,
keyword
=
""
,
city_name
=
""
,
all_skuids
=
[],
get_data_file
=
None
):
print
(
"get_service_info"
)
url
=
'https://api.soyoung.com/v8/superList/index'
...
...
@@ -112,7 +129,8 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
if
data
.
get
(
"type"
)
==
"feed_shop_diallel"
:
for
service
in
data
.
get
(
"items"
,
[]):
service_data
=
service
.
get
(
"data"
)
if
str
(
service_data
.
get
(
"district_2"
))
==
str
(
city_id
):
district_2
=
service_data
.
get
(
"district_2"
)
if
str
(
district_2
)
==
str
(
city_id
):
service_info
=
dict
()
service_info
[
'skuid'
]
=
service_data
.
get
(
"pid"
)
service_info
[
'美购id'
]
=
service_data
.
get
(
"spu_id"
)
...
...
@@ -121,7 +139,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info
[
'sku原价'
]
=
service_data
.
get
(
"price_origin"
)
service_info
[
'sku活动价'
]
=
service_data
.
get
(
"price_online"
)
service_info
[
'机构等级'
]
=
service_data
.
get
(
"avg_score"
)
service_info
[
'美购名称'
]
=
service_data
.
get
(
"title"
)
service_info
[
'美购名称'
]
=
get_keynote_sentence
(
service_data
.
get
(
"title"
)
)
service_info
[
'销量'
]
=
service_data
.
get
(
"order_cnt"
)
icon_data
=
service_data
.
get
(
"icons"
,
[])
service_info
[
'可用尾款券'
]
=
service_data
.
get
(
"wei_kuan_list"
,
[])
...
...
@@ -137,7 +155,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info
[
'query词'
]
=
keyword
service_info
[
'城市'
]
=
city_name
service_info
[
'平台'
]
=
"新氧"
service_info
[
'链接'
]
=
"https://
y.soyoung.com/c
p{}"
.
format
(
service_info
[
'链接'
]
=
"https://
m.soyoung.com/normal/cpwa
p{}"
.
format
(
service_info
[
'skuid'
])
print
(
service_info
)
if
service_data
.
get
(
"pid"
)
not
in
all_skuids
:
...
...
@@ -152,7 +170,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
else
:
print
(
city_id
,
keyword
,
"爬取失败"
)
else
:
print
(
page
,
city_id
,
keyword
,
"本地已爬完"
)
print
(
page
,
city_id
,
keyword
,
"本地已爬完"
)
def
main
(
city_tag
=
""
):
...
...
@@ -166,19 +184,21 @@ def main(city_tag=""):
cityIdMapping
=
{
'北京'
:
'1'
,
'上海'
:
'9'
,
'重庆'
:
'22'
,
'广州市'
:
'289'
,
'深圳市'
:
'291'
,
'郑州市'
:
'240'
,
'武汉市'
:
'258'
,
'长沙市'
:
'275'
,
'南京市'
:
'162'
,
'成都市'
:
'385'
,
'西安市'
:
'438'
,
'杭州市'
:
'175'
}
# keywords = ['瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
# '美白针',
# '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提', '点阵激光', '面部吸脂', '开内眼角', '嗨体', '牙齿矫正', '皮秒', '超皮秒', '植发',
# '黄金微针', '隆胸',
# '微针', '光子嫩肤', '祛斑', '小气泡', '嗨体熊猫针', '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提', '瘦脸', '埋线双眼皮', '菲洛嘉水光针',
# '双眼皮修复',
# '欧洲之星', '脂肪填充', '溶脂针', '法令纹', '鼻基底', '全切双眼皮', '颧骨内推', '鼻子', '光子嫩肤m22', '下颌缘提升', 'm22', '鼻翼缩小',
# '欧洲之星fotona4d', '自体脂肪全面部填充', '玻尿酸丰唇', '水光', '嗨体祛颈纹', '假体隆胸', '英诺小棕瓶', '黄金微雕', '眼袋', '乔雅登',
# '除皱', '颧骨',
# '艾莉薇', '抽脂', '瘦腿', '玻尿酸丰下巴', '纹眉', '伊肤泉微针', '美白', '厚唇改薄', '面部线雕', '祛疤', '伊婉V', '超皮秒祛斑', '除皱针',
# '开眼角',
# '海菲秀', '假体下巴', '刷酸', '泪沟', '拉皮', '全身吸脂', '缩鼻翼']
keywords
=
[
'欧洲之星fotona4d'
]
"""
'瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针', '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合', '瘦肩针', '下颌角', '线雕', '超声刀',
'美白针',
'眼综合', '隆鼻',
"""
keywords
=
[
'菲洛嘉'
,
'下巴'
,
'热拉提'
,
'点阵激光'
,
'面部吸脂'
,
'开内眼角'
,
'嗨体'
,
'牙齿矫正'
,
'皮秒'
,
'超皮秒'
,
'植发'
,
'黄金微针'
,
'隆胸'
,
'微针'
,
'光子嫩肤'
,
'祛斑'
,
'小气泡'
,
'嗨体熊猫针'
,
'熊猫针'
,
'果酸焕肤'
,
'自体脂肪隆胸'
,
'7D聚拉提'
,
'瘦脸'
,
'埋线双眼皮'
,
'菲洛嘉水光针'
,
'双眼皮修复'
,
'欧洲之星'
,
'脂肪填充'
,
'溶脂针'
,
'法令纹'
,
'鼻基底'
,
'全切双眼皮'
,
'颧骨内推'
,
'鼻子'
,
'光子嫩肤m22'
,
'下颌缘提升'
,
'm22'
,
'鼻翼缩小'
,
'欧洲之星fotona4d'
,
'自体脂肪全面部填充'
,
'玻尿酸丰唇'
,
'水光'
,
'嗨体祛颈纹'
,
'假体隆胸'
,
'英诺小棕瓶'
,
'黄金微雕'
,
'眼袋'
,
'乔雅登'
,
'除皱'
,
'颧骨'
,
'艾莉薇'
,
'抽脂'
,
'瘦腿'
,
'玻尿酸丰下巴'
,
'纹眉'
,
'伊肤泉微针'
,
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'除皱针'
,
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
...
...
dev/xinyang_ask_tag/app_soyoung_v1.py
View file @
827549ca
...
...
@@ -28,7 +28,7 @@ def send_email_tome():
content
=
'爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart
=
MIMEText
(
content
,
'plain'
,
"utf-8"
)
zip_file_week
=
'
result1
.csv'
zip_file_week
=
'
soyoung_result
.csv'
zip_apart_week
=
MIMEApplication
(
open
(
zip_file_week
,
'rb'
)
.
read
())
zip_apart_week
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zip_file_week
)
...
...
@@ -57,7 +57,6 @@ def send_email_tome():
def
get_keynote_sentence
(
content
):
try
:
content_list
=
[]
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
...
...
@@ -172,35 +171,33 @@ if __name__ == '__main__':
begin
=
time
.
time
()
nowday
=
datetime
.
datetime
.
now
()
today
=
str
(
nowday
)
.
split
()[
0
]
file_name
=
"save_data_"
+
str
(
today
)
+
".txt"
file_name
=
"s
oyoung_s
ave_data_"
+
str
(
today
)
+
".txt"
get_data_file
=
open
(
file_name
,
"a+"
,
encoding
=
"utf-8"
)
cityIdMapping
=
{
'北京'
:
'1'
,
'上海'
:
'9'
,
'重庆'
:
'22'
,
'广州市'
:
'289'
,
'深圳市'
:
'291'
,
'郑州市'
:
'240'
,
'武汉市'
:
'258'
,
'长沙市'
:
'275'
,
'南京市'
:
'162'
,
'成都市'
:
'385'
,
'西安市'
:
'438'
,
'杭州市'
:
'175'
}
# '瘦脸针', '双眼皮', '光子嫩肤', '吸脂', '水光针',
# '玻尿酸', '热玛吉', '脱毛', '瘦腿针', '鼻综合',
# '瘦肩针','下颌角', '线雕', '超声刀', '美白针',
# '眼综合', '隆鼻', '菲洛嘉', '下巴', '热拉提',
# '点阵激光', '面部吸脂','开内眼角', '嗨体', '牙齿矫正',
# '皮秒', '超皮秒','植发', '黄金微针', '隆胸',
# '微针', '光子嫩肤', '祛斑','小气泡', '嗨体熊猫针',
# '熊猫针', '果酸焕肤', '自体脂肪隆胸', '7D聚拉提','瘦脸',
# '埋线双眼皮', '菲洛嘉水光针', '双眼皮修复', '欧洲之星', '脂肪填充',
# '溶脂针', '法令纹', '鼻基底','全切双眼皮', '颧骨内推',
# '鼻子', '抽脂', '光子嫩肤m22', '下颌缘提升', 'm22',
# '鼻翼缩小', 'fotona4d欧洲之星', '自体脂肪全面部填充', '玻尿酸丰唇', '除皱针',
# '水光', '嗨体祛颈纹','假体隆胸', '英诺小棕瓶', '黄金微雕',
# '眼袋', '乔雅登',
keywords
=
[
'除皱'
,
'颧骨'
,
'艾莉薇'
,
'瘦腿'
,
'玻尿酸丰下巴'
,
'纹眉'
,
'伊肤泉微针'
,
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'瘦脸针'
,
'双眼皮'
,
'光子嫩肤'
,
'吸脂'
,
'水光针'
,
'玻尿酸'
,
'热玛吉'
,
'脱毛'
,
'瘦腿针'
,
'鼻综合'
,
'瘦肩针'
,
'下颌角'
,
'线雕'
,
'超声刀'
,
'美白针'
,
'眼综合'
,
'隆鼻'
,
'菲洛嘉'
,
'下巴'
,
'热拉提'
,
'点阵激光'
,
'面部吸脂'
,
'开内眼角'
,
'嗨体'
,
'牙齿矫正'
,
'皮秒'
,
'超皮秒'
,
'植发'
,
'黄金微针'
,
'隆胸'
,
'微针'
,
'光子嫩肤'
,
'祛斑'
,
'小气泡'
,
'嗨体熊猫针'
,
'熊猫针'
,
'果酸焕肤'
,
'自体脂肪隆胸'
,
'7D聚拉提'
,
'瘦脸'
,
'埋线双眼皮'
,
'菲洛嘉水光针'
,
'双眼皮修复'
,
'欧洲之星'
,
'脂肪填充'
,
'溶脂针'
,
'法令纹'
,
'鼻基底'
,
'全切双眼皮'
,
'颧骨内推'
,
'鼻子'
,
'抽脂'
,
'光子嫩肤m22'
,
'下颌缘提升'
,
'm22'
,
'鼻翼缩小'
,
'fotona4d欧洲之星'
,
'自体脂肪全面部填充'
,
'玻尿酸丰唇'
,
'除皱针'
,
'水光'
,
'嗨体祛颈纹'
,
'假体隆胸'
,
'英诺小棕瓶'
,
'黄金微雕'
,
'眼袋'
,
'乔雅登'
,
'除皱'
,
'颧骨'
,
'艾莉薇'
,
'瘦腿'
,
'玻尿酸丰下巴'
,
'纹眉'
,
'伊肤泉微针'
,
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
city_list
=
[
"北京"
,
""
""
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
all_skuids
=
[]
for
city_name
in
city_list
:
...
...
@@ -221,6 +218,6 @@ if __name__ == '__main__':
open_file
.
close
()
res
=
pd
.
DataFrame
(
all_data
)
res
.
to_csv
(
"
result1
.csv"
,
encoding
=
"gb18030"
)
res
.
to_csv
(
"
soyoung_result
.csv"
,
encoding
=
"gb18030"
)
send_email_tome
()
print
(
time
.
time
()
-
begin
)
dev/xinyang_ask_tag/city.py
View file @
827549ca
...
...
@@ -3947,15 +3947,13 @@ city = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市",
for
item
in
city_info
:
if
"level"
in
item
.
keys
():
cityId
=
item
.
get
(
"id"
)
cityName
=
item
.
get
(
"name"
)
if
cityName
in
city
:
cityId_mapping
[
cityName
]
=
cityId
city_Name
=
item
.
get
(
"name"
)
cityId_mapping
[
city_Name
]
=
[]
if
'son'
in
item
.
keys
():
for
level2Item
in
item
.
get
(
"son"
,
[]):
cityId
=
level2Item
.
get
(
"id"
)
cityName
=
level2Item
.
get
(
"name"
)
if
cityName
in
city
:
cityId_mapping
[
cityName
]
=
cityId
cityId_mapping
[
city_Name
]
.
append
(
cityName
)
print
(
cityId_mapping
)
import
datetime
...
...
dev/xinyang_ask_tag/save_gengmei_data_to_csv.py
0 → 100644
View file @
827549ca
import
json
import
logging
import
smtplib
import
socket
import
time
import
traceback
import
datetime
import
os
import
sys
from
email.mime.application
import
MIMEApplication
from
email.mime.multipart
import
MIMEMultipart
from
email.mime.text
import
MIMEText
from
email.utils
import
formataddr
from
urllib
import
error
import
requests
# 导入requests.exceptions模块中的三种异常类
from
requests.exceptions
import
ReadTimeout
,
HTTPError
,
RequestException
,
ConnectionError
from
requests.packages.urllib3.exceptions
import
ReadTimeoutError
from
socket
import
timeout
import
zipfile
from
retrying
import
retry
import
pandas
as
pd
import
requests
from
lxml
import
etree
logger
=
logging
.
getLogger
(
__name__
)
def
send_email_tome
():
try
:
from_addrs
=
'lixiaofang@igengmei.com'
password
=
'EzJzSRyEG4Jibuy9'
toaddrs
=
"lixiaofang@igengmei.com"
content
=
'爬取更美热搜前100的词召回的商品,内容详见表格'
text_apart
=
MIMEText
(
content
,
'plain'
,
"utf-8"
)
zip_file_week
=
'gengmei_service.csv'
zip_apart_week
=
MIMEApplication
(
open
(
zip_file_week
,
'rb'
)
.
read
())
zip_apart_week
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zip_file_week
)
m
=
MIMEMultipart
()
m
.
attach
(
text_apart
)
m
.
attach
(
zip_apart_week
)
m
[
'From'
]
=
formataddr
((
"李小芳"
,
from_addrs
))
m
[
"To"
]
=
formataddr
((
"李小芳"
,
toaddrs
))
m
[
'Subject'
]
=
'新氧商品信息'
try
:
server
=
smtplib
.
SMTP_SSL
(
'smtp.exmail.qq.com'
,
465
)
server
.
login
(
from_addrs
,
password
)
server
.
sendmail
(
from_addrs
,
[
toaddrs
],
m
.
as_string
())
print
(
'success'
)
server
.
quit
()
except
smtplib
.
SMTPException
as
e
:
print
(
'error'
,
e
)
except
Exception
as
e
:
print
(
str
(
e
))
logger
.
error
(
"catch exception,main:
%
s"
%
traceback
.
format_exc
())
import
re
def
get_keynote_sentence
(
content
):
try
:
content_list
=
[]
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
para
=
re
.
sub
(
'([;。!?
\
?])([^”’])'
,
r"\1\n\2"
,
str_re
)
# 单字符断句符
para
=
re
.
sub
(
'(
\
.{6})([^”’])'
,
r"\1\n\2"
,
para
)
# 英文省略号
para
=
re
.
sub
(
'(
\
…{2})([^”’])'
,
r"\1\n\2"
,
para
)
# 中文省略号
para
=
re
.
sub
(
'([;。!?
\
?][”’])([^,。!?
\
?])'
,
r'\1\n\2'
,
para
)
para
=
para
.
rstrip
()
# 段尾如果有多余的\n就去掉它
return
para
except
:
logging
.
error
(
"catch exception,logins:
%
s"
%
traceback
.
format_exc
())
return
[]
# all_no_city = []
#
# def update_city_name(hospital_name, city_name):
# if hospital_name[:2] in ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "南京", "武汉", "长沙", "郑州", "西安"]:
# return hospital_name[:2]
# if "北京" in hospital_name or "清华" in hospital_name:
# return "北京"
# if "杭州" in hospital_name:
# return "杭州"
# if "西安" in hospital_name:
# return "西安"
# if "深圳" in hospital_name:
# return "深圳"
# if "重庆" in hospital_name:
# return "重庆"
# if "南京" in hospital_name:
# return "南京"
# if "武汉" in hospital_name:
# return "武汉"
# if "成都" in hospital_name or "四川" in hospital_name:
# return "成都"
# if "长沙" in hospital_name or "湖南" in hospital_name:
# return "长沙"
# if "上海" in hospital_name:
# return "上海"
# if "广州" in hospital_name:
# return "广州"
# if "郑州" in hospital_name or "河南" in hospital_name:
# return "郑州"
# else:
# all_no_city.append(hospital_name)
# return city_name
#
if
__name__
==
'__main__'
:
begin
=
time
.
time
()
nowday
=
datetime
.
datetime
.
now
()
today
=
str
(
nowday
)
.
split
()[
0
]
all_data
=
[]
city_list
=
[
"北京"
,
"上海"
,
"广州"
,
"深圳"
,
"杭州"
,
"成都"
,
"重庆"
,
"南京"
,
"武汉"
,
"长沙"
,
"郑州"
,
"西安"
]
for
city_name
in
city_list
:
file_name
=
"gengmei_save_data_"
+
today
+
city_name
+
".txt"
if
os
.
path
.
exists
(
file_name
):
open_file
=
open
(
file_name
,
"r"
,
encoding
=
"utf-8"
)
print
(
file_name
)
for
item
in
open_file
.
readlines
():
try
:
data
=
json
.
loads
(
item
.
strip
())
all_data
.
append
(
data
)
except
:
pass
open_file
.
close
()
if
len
(
all_data
):
res
=
pd
.
DataFrame
(
all_data
)
res
.
to_csv
(
"gengmei_service.csv"
,
encoding
=
"gb18030"
)
# f = zipfile.ZipFile("gengmei_service.csv", "w", zipfile.ZIP_DEFLATED)
#
# send_email_tome()
print
(
time
.
time
()
-
begin
)
print
(
"end"
)
dev/xinyang_ask_tag/save_data_to_csv.py
→
dev/xinyang_ask_tag/save_
soyoung_
data_to_csv.py
View file @
827549ca
...
...
@@ -36,7 +36,7 @@ def send_email_tome():
content
=
'爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart
=
MIMEText
(
content
,
'plain'
,
"utf-8"
)
zip_file_week
=
'
result1
.csv'
zip_file_week
=
'
soyoung_service
.csv'
zip_apart_week
=
MIMEApplication
(
open
(
zip_file_week
,
'rb'
)
.
read
())
zip_apart_week
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zip_file_week
)
...
...
@@ -68,7 +68,6 @@ import re
def
get_keynote_sentence
(
content
):
try
:
content_list
=
[]
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
...
...
@@ -88,19 +87,24 @@ if __name__ == '__main__':
nowday
=
datetime
.
datetime
.
now
()
today
=
str
(
nowday
)
.
split
()[
0
]
all_data
=
[]
file_name
=
"save_data_2021-07-21.txt"
if
os
.
path
.
exists
(
file_name
):
open_file
=
open
(
file_name
,
"r"
,
encoding
=
"utf-8"
)
for
item
in
open_file
.
readlines
():
data
=
json
.
loads
(
item
.
strip
())
data
[
'美购名称'
]
=
get_keynote_sentence
(
data
.
get
(
"美购名称"
))
# print(data['美购名称'])
all_data
.
append
(
data
)
open_file
.
close
()
print
(
len
(
all_data
))
res
=
pd
.
DataFrame
(
all_data
)
res
.
to_csv
(
"result1.csv"
,
encoding
=
"gb18030"
)
send_email_tome
()
city_list
=
[
"北京"
,
"上海"
,
"广州市"
,
"深圳市"
,
"杭州市"
,
"成都市"
,
"重庆"
,
"南京市"
,
"武汉市"
,
"长沙市"
,
"郑州市"
,
"西安市"
]
for
city_name
in
city_list
:
file_name
=
"save_data_"
+
today
+
city_name
+
".txt"
print
(
file_name
)
if
os
.
path
.
exists
(
file_name
):
open_file
=
open
(
file_name
,
"r"
,
encoding
=
"utf-8"
)
for
item
in
open_file
.
readlines
():
try
:
data
=
json
.
loads
(
item
.
strip
())
all_data
.
append
(
data
)
except
:
pass
open_file
.
close
()
if
len
(
all_data
):
res
=
pd
.
DataFrame
(
all_data
)
res
.
to_csv
(
"soyoung_service.csv"
,
encoding
=
"gb18030"
)
send_email_tome
()
print
(
time
.
time
()
-
begin
)
print
(
"end"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment