Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
aeb48597
Commit
aeb48597
authored
Aug 17, 2021
by
李小芳
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
b6079ebc
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
67 additions
and
76 deletions
+67
-76
app_soyoung.py
dev/xinyang_ask_tag/app_soyoung.py
+0
-0
app_soyoung_by_cityname.py
dev/xinyang_ask_tag/app_soyoung_by_cityname.py
+64
-68
city_parse.py
dev/xinyang_ask_tag/city_parse.py
+0
-0
save_soyoung_data_to_csv.py
dev/xinyang_ask_tag/save_soyoung_data_to_csv.py
+2
-5
soyoung_service_1.csv
dev/xinyang_ask_tag/soyoung_service_1.csv
+0
-2
soyoung_service_cika.csv
dev/xinyang_ask_tag/soyoung_service_cika.csv
+0
-0
soyoung_service_other.csv
dev/xinyang_ask_tag/soyoung_service_other.csv
+0
-0
soyoung_service_write_cika.csv
dev/xinyang_ask_tag/soyoung_service_write_cika.csv
+0
-0
test.py
dev/xinyang_ask_tag/test.py
+1
-1
No files found.
dev/xinyang_ask_tag/app_soyoung.py
View file @
aeb48597
This diff is collapsed.
Click to expand it.
dev/xinyang_ask_tag/app_soyoung_by_cityname.py
View file @
aeb48597
...
...
@@ -19,71 +19,15 @@ from lxml import etree
logger
=
logging
.
getLogger
(
__name__
)
def
send_email_tome
():
try
:
from_addrs
=
'lixiaofang@igengmei.com'
password
=
'EzJzSRyEG4Jibuy9'
toaddrs
=
"lixiaofang@igengmei.com"
content
=
'爬取新氧热搜前100的词召回的商品,内容详见表格'
text_apart
=
MIMEText
(
content
,
'plain'
,
"utf-8"
)
zip_file_week
=
'result1.csv'
zip_apart_week
=
MIMEApplication
(
open
(
zip_file_week
,
'rb'
)
.
read
())
zip_apart_week
.
add_header
(
'Content-Disposition'
,
'attachment'
,
filename
=
zip_file_week
)
m
=
MIMEMultipart
()
m
.
attach
(
text_apart
)
m
.
attach
(
zip_apart_week
)
m
[
'From'
]
=
formataddr
((
"李小芳"
,
from_addrs
))
m
[
"To"
]
=
formataddr
((
"李小芳"
,
toaddrs
))
m
[
'Subject'
]
=
'新氧商品信息'
try
:
server
=
smtplib
.
SMTP_SSL
(
'smtp.exmail.qq.com'
,
465
)
server
.
login
(
from_addrs
,
password
)
server
.
sendmail
(
from_addrs
,
[
toaddrs
],
m
.
as_string
())
print
(
'success'
)
server
.
quit
()
except
smtplib
.
SMTPException
as
e
:
print
(
'error'
,
e
)
except
Exception
as
e
:
print
(
str
(
e
))
logger
.
error
(
"catch exception,main:
%
s"
%
traceback
.
format_exc
())
def
get_keynote_sentence
(
content
):
try
:
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
para
=
re
.
sub
(
'([;。!?
\
?])([^”’])'
,
r"\1\n\2"
,
str_re
)
# 单字符断句符
para
=
re
.
sub
(
'(
\
.{6})([^”’])'
,
r"\1\n\2"
,
para
)
# 英文省略号
para
=
re
.
sub
(
'(
\
…{2})([^”’])'
,
r"\1\n\2"
,
para
)
# 中文省略号
para
=
re
.
sub
(
'([;。!?
\
?][”’])([^,。!?
\
?])'
,
r'\1\n\2'
,
para
)
para
=
para
.
rstrip
()
# 段尾如果有多余的\n就去掉它
return
para
except
:
logging
.
error
(
"catch exception,logins:
%
s"
%
traceback
.
format_exc
())
return
[]
def
get_service_info
(
city_id
=-
1
,
keyword
=
""
,
city_name
=
""
,
all_skuids
=
[],
get_data_file
=
None
):
print
(
"get_service_info"
)
url
=
'https://api.soyoung.com/v8/superList/index'
break_flat
=
False
other_city_count
=
0
for
page
in
range
(
1
,
500
):
if
break_flat
==
False
and
other_city_count
<
100
:
data
=
{
'_time'
:
'1626769752'
,
class
SoYoung
(
object
):
def
__init__
(
self
):
self
.
url
=
"https://api.soyoung.com/v8/superList/index"
self
.
headers
=
{
'_time'
:
'1626769752'
,
'ab_id'
:
'C521C79519A5D544390E60FEA08B32DB'
,
"app_id"
:
42
,
"area_belong"
:
4
,
"channel"
:
1
,
"cityId"
:
str
(
city_id
),
#
"cityId": str(city_id),
"device_id"
:
196374256
,
"device_model"
:
"iPhone12,1"
,
'device_version'
:
'13.6.1'
,
...
...
@@ -94,10 +38,10 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"idfa"
:
"057F28DF-20B8-488F-A285-931367FCC110"
,
"is_tf"
:
0
,
"item_id"
:
"--Boundary+D46DCD61FE6FA268"
,
"keyword"
:
str
(
keyword
),
#
"keyword": str(keyword),
"list_name"
:
"sy_app_superlist_search_page"
,
"lver"
:
"8.28.2"
,
"page"
:
page
,
#
"page": page,
"page_size"
:
20
,
"push_app_id"
:
42
,
"request_id"
:
"14d1e2b53ca644242ec7ccd7316a0aa2"
,
...
...
@@ -109,14 +53,46 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
"uid"
:
"48804194"
,
"vistor_uid"
:
""
,
"xy_device_token"
:
"33fa06111dea535c88cc07521f2e466c91"
,
'xy_sign'
:
"Z1VfaYFXrpWBPeizj2VGeQ
%3
D
%3
D"
,
"xy_token"
:
"ad970db3d79f0833d1d25d3942068585"
}
def
get_keynote_sentence
(
self
,
content
):
try
:
ss
=
content
.
encode
(
'utf-16'
,
'surrogatepass'
)
.
decode
(
'utf-16'
)
dr
=
re
.
compile
(
r"<[^>]+>"
,
re
.
S
)
str_re
=
dr
.
sub
(
""
,
ss
)
para
=
re
.
sub
(
'([;。!?
\
?])([^”’])'
,
r"\1\n\2"
,
str_re
)
# 单字符断句符
para
=
re
.
sub
(
'(
\
.{6})([^”’])'
,
r"\1\n\2"
,
para
)
# 英文省略号
para
=
re
.
sub
(
'(
\
…{2})([^”’])'
,
r"\1\n\2"
,
para
)
# 中文省略号
para
=
re
.
sub
(
'([;。!?
\
?][”’])([^,。!?
\
?])'
,
r'\1\n\2'
,
para
)
para
=
para
.
rstrip
()
# 段尾如果有多余的\n就去掉它
return
para
except
:
logging
.
error
(
"catch exception,logins:
%
s"
%
traceback
.
format_exc
())
return
[]
def
search
(
self
):
try
:
# 每次请求之前先暂停几秒 防止被ban
s
=
random
.
random
()
time
.
sleep
(
s
)
response_res
=
requests
.
post
(
url
,
data
,
verify
=
False
)
response_res
=
requests
.
post
(
self
.
url
,
self
.
headers
,
verify
=
False
)
if
response_res
.
status_code
==
200
and
response_res
.
text
:
response
=
json
.
loads
(
response_res
.
text
)
return
response
elif
response_res
.
status_code
==
403
:
return
self
.
search
()
else
:
print
(
"发生错误 停止请求"
)
except
:
return
print
(
"发生错误 停止请求"
)
def
response_analysis
(
self
,
response
=
None
,
city_id
=-
1
,
city_name
=
None
,
keyword
=
None
,
all_skuids
=
[],
get_data_file
=
None
,
other_city_count
=
0
,
break_flat
=
False
):
"""
解析获取到的数据
"""
try
:
responseData
=
response
.
get
(
"responseData"
,
{})
.
get
(
"data"
)
for
item
in
responseData
:
if
item
.
get
(
"type"
)
==
"feed_area"
:
...
...
@@ -135,7 +111,7 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info
[
'sku原价'
]
=
service_data
.
get
(
"price_origin"
)
service_info
[
'sku活动价'
]
=
service_data
.
get
(
"price_online"
)
service_info
[
'机构等级'
]
=
service_data
.
get
(
"avg_score"
)
service_info
[
'美购名称'
]
=
get_keynote_sentence
(
service_data
.
get
(
"title"
))
service_info
[
'美购名称'
]
=
self
.
get_keynote_sentence
(
service_data
.
get
(
"title"
))
service_info
[
'销量'
]
=
service_data
.
get
(
"order_cnt"
)
icon_data
=
service_data
.
get
(
"icons"
,
[])
service_info
[
'可用尾款券'
]
=
service_data
.
get
(
"wei_kuan_list"
,
[])
...
...
@@ -153,16 +129,34 @@ def get_service_info(city_id=-1, keyword="", city_name="", all_skuids=[], get_da
service_info
[
'平台'
]
=
"新氧"
service_info
[
'链接'
]
=
"https://m.soyoung.com/normal/cpwap{}"
.
format
(
service_info
[
'skuid'
])
print
(
service_info
)
if
service_data
.
get
(
"pid"
)
not
in
all_skuids
:
get_data_file
.
write
(
json
.
dumps
(
service_info
))
get_data_file
.
write
(
"
\n
"
)
print
(
"write success"
)
else
:
other_city_count
+=
1
else
:
print
(
"break"
)
break_flat
=
True
break
except
:
pass
def
get_service_info
(
self
,
city_id
=-
1
,
keyword
=
""
,
city_name
=
""
,
all_skuids
=
[],
get_data_file
=
None
):
break_flat
=
False
other_city_count
=
0
for
page
in
range
(
1
,
500
):
# other_city_count <10 代表已经结束
if
break_flat
==
False
and
other_city_count
<
10
:
self
.
headers
[
'cityId'
]
=
str
(
city_id
)
self
.
headers
[
'keyword'
]
=
str
(
keyword
)
self
.
headers
[
'page'
]
=
str
(
page
)
response
=
self
.
search
()
if
response
:
self
.
response_analysis
(
response
=
response
,
city_id
=
city_id
,
keyword
=
keyword
,
city_name
=
city_name
,
all_skuids
=
all_skuids
,
get_data_file
=
get_data_file
,
other_city_count
=
other_city_count
,
break_flat
=
break_flat
)
else
:
print
(
city_id
,
keyword
,
"爬取失败"
)
else
:
...
...
@@ -194,17 +188,19 @@ def main(city_tag=""):
'美白'
,
'厚唇改薄'
,
'面部线雕'
,
'祛疤'
,
'伊婉V'
,
'超皮秒祛斑'
,
'除皱针'
,
'开眼角'
,
'海菲秀'
,
'假体下巴'
,
'刷酸'
,
'泪沟'
,
'拉皮'
,
'全身吸脂'
,
'缩鼻翼'
]
soyoung
=
SoYoung
()
city_list
=
[
city_tag
]
all_skuids
=
[]
for
city_name
in
city_list
:
city_id
=
cityIdMapping
.
get
(
city_name
)
for
word
in
keywords
:
get_service_info
(
city_id
=
city_id
,
keyword
=
word
,
city_name
=
city_name
,
soyoung
.
get_service_info
(
city_id
=
city_id
,
keyword
=
word
,
city_name
=
city_name
,
all_skuids
=
all_skuids
,
get_data_file
=
get_data_file
)
get_data_file
.
close
()
print
(
time
.
time
()
-
begin
)
print
(
"全部爬取完成"
)
if
__name__
==
'__main__'
:
...
...
dev/xinyang_ask_tag/city.py
→
dev/xinyang_ask_tag/city
_parse
.py
View file @
aeb48597
File moved
dev/xinyang_ask_tag/save_soyoung_data_to_csv.py
View file @
aeb48597
...
...
@@ -87,11 +87,8 @@ if __name__ == '__main__':
nowday
=
datetime
.
datetime
.
now
()
today
=
str
(
nowday
)
.
split
()[
0
]
all_data
=
[]
# city_list = ["北京", "上海", "广州市", "深圳市", "杭州市", "成都市", "重庆", "南京市", "武汉市", "长沙市", "郑州市", "西安市"]
city_list
=
[
"save_data_2021-07-27.txt"
,
"save_data_2021-07-28.txt"
,
"save_data_2021-07-29.txt"
]
for
city_name
in
city_list
:
# file_name = "save_data_" + today + ".txt"
# print(file_name)
file_list
=
[
"save_data_2021-07-27.txt"
,
"save_data_2021-07-28.txt"
,
"save_data_2021-07-29.txt"
]
for
city_name
in
file_list
:
if
os
.
path
.
exists
(
city_name
):
open_file
=
open
(
city_name
,
"r"
,
encoding
=
"utf-8"
)
for
item
in
open_file
.
readlines
():
...
...
dev/xinyang_ask_tag/soyoung_service_1.csv
deleted
100644 → 0
View file @
b6079ebc
0,666426,108536,北京俏中关医疗美容门诊部,9800,1972,5,【除皱瘦脸】美国进口标准装【除皱瘦脸】瘦脸针100U·足量·正品 进口/提升/下颌线,329,"['付尾款,最高立减068', '尾款满100减8']",[],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap666426
1,84880,82258,北京画美医疗美容医院,1680,551,4.8,【注射瘦脸】除皱瘦脸国产80-100u 限购一次 正品足量 正品可验 小V脸 去咬肌 咬肌肥大瘦脸针,2321,[],['新人首单立减0629'],瘦脸针,北京,新氧,https://m.soyoung.com/normal/cpwap84880
dev/xinyang_ask_tag/soyoung_service_cika.csv
View file @
aeb48597
This diff is collapsed.
Click to expand it.
dev/xinyang_ask_tag/soyoung_service_other.csv
deleted
100644 → 0
View file @
b6079ebc
This diff is collapsed.
Click to expand it.
dev/xinyang_ask_tag/soyoung_service_write_cika.csv
deleted
100644 → 0
View file @
b6079ebc
This diff is collapsed.
Click to expand it.
dev/xinyang_ask_tag/test.py
View file @
aeb48597
...
...
@@ -66,7 +66,7 @@ def get_cika_info_to_csv():
try
:
cika_price_dict
=
dict
()
print
(
"index:"
,
item
.
strip
()
.
split
(
","
)[
0
])
if
int
(
item
.
strip
()
.
split
(
","
)[
0
])
>
3
375
5
:
if
int
(
item
.
strip
()
.
split
(
","
)[
0
])
>
3
464
5
:
service_id
=
item
.
strip
()
.
split
(
","
)[
2
]
url
=
item
.
strip
()
.
split
(
","
)[
-
1
]
if
service_id
in
have_read_service
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment