Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
A
Appium-crawl
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
李康
Appium-crawl
Commits
84dc06c7
Commit
84dc06c7
authored
Oct 29, 2019
by
李康
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加若干说明
parent
d59311ba
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
46 additions
and
304 deletions
+46
-304
README.txt
README.txt
+46
-0
find_brand.py
find_brand.py
+0
-96
product_in_list.py
product_in_list.py
+0
-208
No files found.
README.txt
0 → 100644
View file @
84dc06c7
c2h4_data.tar.gz:
爬取的以息商品库原始数据
brands/:
存放的是所有爬取的品牌
注意品牌里有重复,使用时或许需要排重
brands.csv是所有品牌的列表,其格式为【中文名,英文名,地区和时间,描述】
其余jpg图片为品牌的logo图片
brands.py:
用于爬取品牌榜,因为品牌不多,所以这部分代码写得比较随意,能用就行
这里需要注意的是,有些品牌没有中文名,这里需要单独处理一下
其中的getAllBrands函数用于提取所有有中文名的品牌,getEnglishBrands用于提取只有英文名的品牌
使用方法为,将app翻到品牌列表,然后执行python brands.py即可
需要注意的是,一次只能爬取一个品牌列表,如有多个列表请分别执行
程序中写死了一些参数,可以根据实际情况修改
data_clean.py:
数据清洗功能
执行方式为python data_clean.py
其输入输出都在脚本里写死了,可以根据需要加以修改,比较简单
数据清洗的逻辑是跟产品定下来的,之后可能会有变化,所以这一部分功能不一定能复用
product.py:
用于提取所有产品信息
执行方式为,将app翻到品牌列表页,然后执行python product.py 品牌名,这样会爬取对应品牌下的所有产品信息
或者将app翻到某品牌的产品列表页,执行python product.py,也会爬取该品牌的所有产品
在长时间爬取时,以息app可能会crash,爬虫提供了自动检测重启功能,但是该功能不是完全有效,有些时候手机会死机,有些时候重启逻辑会出bug,
都会导致重启失效,需要人工干预
爬虫还支持手动断点启动功能,方法为,将app翻到品牌列表页,执行python product.py 品牌名 子类别名,爬虫会自动翻到上次爬取的产品处进行爬取
爬虫运行时会生成一个文件夹,该文件夹以品牌名命名,其中包括一个product.csv的商品列表文件和各个商品的图标,重启功能就是从product.csv里获得已爬取的商品信息
example_files:
一些示例文件
all_products.csv为数据排重后的所有商品列表
brands.url为品牌与品牌logo url(logo会通过upload_images.py上传到七牛云)的对应关系,json格式
products.url为商品与商品logo url的对应关系,json格式
brands_clean.csv和products_clean.csv为清洗后的数据
upload_images.py:
图片上传脚本,需要放到后端的venus项目里执行
每上传一个图片,会返回图片对应的url
all.txt:
以息的所有品牌名
find_brand.py
deleted
100644 → 0
View file @
d59311ba
import
sys
,
os
,
time
,
re
list_dir
=
"./"
brand_dir
=
"brands/"
def
findBrand
(
argv
):
listf
=
open
(
list_dir
+
"products.csv"
,
"r"
)
brandf
=
open
(
brand_dir
+
"brands.csv"
,
"r"
)
outputbrand
=
"related_brands.csv"
outputproduct
=
"new_product.csv"
obf
=
open
(
outputbrand
,
"w"
)
opf
=
open
(
outputproduct
,
"w"
)
brands
=
[]
brandlines
=
{}
for
line
in
brandf
:
brand
=
line
.
split
(
"; "
)[
0
]
brandlines
[
brand
]
=
line
# print(brand)
brands
.
append
(
brand
)
not_found
=
[]
for
line
in
listf
:
fields
=
line
.
split
(
"; "
)
key
=
None
if
fields
[
0
]
!=
""
:
key
=
fields
[
0
]
elif
fields
[
1
]
!=
""
:
key
=
fields
[
1
]
else
:
print
(
"No product key !!!!!!!!"
)
return
found_flag
=
False
for
brand
in
brands
:
if
key
.
startswith
(
brand
):
found_flag
=
True
# print("%s-------->%s" % (key, brand))
newbls
=
brandlines
[
brand
]
.
split
(
"; "
)
newbls
=
newbls
[:
2
]
+
[
newbls
[
2
]
.
split
(
" "
)[
0
]]
+
newbls
[
3
:]
if
len
(
newbls
)
!=
4
:
print
(
newbls
)
return
obf
.
write
(
"; "
.
join
(
newbls
))
newline
=
fields
[:
2
]
score
=
float
(
fields
[
2
])
/
2
newline
.
append
(
str
(
score
))
comment_num
=
''
for
ch
in
fields
[
3
]:
if
ch
==
"人"
:
break
try
:
temp
=
int
(
ch
)
comment_num
+=
ch
except
:
continue
newline
.
append
(
str
(
comment_num
))
if
fields
[
4
]
!=
""
:
temp
=
fields
[
4
]
.
split
(
"·"
)
if
len
(
temp
)
==
2
:
newline
.
append
(
re
.
sub
(
r'\s+'
,
''
,
temp
[
0
]))
price
=
re
.
sub
(
r'\s+'
,
''
,
temp
[
1
])
price
=
price
.
split
(
'/'
)[
0
]
if
price
[
0
]
!=
"¥"
:
print
(
price
)
return
newline
.
append
(
price
[
1
:])
else
:
newline
.
append
(
re
.
sub
(
r'\s+'
,
''
,
temp
[
0
]))
newline
.
append
(
""
)
else
:
newline
+=
[
""
,
""
]
newline
+=
fields
[
5
:]
if
len
(
newline
)
!=
8
:
print
(
newline
)
return
opf
.
write
(
"; "
.
join
(
newline
))
break
if
not
found_flag
:
not_found
.
append
(
key
)
# print("===================totally %s product not found=========================" % len(not_found))
# for nf in not_found:
# print(nf)
if
__name__
==
'__main__'
:
findBrand
(
sys
.
argv
)
\ No newline at end of file
product_in_list.py
deleted
100644 → 0
View file @
d59311ba
import
cv2
import
uiautomator2
as
u2
import
numpy
as
np
import
sys
,
os
,
time
delim
=
"; "
filter_scroll_steps
=
2
output_dir
=
"list2"
def
cropImg
(
img
,
out
,
bounds
):
delta
=
20
+
28
cropped
=
img
[
(
int
(
bounds
[
'top'
])
+
delta
)
:
(
int
(
bounds
[
'bottom'
])
-
delta
),
(
int
(
bounds
[
'left'
])
+
delta
)
:
(
int
(
bounds
[
'right'
])
-
delta
)
]
# 裁剪坐标为[y0:y1, x0:x1]
cv2
.
imwrite
(
out
,
cropped
)
def
getProductImg
(
d
,
path
,
pos
):
image
=
d
.
screenshot
(
format
=
'opencv'
)
cropImg
(
image
,
path
,
pos
)
def
getProductKey
(
product
,
names
=
None
):
keyobj
=
None
name
=
product
.
child
(
resourceId
=
"org.c2h4.afei.beauty:id/tv_name"
)
if
name
.
count
==
0
:
name
=
""
else
:
keyobj
=
name
name
=
name
.
get_text
()
name_en
=
product
.
child
(
resourceId
=
"org.c2h4.afei.beauty:id/tv_name_en"
)
if
name_en
.
count
==
0
:
name_en
=
""
else
:
if
keyobj
is
None
:
keyobj
=
name_en
name_en
=
name_en
.
get_text
()
key
=
""
if
name
!=
""
:
key
=
name
elif
name_en
!=
""
:
key
=
name_en
if
names
is
not
None
:
names
.
append
(
name
)
names
.
append
(
name_en
)
return
key
,
keyobj
def
getProductDetailBasic
(
product
,
done
):
names
=
[]
key
,
keyobj
=
getProductKey
(
product
,
names
)
if
key
in
done
or
key
==
""
:
return
key
,
keyobj
,
None
rate_score
=
keyobj
.
sibling
(
resourceId
=
"org.c2h4.afei.beauty:id/rate_score"
)
if
rate_score
.
count
==
0
:
rate_score
=
""
else
:
rate_score
=
rate_score
.
get_text
()
asess_num
=
keyobj
.
sibling
(
resourceId
=
"org.c2h4.afei.beauty:id/tv_asess_num"
)
if
asess_num
.
count
==
0
:
asess_num
=
""
else
:
asess_num
=
asess_num
.
get_text
()
address
=
keyobj
.
sibling
(
resourceId
=
"org.c2h4.afei.beauty:id/tv_address"
)
if
address
.
count
==
0
:
address
=
""
else
:
address
=
address
.
get_text
()
return
key
,
keyobj
,
names
+
[
rate_score
,
asess_num
,
address
]
def
getProductDetail
(
d
,
product
,
done
):
key
,
keyobj
,
basicinfo
=
getProductDetailBasic
(
product
,
done
)
if
basicinfo
is
None
:
return
None
keyobj
.
click
()
img
=
d
(
resourceId
=
"org.c2h4.afei.beauty:id/iv_image"
)
wait_cnt
=
0
while
img
.
count
==
0
:
time
.
sleep
(
0.2
)
wait_cnt
+=
1
if
(
wait_cnt
==
300
):
raise
RuntimeError
(
"No response from APP"
)
img
=
d
(
resourceId
=
"org.c2h4.afei.beauty:id/iv_image"
)
# wait half second for the image to be stable to take screenshot,
# otherwise possibly the image will be different somehow
time
.
sleep
(
0.5
)
path
=
output_dir
+
'/'
+
key
+
".jpg"
getProductImg
(
d
,
path
,
img
.
info
[
'bounds'
])
effects
=
d
(
resourceId
=
"org.c2h4.afei.beauty:id/rl_effect"
)
.
child
(
className
=
"android.widget.TextView"
)
temp
=
[]
for
effect
in
effects
:
if
effect
.
get_text
()
.
startswith
(
"功效"
):
continue
else
:
temp
.
append
(
effect
.
get_text
())
effects
=
" "
.
join
(
temp
)
d
.
press
(
"back"
)
time
.
sleep
(
1
)
done
.
append
(
key
)
basicinfo
.
append
(
effects
)
return
basicinfo
def
saveProduct
(
f
,
res
,
product_type
):
res
.
append
(
product_type
)
print
(
delim
.
join
(
res
))
f
.
write
(
delim
.
join
(
res
)
+
"
\n
"
)
def
getAllProductUiObj
(
d
):
return
d
(
resourceId
=
"org.c2h4.afei.beauty:id/rv_container"
)
.
child
(
className
=
"android.widget.RelativeLayout"
)
def
getAllProducts
(
d
,
f
,
product_type
):
done
=
[]
nocnt
=
0
scraped_cnt
=
0
scroll_scale
=
0.1
while
True
:
products
=
getAllProductUiObj
(
d
)
product
=
None
for
temp
in
products
:
if
temp
.
info
[
'bounds'
][
'top'
]
==
temp
.
info
[
'visibleBounds'
][
'top'
]:
product
=
temp
break
res
=
getProductDetail
(
d
,
product
,
done
)
if
res
is
not
None
:
saveProduct
(
f
,
res
,
product_type
)
scraped_cnt
+=
1
if
scraped_cnt
%
10
==
0
:
f
.
flush
()
if
scraped_cnt
==
100
:
return
scraped_cnt
nocnt
=
0
else
:
nocnt
+=
1
if
nocnt
==
int
(
1
/
scroll_scale
)
+
1
:
break
d
.
swipe_ext
(
"up"
,
scale
=
scroll_scale
)
# handle the last few products in the list
products
=
getAllProductUiObj
(
d
)
for
idx
in
range
(
1
,
products
.
count
):
res
=
getProductDetail
(
d
,
products
[
idx
],
done
)
if
res
is
not
None
:
saveProduct
(
f
,
res
,
product_type
)
scraped_cnt
+=
1
return
scraped_cnt
# tap_list = ["洁面", "护肤水", "精华", "乳液·面霜", "防晒", "眼部", "清洁面膜", "面膜", "唇部", "身体"]
tap_list
=
[
"卸妆"
,
"妆前"
,
"粉底"
,
"口红"
,
"眉部"
,
"眼影"
,
"眼线"
,
"睫毛"
,
"腮红"
,
"定妆"
,
"遮瑕·修容"
,
"香水"
]
def
getProductByType
(
argv
):
target_idx
=
int
(
argv
[
1
])
d
=
u2
.
connect_usb
(
'd52196830204'
)
print
(
d
.
app_current
())
d
.
freeze_rotation
()
with
open
(
"
%
s/products.csv"
%
output_dir
,
'a+'
)
as
f
:
tap
=
tap_list
[
target_idx
]
tap_ui_obj
=
d
(
text
=
tap
)
if
tap_ui_obj
.
count
==
1
:
d
(
text
=
tap
)
.
click
()
else
:
print
(
"Tap
%
s (
%
d) is not found!"
%
(
tap
,
tap_ui_obj
.
count
))
return
time
.
sleep
(
1
)
temp_cnt
=
getAllProducts
(
d
,
f
,
tap
)
f
.
flush
()
print
(
"--------------------- tap summury ---------------------"
)
print
(
"
%
s products are scraped from
%
s"
%
(
temp_cnt
,
tap
))
print
(
"-------------------------------------------------------------"
)
d
.
freeze_rotation
(
False
)
def
getProductByType2
(
argv
):
d
=
u2
.
connect_usb
(
'd52196830204'
)
print
(
d
.
app_current
())
d
.
freeze_rotation
()
with
open
(
"
%
s/products.csv"
%
output_dir
,
'a+'
)
as
f
:
for
tap
in
tap_list
:
tap_ui_obj
=
d
(
text
=
tap
)
if
tap_ui_obj
.
count
==
1
:
d
(
text
=
tap
)
.
click
()
else
:
print
(
"Tap
%
s (
%
d) is not found!"
%
(
tap
,
tap_ui_obj
.
count
))
return
time
.
sleep
(
1
)
temp_cnt
=
getAllProducts
(
d
,
f
,
tap
)
f
.
flush
()
print
(
"--------------------- tap summury ---------------------"
)
print
(
"
%
s products are scraped from
%
s"
%
(
temp_cnt
,
tap
))
print
(
"-------------------------------------------------------------"
)
d
.
freeze_rotation
(
False
)
if
__name__
==
'__main__'
:
getProductByType
(
sys
.
argv
)
# getProductByType2(sys.argv)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment