Commit 84dc06c7 authored by 李康's avatar 李康

添加若干说明

parent d59311ba
c2h4_data.tar.gz:
爬取的以息商品库原始数据
brands/:
存放的是所有爬取的品牌
注意品牌里有重复,使用时或许需要排重
brands.csv是所有品牌的列表,其格式为【中文名,英文名,地区和时间,描述】
其余jpg图片为品牌的logo图片
brands.py:
用于爬取品牌榜,因为品牌不多,所以这部分代码写得比较随意,能用就行
这里需要注意的是,有些品牌没有中文名,这里需要单独处理一下
其中的getAllBrands函数用于提取所有有中文名的品牌,getEnglishBrands用于提取只有英文名的品牌
使用方法为,将app翻到品牌列表,然后执行python brands.py即可
需要注意的是,一次只能爬取一个品牌列表,如有多个列表请分别执行
程序中写死了一些参数,可以根据实际情况修改
data_clean.py:
数据清洗功能
执行方式为python data_clean.py
其输入输出都在脚本里写死了,可以根据需要加以修改,比较简单
数据清洗的逻辑是跟产品定下来的,之后可能会有变化,所以这一部分功能不一定能复用
product.py:
用于提取所有产品信息
执行方式为,将app翻到品牌列表页,然后执行python product.py 品牌名,这样会爬取对应品牌下的所有产品信息
或者将app翻到某品牌的产品列表页,执行python product.py,也会爬取该品牌的所有产品
在长时间爬取时,以息app可能会crash,爬虫提供了自动检测重启功能,但是该功能不是完全有效,有些时候手机会死机,有些时候重启逻辑会出bug,
都会导致重启失效,需要人工干预
爬虫还支持手动断点启动功能,方法为,将app翻到品牌列表页,执行python product.py 品牌名 子类别名,爬虫会自动翻到上次爬取的产品处进行爬取
爬虫运行时会生成一个文件夹,该文件夹以品牌名命名,其中包括一个product.csv的商品列表文件和各个商品的图标,重启功能就是从product.csv里获得已爬取的商品信息
example_files:
一些示例文件
all_products.csv为数据排重后的所有商品列表
brands.url为品牌与品牌logo url(logo会通过upload_images.py上传到七牛云)的对应关系,json格式
products.url为商品与商品logo url的对应关系,json格式
brands_clean.csv和products_clean.csv为清洗后的数据
upload_images.py:
图片上传脚本,需要放到后端的venus项目里执行
每上传一个图片,会返回图片对应的url
all.txt:
以息的所有品牌名
import sys, os, time, re
list_dir = "./"
brand_dir = "brands/"
def findBrand(argv):
listf = open(list_dir + "products.csv", "r")
brandf = open(brand_dir + "brands.csv", "r")
outputbrand = "related_brands.csv"
outputproduct = "new_product.csv"
obf = open(outputbrand, "w")
opf = open(outputproduct, "w")
brands = []
brandlines = {}
for line in brandf:
brand = line.split("; ")[0]
brandlines[brand] = line
# print(brand)
brands.append(brand)
not_found = []
for line in listf:
fields = line.split("; ")
key = None
if fields[0] != "":
key = fields[0]
elif fields[1] != "":
key = fields[1]
else:
print("No product key !!!!!!!!")
return
found_flag = False
for brand in brands:
if key.startswith(brand):
found_flag = True
# print("%s-------->%s" % (key, brand))
newbls = brandlines[brand].split("; ")
newbls = newbls[:2] + [newbls[2].split(" ")[0]] + newbls[3:]
if len(newbls) != 4:
print(newbls)
return
obf.write("; ".join(newbls))
newline = fields[:2]
score = float(fields[2]) / 2
newline.append(str(score))
comment_num = ''
for ch in fields[3]:
if ch == "人":
break
try:
temp = int(ch)
comment_num += ch
except:
continue
newline.append(str(comment_num))
if fields[4] != "":
temp = fields[4].split("·")
if len(temp) == 2:
newline.append(re.sub(r'\s+','', temp[0]))
price = re.sub(r'\s+','', temp[1])
price = price.split('/')[0]
if price[0] != "¥":
print(price)
return
newline.append(price[1:])
else:
newline.append(re.sub(r'\s+','', temp[0]))
newline.append("")
else:
newline += ["", ""]
newline += fields[5:]
if len(newline) != 8:
print(newline)
return
opf.write("; ".join(newline))
break
if not found_flag:
not_found.append(key)
# print("===================totally %s product not found=========================" % len(not_found))
# for nf in not_found:
# print(nf)
if __name__=='__main__':
findBrand(sys.argv)
\ No newline at end of file
import cv2
import uiautomator2 as u2
import numpy as np
import sys, os, time
delim = "; "
filter_scroll_steps = 2
output_dir = "list2"
def cropImg(img, out, bounds):
delta = 20 + 28
cropped = img[
(int(bounds['top']) + delta) : (int(bounds['bottom']) - delta),
(int(bounds['left']) + delta) : (int(bounds['right']) - delta)
] # 裁剪坐标为[y0:y1, x0:x1]
cv2.imwrite(out, cropped)
def getProductImg(d, path, pos):
image = d.screenshot(format='opencv')
cropImg(image, path, pos)
def getProductKey(product, names = None):
keyobj = None
name = product.child(resourceId="org.c2h4.afei.beauty:id/tv_name")
if name.count == 0:
name = ""
else:
keyobj = name
name = name.get_text()
name_en = product.child(resourceId="org.c2h4.afei.beauty:id/tv_name_en")
if name_en.count == 0:
name_en = ""
else:
if keyobj is None:
keyobj = name_en
name_en = name_en.get_text()
key = ""
if name != "":
key = name
elif name_en != "":
key = name_en
if names is not None:
names.append(name)
names.append(name_en)
return key, keyobj
def getProductDetailBasic(product, done):
names = []
key, keyobj = getProductKey(product, names)
if key in done or key == "":
return key, keyobj, None
rate_score = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/rate_score")
if rate_score.count == 0:
rate_score = ""
else:
rate_score = rate_score.get_text()
asess_num = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/tv_asess_num")
if asess_num.count == 0:
asess_num = ""
else:
asess_num = asess_num.get_text()
address = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/tv_address")
if address.count == 0:
address = ""
else:
address = address.get_text()
return key, keyobj, names + [rate_score, asess_num, address]
def getProductDetail(d, product, done):
key, keyobj, basicinfo = getProductDetailBasic(product, done)
if basicinfo is None:
return None
keyobj.click()
img = d(resourceId="org.c2h4.afei.beauty:id/iv_image")
wait_cnt = 0
while img.count == 0:
time.sleep(0.2)
wait_cnt += 1
if (wait_cnt == 300):
raise RuntimeError("No response from APP")
img = d(resourceId="org.c2h4.afei.beauty:id/iv_image")
# wait half second for the image to be stable to take screenshot,
# otherwise possibly the image will be different somehow
time.sleep(0.5)
path = output_dir + '/' + key + ".jpg"
getProductImg(d, path, img.info['bounds'])
effects = d(resourceId="org.c2h4.afei.beauty:id/rl_effect").child(className="android.widget.TextView")
temp = []
for effect in effects:
if effect.get_text().startswith("功效"):
continue
else:
temp.append(effect.get_text())
effects = " ".join(temp)
d.press("back")
time.sleep(1)
done.append(key)
basicinfo.append(effects)
return basicinfo
def saveProduct(f, res, product_type):
res.append(product_type)
print(delim.join(res))
f.write(delim.join(res) + "\n")
def getAllProductUiObj(d):
return d(resourceId="org.c2h4.afei.beauty:id/rv_container").child(className="android.widget.RelativeLayout")
def getAllProducts(d, f, product_type):
done = []
nocnt = 0
scraped_cnt = 0
scroll_scale = 0.1
while True:
products = getAllProductUiObj(d)
product = None
for temp in products:
if temp.info['bounds']['top'] == temp.info['visibleBounds']['top']:
product = temp
break
res = getProductDetail(d, product, done)
if res is not None:
saveProduct(f, res, product_type)
scraped_cnt += 1
if scraped_cnt % 10 == 0:
f.flush()
if scraped_cnt == 100:
return scraped_cnt
nocnt = 0
else:
nocnt += 1
if nocnt == int(1 / scroll_scale) + 1:
break
d.swipe_ext("up", scale=scroll_scale)
# handle the last few products in the list
products = getAllProductUiObj(d)
for idx in range(1, products.count):
res = getProductDetail(d, products[idx], done)
if res is not None:
saveProduct(f, res, product_type)
scraped_cnt += 1
return scraped_cnt
# tap_list = ["洁面", "护肤水", "精华", "乳液·面霜", "防晒", "眼部", "清洁面膜", "面膜", "唇部", "身体"]
tap_list = ["卸妆", "妆前", "粉底", "口红", "眉部", "眼影", "眼线", "睫毛", "腮红", "定妆", "遮瑕·修容", "香水"]
def getProductByType(argv):
target_idx = int(argv[1])
d = u2.connect_usb('d52196830204')
print(d.app_current())
d.freeze_rotation()
with open("%s/products.csv" % output_dir, 'a+') as f:
tap = tap_list[target_idx]
tap_ui_obj = d(text=tap)
if tap_ui_obj.count == 1:
d(text=tap).click()
else:
print("Tap %s (%d) is not found!" % (tap, tap_ui_obj.count))
return
time.sleep(1)
temp_cnt = getAllProducts(d, f, tap)
f.flush()
print("--------------------- tap summury ---------------------")
print("%s products are scraped from %s" % (temp_cnt, tap))
print("-------------------------------------------------------------")
d.freeze_rotation(False)
def getProductByType2(argv):
d = u2.connect_usb('d52196830204')
print(d.app_current())
d.freeze_rotation()
with open("%s/products.csv" % output_dir, 'a+') as f:
for tap in tap_list:
tap_ui_obj = d(text=tap)
if tap_ui_obj.count == 1:
d(text=tap).click()
else:
print("Tap %s (%d) is not found!" % (tap, tap_ui_obj.count))
return
time.sleep(1)
temp_cnt = getAllProducts(d, f, tap)
f.flush()
print("--------------------- tap summury ---------------------")
print("%s products are scraped from %s" % (temp_cnt, tap))
print("-------------------------------------------------------------")
d.freeze_rotation(False)
if __name__=='__main__':
getProductByType(sys.argv)
# getProductByType2(sys.argv)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment