添加若干说明

84dc06c7 · 李康 · d59311ba · 84dc06c7 · d59311ba · d59311ba
Commit 84dc06c7 authored Oct 29, 2019 by 李康
Show whitespace changes
Inline Side-by-side

Showing with 46 additions and 304 deletions

README.txt README.txt +46 -0

find_brand.py find_brand.py +0 -96

product_in_list.py product_in_list.py +0 -208

No files found.
--- a/README.txt
+++ b/README.txt
+c2h4_data.tar.gz：
+    爬取的以息商品库原始数据
+
+brands/:
+    存放的是所有爬取的品牌
+    注意品牌里有重复，使用时或许需要排重
+    brands.csv是所有品牌的列表，其格式为【中文名，英文名，地区和时间，描述】
+    其余jpg图片为品牌的logo图片
+
+brands.py：
+    用于爬取品牌榜，因为品牌不多，所以这部分代码写得比较随意，能用就行
+    这里需要注意的是，有些品牌没有中文名，这里需要单独处理一下
+    其中的getAllBrands函数用于提取所有有中文名的品牌，getEnglishBrands用于提取只有英文名的品牌
+    使用方法为，将app翻到品牌列表，然后执行python brands.py即可
+    需要注意的是，一次只能爬取一个品牌列表，如有多个列表请分别执行
+    程序中写死了一些参数，可以根据实际情况修改
+
+data_clean.py:
+    数据清洗功能
+    执行方式为python data_clean.py
+    其输入输出都在脚本里写死了，可以根据需要加以修改，比较简单
+    数据清洗的逻辑是跟产品定下来的，之后可能会有变化，所以这一部分功能不一定能复用
+
+
+product.py:
+    用于提取所有产品信息
+    执行方式为，将app翻到品牌列表页，然后执行python product.py 品牌名，这样会爬取对应品牌下的所有产品信息
+    或者将app翻到某品牌的产品列表页，执行python product.py，也会爬取该品牌的所有产品
+    在长时间爬取时，以息app可能会crash，爬虫提供了自动检测重启功能，但是该功能不是完全有效，有些时候手机会死机，有些时候重启逻辑会出bug，
+    都会导致重启失效，需要人工干预
+    爬虫还支持手动断点启动功能，方法为，将app翻到品牌列表页，执行python product.py 品牌名 子类别名，爬虫会自动翻到上次爬取的产品处进行爬取
+    爬虫运行时会生成一个文件夹，该文件夹以品牌名命名，其中包括一个product.csv的商品列表文件和各个商品的图标，重启功能就是从product.csv里获得已爬取的商品信息
+
+example_files:
+    一些示例文件
+    all_products.csv为数据排重后的所有商品列表
+    brands.url为品牌与品牌logo url（logo会通过upload_images.py上传到七牛云）的对应关系，json格式
+    products.url为商品与商品logo url的对应关系，json格式
+    brands_clean.csv和products_clean.csv为清洗后的数据
+
+upload_images.py:
+    图片上传脚本，需要放到后端的venus项目里执行
+    每上传一个图片，会返回图片对应的url
+
+all.txt:
+    以息的所有品牌名
--- a/find_brand.py
+++ b/find_brand.py
-import sys, os, time, re
-
-list_dir = "./"
-brand_dir = "brands/"
-def findBrand(argv):
-    listf = open(list_dir + "products.csv", "r")
-    brandf = open(brand_dir + "brands.csv", "r")
-
-    outputbrand = "related_brands.csv"
-    outputproduct = "new_product.csv"
-    obf = open(outputbrand, "w")
-    opf = open(outputproduct, "w")
-
-    brands = []
-    brandlines = {}
-    for line in brandf:
-        brand = line.split("; ")[0]
-        brandlines[brand] = line
-
-        # print(brand)
-        brands.append(brand)
-
-    not_found = []
-    for line in listf:
-        fields = line.split("; ")
-        key = None
-        if fields[0] != "":
-            key = fields[0]
-        elif fields[1] != "":
-            key = fields[1]
-        else:
-            print("No product key !!!!!!!!")
-            return
-        found_flag = False
-        for brand in brands:
-            if key.startswith(brand):
-                found_flag = True
-                # print("%s-------->%s" % (key, brand))
-                newbls = brandlines[brand].split("; ")
-                newbls = newbls[:2] + [newbls[2].split(" ")[0]] + newbls[3:]
-                if len(newbls) != 4:
-                    print(newbls)
-                    return
-                obf.write("; ".join(newbls))
-
-                newline = fields[:2]
-
-                score = float(fields[2]) / 2
-                newline.append(str(score))
-
-                comment_num = ''
-                for ch in fields[3]:
-                    if ch == "人":
-                        break
-                    try:
-                        temp = int(ch)
-                        comment_num += ch
-                    except:
-                        continue
-                newline.append(str(comment_num))
-
-                if fields[4] != "":
-                    temp = fields[4].split("·")
-                    if len(temp) == 2:
-                        newline.append(re.sub(r'\s+','', temp[0]))
-                        price = re.sub(r'\s+','', temp[1])
-                        price = price.split('/')[0]
-                        if price[0] != "¥":
-                            print(price)
-                            return
-                        newline.append(price[1:])
-                    else:
-                        newline.append(re.sub(r'\s+','', temp[0]))
-                        newline.append("")
-                else:
-                    newline += ["", ""]
-
-                newline += fields[5:]
-                if len(newline) != 8:
-                    print(newline)
-                    return
-
-                opf.write("; ".join(newline))
-                break
-        if not found_flag:
-            not_found.append(key)
-
-
-    # print("===================totally %s product not found=========================" % len(not_found))
-    # for nf in not_found:
-    #     print(nf)
-
-
-if __name__=='__main__':
-    findBrand(sys.argv)
\ No newline at end of file
--- a/product_in_list.py
+++ b/product_in_list.py
-import cv2
-import uiautomator2 as u2
-import numpy as np
-import sys, os, time
-
-delim = "; "
-filter_scroll_steps = 2
-output_dir = "list2"
-
-def cropImg(img, out, bounds):
-    delta = 20 + 28
-    cropped = img[
-              (int(bounds['top']) + delta) : (int(bounds['bottom']) - delta),
-              (int(bounds['left']) + delta) : (int(bounds['right']) - delta)
-              ]  # 裁剪坐标为[y0:y1, x0:x1]
-    cv2.imwrite(out, cropped)
-
-def getProductImg(d, path, pos):
-    image = d.screenshot(format='opencv')
-    cropImg(image, path, pos)
-
-
-
-def getProductKey(product, names = None):
-    keyobj = None
-    name = product.child(resourceId="org.c2h4.afei.beauty:id/tv_name")
-    if name.count == 0:
-        name = ""
-    else:
-        keyobj = name
-        name = name.get_text()
-
-    name_en = product.child(resourceId="org.c2h4.afei.beauty:id/tv_name_en")
-    if name_en.count == 0:
-        name_en = ""
-    else:
-        if keyobj is None:
-            keyobj = name_en
-        name_en = name_en.get_text()
-
-    key = ""
-    if name != "":
-        key = name
-    elif name_en != "":
-        key = name_en
-
-    if names is not None:
-        names.append(name)
-        names.append(name_en)
-
-    return key, keyobj
-
-def getProductDetailBasic(product, done):
-    names = []
-    key, keyobj = getProductKey(product, names)
-    if key in done or key == "":
-        return key, keyobj, None
-
-    rate_score = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/rate_score")
-    if rate_score.count == 0:
-        rate_score = ""
-    else:
-        rate_score = rate_score.get_text()
-
-    asess_num = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/tv_asess_num")
-    if asess_num.count == 0:
-        asess_num = ""
-    else:
-        asess_num = asess_num.get_text()
-
-    address = keyobj.sibling(resourceId="org.c2h4.afei.beauty:id/tv_address")
-    if address.count == 0:
-        address = ""
-    else:
-        address = address.get_text()
-    return key, keyobj, names + [rate_score, asess_num, address]
-
-def getProductDetail(d, product, done):
-    key, keyobj, basicinfo = getProductDetailBasic(product, done)
-    if basicinfo is None:
-        return None
-
-    keyobj.click()
-    img = d(resourceId="org.c2h4.afei.beauty:id/iv_image")
-    wait_cnt = 0
-    while img.count == 0:
-        time.sleep(0.2)
-        wait_cnt += 1
-        if (wait_cnt == 300):
-            raise RuntimeError("No response from APP")
-        img = d(resourceId="org.c2h4.afei.beauty:id/iv_image")
-
-    # wait half second for the image to be stable to take screenshot,
-    # otherwise possibly the image will be different somehow
-    time.sleep(0.5)
-
-    path = output_dir + '/' + key + ".jpg"
-    getProductImg(d, path, img.info['bounds'])
-    effects = d(resourceId="org.c2h4.afei.beauty:id/rl_effect").child(className="android.widget.TextView")
-    temp = []
-    for effect in effects:
-        if effect.get_text().startswith("功效"):
-            continue
-        else:
-            temp.append(effect.get_text())
-    effects = " ".join(temp)
-    d.press("back")
-    time.sleep(1)
-
-    done.append(key)
-    basicinfo.append(effects)
-    return basicinfo
-
-def saveProduct(f, res, product_type):
-    res.append(product_type)
-    print(delim.join(res))
-    f.write(delim.join(res) + "\n")
-
-def getAllProductUiObj(d):
-    return d(resourceId="org.c2h4.afei.beauty:id/rv_container").child(className="android.widget.RelativeLayout")
-
-def getAllProducts(d, f, product_type):
-    done = []
-    nocnt = 0
-    scraped_cnt = 0
-    scroll_scale = 0.1
-
-    while True:
-        products = getAllProductUiObj(d)
-        product = None
-        for temp in products:
-            if temp.info['bounds']['top'] == temp.info['visibleBounds']['top']:
-                product = temp
-                break
-        res = getProductDetail(d, product, done)
-        if res is not None:
-            saveProduct(f, res, product_type)
-            scraped_cnt += 1
-            if scraped_cnt % 10 == 0:
-                f.flush()
-            if scraped_cnt == 100:
-                return scraped_cnt
-            nocnt = 0
-        else:
-            nocnt += 1
-        if nocnt == int(1 / scroll_scale) + 1:
-            break
-        d.swipe_ext("up", scale=scroll_scale)
-
-    # handle the last few products in the list
-    products = getAllProductUiObj(d)
-    for idx in range(1, products.count):
-        res = getProductDetail(d, products[idx], done)
-        if res is not None:
-            saveProduct(f, res, product_type)
-            scraped_cnt += 1
-
-    return scraped_cnt
-
-# tap_list = ["洁面", "护肤水", "精华", "乳液·面霜", "防晒", "眼部", "清洁面膜", "面膜", "唇部", "身体"]
-tap_list = ["卸妆", "妆前", "粉底", "口红", "眉部", "眼影", "眼线", "睫毛", "腮红", "定妆", "遮瑕·修容", "香水"]
-def getProductByType(argv):
-    target_idx = int(argv[1])
-    d = u2.connect_usb('d52196830204')
-    print(d.app_current())
-    d.freeze_rotation()
-
-    with open("%s/products.csv" % output_dir, 'a+') as f:
-        tap = tap_list[target_idx]
-        tap_ui_obj = d(text=tap)
-        if tap_ui_obj.count == 1:
-            d(text=tap).click()
-        else:
-            print("Tap %s (%d) is not found!" % (tap, tap_ui_obj.count))
-            return
-        time.sleep(1)
-        temp_cnt = getAllProducts(d, f, tap)
-        f.flush()
-        print("--------------------- tap summury ---------------------")
-        print("%s products are scraped from %s" % (temp_cnt, tap))
-        print("-------------------------------------------------------------")
-    d.freeze_rotation(False)
-
-def getProductByType2(argv):
-    d = u2.connect_usb('d52196830204')
-    print(d.app_current())
-    d.freeze_rotation()
-
-    with open("%s/products.csv" % output_dir, 'a+') as f:
-        for tap in tap_list:
-            tap_ui_obj = d(text=tap)
-            if tap_ui_obj.count == 1:
-                d(text=tap).click()
-            else:
-                print("Tap %s (%d) is not found!" % (tap, tap_ui_obj.count))
-                return
-            time.sleep(1)
-            temp_cnt = getAllProducts(d, f, tap)
-            f.flush()
-            print("--------------------- tap summury ---------------------")
-            print("%s products are scraped from %s" % (temp_cnt, tap))
-            print("-------------------------------------------------------------")
-    d.freeze_rotation(False)
-
-if __name__=='__main__':
-    getProductByType(sys.argv)
-    # getProductByType2(sys.argv)
\ No newline at end of file