# -*- coding:utf-8 -*- # @Time : 2019/4/17 9:15 # @Author : litao # -*- coding: utf-8 -*- import os import re import time import copy import requests import datetime import json from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.proxy import Proxy, ProxyType from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.output_results import output_result from crawler.crawler_sys.utils.util_logging import logged from fontTools.ttLib import * from crawler.crawler_sys.utils.func_verification_code import Login try: from crawler_sys.framework.func_get_releaser_id import * except: from func_get_releaser_id import * from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count import random, urllib class Crawler_kwai(): def __init__(self, timeout=None, platform='kwai'): if timeout is None: self.timeout = 10 else: self.timeout = timeout self.platform = platform self.TotalVideo_num = None self.midstepurl = None std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal'] for key in unused_key_list: self.video_data.pop(key) self.first_page_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "live.kuaishou.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", } self.loginObj = Login() self.get_cookies_and_front = self.loginObj.get_cookies_and_front # def __del__(self): # try: # self.driver.quit() # os.remove(self.plugin_path) # except: # pass def create_proxyauth_extension(self, proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None): """Proxy Auth Extension args: proxy_host (str): domain or ip address, ie proxy.domain.com proxy_port (int): port proxy_username (str): auth username proxy_password (str): auth password kwargs: scheme (str): proxy scheme, default http plugin_path (str): absolute path of the extension return str -> plugin_path """ import string import zipfile if plugin_path is None: plugin_path = '/home/hanye/vimm_chrome_proxyauth_plugin_%s.zip' % int( datetime.datetime.now().timestamp() * 1e3) manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = string.Template( """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "${scheme}", host: "${host}", port: parseInt(${port}) }, bypassList: ["foobar.com"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "${username}", password: "${password}" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: ["<all_urls>"]}, ['blocking'] ); """ ).substitute( host=proxy_host, port=proxy_port, username=proxy_username, password=proxy_password, scheme=scheme, ) with zipfile.ZipFile(plugin_path, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) self.plugin_path = plugin_path return plugin_path def get_cookies_and_font(self, releaserUrl): # print(releaserUrl) self.cookie_dic, self.uni_code_dic = self.get_cookies_and_front(releaserUrl) def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) @staticmethod def re_cal_count(count_num): if isinstance(count_num, int): return count_num if isinstance(count_num, str): if count_num[-1] == "w": return int(float(count_num[:-1]) * 10000) try: return int(count_num) except: return False return False # def get_num_dic(self): # xml_re = { # '<TTGlyph name="(.*)" xMin="32" yMin="-6" xMax="526" yMax="729">': 0, # '<TTGlyph name="(.*)" xMin="32" yMin="7" xMax="526" yMax="742">': 0, # '<TTGlyph name="(.*)" xMin="98" yMin="13" xMax="363" yMax="726">': 1, # '<TTGlyph name="(.*)" xMin="98" yMin="26" xMax="363" yMax="739">': 1, # '<TTGlyph name="(.*)" xMin="32" yMin="13" xMax="527" yMax="732">': 2, # '<TTGlyph name="(.*)" xMin="32" yMin="26" xMax="527" yMax="745">': 2, # '<TTGlyph name="(.*)" xMin="25" yMin="-6" xMax="525" yMax="730">': 3, # '<TTGlyph name="(.*)" xMin="25" yMin="7" xMax="525" yMax="743">': 3, # '<TTGlyph name="(.*)" xMin="26" yMin="13" xMax="536" yMax="731">': 4, # '<TTGlyph name="(.*)" xMin="26" yMin="26" xMax="536" yMax="744">': 4, # '<TTGlyph name="(.*)" xMin="33" yMin="-5" xMax="526" yMax="717">': 5, # '<TTGlyph name="(.*)" xMin="33" yMin="8" xMax="526" yMax="730">': 5, # '<TTGlyph name="(.*)" xMin="39" yMin="-5" xMax="530" yMax="732">': 6, # '<TTGlyph name="(.*)" xMin="39" yMin="8" xMax="530" yMax="745">': 6, # '<TTGlyph name="(.*)" xMin="38" yMin="13" xMax="536" yMax="717">': 7, # '<TTGlyph name="(.*)" xMin="38" yMin="26" xMax="536" yMax="730">': 7, # '<TTGlyph name="(.*)" xMin="33" yMin="-7" xMax="525" yMax="731">': 8, # '<TTGlyph name="(.*)" xMin="33" yMin="6" xMax="525" yMax="744">': 8, # '<TTGlyph name="(.*)" xMin="37" yMin="-7" xMax="521" yMax="730">': 9, # '<TTGlyph name="(.*)" xMin="37" yMin="6" xMax="521" yMax="743">': 9 # } # uni_code_dic = {} # try: # for re_code in xml_re: # code_dic = re.findall(re_code, self.xml_text) # if code_dic: # uni_code_dic[code_dic[0].replace("uni", "\\\\u").lower()] = xml_re[re_code] # print("uni_code_dic", uni_code_dic) # return uni_code_dic # except: # print(self.xml_text) # return False def unicode_to_num(self, uni_str): count_num = str(uni_str.encode("unicode_escape"))[2:-1] # print(count_num) for i in self.uni_code_dic: if i in count_num: count_num = count_num.replace(i, str(self.uni_code_dic[i])) # print(count_num) return count_num # @logged def get_releaser_follower_num_pc(self, releaserUrl): count_true = 0 headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "content-type": "application/json", "Referer": releaserUrl, "Origin": "https://live.kuaishou.com", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "live.kuaishou.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", } while count_true < 5: proxies = get_proxy(proxies_num=1) self.get_cookies_and_font(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl) if not releaser_id: return None, None post_url = 'https://live.kuaishou.com/graphql' post_dic = {"operationName": "userInfoQuery", "variables": {"principalId": releaser_id}, "query": "query userInfoQuery($principalId: String) {\n userInfo(principalId: $principalId) {\n id\n principalId\n kwaiId\n eid\n userId\n profile\n name\n description\n sex\n constellation\n cityName\n living\n watchingCount\n isNew\n privacy\n feeds {\n eid\n photoId\n thumbnailUrl\n timestamp\n __typename\n }\n verifiedStatus {\n verified\n description\n type\n new\n __typename\n }\n countsInfo {\n fan\n follow\n photo\n liked\n open\n playback\n private\n __typename\n }\n bannedStatus {\n banned\n defriend\n isolate\n socialBanned\n __typename\n }\n __typename\n }\n}\n"} try: releaser_page = requests.post(post_url, headers=headers, cookies=self.cookie_dic, json=post_dic, proxies=proxies, timeout=2) except: releaser_page = requests.post(post_url, headers=headers, cookies=self.cookie_dic, json=post_dic) res_dic = releaser_page.json() print(res_dic) if res_dic.get("errors"): self.loginObj.delete_cookies(self.cookie_dic) # continue try: releaser_follower_num_str = res_dic["data"]["userInfo"]["countsInfo"]["fan"] releaser_follower_num = self.re_cal_count(self.unicode_to_num(releaser_follower_num_str)) print(releaser_follower_num) releaser_img = self.get_releaser_image(data=res_dic) return releaser_follower_num, releaser_img except: if count_true == 4: self.loginObj.delete_cookies(self.cookie_dic) count_true += 1 return None, None def get_releaser_follower_num(self, releaserUrl): count_true = 0 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Host": "kpfshanghai.m.chenzhongtech.com", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36" } while count_true < 5: proxies = get_proxy(proxies_num=1) releaser_id = self.get_releaser_id(releaserUrl) if not releaser_id: return None, None get_body = { "fid": str(random.randint(1535120000, 1535130000)), "cc": "share_copylink", "appType": "21", "shareType": "3", "et": "null", "timestamp": int(datetime.datetime.now().timestamp() * 1e3) } get_url = 'https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?%s' % ( releaser_id, urllib.parse.urlencode(get_body)) try: releaser_page = requests.get(get_url, headers=headers, # cookies=self.cookie_dic, proxies=proxies, timeout=5) except: releaser_page = requests.get(get_url, headers=headers, # cookies=self.cookie_dic, timeout=2) res_text = releaser_page.text # print(res_text) try: releaser_follower_num_str = re.findall('<div class="fans-follows"> (.*?)<span', res_text)[0] releaser_follower_num = self.re_cal_count(releaser_follower_num_str) print(releaser_follower_num) releaser_img = re.findall('background-image:url\((.*?)\)', res_text)[0] return releaser_follower_num, releaser_img except: count_true += 1 return None, None def get_releaser_image(self, releaserUrl=None, data=None): if releaserUrl: self.get_cookies_and_font(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/graphql' post_dic = {"operationName": "userInfoQuery", "variables": {"principalId": releaser_id}, "query": "query userInfoQuery($principalId: String) {\n userInfo(principalId: $principalId) {\n id\n principalId\n kwaiId\n eid\n userId\n profile\n name\n description\n sex\n constellation\n cityName\n living\n watchingCount\n isNew\n privacy\n feeds {\n eid\n photoId\n thumbnailUrl\n timestamp\n __typename\n }\n verifiedStatus {\n verified\n description\n type\n new\n __typename\n }\n countsInfo {\n fan\n follow\n photo\n liked\n open\n playback\n private\n __typename\n }\n bannedStatus {\n banned\n defriend\n isolate\n socialBanned\n __typename\n }\n __typename\n }\n}\n"} releaser_page = requests.post(releaserUrl, headers=self.first_page_headers, cookies=self.cookie_dic, json=post_dic) res_dic = releaser_page.json() try: releaser_img = res_dic["data"]["userInfo"]["profile"] print(releaser_img) return releaser_img except: return None else: releaser_img = data["data"]["userInfo"]["profile"] print(releaser_img) return releaser_img @staticmethod def get_video_image(data): return data.get("poster") def releaser_page(self, releaserUrl, **kwargs): self.releaser_page_web(releaserUrl, **kwargs) # self.releaser_page_pc(releaserUrl, **kwargs) def releaser_page_web(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ releaser = "" count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl pcursor = 0 headers = { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh,zh-CN;q=0.9", "Connection": "keep-alive", "Content-Type": "application/json; charset=UTF-8", # "Cookie": "did=web_c7c42d62cbb24{0}4d1ca5ffca052e3; didv=1582271776000; sid=e12d2ec74ec7af3a24d{1}cd6;pua5rv=1".format(random.randint(6000,8000),random.randint(20,99)), "Cookie": "did=web_790b7bcefe7347c5937a39d34c49f7ed; didv=1583150714000; sid=ab0c3a5497ab3c8fb73c8bef", "Host": "kpfshanghai.m.chenzhongtech.com", "kpf": "H5", "kpn": "KUAISHOU", # "Origin": "https://v.kuaishou.com", "Origin": "https://kpfshanghai.m.chenzhongtech.com", "Referer": "https://kpfshanghai.m.chenzhongtech.com/fw/user/%s?fid=1535125322&cc=share_copylink&shareMethod=TOKEN&docId=0&kpn=KUAISHOU&subBiz=PROFILE&shareId=14810686%s&docABKey=share_textid_profile&shareToken=X-7AeJHKdHOc_-392ps0aWP381Bs&shareResourceType=PROFILE_OTHER&groupABKey=share_group_profile&groupName=&expTag=null&shareObjectId=916251992&shareUrlOpened=0" % (releaser_id,random.randint(1000,9800)), "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } proxies = get_proxy(proxies_num) # print(proxies) # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": try: if proxies_num: get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={"eid": releaser_id, "count": 100, "pcursor": pcursor}, headers=headers, timeout=10, proxies=proxies) else: get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", json={"eid": releaser_id, "count": 100, "pcursor": pcursor}, headers=headers, timeout=10) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(random.randint(3,5)) page_dic = get_page.json() data_list = page_dic.get("feeds") # # print(data_list) # if not data_list: # get_page = requests.post("https://kpfshanghai.m.chenzhongtech.com/rest/kd/feed/profile", # json={"eid": releaser_id, "count": 18, "pcursor": pcursor}, # headers=headers, timeout=10) # page_dic = get_page.json() # data_list = page_dic.get("feeds") # time.sleep(1) if not data_list: print("no more data at releaser: %s page: %s " % (releaser_id, count)) proxies = get_proxy(proxies_num) retry_time += 1 if retry_time > 3: proxies_num = 0 print("no proxies") if retry_time >5: pcursor = "no_more" continue else: pcursor = page_dic.get("pcursor") print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) try: video_dic['title'] = info_dic.get('caption') releaser_id = info_dic.get("userEid") photoId_list = info_dic.get('share_info').split("&") for photoid in photoId_list: if "photoId=" in photoid: photoid = photoid.replace("photoId=", "") break video_dic['video_id'] = photoid video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % (releaser_id, photoid) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get("userName") video_dic['play_count'] = trans_play_count(info_dic.get("viewCount")) video_dic['comment_count'] = trans_play_count(info_dic.get("commentCount")) video_dic['favorite_count'] = trans_play_count(info_dic.get('likeCount')) video_dic['repost_count'] = trans_play_count(info_dic.get('forwardCount')) video_dic['fetch_time'] = int(time.time() * 1e3) try: video_dic['duration'] = int(info_dic.get("ext_params").get("video") / 1000) except: video_dic['duration'] = 0 print("duration error") video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id) video_dic['releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id video_dic['video_img'] = info_dic.get("coverUrls")[0].get("url") except Exception as e: print(e) continue if video_dic['play_count'] is False or video_dic['comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list def releaser_page_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): """ get video info from api instead of web page html the most scroll page is 1000 # """ # self.get_cookies_and_font(releaserUrl) user_id = "153512{0}".format(random.randint(1000, 9000)) releaser = "" headers = { "accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "content-type": "application/json", "Host": "live.kuaishou.com", "Origin": "https://live.kuaishou.com", "Referer": releaserUrl, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "Cookie": "clientid=3; did=web_504e72386a69c6d6172f1457b591%sc; client_key=65890b29; userId=%s" % (random.randint(300,800),user_id) } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'https://live.kuaishou.com/profile/%s' % releaser_id pcursor = None principalId = releaser_id self.video_data['releaserUrl'] = releaserUrl # firset_page = requests.get(releaserUrl, headers=self.first_page_headers) # cookie = firset_page.cookies # firset_page = requests.get(releaserUrl, headers=self.first_page_headers,cookies=cookie) # cookie = firset_page.cookies while count <= releaser_page_num_max and count <= 1000 and pcursor != "no_more": variables = {"principalId": principalId, "pcursor": pcursor, "count": 100} url_dic = {"operationName": "publicFeedsQuery", "variables": variables, "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n live {\n user {\n id\n avatar\n name\n __typename\n }\n watchingCount\n poster\n coverUrl\n caption\n id\n playUrls {\n quality\n url\n __typename\n }\n quality\n gameInfo {\n category\n name\n pubgSurvival\n type\n kingHero\n __typename\n }\n hasRedPack\n liveGuess\n expTag\n __typename\n }\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n"} api_url = 'https://live.kuaishou.com/m_graphql' try: if proxies: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5, proxies=proxies) else: get_page = requests.post(api_url, headers=headers, json=url_dic, timeout=5) except: proxies = get_proxy(proxies_num) continue # print(get_page.content) time.sleep(0.5) page_dic = get_page.json() data_list = page_dic.get("data").get("publicFeeds").get("list") # print(data_list) if data_list == []: print("no more data at releaser: %s page: %s " % (releaser, count)) retry_time += 1 if retry_time > 3: proxies_num = 0 if retry_time > 5: pcursor = "no_more" # if not pcursor: # self.loginObj.delete_cookies(self.cookie_dic) continue else: pcursor = page_dic.get("data").get("publicFeeds").get("pcursor") print("get data at releaser: %s page: %s" % (releaser, count)) count += 1 for info_dic in data_list: video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('caption') releaser_id_ = info_dic.get('user').get("eid") video_dic['url'] = "https://live.kuaishou.com/u/%s/%s" % (releaser_id_, info_dic.get('id')) video_dic['release_time'] = info_dic.get('timestamp') video_dic['releaser'] = info_dic.get('user').get("name") video_dic['play_count'] = trans_play_count(info_dic.get('counts').get("displayView")) video_dic['comment_count'] = trans_play_count(info_dic.get('counts').get("displayComment")) video_dic['favorite_count'] = trans_play_count(info_dic.get('counts').get("displayLike")) video_dic['video_id'] = info_dic.get('id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "kwai_%s" % (releaser_id_) video_dic['releaserUrl'] = 'https://live.kuaishou.com/profile/%s' % releaser_id_ video_dic['video_img'] = self.get_video_image(info_dic) if video_dic['play_count'] is False or video_dic['comment_count'] is False or video_dic[ 'favorite_count'] is False: print(info_dic) continue else: result_list.append(video_dic) if len(result_list) >= 100: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) print(len(result_list)) result_list.clear() return result_list # test if __name__ == '__main__': test = Crawler_kwai() url = 'https://live.kuaishou.com/profile/IIloveyoubaby' user_lis = [ "https://live.kuaishou.com/profile/3xx3vac2uctn2ak", "https://live.kuaishou.com/profile/3x4g7ckzcvn4avw", "https://live.kuaishou.com/profile/3xwgb2qdek5a9qg", "https://live.kuaishou.com/profile/3xf7z2egf9ggpfu", "https://live.kuaishou.com/profile/3xgb6xe5iv7y82k", "https://live.kuaishou.com/profile/3xkq27yxyxcbr8a", "https://live.kuaishou.com/profile/3xzd4ug8znq7kmm", "https://live.kuaishou.com/profile/3xbuepqfu433xne", "https://live.kuaishou.com/profile/3xsew4zi5ujve4i", "https://live.kuaishou.com/profile/3xhpu9g5cs6ai3u", "https://live.kuaishou.com/profile/3xvvwv8jyr4xdxg", "https://live.kuaishou.com/profile/3xfahqfxfrjri3g", "https://live.kuaishou.com/profile/3xvb2z84fsmfnt4", "https://live.kuaishou.com/profile/3xceq4ik3q5u9fc", "https://live.kuaishou.com/profile/3xp7cughuxajtcm", "https://live.kuaishou.com/profile/3xd7vbzuh7mt9ig", "https://live.kuaishou.com/profile/3xez8f2yy44fpig", "https://live.kuaishou.com/profile/3xpntyij92ba6q2", "https://live.kuaishou.com/profile/3x3xhbiiqb5iygw", "https://live.kuaishou.com/profile/3xhdbritwn5j7s2", "https://live.kuaishou.com/profile/3xnd62irvsk6i4i", "https://live.kuaishou.com/profile/3x5xid3skdu67pi", "https://live.kuaishou.com/profile/3xpei5y5psgadqs", "https://live.kuaishou.com/profile/3xiuijxfzpz7sei", "https://live.kuaishou.com/profile/3xuc5xaw5b4qqrw", "https://live.kuaishou.com/profile/3xukww4cfe3bbbu", "https://live.kuaishou.com/profile/3x7vg8ki2fgbxhw", "https://live.kuaishou.com/profile/3xz89yf92agnk3i", "https://live.kuaishou.com/profile/3xr6c3adec9za9w", "https://live.kuaishou.com/profile/3xa3msq9yvt4zfe", "https://live.kuaishou.com/profile/3xyijdkrxfnbmbm", "https://live.kuaishou.com/profile/3xiwtf8z4if25fe", ] for u in user_lis: ttt = test.releaser_page(releaserUrl=u, output_to_es_raw=True, es_index='crawler-data-raw', doc_type='doc', releaser_page_num_max=5, proxies_num=5) # test.get_releaser_follower_num(u) # break # sr_tud = test.search_page(keyword='任正非 BBC', search_pages_max=2)