# -*- coding:utf-8 -*- # @Time : 2019/12/11 18:58 # @Author : litao import numpy as np import random import json, redis, re, requests from selenium.webdriver import ActionChains import time, datetime, copy from selenium import webdriver from PIL import Image import os from selenium.webdriver.support.ui import WebDriverWait import cv2 from fontTools.ttLib import * from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from concurrent.futures import ProcessPoolExecutor rds_list = redis.StrictRedis(host='192.168.17.60', port=6379, db=1, decode_responses=True) rds_single = redis.StrictRedis(host='192.168.17.60', port=6379, db=0, decode_responses=True) # rds_get = redis.StrictRedis(host='192.168.17.60', port=6379, db=15, decode_responses=True) class CrawlerMain(object): def __init__(self): self.chrome_options = webdriver.ChromeOptions() self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--disable-gpu') # self.chrome_options.add_argument("--start-maximized") self.chrome_options.add_argument("--no-sandbox") self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) self.timestamp = str(datetime.datetime.now().timestamp() * 1e3) prefs = {"profile.managed_default_content_settings.images": 2} self.chrome_options.add_experimental_option("prefs", prefs) # self.driver = webdriver.Chrome(options=self.chrome_options) self.one_video_dic = { "platform": "bilibili", "ID": "", "title": "", "url": "", "describe": "", "video_count": "", "sum_duration": "", "year": "", "provider": "", "style_tags": "", "project_tags":"", "language": "", "area": "", "if_pay": "", "play_count_sum": "", "play_heat": "", "rate": "", "favorite_count_sum": "", "comment_count_sum": "", "barrage_count": "", } self.single_video_dic = { "platform": "bilibili", "vid": "", "title": "", "video_title": "", "url": "", "video_url":"", "describe": "", "video_count": "", "duration": "", "year": "", "provider": "", "language": "", "area": "", "if_pay": "", "play_count": "", "play_heat": "", "rate": "", "favorite_count": "", "comment_count": "", "barrage_count": "", } def __exit__(self): self.driver.close() def list_page(self, url=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, project_tag_xpath=None, provider=None, year=None, next_page_xpath=None, roll=None, project_tag=None, style_tags=None ): # js = "var q=document.documentElement.scrollTop=%s" % roll # self.driver.execute_script(js) while True: # self.driver.get(url) # self.driver.implicitly_wait(5) time.sleep(0.2) # self.driver.execute_script(js) try: next_page_obj = self.driver.find_elements_by_xpath(next_page_xpath) except: # self.driver.get(self.driver.current_url) continue vidoe_list_obj = self.driver.find_elements_by_xpath(video_list_xpath) try: for one_video in vidoe_list_obj: if_pay = "" str_res_list = one_video.text.split("\n") if len(str_res_list) == 4: play_count_str, if_pay, title_str, video_count_str = str_res_list elif len(str_res_list) == 3: play_count_str, title_str, video_count_str = str_res_list else: play_count_str, title_str = str_res_list video_count_str = "" play_count = trans_play_count(play_count_str) project_name = "bilibili_%s" % title_str url = one_video.find_element_by_xpath("./a[1]").get_attribute('href') data_dic = { "play_count": play_count, "url": url, "title": title_str, "video_count": video_count_str, "if_pay": if_pay } if style_tags: temp_dic = {} temp_dic["style_tags"] = style_tags data_dic.update(temp_dic) if project_tag: temp_dic = {} temp_dic["project_tags"] = project_tag data_dic.update(temp_dic) if year: temp_dic = {} temp_dic["year"] = year data_dic.update(temp_dic) self.parse_data(data_dic, project_name) if next_page_obj: # next_page_obj[0].click() action = ActionChains(self.driver) action.click(next_page_obj[0]).perform() else: # self.driver.close() break except Exception as e: print(e) self.driver.close() def style_tags(self, url=None, video_list_xpath=None, play_count_xpath=None, if_free=None, title=None, describe=None, style_tags_xpath=None, style=None, project_tag=None, provider=None, year=None, next_page_xpath=None, roll=None, ): self.driver.get(url) time.sleep(0.2) style_tags_obj = self.driver.find_elements_by_xpath(style_tags_xpath) for style_tag in style_tags_obj: style_tags = style_tag.text print(style_tags) action = ActionChains(self.driver) action.click(style_tag).perform() if style_tags == "全部": continue year=style_tags # if True: self.list_page(url=self.driver.current_url, video_list_xpath=video_list_xpath, play_count_xpath=play_count_xpath, if_free=if_free, title=title, describe=describe, project_tag_xpath=style_tags_xpath, project_tag=project_tag, style_tags=style, provider=provider, year=year, next_page_xpath=next_page_xpath, roll=roll,) def parse_data(self, data_dic, project_name): res = rds_list.hgetall(project_name) if res: if not res.get("project_tags"): res["project_tags"] = "" if data_dic.get("style_tags"): if data_dic.get("style_tags") not in res["style_tags"]: data_dic["style_tags"] = res["style_tags"] + "," + data_dic.get("style_tags") if data_dic.get("project_tags"): if data_dic.get("project_tags") not in res["project_tags"]: data_dic["project_tags"] = res["project_tags"] + "," + data_dic.get("project_tags") if data_dic.get("provider"): if data_dic.get("provider") not in res["provider"]: data_dic["provider"] = res["provider"] + "," + data_dic.get("provider") res.update(data_dic) rds_list.hmset(project_name, res) else: data = copy.deepcopy(self.one_video_dic) data.update(data_dic) rds_list.hmset(project_name, data) def parse_single_data(self, data_dic, project_name): res = rds_single.hgetall(project_name) if res: if not res.get("project_tags"): res["project_tags"] = "" if data_dic.get("style_tags"): if data_dic.get("style_tags") not in res["style_tags"]: data_dic["style_tags"] = res["style_tags"] + "," + data_dic.get("style_tags") if data_dic.get("project_tags"): if data_dic.get("project_tags") not in res["project_tags"]: data_dic["project_tags"] = res["project_tags"] + "," + data_dic.get("project_tags") if data_dic.get("provider"): if data_dic.get("provider") not in res["provider"]: data_dic["provider"] = res["provider"] + "," + data_dic.get("provider") res.update(data_dic) rds_single.hmset(project_name, res) else: data = copy.deepcopy(self.single_video_dic) data.update(data_dic) rds_single.hmset(project_name, data) def video_page(self,task=0): self.driver = webdriver.Chrome(options=self.chrome_options) self.driver.maximize_window() has_data = rds_get.dbsize() while has_data: keys = rds_get.randomkey() res = rds_get.hgetall(keys) has_data = rds_get.dbsize() # print(has_data) # print(res) # self.driver.execute_script('window.open("%s");' % res["url"]) # 关闭到当前窗口 # self.driver.close() time.sleep(0.2) # for handle in self.driver.window_handles: # self.driver.switch_to.window(handle) self.driver.get(res["url"]) self.driver.execute_script("window.scrollBy(0,1000)") self.driver.implicitly_wait(10) title = self.driver.find_element_by_xpath("//a[@class='media-title']").text print("task ",str(task),title) detail_page_url_obj = self.driver.find_element_by_xpath("//a[@class='media-title']") detail_page_url = detail_page_url_obj.get_attribute("href") print(detail_page_url) time.sleep(1) discribe_info = self.driver.find_element_by_xpath("//div[@class='media-count']") discribe_info_str = discribe_info.text play_count,barrage_count,favorite_count = discribe_info_str.split(" · ") # print(discribe_info_str) play_count=trans_play_count(play_count) if not play_count: print(discribe_info_str) play_count="--" rate_list = self.driver.find_elements_by_xpath("//div[@class='media-rating']") if rate_list: rate = rate_list[0].text else: rate = "" # print(rate) describe = self.driver.find_element_by_xpath("//a[@class='media-desc webkit-ellipsis']").text describe = describe.split("\n",-1)[0] # print(describe) dic = {"describe":describe, "rate":rate, "play_count":play_count, "barrage_count":barrage_count, "favorite_count":favorite_count, } project_name = "bilibili_%s" % title self.parse_data(dic,project_name) self.one_video_page(title,res["url"]) rds_get.delete(keys) def one_video_page(self,title,url): video_obj_list = self.driver.find_elements_by_xpath("//div[@id='eplist_module']//li") if video_obj_list: # time.sleep(0.1) action = ActionChains(self.driver) video_name_tags = self.driver.find_elements_by_xpath("//i[@class='mode-change iconfont icon-ep-list-simple']") if video_name_tags: # time.sleep(0.1) action.move_to_element(video_name_tags[0]).click().perform() del action time.sleep(0.1) video_obj_list = self.driver.find_elements_by_xpath("//div[@id='eplist_module']//li") video_obj= self.driver.find_element_by_xpath("//div[@id='eplist_module']//li") for video_count,video_obj in enumerate(video_obj_list): self.driver.implicitly_wait(10) action = ActionChains(self.driver) action.click(video_obj).perform() del action self.driver.execute_script("window.scrollBy(0,1000)") time.sleep(0.2) video_title = video_obj.text if_pay = "" # print(video_title) if "\n" in video_title: video_title,if_pay = video_title.split("\n",-1) # print(self.driver.page_source) comment_count_list = self.driver.find_elements_by_xpath("//span[@class='results']") if comment_count_list: comment_count = comment_count_list[0].text else: comment_count = 0 # print(comment_count) video_id = self.driver.find_element_by_xpath("//a[@class='av-link']") video_url = video_id.get_attribute("href") # print(video_url) # print(video_id.text) barrage_count_list = self.driver.find_elements_by_xpath("//span[@class='bilibili-player-video-info-danmaku-number']") if barrage_count_list: barrage_count = barrage_count_list[0].text else: barrage_count = "-" # print(barrage_count) duration = self.driver.find_elements_by_xpath('//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]') try: duration = trans_duration(duration[0].text) print(video_count,duration) except: duration = "" # print(duration) project_name = "bilibili_%s_%s" % (title,video_title) dic = { "title": title, "video_title": video_title, "if_pay":if_pay, "comment_count":comment_count, "url":url, "video_url":video_url, "video_id":video_id.text, "barrage_count":barrage_count, "duration":duration, "video_count":video_count + 1 } self.parse_single_data(dic,project_name) else: self.driver.execute_script("window.scrollBy(0,1000)") # time.sleep(0.4) video_title = self.driver.find_element_by_xpath('//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[1]/div[1]').text if_pay = "" # print(video_title) if "\n" in video_title: video_title, if_pay = video_title.split("\n", -1) # print(self.driver.page_source) comment_count = self.driver.find_element_by_xpath("//span[@class='results']").text # print(comment_count) video_id = self.driver.find_element_by_xpath("//a[@class='av-link']") video_url = video_id.get_attribute("href") # print(video_url) # print(video_id.text) barrage_count = self.driver.find_element_by_xpath( "//span[@class='bilibili-player-video-info-danmaku-number']").text # print(barrage_count) duration = self.driver.find_elements_by_xpath( '//*[@id="bilibiliPlayer"]/div[1]/div[1]/div[10]/div[2]/div[2]/div[1]/div[3]/div/span[3]') try: duration = trans_duration(duration[0].text) print(duration) except: duration = 0 project_name = "bilibili_%s_%s" % (title, video_title) dic = { "title": title, "video_title": video_title, "if_pay": if_pay, "comment_count": comment_count, "url": url, "video_url": video_url, "video_id": video_id.text, "barrage_count": barrage_count, "duration": duration, "video_count":1 } self.parse_single_data(dic, project_name) def detail_page(self): pass if __name__ == "__main__": # crlawler_bilbili = CrawlerMain() # crlawler_bilbili.list_page(url="https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=-1&release_date=-1&season_status=1&order=2&st=3&sort=0&page=1", # video_list_xpath="//ul[@class='bangumi-list clearfix']//li", # next_page_xpath="//a[@class='p next-page']", # roll=1200 # ) # style_tags_dic = { # "历史":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10033&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "美食":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10045&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "人文":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10065&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "科技":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10066&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "探险":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10067&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "宇宙":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10068&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "萌宠":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10069&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "社会":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10070&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "动物":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10071&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "自然":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10072&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "医疗":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10073&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "军事":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10074&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "灾难":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10064&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "罪案":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10075&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "神秘":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10076&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "旅行":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10077&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "运动":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=10038&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "电影":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=-10&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # } # for style in style_tags_dic: # try: # crlawler_bilbili = CrawlerMain() # crlawler_bilbili.style_tags( # url=style_tags_dic[style], # video_list_xpath="//ul[@class='bangumi-list clearfix']//li", # next_page_xpath="//a[@class='p next-page']", # roll=1200,style=style # ) # except: # continue # project_dic = { # "央视":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=4&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "BBC":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "探索频道":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=7&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "NHK":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=2&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "历史频道":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=6&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "卫视":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=8&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "自制":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=9&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "ITV":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=5&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "SKY":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=3&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "ZDF":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=10&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "合作机构":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=11&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "国内其他":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=12&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # "国外其他":"https://www.bilibili.com/documentary/index/#style_id=-1&producer_id=13&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # # } # for project in project_dic: # try: # crlawler_bilbili = CrawlerMain() # crlawler_bilbili.style_tags( # url=project_dic[project], # video_list_xpath="//ul[@class='bangumi-list clearfix']//li", # next_page_xpath="//a[@class='p next-page']", # roll=1200,project_tag=project # ) # except: # continue # project_dic = { # "2020":"https://www.bilibili.com/documentary/index/?spm_id_from=333.6.b_7375626e6176.6#style_id=-1&producer_id=-1&release_date=-1&season_status=-1&order=2&st=3&sort=0&page=1", # } # crlawler_bilbili = CrawlerMain() # for project in project_dic: # crlawler_bilbili.style_tags( # url=project_dic[project], # video_list_xpath="//ul[@class='bangumi-list clearfix']//li", # next_page_xpath="//a[@class='p next-page']", # roll=1200,year=project, # style_tags_xpath="/html[1]/body[1]/div[2]/div[2]/div[2]/div[2]/div[3]/ul[1]/li" # ) executor = ProcessPoolExecutor(max_workers=6) futures = [] crlawler_bilbili = CrawlerMain() # process_task = crlawler_bilbili.video_page for one_scan in range(3): crlawler_bilbili = CrawlerMain() future = executor.submit(crlawler_bilbili.video_page,task=one_scan) futures.append(future) # crlawler_bilbili.video_page(one_scan) executor.shutdown(True) # crlawler_bilbili = CrawlerMain() # crlawler_bilbili.video_page()