# -*- coding: utf-8 -*-
"""
Created on Sat Feb 24 14:19:00 2018
independent function:
1 list page
2 search page
3 video page
4 releaser page
All function take parameters and return dict list.
parameters:
1 list page: url
2 search page: keyword
3 video page: url
4 releaser page: url
@author: fangyucheng
"""
import requests
from bs4 import BeautifulSoup
import datetime
import re
import pandas as pd
import json
class Crawler_v_qq:
def video_page(self, url):
get_page=requests.get(url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
try:
title=soup.find('h1',{'class':'video_title _video_title'}).text
title=title.replace('\n','')
title=title.replace('\t','')
except AttributeError:
title=None
try:
releaser=soup.find('span',{'class':'user_name'}).text
except AttributeError:
releaser=None
try:
releaserUrl=soup.find('a',{'class':'user_info'})['href']
except TypeError:
releaserUrl=None
try:
video_intro=soup.find('meta',{'itemprop':'description'})['content']
except TypeError:
video_intro=None
midstep = soup.find("script",{"r-notemplate":"true"}).text
try:
duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1]
except IndexError:
duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1]
duration=duration.replace('"','')
duration=int(duration)
except:
print('Catched exception, didn\'t find duartion in var VIDEO_INFO')
duration=0
try:
playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1]
except:
print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO')
playcount=0
retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0]
try:
release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3)
except ValueError:
release_time=0
fetch_time=int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
try:
target=soup.find('div',{'class':'video_tags _video_tags'}).text
except:
target=None
D0={'title':title,'target':target,'play_count':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl,'url':url,'fetch_time':fetch_time}
return D0
def search_page(self, keyword):
search_page_Lst=[]
def process_one_line(data_line):
url=data_line.h2.a['href']
dicdicdic=self.video_page(url)
return dicdicdic
search_url = ['https://v.qq.com/x/search?q='+keyword+'&cur={}'.format(str(i)) for i in range(1,6)]
for urls in search_url:
get_page=requests.get(urls)
print(urls)
get_page.encoding='utf-8'
page=get_page.text
soup = BeautifulSoup(page,'html.parser')
tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" })
for data_line in tencent:
one_line_dic=process_one_line(data_line)
print('get one line done')
search_page_Lst.append(one_line_dic)
return search_page_Lst
def releaser_page(self, releaserurl):
releaser_data_Lst=[]
get_page=requests.get(releaserurl)
get_page.encoding='utf-8'
page=get_page.text
soup = BeautifulSoup(page,'html.parser')
totalvideonum=int(soup.find('span',{'class':'count_num'}).text[1:-3])
if totalvideonum/24==int(totalvideonum/24):
totalvideopage=int(totalvideonum/24)
else:
totalvideopage=int(totalvideonum/24)+1
releaserID=soup.find('span',{'class':'sns_btn'}).a['data-vuin']
video_page_url = ['http://c.v.qq.com/vchannelinfo?otype=json&uin='+releaserID+'&qm=1&pagenum={}&num=24'.format(str(i)) for i in range(1,totalvideopage)]
for urls in video_page_url:
get_page=requests.get(urls)
print(urls)
get_page.encoding='utf-8'
page=get_page.text
twenty_four_video=page.split('[')[1].split(']')[0].replace('},{','},,,{').split(',,,')
for a_video in twenty_four_video:
url=eval(a_video)['url']
one_video_dic=self.video_page(url)
print('get one line done')
releaser_data_Lst.append(one_video_dic)
return releaser_data_Lst
def list_page(self, listurl):
#listurl=http://v.qq.com/x/list/fashion/
list_data_Lst=[]
listnum=[]
for i in range(0,34):
list_num=i*30
listnum.append(list_num)
#最近热播
listpage=[listurl+'?&offset={}'.format(str(i)) for i in listnum]
#最近上架
#listpage=[listurl+'?sort=5&offset={}'.format(str(i)) for i in listnum]
for listurls in listpage:
get_page=requests.get(listurls)
get_page.encoding='utf-8'
page = get_page.text
print(listurls)
soup = BeautifulSoup(page,'html.parser')
midstep=soup.find_all('li',{'class':'list_item'})
counter=0
for urls in midstep:
url=urls.a['href']
one_video_dic=self.video_page(url)
counter+=1
print(counter)
list_data_Lst.append(one_video_dic)
return list_data_Lst
def doc_list_page(self, listurl):
#listurl=http://v.qq.com/x/list/fashion/
done=open('done_qq','a')
result=open('result_qq','a')
error=open('error_qq','a')
list_data_Lst=[]
listnum=[]
for i in range(0,93):
list_num=i*30
listnum.append(list_num)
#最近热播
listpage=[listurl+'?&offset={}'.format(str(i)) for i in listnum]
#最近上架
#listpage=[listurl+'?sort=5&offset={}'.format(str(i)) for i in listnum]
for listurl in listpage:
get_page=requests.get(listurl)
get_page.encoding='utf-8'
page = get_page.text
print(listurl)
done.write(listurl)
done.write('\n')
done.flush()
soup = BeautifulSoup(page,'html.parser')
midstep=soup.find_all('strong',{'class':'figure_title'})
for line in midstep:
album_name=line.text
url=line.a['href']
get_page=requests.get(url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
try:
get_all_url=soup.find('ul',{'class':'figure_list _hot_wrapper'})
url_agg=get_all_url.find_all('a',{'class':'figure_detail'})
urllist=[]
for line in url_agg:
url_part=line['href']
url='https://v.qq.com'+url_part
urllist.append(url)
for url in urllist:
try:
one_video=self.video_page(url)
one_video['album_name']=album_name
print(url)
list_data_Lst.append(one_video)
one_video_json=json.dumps(one_video)
result.write(one_video_json)
result.write('\n')
result.flush()
except AttributeError:
D0={'url':url,'album_name':album_name}
print('there is an error')
json_D0=json.dumps(D0)
error.write(json_D0)
error.write('\n')
error.flush()
except:
one_video=self.video_page(url)
one_video['album_name']=album_name
print(url)
list_data_Lst.append(one_video)
one_video_json=json.dumps(one_video)
result.write(one_video_json)
result.write('\n')
result.flush()
done.close()
result.close()
error.close()
return list_data_Lst
def doc_list_reborn(self, listurl,x):
#listurl=http://v.qq.com/x/list/fashion/
done=open('done_qq','a')
result=open('result_qq','a')
error=open('error_qq','a')
list_data_Lst=[]
listnum=[]
for i in range(x,93):
list_num=i*30
listnum.append(list_num)
#最近热播
listpage=[listurl+'?&offset={}'.format(str(i)) for i in listnum]
#最近上架
#listpage=[listurl+'?sort=5&offset={}'.format(str(i)) for i in listnum]
for listurl in listpage:
get_page=requests.get(listurl)
get_page.encoding='utf-8'
page = get_page.text
print(listurl)
done.write(listurl)
done.write('\n')
done.flush()
soup = BeautifulSoup(page,'html.parser')
midstep=soup.find_all('strong',{'class':'figure_title'})
for line in midstep:
album_name=line.text
url=line.a['href']
get_page=requests.get(url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
try:
get_all_url=soup.find('ul',{'class':'figure_list _hot_wrapper'})
url_agg=get_all_url.find_all('a',{'class':'figure_detail'})
urllist=[]
for line in url_agg:
url_part=line['href']
url='https://v.qq.com'+url_part
urllist.append(url)
for url in urllist:
try:
one_video=self.video_page(url)
one_video['album_name']=album_name
print(url)
list_data_Lst.append(one_video)
one_video_json=json.dumps(one_video)
result.write(one_video_json)
result.write('\n')
result.flush()
except:
D0={'url':url,'album_name':album_name}
print('there is an error')
json_D0=json.dumps(D0)
error.write(json_D0)
error.write('\n')
error.flush()
except:
try:
one_video=self.video_page(url)
one_video['album_name']=album_name
print(url)
list_data_Lst.append(one_video)
one_video_json=json.dumps(one_video)
result.write(one_video_json)
result.write('\n')
result.flush()
except:
D0={'url':url,'album_name':album_name}
print('there is an error')
json_D0=json.dumps(D0)
error.write(json_D0)
error.write('\n')
error.flush()
done.close()
result.close()
error.close()
return list_data_Lst
# test
if __name__=='__main__':
v_qq_crawler = Crawler_v_qq()
#video_data2=v_qq_crawler.search_page(keyword='国家相册')
result=open('result_qq','a')
for line in urllist:
try:
album_name=['album_name']
url=line['url']
one_video=v_qq_crawler.video_page(url)
print('get one video')
one_video['album_name']=album_name
one_video_json=json.dumps(one_video)
result.write(one_video_json)
result.write('\n')
result.flush()
except:
print(url)
# video_data3=v_qq_crawler.releaser_page(releaserurl='http://v.qq.com/vplus/xhpmtt/videos')
#x=int(2760/30)
#listurl='http://v.qq.com/x/list/doco'
# video_data4=v_qq_crawler.doc_list_reborn(listurl,x)
# list_page_url='https://v.qq.com/finance'
# list_page_data_Lst=v_qq_crawler.list_page(list_page_url)