1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 15:21:47 2018
@author: fangyucheng
"""
import requests
from bs4 import BeautifulSoup
import re
import datetime
import pickle
import pandas as pd
class Crawler_v_qq_eastnews:
def video_page(self, url):
get_page=requests.get(url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
try:
title=soup.find('h1',{'class':'video_title _video_title'}).text
title=title.replace('\n','')
title=title.replace('\t','')
except AttributeError:
title=None
try:
releaser=soup.find('span',{'class':'user_name'}).text
except:
releaser=None
try:
releaserUrl=soup.find('a',{'class':'user_info'})['href']
except TypeError:
releaserUrl=None
try:
video_intro=soup.find('meta',{'itemprop':'description'})['content']
except TypeError:
video_intro=None
try:
midstep = soup.find("script",{"r-notemplate":"true"}).text
try:
duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1]
except IndexError:
duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1]
duration=duration.replace('"','')
duration=int(duration)
except:
print('Catched exception, didn\'t find duartion in var VIDEO_INFO')
duration=0
try:
playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1]
except:
print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO')
playcount=0
retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0]
try:
release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3)
except ValueError:
release_time=0
except:
duration=None
playcount=None
release_time=None
D0={'title':title,'playcount':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl}
return D0
def search_page(self,totalpage):
video_Lst=[]
url_Lst=[]
page_Lst=['https://v.qq.com/x/search/?ses=qid%3D_5hveCy5oWKS_b5d4GuLquXTO29F8LJnLcmNDpNkXFkeEr8UDB0g9g%26last_query%3D%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB%26tabid_list%3D0%7C11%7C8%7C7%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E6%96%B0%E9%97%BB%7C%E5%8E%9F%E5%88%9B%7C%E5%85%B6%E4%BB%96&q=%E4%B8%9C%E6%96%B9%E6%96%B0%E9%97%BB&stag=3&cur={}&cxt=tabid%3D0%26sort%3D1%26pubfilter%3D0%26duration%3D3'.format(str(i)) for i in range(1,totalpage)]
for page_url in page_Lst:
get_page=requests.get(page_url)
print (page_url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" })
for data_line in tencent:
try:
ttt=data_line.find('span',{'title':'东方新闻'}).text
except AttributeError:
ttt=None
if ttt==None:
urls=None
else:
urls=data_line.h2.a['href']
get_page=requests.get(urls)
print (urls)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
fff=soup.find_all('a',{'class':'figure_detail'})
for zzz in fff:
urls1=zzz['href']
urls2='https://v.qq.com'+urls1
url_Lst.append(urls2)
for url in url_Lst:
dicdic = self.video_page(url)
dicdic['url']=url
print(url)
video_Lst.append(dicdic)
return video_Lst
if __name__=='__main__':
v_qq_crawler = Crawler_v_qq_eastnews()
search_page2=v_qq_crawler.search_page(totalpage=20)