1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 12:16:46 2018
@author: fangyucheng
"""
import requests
from bs4 import BeautifulSoup
import re
import datetime
import pickle
import pandas as pd
class Crawler_v_qq_seeeast:
def video_page(self, url):
get_page=requests.get(url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
try:
title=soup.find('h1',{'class':'video_title _video_title'}).text
title=title.replace('\n','')
title=title.replace('\t','')
except AttributeError:
title=None
try:
releaser=soup.find('span',{'class':'user_name'}).text
except AttributeError:
releaser=None
try:
releaserUrl=soup.find('a',{'class':'user_info'})['href']
except TypeError:
releaserUrl=None
try:
video_intro=soup.find('meta',{'itemprop':'description'})['content']
except TypeError:
video_intro=None
try:
midstep = soup.find("script",{"r-notemplate":"true"}).text
try:
duration = re.findall(r'"duration":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"duration":[0-9]{1,10}', midstep)))[0].split(':')[1]
except IndexError:
duration = re.findall(r'"duration":"[0-9]{1,10}"', ','.join(re.findall(r'VIDEO_INFO.*"duration":"[0-9]{1,10}"', midstep)))[0].split(':')[1]
duration=duration.replace('"','')
duration=int(duration)
except:
print('Catched exception, didn\'t find duartion in var VIDEO_INFO')
duration=0
try:
playcount = re.findall(r'"view_all_count":[0-9]{1,10}', ','.join(re.findall(r'VIDEO_INFO.*"view_all_count":[0-9]{1,10}', midstep)))[0].split(':')[1]
except:
print('Catched exception, didn\'t find view_all_count in var VIDEO_INFO')
playcount=0
retime=re.findall(r'"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', ','.join(re.findall(r'VIDEO_INFO.*"video_checkup_time":"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d"', midstep)))[0].split('":"')[1].split(' ')[0]
release_time=int(datetime.datetime.strptime(retime,'%Y-%m-%d').timestamp()*1e3)
except AttributeError:
duration=0
playcount=0
release_time=0
D0={'title':title,'playcount':playcount,'releaser':releaser,'video_intro':video_intro,'release_time':release_time,'duration':duration,'releaserUrl':releaserUrl}
return D0
def search_page(self,totalpage):
video_Lst=[]
url_Lst=[]
page_Lst=['https://v.qq.com/x/search/?ses=qid%3D_tWkHF0wGSaxTrPo1x9xd0NnsAZaHm3ts4uGEB7pLLeyHMZ1YQ-myg%26last_query%3D%E7%9C%8B%E4%B8%9C%E6%96%B9%26tabid_list%3D0%7C3%7C11%7C12%7C8%26tabname_list%3D%E5%85%A8%E9%83%A8%7C%E7%BB%BC%E8%89%BA%7C%E6%96%B0%E9%97%BB%7C%E5%A8%B1%E4%B9%90%7C%E5%8E%9F%E5%88%9B&q=%E7%9C%8B%E4%B8%9C%E6%96%B9&stag=3&cur={}&cxt=tabid%3D0%26sort%3D1%26pubfilter%3D0%26duration%3D3'.format(str(i)) for i in range(1,totalpage)]
for page_url in page_Lst:
get_page=requests.get(page_url)
print (page_url)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
tencent = soup.find_all("div", { "class" : "result_item result_item_h _quickopen" })
for data_line in tencent:
stepone=data_line.find('div',{'class':'result_info'}).div.div
steptwo=stepone.find('span',{'class':'content'}).text
stepthree=steptwo.split('-')[1]
if stepthree=='02':
try:
ttt=data_line.find('span',{'title':'看东方'}).text
except AttributeError:
ttt=None
if ttt==None:
urls=None
else:
urls=data_line.h2.a['href']
get_page=requests.get(urls)
print (urls)
get_page.encoding='utf-8'
page = get_page.text
soup = BeautifulSoup(page,'html.parser')
fff=soup.find_all('a',{'class':'figure_detail'})
for zzz in fff:
urls1=zzz['href']
urls2='https://v.qq.com'+urls1
url_Lst.append(urls2)
for url in url_Lst:
dicdic = self.video_page(url)
dicdic['url']=url
print(url)
video_Lst.append(dicdic)
return video_Lst
if __name__=='__main__':
v_qq_crawler = Crawler_v_qq_seeeast()
#video_data=v_qq_crawler.search_page(keyword='国家相册')
search_page=v_qq_crawler.search_page(totalpage=5)
#eastnewFeb=pd.DataFrame(eastnewsFeb)
#eastnewFeb.to_csv('东方新闻-二月',encoding='gb18030',index=False)
#eastnewFeb['release_time_H']=eastnewFeb['release_time']-4*3600*1e3
#eastnewFeb['real_time']=pd.to_datetime(eastnewFeb['release_time_H'],unit='ms')
#counter=1
#for url in url_Lst:
#dicdic = self.video_page(url)
#dicdic['url']=url
#video_Lst.append(dicdic)
#counter+=1
#print(counter)
#if counter%1000==0:
#with open('seeeast_data_Lst_pickle', 'ab') as pickleF:
#pickle.dump(video_Lst, pickleF)
#video_Lst.clear
#if len(video_Lst)>0:
#with open('seeeast_data_Lst_pickle', 'ab') as pickleF:
#pickle.dump(video_Lst, pickleF)
seeeast=pd.DataFrame(search_page)
seeeast['re']=seeeast['release_time']-4*3600*1e3
seeeast['real_time']=pd.to_datetime(seeeast['re'],unit='ms')
seeeast.to_csv('东方新闻xin-二月',encoding='gb18030',index=False)