# -*- coding: utf-8 -*-
"""
Created on Sun Apr 8 14:53:43 2018
@author: fangyucheng
"""
import datetime
import copy
import re
import json
from crawler_sys.framework.video_fields_std import Std_fields_video
from crawler_sys.utils.output_results import retry_get_url
from crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.util_logging import logged
class Crawler_pear():
def __init__(self, timeout=None, platform='pearvideo'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['channel', 'describe', 'repost_count', 'isOriginal',
'video_id']
for popk in pop_key_Lst:
self.video_data.pop(popk)
self.legal_list_page_urls = []
self.legal_channels = []
Cookie = ('JSESSIONID=E546B64FEB7F2009C3D2D64887F4FD67,'
'PEAR_UUID=7693819d-8037-460d-9a04-15217b3ee67f; '
'PEAR_DEVICE_FLAG=true;Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1522723194;'
'UM_distinctid=1600b8f08667c5-0c3c0cc44f7afd8-173a7640-1fa400-1600b8f08677c7; '
'PV_APP=srv-pv-prod-portal1; __ads_session=Ai82CnAsFAn2IyZjGwA=')
self.headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control':'max-age=0, no-cache',
'Connection':'keep-alive',
# Cookie is necessary to get results
'Cookie':Cookie,
'Host': 'app.pearvideo.com',
'Pragma':'no-cache',
'Upgrade-Insecure-Requests':'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
self.category_corresponding = {'社会':'1', '世界':'2', '财富':'3', '娱乐':'4', '生活':'5',
'美食':'6', '搞笑':'7', '科技':'8', '体育':'9', '知新':'10',
'二次元':'17', '汽车':'31', '音乐':'59'}
def video_page(self, url):
find_video_id = ' '.join(re.findall('video_\d+', url))
video_id = ' '.join(re.findall('\d+', find_video_id))
real_url = 'http://app.pearvideo.com/clt/jsp/v3/content.jsp?contId='+video_id
get_page = retry_get_url(real_url, headers=self.headers)
get_page.encoding = 'utf-8'
page = get_page.text
page = page.replace('\n', '')
page = page.replace('\r', '')
page_dic = json.loads(page)
if page_dic['resultMsg'] == 'success':
self.video_data['title'] = page_dic['content']['name']
self.video_data['video_id'] = video_id
self.video_data['url'] = url
self.video_data['releaser'] = page_dic['content']['authors'][0]['nickname']
releaser_id = page_dic['content']['authors'][0]['userId']
self.video_data['releaser_id'] = releaser_id
self.video_data['releaserUrl'] = 'http://www.pearvideo.com/author_'+releaser_id
self.video_data['comment_count'] = page_dic['content']['commentTimes']
self.video_data['favorite_count'] = page_dic['content']['praiseTimes']
rt_time = page_dic['content']['pubTime']
self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time,
'%Y-%m-%d %H:%M').timestamp()*1e3)
dura = page_dic['content']['duration']
dura = dura.replace('"', '')
dura_lst = dura.split("'")
self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
video_info = copy.deepcopy(self.video_data)
else:
print('error'+url)
return video_info
def video_page_by_video_id(self, video_id):
real_url = 'http://app.pearvideo.com/clt/jsp/v3/content.jsp?contId='+video_id
get_page = retry_get_url(real_url, headers=self.headers)
get_page.encoding = 'utf-8'
page = get_page.text
page = page.replace('\n', '')
page = page.replace('\r', '')
page_dic = json.loads(page)
if page_dic['resultMsg'] == 'success':
self.video_data['title'] = page_dic['content']['name']
self.video_data['video_id'] = video_id
url = 'http://www.pearvideo.com/video_'+video_id
self.video_data['url'] = url
self.video_data['releaser'] = page_dic['content']['authors'][0]['nickname']
releaser_id = page_dic['content']['authors'][0]['userId']
self.video_data['releaser_id'] = releaser_id
self.video_data['releaserUrl'] = 'http://www.pearvideo.com/author_'+releaser_id
self.video_data['comment_count'] = page_dic['content']['commentTimes']
self.video_data['favorite_count'] = page_dic['content']['praiseTimes']
rt_time = page_dic['content']['pubTime']
self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time,
'%Y-%m-%d %H:%M').timestamp()*1e3)
dura = page_dic['content']['duration']
dura = dura.replace('"', '')
dura_lst = dura.split("'")
self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
video_info = copy.deepcopy(self.video_data)
else:
print('error'+url)
return video_info
@logged
def releaser_page(self, releaserUrl,
error_file=None,
output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
releaser_page_num_max=30):
result_Lst = []
find_releaser_id = ' '.join(re.findall('author_\d+', releaserUrl))
releaser_id = ' '.join(re.findall('\d+', find_releaser_id))
real_releaserUrl = ('http://app.pearvideo.com/clt/jsp/v3/userHome.jsp?userId='
+releaser_id+'&reqType=1&start=0')
count = 0
while real_releaserUrl != '' and count < releaser_page_num_max*10:
count += 10
get_page = retry_get_url(real_releaserUrl, headers=self.headers)
get_page.encoding='utf-8'
page = get_page.text
print('get page done')
page = page.replace('\n', '')
page = page.replace('\r', '')
page_dic = json.loads(page)
video_lst = page_dic['dataList']
real_releaserUrl = page_dic['nextUrl']
print('get next url:'+real_releaserUrl)
for line in video_lst:
try:
video_id = line['contInfo']['contId']
self.video_data['title'] = line['contInfo']['name']
self.video_data['video_id'] = video_id
self.video_data['url'] = 'http://www.pearvideo.com/video_'+video_id
self.video_data['releaser_id'] = releaser_id
self.video_data['releaserUrl'] = releaserUrl
self.video_data['favorite_count'] = line['contInfo']['praiseTimes']
rt_time = line['pubTime']
self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time, '%Y-%m-%d %H:%M').timestamp()*1e3)
dura = line['contInfo']['duration']
dura = dura.replace('"', '')
dura_lst = dura.split("'")
self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
video_info = copy.deepcopy(self.video_data)
print('get one video_info')
result_Lst.append(video_info)
if len(result_Lst) >= 100:
output_result(result_Lst, self.platform,
output_to_file=output_to_file,
filepath=filepath,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
)
result_Lst.clear()
except:
pass
if len(result_Lst) != []:
output_result(result_Lst, self.platform,
output_to_file=output_to_file,
filepath=filepath,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
)
result_Lst.clear()
def list_page(self, list_name,
error_file=None,
output_to_file=False,
filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
total_wanted_video_num=100):
category_id = self.category_corresponding[list_name]
result_Lst = []
count = 0
list_url = 'http://app.pearvideo.com/clt/jsp/v3/getCategoryConts.jsp?categoryId='+category_id+'&start=0'
while list_url != '' and count < total_wanted_video_num:
get_page = retry_get_url(list_url, headers=self.headers)
get_page.encoding = 'utf-8'
page = get_page.text
print('get page done')
page = page.replace('\n', '')
page = page.replace('\r', '')
page_dic = json.loads(page)
list_url = page_dic['nextUrl']
video_lst_hot = page_dic['rankList']
video_lst_gen = page_dic['contList']
if video_lst_hot != []:
for line in video_lst_hot:
try:
contid = line['contId']
line_dic = self.video_page_by_video_id(contid)
print('get one line done')
result_Lst.append(line_dic)
except:
pass
for line in video_lst_gen:
try:
contid = line['contId']
line_dic = self.video_page_by_video_id(contid)
print('get one line done')
result_Lst.append(line_dic)
except:
pass
count += len(video_lst_gen)+len(video_lst_hot)
print(count)
if len(result_Lst) >= 100:
output_result(result_Lst, self.platform,
output_to_file=output_to_file,
filepath=filepath,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
)
result_Lst.clear()
if len(result_Lst) != []:
output_result(result_Lst, self.platform,
output_to_file=output_to_file,
filepath=filepath,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
push_to_redis=push_to_redis,
)
result_Lst.clear()
def search_page(self, keyword=None, search_pages_max=30,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None):
list_page = []
count = 0
pages = 0
while pages < search_pages_max:
pages += 1
try:
url = ('http://app.pearvideo.com/clt/jsp/v3/search.jsp?k='
+ keyword
+ '&start='
+ str(count))
get_page = retry_get_url(url, headers=self.headers)
get_page.encoding = 'utf-8'
page = get_page.text
print('get page done')
page = page.replace('\n', '')
page = page.replace('\r', '')
page_dic = json.loads(page)
midstep = page_dic['searchList']
count += 10
for line in midstep:
try:
contid = line['contId']
line_dic = self.video_page_by_video_id(contid)
print('get one line done')
list_page.append(line_dic)
if len(list_page) >= 100:
output_result(result_Lst=list_page,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
list_page.clear()
except:
pass
except:
pass
if list_page != []:
output_result(result_Lst=list_page,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
return list_page
if __name__=='__main__':
pear = Crawler_pear()
# #videopage = pear.video_page(url='http://www.pearvideo.com/video_1400210')
# # releaserUrl = 'http://www.pearvideo.com/author_11406235'
# # pear.releaser_page(releaserUrl,
# # error_file=None,
# # output_to_file=False, filepath=None,
# # output_to_es_raw=False,
# # output_to_es_register=False,
# # push_to_redis=False,
# # releaser_page_num_max=30)
# #releaser2=pear.releaser_page(userid='10006693')
# listpage = pear.list_page(list_name='社会',
# error_file=None,
# output_to_file=False,
# filepath=None,
# output_to_es_raw=True,
# output_to_es_register='test2',
# push_to_redis='test_pear',
# total_wanted_video_num=30)
sr_pearv = pear.search_page(keyword='任正非 BBC', search_pages_max=4)