crawler_pear.py 15.3 KB
# -*- coding: utf-8 -*-
"""
Created on Sun Apr  8 14:53:43 2018

@author: fangyucheng
"""

import datetime
import copy
import re
import json
from crawler_sys.framework.video_fields_std import Std_fields_video
from crawler_sys.utils.output_results import retry_get_url
from crawler_sys.utils.output_results import output_result
from crawler.crawler_sys.utils.util_logging import logged


class Crawler_pear():

    def __init__(self, timeout=None, platform='pearvideo'):
        if timeout == None:
            self.timeout = 10
        else:
            self.timeout = timeout
        self.platform = platform
        std_fields = Std_fields_video()
        self.video_data = std_fields.video_data
        self.video_data['platform'] = self.platform
        # remove fields that crawled data don't have
        pop_key_Lst = ['channel', 'describe', 'repost_count', 'isOriginal',
                       'video_id']
        for popk in pop_key_Lst:
            self.video_data.pop(popk)
        self.legal_list_page_urls = []
        self.legal_channels = []
        Cookie = ('JSESSIONID=E546B64FEB7F2009C3D2D64887F4FD67,'
                  'PEAR_UUID=7693819d-8037-460d-9a04-15217b3ee67f; '
                  'PEAR_DEVICE_FLAG=true;Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1522723194;'
                  'UM_distinctid=1600b8f08667c5-0c3c0cc44f7afd8-173a7640-1fa400-1600b8f08677c7; '
                  'PV_APP=srv-pv-prod-portal1; __ads_session=Ai82CnAsFAn2IyZjGwA=')
        self.headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                        'Accept-Encoding':'gzip, deflate',
                        'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                        'Cache-Control':'max-age=0, no-cache',
                        'Connection':'keep-alive',
                        # Cookie is necessary to get results
                        'Cookie':Cookie,
                        'Host': 'app.pearvideo.com',
                        'Pragma':'no-cache',
                        'Upgrade-Insecure-Requests':'1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
        self.category_corresponding = {'社会':'1', '世界':'2', '财富':'3', '娱乐':'4', '生活':'5',
                                       '美食':'6', '搞笑':'7', '科技':'8', '体育':'9', '知新':'10',
                                       '二次元':'17', '汽车':'31', '音乐':'59'}



    def video_page(self, url):
        find_video_id = ' '.join(re.findall('video_\d+', url))
        video_id = ' '.join(re.findall('\d+', find_video_id))
        real_url = 'http://app.pearvideo.com/clt/jsp/v3/content.jsp?contId='+video_id
        get_page = retry_get_url(real_url, headers=self.headers)
        get_page.encoding = 'utf-8'
        page = get_page.text
        page = page.replace('\n', '')
        page = page.replace('\r', '')
        page_dic = json.loads(page)
        if page_dic['resultMsg'] == 'success':
            self.video_data['title'] = page_dic['content']['name']
            self.video_data['video_id'] = video_id
            self.video_data['url'] = url
            self.video_data['releaser'] = page_dic['content']['authors'][0]['nickname']
            releaser_id = page_dic['content']['authors'][0]['userId']
            self.video_data['releaser_id'] = releaser_id
            self.video_data['releaserUrl'] = 'http://www.pearvideo.com/author_'+releaser_id
            self.video_data['comment_count'] = page_dic['content']['commentTimes']
            self.video_data['favorite_count'] = page_dic['content']['praiseTimes']
            rt_time = page_dic['content']['pubTime']
            self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time,
                                                                             '%Y-%m-%d %H:%M').timestamp()*1e3)
            dura = page_dic['content']['duration']
            dura = dura.replace('"', '')
            dura_lst = dura.split("'")
            self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
            self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
            video_info = copy.deepcopy(self.video_data)
        else:
            print('error'+url)
        return video_info


    def video_page_by_video_id(self, video_id):
        real_url = 'http://app.pearvideo.com/clt/jsp/v3/content.jsp?contId='+video_id
        get_page = retry_get_url(real_url, headers=self.headers)
        get_page.encoding = 'utf-8'
        page = get_page.text
        page = page.replace('\n', '')
        page = page.replace('\r', '')
        page_dic = json.loads(page)
        if page_dic['resultMsg'] == 'success':
            self.video_data['title'] = page_dic['content']['name']
            self.video_data['video_id'] = video_id
            url = 'http://www.pearvideo.com/video_'+video_id
            self.video_data['url'] = url
            self.video_data['releaser'] = page_dic['content']['authors'][0]['nickname']
            releaser_id = page_dic['content']['authors'][0]['userId']
            self.video_data['releaser_id'] = releaser_id
            self.video_data['releaserUrl'] = 'http://www.pearvideo.com/author_'+releaser_id
            self.video_data['comment_count'] = page_dic['content']['commentTimes']
            self.video_data['favorite_count'] = page_dic['content']['praiseTimes']
            rt_time = page_dic['content']['pubTime']
            self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time,
                                                                             '%Y-%m-%d %H:%M').timestamp()*1e3)
            dura = page_dic['content']['duration']
            dura = dura.replace('"', '')
            dura_lst = dura.split("'")
            self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
            self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
            video_info = copy.deepcopy(self.video_data)
        else:
            print('error'+url)
        return video_info


    @logged
    def releaser_page(self, releaserUrl,
                      error_file=None,
                      output_to_file=False, filepath=None,
                      output_to_es_raw=False,
                      output_to_es_register=False,
                      push_to_redis=False,
                      releaser_page_num_max=30):

        result_Lst = []
        find_releaser_id = ' '.join(re.findall('author_\d+', releaserUrl))
        releaser_id = ' '.join(re.findall('\d+', find_releaser_id))
        real_releaserUrl = ('http://app.pearvideo.com/clt/jsp/v3/userHome.jsp?userId='
                            +releaser_id+'&reqType=1&start=0')
        count = 0

        while real_releaserUrl != '' and count < releaser_page_num_max*10:
            count += 10
            get_page = retry_get_url(real_releaserUrl, headers=self.headers)
            get_page.encoding='utf-8'
            page = get_page.text
            print('get page done')
            page = page.replace('\n', '')
            page = page.replace('\r', '')
            page_dic = json.loads(page)
            video_lst = page_dic['dataList']
            real_releaserUrl = page_dic['nextUrl']
            print('get next url:'+real_releaserUrl)
            for line in video_lst:
                try:
                    video_id = line['contInfo']['contId']
                    self.video_data['title'] = line['contInfo']['name']
                    self.video_data['video_id'] = video_id
                    self.video_data['url'] = 'http://www.pearvideo.com/video_'+video_id
                    self.video_data['releaser_id'] = releaser_id
                    self.video_data['releaserUrl'] = releaserUrl
                    self.video_data['favorite_count'] = line['contInfo']['praiseTimes']
                    rt_time =  line['pubTime']
                    self.video_data['release_time'] = int(datetime.datetime.strptime(rt_time, '%Y-%m-%d %H:%M').timestamp()*1e3)
                    dura = line['contInfo']['duration']
                    dura = dura.replace('"', '')
                    dura_lst = dura.split("'")
                    self.video_data['duration'] = (int(dura_lst[0]))*60+int(dura_lst[1])
                    self.video_data['fetch_time'] = int(datetime.datetime.timestamp(datetime.datetime.now())*1e3)
                    video_info = copy.deepcopy(self.video_data)
                    print('get one video_info')
                    result_Lst.append(video_info)

                    if len(result_Lst) >= 100:
                        output_result(result_Lst, self.platform,
                                      output_to_file=output_to_file,
                                      filepath=filepath,
                                      output_to_es_raw=output_to_es_raw,
                                      output_to_es_register=output_to_es_register,
                                      push_to_redis=push_to_redis,
                                     )
                        result_Lst.clear()
                except:
                    pass

        if len(result_Lst) != []:
            output_result(result_Lst, self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          push_to_redis=push_to_redis,
                         )
            result_Lst.clear()


    def list_page(self, list_name,
                  error_file=None,
                  output_to_file=False,
                  filepath=None,
                  output_to_es_raw=False,
                  output_to_es_register=False,
                  push_to_redis=False,
                  total_wanted_video_num=100):
        category_id = self.category_corresponding[list_name]
        result_Lst = []
        count = 0
        list_url = 'http://app.pearvideo.com/clt/jsp/v3/getCategoryConts.jsp?categoryId='+category_id+'&start=0'
        while list_url != '' and count < total_wanted_video_num:
            get_page = retry_get_url(list_url, headers=self.headers)
            get_page.encoding = 'utf-8'
            page = get_page.text
            print('get page done')
            page = page.replace('\n', '')
            page = page.replace('\r', '')
            page_dic = json.loads(page)
            list_url = page_dic['nextUrl']
            video_lst_hot = page_dic['rankList']
            video_lst_gen = page_dic['contList']
            if video_lst_hot != []:
                for line in video_lst_hot:
                    try:
                        contid = line['contId']
                        line_dic = self.video_page_by_video_id(contid)
                        print('get one line done')
                        result_Lst.append(line_dic)
                    except:
                        pass
            for line in video_lst_gen:
                try:
                    contid = line['contId']
                    line_dic = self.video_page_by_video_id(contid)
                    print('get one line done')
                    result_Lst.append(line_dic)
                except:
                    pass

            count += len(video_lst_gen)+len(video_lst_hot)
            print(count)

            if len(result_Lst) >= 100:
                output_result(result_Lst, self.platform,
                              output_to_file=output_to_file,
                              filepath=filepath,
                              output_to_es_raw=output_to_es_raw,
                              output_to_es_register=output_to_es_register,
                              push_to_redis=push_to_redis,
                             )
                result_Lst.clear()

        if len(result_Lst) != []:
            output_result(result_Lst, self.platform,
                          output_to_file=output_to_file,
                          filepath=filepath,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          push_to_redis=push_to_redis,
                         )
            result_Lst.clear()


    def search_page(self, keyword=None, search_pages_max=30,
                    output_to_es_raw=False,
                    output_to_es_register=False,
                    es_index=None,
                    doc_type=None):
        list_page = []
        count = 0
        pages = 0
        while pages < search_pages_max:
            pages += 1
            try:
                url = ('http://app.pearvideo.com/clt/jsp/v3/search.jsp?k='
                       + keyword
                       + '&start='
                       + str(count))
                get_page = retry_get_url(url, headers=self.headers)
                get_page.encoding = 'utf-8'
                page = get_page.text
                print('get page done')
                page = page.replace('\n', '')
                page = page.replace('\r', '')
                page_dic = json.loads(page)
                midstep = page_dic['searchList']
                count += 10
                for line in midstep:
                    try:
                        contid = line['contId']
                        line_dic = self.video_page_by_video_id(contid)
                        print('get one line done')
                        list_page.append(line_dic)
                        if len(list_page) >= 100:
                            output_result(result_Lst=list_page,
                                          platform=self.platform,
                                          output_to_es_raw=output_to_es_raw,
                                          output_to_es_register=output_to_es_register,
                                          es_index=es_index,
                                          doc_type=doc_type)
                            list_page.clear()
                    except:
                        pass
            except:
                pass
        if list_page != []:
            output_result(result_Lst=list_page,
                          platform=self.platform,
                          output_to_es_raw=output_to_es_raw,
                          output_to_es_register=output_to_es_register,
                          es_index=es_index,
                          doc_type=doc_type)

        return list_page

if __name__=='__main__':
    pear = Crawler_pear()
#     #videopage = pear.video_page(url='http://www.pearvideo.com/video_1400210')
# #    releaserUrl = 'http://www.pearvideo.com/author_11406235'
# #    pear.releaser_page(releaserUrl,
# #                       error_file=None,
# #                       output_to_file=False, filepath=None,
# #                       output_to_es_raw=False,
# #                       output_to_es_register=False,
# #                       push_to_redis=False,
# #                       releaser_page_num_max=30)
#     #releaser2=pear.releaser_page(userid='10006693')
#     listpage = pear.list_page(list_name='社会',
#                   error_file=None,
#                   output_to_file=False,
#                   filepath=None,
#                   output_to_es_raw=True,
#                   output_to_es_register='test2',
#                   push_to_redis='test_pear',
#                   total_wanted_video_num=30)
    sr_pearv = pear.search_page(keyword='任正非 BBC', search_pages_max=4)