# -*- coding: utf-8 -*- """ Created on Tue Nov 27 14:05:18 2018 @author: fangyucheng """ import hashlib import uuid import copy import urllib import time import datetime import requests from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.site_crawler import crawler_v_qq from crawler.crawler_sys.utils.output_results import output_result import re,json try: from crawler_sys.framework.func_get_releaser_id import * except: from func_get_releaser_id import * from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy class Crawler_Tencent_News(): def __init__(self, platform='腾讯新闻'): self.platform = platform self.devid = '008796749793280' # self.appver = '23_android_5.4.10' # self.devid = "7313ae71df9e5367", self.appver = "23_android_5.8.00" self.qnrid = str(uuid.uuid4()) self.headers = {"Host": "r.inews.qq.com", "Accept-Encoding": "gzip,deflate", "Referer": "http://inews.qq.com/inews/android/", "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5410(android)", "Cookie": "lskey=;luin=;skey=;uin=; logintype=0; main_login=qq;", "Connection": "Keep-Alive"} self.video_data = Std_fields_video().video_data self.video_data['platform'] = self.platform untouch_key_lst = ['channel', 'describe', 'isOriginal', 'repost_count'] for key in untouch_key_lst: self.video_data.pop(key) self.crawler_video_page = crawler_v_qq.Crawler_v_qq().video_page self.list_page_dict = {"体育": "8"} def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) def forllower_num_to_int(self, num): if str(num)[-1:] == "万": return int(float(num[:-1]) * 1e4) elif isinstance(num, float): return num elif isinstance(num, int): return num def get_releaser_follower_num(self, releaserUrl): head = { "chlid": self.get_releaser_id(releaserUrl), "media_openid": "", "is_special_device": "0", "mid": "0", "dpi": "270.0", "is_chinamobile_oem": "0", "qqnetwork": "wifi", "rom_type": "V417IR release-keys", "real_device_width": "3.0", "net_proxy": "DIRECT@", "net_bssid": "01:80:c2:00:00:03", "currentChannelId": "news_video_child_newRecommend", "isElderMode": "0", "apptype": "android", "islite": "0", "hw": "Xiaomi_MINOTE3", "global_session_id": "1558345020297", "screen_width": "810", "omgbizid": "", "sceneid": "", "videoAutoPlay": "1", "imsi": "460063005313888", "fix_store": "", "isoem": "0", "currentTabId": "news_live", "lite_version": "", "net_slot": "0", "startTimestamp": "1558345020", "pagestartfrom": "icon", "mac": "mac unknown", "activefrom": "icon", "net_ssid": "NemuWiFi", "store": "10611", "screen_height": "1440", "extinfo": "", "real_device_height": "5.33", "origin_imei": "261721032526201", "network_type": "wifi", "origCurrentTab": "live", "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J601P000000000:A401P000050901:J401P100000000:J304P000000000:J701P000000000:B703P000062002:B704P000064803:J702P000000000:B064P000065702:J267P000000000:B060P000066504:A403P000070903:J055P000000000:A402P000060701:B402P200065202:B054P100068903:A054P200070501:A054P600071201:A054P300068801:J054P000000000:J054P040000000|1414|0|1|0|0|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|2|2|0|0|0|4|0|0|1|0|5|2|0|0|3|0|0|1|0|1|1|0|0|1|0|4|0|1|2|11|20|1|0|1|0|0|30|1|4|0|0|3|40|0|51|60|0|0|0|0|0", "imsi_history": "460063005313888", "net_apn": "0", "uid": "7313ae71df9e5367", "omgid": "", "trueVersion": "5.8.00", "qimei": "261721032526201", "devid": "7313ae71df9e5367", "appver": "23_android_5.8.00", "Cookie": "lskey=;skey=;uin=; luin=;logintype=0; main_login=", } try: url = "https://r.inews.qq.com/getSubItem?%s" % urllib.parse.urlencode(head) res = requests.get(url) res_json = res.json() # print(res_json) follower_num = self.forllower_num_to_int(res_json.get("channelInfo").get("subCount")) releaser_img = self.get_releaser_image(data=res_json) print('%s follower number is %s' % (releaserUrl, follower_num)) return follower_num,releaser_img except: print("can't find followers") def get_releaser_image(self, releaserUrl=None,data=None): if releaserUrl: head = { "chlid": self.get_releaser_id(releaserUrl), "media_openid": "", "is_special_device": "0", "mid": "0", "dpi": "270.0", "is_chinamobile_oem": "0", "qqnetwork": "wifi", "rom_type": "V417IR release-keys", "real_device_width": "3.0", "net_proxy": "DIRECT@", "net_bssid": "01:80:c2:00:00:03", "currentChannelId": "news_video_child_newRecommend", "isElderMode": "0", "apptype": "android", "islite": "0", "hw": "Xiaomi_MINOTE3", "global_session_id": "1558345020297", "screen_width": "810", "omgbizid": "", "sceneid": "", "videoAutoPlay": "1", "imsi": "460063005313888", "fix_store": "", "isoem": "0", "currentTabId": "news_live", "lite_version": "", "net_slot": "0", "startTimestamp": "1558345020", "pagestartfrom": "icon", "mac": "mac unknown", "activefrom": "icon", "net_ssid": "NemuWiFi", "store": "10611", "screen_height": "1440", "extinfo": "", "real_device_height": "5.33", "origin_imei": "261721032526201", "network_type": "wifi", "origCurrentTab": "live", "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J601P000000000:A401P000050901:J401P100000000:J304P000000000:J701P000000000:B703P000062002:B704P000064803:J702P000000000:B064P000065702:J267P000000000:B060P000066504:A403P000070903:J055P000000000:A402P000060701:B402P200065202:B054P100068903:A054P200070501:A054P600071201:A054P300068801:J054P000000000:J054P040000000|1414|0|1|0|0|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|2|2|0|0|0|4|0|0|1|0|5|2|0|0|3|0|0|1|0|1|1|0|0|1|0|4|0|1|2|11|20|1|0|1|0|0|30|1|4|0|0|3|40|0|51|60|0|0|0|0|0", "imsi_history": "460063005313888", "net_apn": "0", "uid": "7313ae71df9e5367", "omgid": "", "trueVersion": "5.8.00", "qimei": "261721032526201", "devid": "7313ae71df9e5367", "appver": "23_android_5.8.00", "Cookie": "lskey=;skey=;uin=; luin=;logintype=0; main_login=", } try: url = "https://r.inews.qq.com/getSubItem?%s" % urllib.parse.urlencode(head) res = requests.get(url) res_json = res.json() # print(res_json) releaser_img_url = res_json.get("channelInfo").get("icon") return releaser_img_url except: print("can't get releaser_img_url") else: releaser_img_url = data.get("channelInfo").get("icon") return releaser_img_url # def search_video_page(self, keyword, releaser_url,releaser=True, # search_pages_max=30, # output_to_es_raw=False, # output_to_es_register=False, # es_index=None, # doc_type=None, # **kwargs): # """ # This is improved search page crawler, involved search_type. # When search_type == 'searchMore', it's the same as previous one, # which is the '综合' column in app search page. # When search_type == 'verticalSearch', it's is the '视频' column # in app search page. # """ # # def get_list(search_type, page_dict): # # print(search_type,page_dict) # if search_type == 'searchMore': # return page_dict['data']['secData'] # elif search_type == 'verticalSearch': # try: # return page_dict['secList'][0].get('videoList') # except: # return [] # else: # print('unknow search_type:', search_type) # return None # # search_request_dict = { # 'verticalSearch': { # 'url_prefix': 'http://r.inews.qq.com/verticalSearch?', # 'para_dict': { # "chlid": "_qqnews_custom_search_video", # "uid": "7313ae71df9e5367", # "omgid": "", # "trueVersion": "5.8.00", # "qimei": "379317519303213", # "devid": "008796749793280", # "appver": "23_android_5.8.00", # "Cookie": "lskey=;skey=;uin=; luin=;logintype=0; main_login=;", # }, # } # } # # headers = {"Host": "r.inews.qq.com", # "Accept-Encoding": "gzip", # "Referer": "http://inews.qq.com/inews/android/", # "Content-Type": "application/x-www-form-urlencoded", # "User-Agent": "%E8%85%BE%E8%AE%AF%E6%96%B0%E9%97%BB5800(android)", # "RecentUserOperation": "2_GuidePage,1_news_background,1_news_news_top", # "Connection": "Keep-Alive"} # # body = { # "search_type": "video", # "query": keyword, # "page": "1", # "type": "0", # "transparam": '{"sessionid":"2015601560736100"}', # "search_from": "click", # "cp_type": "0", # "is_special_device": "0", # "mid": "0", # "dpi": "270.0", # "is_chinamobile_oem": "0", # "qqnetwork": "wifi", # "rom_type": "V417IR release-keys", # "real_device_width": "3.0", # "net_proxy": "DIRECT@", # "net_bssid": "01:80:c2:00:00:03", # "currentChannelId": "news_news_top", # "isElderMode": "0", # "apptype": "android", # "islite": "0", # "hw": "HUAWEI_BLA-AL00", # "global_session_id": "1560735392163", # "screen_width": "810", # "videoAutoPlay": "1", # "imsi": "460062614015394", # "isoem": "0", # "currentTabId": "news_news", # "net_slot": "0", # "startTimestamp": "0", # "pagestartfrom": "icon", # "mac": "mac unknown", # "activefrom": "icon", # "net_ssid": "WiFi", # "store": "10611", # "screen_height": "1440", # "real_device_height": "5.33", # "origin_imei": "379317519303213", # "network_type": "wifi", # "origCurrentTab": "top", # "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:A601P000081702:A401P000050901:J401P100000000:J602P000000000:J304P000000000:B701P000075404:J703P000000000:B704P000085403:J702P000000000:B064P000065702:A267P000074401:B267P100078102:B060P000085902:J403P000000000:J403P100000000:J055P200000000:A402P100080401:J402P000000000:J402P200000000:B054P000061502:A054P600071201:J054P200000000:J054P100000000|1414|0|1|0|0|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|2|2|0|0|0|4|0|0|1|2|5|2|0|0|3|0|0|1|0|1|1|0|0|1|0|4|0|1|2|11|20|1|0|1|0|0|30|1|4|0|1|4|40|0|51|60|0|0|0|0|0|4|0|0|0|0", # "imsi_history": "460062614015394", # "net_apn": "0" # } # final_lst = [] # for search_type in search_request_dict: # print(datetime.datetime.now(), '****** search_type', search_type) # qnrid, qnsig = self.build_qnrid_and_qnsig(cgi=search_type) # for page in range(1, search_pages_max): # para_dict = search_request_dict[search_type]['para_dict'] # body['page'] = page # url_prefix = search_request_dict[search_type]['url_prefix'] # url = url_prefix + urllib.parse.urlencode(para_dict) # # get_page = requests.post(url, headers=headers, data=body) # try: # page_dict = get_page.json() # except: # pass # else: # video_lst = get_list(search_type, page_dict) # if video_lst is not None and video_lst != []: # for video_dict in video_lst: # if 'hasVideo' in video_dict: # try: # if 'source' not in video_dict: # # ignore those without 'source' field # continue # video_info = copy.deepcopy(self.video_data) # info_id = video_dict['id'] # video_info['title'] = video_dict['title'] # video_info['url'] = video_dict['url'] # video_info['video_id'] = video_dict['vid'] # video_info['play_count'] = video_dict.get('video_channel').get('video').get( # 'playcount') # video_info['releaser'] = video_dict['source'] # try: # video_info['releaserUrl'] = releaser_url # video_info['releaser_id_str'] = "腾讯新闻_" + self.get_releaser_id(releaser_url) # except: # video_info['releaserUrl'] = "" # video_info['releaser_id_str'] = "" # video_info['release_time'] = int(video_dict['timestamp'] * 1e3) # video_info['favorite_count'] = video_dict['likeInfo'] # video_info['comment_count'] = video_dict['comments'] # try: # dura_str = trans_duration( # video_dict.get('video_channel').get('video').get('duration')) # except: # dura_str = '' # video_info['duration'] = dura_str # fetch_time = int(datetime.datetime.now().timestamp() * 1e3) # video_info['fetch_time'] = fetch_time # if releaser: # if keyword == video_info['releaser']: # final_lst.append(video_info) # else: # final_lst.append(video_info) # # print(video_info) # print("get video data %s" % video_info['title']) # except: # continue # else: # print("hasVideo flag is False, no data in this video_dict") # break # if len(final_lst) >= 100: # output_result(result_Lst=final_lst, # platform=self.platform, # output_to_es_raw=output_to_es_raw, # output_to_es_register=output_to_es_register, # es_index=es_index, # doc_type=doc_type) # final_lst.clear() # # if final_lst != []: # output_result(result_Lst=final_lst, # platform=self.platform, # output_to_es_raw=output_to_es_raw, # output_to_es_register=output_to_es_register, # es_index=es_index, # doc_type=doc_type) # return final_lst def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=10000, es_index=None, doc_type=None, proxies_num=None): proxies = get_proxy(proxies_num) releaser_id = self.get_releaser_id(releaserUrl) # qnrid, qnsig = self.build_qnrid_and_qnsig(cgi="om") result_list = [] has_more = True count = 1 page_time = "" while has_more and count <= releaser_page_num_max and releaser_id: url_dic = { "chlid": releaser_id, "page_time": page_time, "coral_uin": "ec8bb1459b9d84100312bf035bb43cd4d0", "coral_uid": "", "type": "om", "uid": "7313ae71df9e5367", "omgid": "", "trueVersion": "5.8.00", "qimei": "287801615436009", # "devid" :"7313ae71df9e5367", # "appver" :"23_android_5.8.00", 'devid': self.devid, 'appver': self.appver, } post_body = { "activefrom": "icon", "apptype": "android", "article_pos": "0", "articleID": "ec8bb1459b9d84100312bf035bb43cd4d0_%s" % releaser_id, "articlepage": "-1", "articletype": "509", "articleType": "509", "articleUUID": "7a7f71aff201a175cbb8b946b1a0ab3b", "cell_id": "normal_article_cell", "cityList": "news_news_bj", "coverType": "0", "currentChannelId": "news_news_top", "currentTabId": "news_news", "dpi": "270.0", "global_info": "1|1|1|1|1|14|4|1|0|6|1|1|1||0|J309P000000000:J902P000000000:J601P000000000:A601P400109701:A601P200096101:J601P500000000:J601P100000000:J601P600000000:J601P300000000:J603P000000000:J604P000000000:A401P000050901:J401P100000000:J602P000000000:J602P900000000:J304P000000000:J701P000000000:B703P000107302:J704P000000000:B702P000098602:A064P000117303:B085P000087702:B267P000118602:J267P100000000:B073P000120202:A060P000116701:J060P400000000:J060P100000000:J060P016000000:A403P100114101:J403P000000000:J055P000000000:J055P200000000:B402P100095203:J402P000000000:J402P013000000:A054P000101101:A054P600071201:J054P200000000:B901P000117402|1414|0|1|24|24|0|0|0||3|3|1|1|1|1|1|1|-1|0|0|2|2|0|0|0|4|0|0|1|2|5|2|0|0|3|0|0|1|0|1|1|0|0|1|0|4|0|1|1|11|20|1|0|1|1|0|0|1|4|0|1|1|40|0|51|60|0|0|0|0|0|4|0|0|0|0|0|0", "global_session_id": "1564032931171", "hasVideo": "0", "hw": "vivo_VIVOX20Plus", "id": "ec8bb1459b9d84100312bf035bb43cd4d0_%s" % releaser_id, "idStr": "ec8bb1459b9d84100312bf035bb43cd4d0_%s" % releaser_id, "imsi": "460073046925329", "imsi_history": "460073046925329", "is_chinamobile_oem": "0", "is_special_device": "0", "isAd": "0", "isCpFocus": "0", "isElderMode": "0", "isGifPlayed": "0", "isHotCommentLink": "0", "isHotNews": "0", "isIPSpecialVideo": "0", "islite": "0", "isoem": "0", "mac": "mac unknown", "mid": "0", "moduleArticlePos": "0", "net_apn": "0", "net_bssid": "49:4a:55:76:75:58", "net_proxy": "DIRECT@", "net_slot": "0", "net_ssid": "IJUvuXkoA8H", "network_type": "wifi", "newsID": "ec8bb1459b9d84100312bf035bb43cd4d0_%s" % releaser_id, "origCurrentTab": "top", "origin_imei": "287801615436009", "originPageType": "second_timeline", "page_type": "second_timeline", "pageIsIPSpecialVideo": "0", "pagestartfrom": "icon", "qqnetwork": "wifi", "real_device_height": "5.33", "real_device_width": "3.0", "realArticlePos": "0", "rom_type": "V417IR release-keys", "screen_height": "1440", "screen_width": "810", "startTimestamp": "0", "store": "10611", #"title": "看看新闻Knews", "userId": "ec8bb1459b9d84100312bf035bb43cd4d0", "userMediaId": releaser_id, "userVipType": "0", "videoAutoPlay": "1", "videoBlackBorder": "0", "videoShowType": "0", } post_url = "https://r.inews.qq.com/getUserVideoList?" url = post_url + urllib.parse.urlencode(url_dic) if proxies: get_page = requests.post(url, headers=self.headers, data=post_body,proxies=proxies) else: get_page = requests.post(url, headers=self.headers, data=post_body) page_dic = {} try: page_dic = get_page.json() # print(page_dic) data_list = page_dic.get('newslist') has_more = page_dic.get('next') page_time = str(page_dic.get("last_time")) except: if data_list: data_list = page_dic.get('newslist') has_more = page_dic.get('next') else: data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if data_list == []: proxies = get_proxy(1) print("no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) # print(url) count += 1 has_more = False continue else: count += 1 print("craw data in releaser %s page %s" % (releaser_id, count)) for one_video in data_list: # info_str = one_video.get('content') video_dic = copy.deepcopy(self.video_data) video_dic['title'] = one_video.get('title') video_dic['url'] = one_video.get('url') video_dic['releaser'] = one_video.get('chlname') video_dic['releaserUrl'] = "https://view.inews.qq.com/media/%s" % releaser_id release_time = one_video.get('timestamp') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int(self.t2s(one_video.get('videoTotalTime'))) if not video_dic['duration']: try: video_dic['duration'] = int(self.t2s(one_video.get('video_channel').get("video").get("duration"))) except: video_dic['duration'] = 0 video_dic['play_count'] = one_video.get('video_channel').get("video").get("playcount") video_dic['repost_count'] = one_video.get('shareCount') video_dic['comment_count'] = one_video.get('comments') video_dic['favorite_count'] = one_video.get('likeInfo') video_dic['video_id'] = one_video.get('vid') video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "腾讯新闻_%s" % releaser_id video_dic['video_img'] = one_video.get('pic_minivideo') yield video_dic @staticmethod def t2s(t): if t: if len(t) == 5: t = str(t) m, s = t.strip().split(":") return float(m) * 60 + float(s) elif len(t) >= 7: t = str(t) h, m, s = t.strip().split(":") return float(h) * 3600 + float(m) * 60 + float(s) else: return 0 # def search_page(self, keyword, # search_pages_max=30, # output_to_es_raw=False, # output_to_es_register=False, # es_index=None, # doc_type=None): # """ # This is improved search page crawler, involved search_type. # When search_type == 'searchMore', it's the same as previous one, # which is the '综合' column in app search page. # When search_type == 'verticalSearch', it's is the '视频' column # in app search page. # """ # # def get_list(search_type, page_dict): # # print(search_type,page_dict) # if search_type == 'searchMore': # return page_dict['data']['secData'] # elif search_type == 'verticalSearch': # try: # return page_dict['secList'][0].get('videoList') # except: # return [] # else: # print('unknow search_type:', search_type) # return None # # search_request_dict = { # 'verticalSearch': { # 'url_prefix': 'http://r.inews.qq.com/verticalSearch?', # 'para_dict': { # 'devid': self.devid, # 'appver': self.appver, # 'query': keyword, # 'page': None, # 'search_type': 'video' # }, # }, # 'searchMore': { # 'url_prefix': 'http://r.inews.qq.com/searchMore?', # 'para_dict': { # 'devid': self.devid, # 'appver': self.appver, # 'query': keyword, # # 'page': None, # } # }, # } # # final_lst = [] # for search_type in search_request_dict: # print(datetime.datetime.now(), '****** search_type', search_type) # for page in range(1, search_pages_max): # para_dict = search_request_dict[search_type]['para_dict'] # para_dict['page'] = page # url_prefix = search_request_dict[search_type]['url_prefix'] # url = url_prefix + urllib.parse.urlencode(para_dict) # # get_page = requests.get(url, headers=self.headers) # try: # page_dict = get_page.json() # except: # pass # else: # video_lst = get_list(search_type, page_dict) # if video_lst is not None and video_lst != []: # for video_dict in video_lst: # if 'hasVideo' in video_dict: # try: # if 'source' not in video_dict: # # ignore those without 'source' field # continue # video_info = copy.deepcopy(self.video_data) # info_id = video_dict['id'] # playcnt_url = ('http://r.inews.qq.com/getSimpleNews' # '/23_android_5.4.10/news_news_search/%s' # % info_id) # play_count, vid, info_source, data_source = self.get_playcnt(url=playcnt_url) # video_info['title'] = video_dict['title'] # video_info['url'] = video_dict['url'] # video_info['video_id'] = vid # video_info['play_count'] = play_count # video_info['playcnt_url'] = playcnt_url # video_info['releaser'] = video_dict['source'] # video_info['release_time'] = int(video_dict['timestamp'] * 1e3) # video_info['info_source'] = info_source # video_info['data_source'] = data_source # # try: # dura_str = video_dict['videoTotalTime'] # except: # dura_str = '' # video_info['duration'] = trans_duration(dura_str) # fetch_time = int(time.time() * 1e3) # video_info['fetch_time'] = fetch_time # final_lst.append(video_info) # print("get video data %s" % video_info['title']) # except: # continue # else: # print("hasVideo flag is False, no data in this video_dict") # if len(final_lst) >= 100: # output_result(result_Lst=final_lst, # platform=self.platform, # output_to_es_raw=output_to_es_raw, # output_to_es_register=output_to_es_register, # es_index=es_index, # doc_type=doc_type) # final_lst.clear() # # if final_lst != []: # output_result(result_Lst=final_lst, # platform=self.platform, # output_to_es_raw=output_to_es_raw, # output_to_es_register=output_to_es_register, # es_index=es_index, # doc_type=doc_type) # return final_lst def get_playcnt(self, url): get_page = requests.get(url) page_dic = get_page.json() try: play_count = page_dic['attribute']['VIDEO_0']['playcount'] except: play_count = None try: vid = page_dic['attribute']['VIDEO_0']['vid'] except: vid = None if play_count is not None: info_source = 'tencent_news' data_source = None else: info_source = 'tencent_video' data_source = None if vid is not None: video_url = 'https://v.qq.com/x/page/%s.html' % vid added_dic = self.crawler_video_page(url=video_url) try: play_count = added_dic['play_count'] data_source = added_dic['data_source'] except: play_count = None data_source = None return play_count, vid, info_source, data_source def list_page_for_main_page(self, channel_id, output_to_file=False, filepath=None, list_page_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False): post_dic = {'chlid': channel_id, 'forward': '2', 'uid': '6F0D5898-2C3A-46FE-9181-589BC52ED743', 'adReqData': '{"chid":2,"adtype":0,"pf":"iphone","launch":"0","ext":{"mob":{"mobstr":"Aejy45+NeSZw4VxymYnnIhMV+MEM+6sW9Rkw16FvkWGCz1rsPQflpTnsN+KnArzMwheqHiLErlbOlNWL0SoBI0lJtRh13iyR+LxSv3Y+hJrixm\\/Sxn\\/YhInAhlYioOjQ9cHGSSRmdgaDyqx2dDLZosKp+QSMqr649GGxQ36xbSdjbvZ3MGywBOsVNcf+EZkV+U9Q8LyDPc6PZ56b\\/GLGncf4XcrVFnKlUi+kebsg8DCD\\/nlvTDGSkWOtu33GJ4Ct\\/hfZ1c3UNHw5bRwHRM0L0+6QYANTrPzl2X6hZK3kijlJsub+RvcPNPNQGrhK3e4yYHJmspW19qE5mPgxd5lbwzJ8VQifTrjGeB+cdCcGmEPYBcZwHmxRhEAo7A0bJcSLK5KACWNsKw8I085yoKLCIE40\\/1J+umH8QsTU6K+wLdpjpaI6D3XMa\\/GZiguAcNB7HMSMpBFY6dq1saxz0u+6Ex2n2CwJlY4JYzf2S4r69t8J1WCQInAjIf\\/Io+ZVhXNnNUx3GVir\\/TaffnYpd\\/5ZvqdKtBIWXZFtXOoWC66tNBG\\/D+YAoY8\\/yVAQL7slsS1qbjdDqByVI2DMq299y6yAh0hejMouwaCGK2Q2OCMes5xrghJ1sotO5mSqioK23WbdF9GiQSVqmbE94wzpCwaPCwrEzkgKWHuPxh0UlqUs9QeGe30SHv4OOpqF9QOUeXYJ\\/Xkana90uC32g3LuM6jdPTv07qbyk1tX87pGdnyvjR9BBEhb0dyLUFi\\/Gx8t4T+yHLxt0X9yKsGKCJX1U8AdkTwLlJslIX9Rzqy+Yb1n9sg85KAS5yUsQZqSv9kKRuZpYsfj6LLaI\\/Bet9BUNtGu4hYuZBqKFWp34XegvS4d3M9U"}},"ver":"5.7.22","slot":[{"islocal":0,"orders_info":["67950414,6870997,0,1000,0,110,2","88685632,1266139,1761176905,19,101,110,3","48980066,1934913,3602870493,19,101,110,1"],"recent_rot":["1,2,3"],"refresh_type":0,"loid":"1,13,16,23","channel":"news_video_top"}],"appversion":"181210"}', 'kankaninfo': '{"gender":1,"lastExp":416,"refresh":0,"scene":2}', 'channelPosition': '1', 'rendType': 'kankan', 'page': '0'} headers = {"content-type": "application/x-www-form-urlencoded", "store": "1", "accept": "*/*", "idft": "CE1E8744-7BF9-4FDD-87A5-463C6B9A66E1", "idfa": "05571C2D-1C86-4B5B-87EF-E4B4DAF07DDB", "appver": "12.0.1_qqnews_5.7.22", "devid": "d605a70a-d084-487e-aaf1-8a057d40ef39", "devicetoken": "<f4b49138 3ca95e38 1519836e daefaab6 799b04da c164f7a7 4cb7d999 6e343393>", "accept-language": "zh-Hans-CN;q=1, en-CN;q=0.9", "referer": "http://inews.qq.com/inews/iphone/", "user-agent": "QQNews/5.7.22 (iPhone; iOS 12.0.1; Scale/2.00)", "content-length": "2169", "accept-encoding": "br, gzip, deflate", "cookie": "logintype=2", "qqnetwork": "wifi"} domain_url = "https://r.inews.qq.com/getQQNewsUnreadList?" query_dict = {'appver': '12.0.1_qqnews_5.7.22', 'pagestartfrom': 'icon', 'page_type': 'timeline', 'apptype': 'ios', 'rtAd': '1', 'imsi': '460-01', 'screen_height': '667', 'network_type': 'wifi', 'startTimestamp': '1545835451', 'store': '1', 'deviceToken': '<f4b49138 3ca95e38 1519836e daefaab6 799b04da c164f7a7 4cb7d999 6e343393>', 'global_info': '1|1|1|1|1|14|4|1|0|6|1|1|2|2|0|J267P000000000:J060P000000000:B054P000015802:J054P600000000|1421|0|1|0|0|0|0|0|1001|0|6|1|1|1|1|1|1|-1|0|0|0|2|1|1|0|0|2|0|1|0|4|0|0|0|3|0|0|0|0', 'globalInfo': '1|1|1|1|1|14|4|1|0|6|1|1|2|2|0|J267P000000000:J060P000000000:B054P000015802:J054P600000000|1421|0|1|0|0|0|0|0|1001|0|6|1|1|1|1|1|1|-1|0|0|0|2|1|1|0|0|2|0|1|0|4|0|0|0|3|0|0|0|0', 'screen_scale': '2', 'activefrom': 'icon', 'screen_width': '375', 'isJailbreak': '0', 'qqnews_refpage': 'QNCommonListController', 'omgid': 'a305486b92cc9e48f90929497de4cb30dfde0010112206', 'device_model': 'iPhone9,1', 'pagestartFrom': 'icon', 'device_appin': '6F0D5898-2C3A-46FE-9181-589BC52ED743', 'devid': 'D605A70A-D084-487E-AAF1-8A057D40EF39', 'omgbizid': '138dc6ef3ae8a24f7c897a9bbde8b9098f210060113210', 'idfa': '05571C2D-1C86-4B5B-87EF-E4B4DAF07DDB'} count = 0 result_list = [] while count < list_page_max: post_dic['page'] = str(count) timestamp = int(time.time()) query_dict['startTimestamp'] = timestamp url = domain_url + urllib.parse.urlencode(query_dict) get_page = requests.post(url, data=post_dic, headers=headers) page_dict = get_page.json() # video_list1 = page_dict["kankaninfo"]["videos"] video_list2 = page_dict["newslist"] count += 1 # return video_list2 if video_list2 != []: print("get data at page %s" % str(count - 1)) for video_info in video_list2: has_video = video_info.get('hasVideo') video_channel = video_info.get('video_channel') if has_video == 1 or video_channel is not None: video_dict = copy.deepcopy(self.video_data) video_dict['channel'] = channel_id video_dict['title'] = video_info['longtitle'] print(video_dict['title']) video_dict['url'] = video_info['url'] try: dura_str = video_info['video_channel']['video']['duration'] video_dict['duration'] = trans_duration(dura_str) except: video_dict['duration'] = 0 try: video_dict['releaser'] = video_info['chlname'] except: video_dict['releaser'] = None try: video_dict['releaser_id'] = video_info['card']['uin'] except: video_dict['releaser_id'] = None video_dict['release_time'] = int(video_info['timestamp'] * 1e3) try: video_dict['read_count'] = video_info['read_count'] except: video_dict['read_count'] = 0 video_dict['comment_count'] = video_info['comments'] video_dict['favorite_count'] = video_info['likeInfo'] try: video_dict['play_count'] = video_info['video_channel']['video']['playcount'] except: video_dict['play_count'] = 0 video_dict['article_id'] = video_info['id'] try: video_dict['video_id_str'] = video_info['vid'] except: video_dict['video_id_str'] = None video_dict['fetch_time'] = int(time.time() * 1e3) result_list.append(video_dict) if len(result_list) >= 100: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) return result_list def list_page_for_special_area(self, channel_id, output_to_file=False, filepath=None, list_page_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False): page_list = [] post_dic = { 'adReqData': '{"chid":2,"adtype":0,"pf":"iphone","launch":"0","ext":{"mob":{"mobstr":"Aejy45+NeSZw4VxymYnnIhMV+MEM+6sW9Rkw16FvkWGCz1rsPQflpTnsN+KnArzMwheqHiLErlbOlNWL0SoBI0lJtRh13iyR+LxSv3Y+hJrixm\\/Sxn\\/YhInAhlYioOjQ9cHGSSRmdgaDyqx2dDLZosKp+QSMqr649GGxQ36xbSdjbvZ3MGywBOsVNcf+EZkV+U9Q8LyDPc6PZ56b\\/GLGncf4XcrVFnKlUi+kebsg8DCD\\/nlvTDGSkWOtu33GJ4Ct\\/hfZ1c3UNHw5bRwHRM0L0+6QYANTrPzl2X6hZK3kijlJsub+RvcPNPNQGrhK3e4yYHJmspW19qE5mPgxd5lbwzLfC4rOa2XJGXs8Am8hxBVUQBrYaSX5y1D\\/H2H+\\/KuPjUhMtylfH4pqvrYmedw8h56zQLScQQ1xOMsiYtb72YRegl4pByfwExmmQ3L8EtBRDGoJznbwnCe863BRgZTCS9jQT0Wry6f1UGhpmH98UCfP\\/fzWCLOCPaXJCH5gxYdSIOc7u4nw7mBbPk\\/xhFWz7PDTCw9wxEwVpLBshqbxVfPQ9eTND\\/BEd9hrtE\\/ZVlJz+wIIaabOUgyMMEqGqUNPvI5Dt6JLD\\/s2yPA2zd8saGSjrLcBHzKfKEt4prtCjasz+\\/IK8eWT5QCrrJC9swLAAdUFKjX6mAcpR0ZF97ubI6I4rheTkfhfQ+5gX9Dm7ahfs6b4Fzk0ewwY9uim4BEVkzQqHeIejtVShVG8LoXuqqPsen4YS2QGhDvzfop6Usr4J8Eb\\/lFZREasEN1MRNC8FqcQoWQPc\\/BGyxU0viDeZKH3wtZ2jXhs7l8xqX9jbaON1nqgdayLVLQz+POAZnQz7iwTjrFX9A9mYM\\/NgUA32jQq"}},"ver":"5.7.22","slot":[{"loid":"1,13,16,23","channel":"news_news_sh","recent_rot":["1,2,3","4,5,6","4,7,6","8","9,10"],"refresh_type":2,"islocal":1,"seq_loid":"1,1,1,1,1","cur":58,"orders_info":["91987701,8122038,0,19,4201,110,1","91234216,9163219,537783065,19,4301,110,1","89033789,1252416,860268798,19,2804,110,1","91741757,7748442,3134180117,19,4301,110,1","82330504,3792964,3640746586,1000,101,110,2","89123229,8862275,3031448928,19,507,110,1","91890078,8651597,3360875772,19,4001,110,3","91961141,7311564,2009512134,19,4301,110,1","91639391,9117817,826327870,1000,4201,110,2","76056706,6378696,2577335544,19,4107,110,1"],"current_rot":"4,7,6,8,9","seq":"4,9,15,40,50"}],"appversion":"181210"}', 'lc_ids': 'CELLSHC201504210000', 'uid': '6F0D5898-2C3A-46FE-9181-589BC52ED743', 'is_new_user': '0', # 'feedbackNewsId': '20181228A0XN2100|2|0,20181215V0I04Q00|4|0,20181228A15XAT00|2|0,20181224V0C58S00|4|0,20181228A1EQLH00|2|0', 'chlid': channel_id, 'channelType': channel_id, 'newsTopPage': '0', 'feedbackCur': '53', 'channelPosition': '67', 'page': '1', 'forward': '1', 'picType': '1,2,0,2,1,2,1,2,1,0,1,2,0,2,1,2,1,2,1,2', 'cachedCount': '50'} headers = {"content-type": "application/x-www-form-urlencoded", "store": "1", "accept": "*/*", "idft": "CE1E8744-7BF9-4FDD-87A5-463C6B9A66E1", "idfa": "05571C2D-1C86-4B5B-87EF-E4B4DAF07DDB", "appver": "12.0.1_qqnews_5.7.22", "devid": "d605a70a-d084-487e-aaf1-8a057d40ef39", "devicetoken": "<f4b49138 3ca95e38 1519836e daefaab6 799b04da c164f7a7 4cb7d999 6e343393>", "accept-language": "zh-Hans-CN;q=1, en-CN;q=0.9", "referer": "http://inews.qq.com/inews/iphone/", "user-agent": "QQNews/5.7.22 (iPhone; iOS 12.0.1; Scale/2.00)", "content-length": "2169", "accept-encoding": "br, gzip, deflate", "cookie": "logintype=2", "qqnetwork": "wifi"} domain_url = "https://r.inews.qq.com/getQQNewsUnreadList?" query_dict = {'appver': '12.0.1_qqnews_5.7.22', 'pagestartfrom': 'icon', 'page_type': 'timeline', 'apptype': 'ios', 'rtAd': '1', 'imsi': '460-01', 'screen_height': '667', 'network_type': 'wifi', 'startTimestamp': '1545835451', 'store': '1', 'deviceToken': '<f4b49138 3ca95e38 1519836e daefaab6 799b04da c164f7a7 4cb7d999 6e343393>', 'global_info': '1|1|1|1|1|14|4|1|0|6|1|1|2|2|0|J267P000000000:J060P000000000:B054P000015802:J054P600000000|1421|0|1|0|0|0|0|0|1001|0|6|1|1|1|1|1|1|-1|0|0|0|2|1|1|0|0|2|0|1|0|4|0|0|0|3|0|0|0|0', 'globalInfo': '1|1|1|1|1|14|4|1|0|6|1|1|2|2|0|J267P000000000:J060P000000000:B054P000015802:J054P600000000|1421|0|1|0|0|0|0|0|1001|0|6|1|1|1|1|1|1|-1|0|0|0|2|1|1|0|0|2|0|1|0|4|0|0|0|3|0|0|0|0', 'screen_scale': '2', 'activefrom': 'icon', 'screen_width': '375', 'isJailbreak': '0', 'qqnews_refpage': 'QNCommonListController', 'omgid': 'a305486b92cc9e48f90929497de4cb30dfde0010112206', 'device_model': 'iPhone9,1', 'pagestartFrom': 'icon', 'device_appin': '6F0D5898-2C3A-46FE-9181-589BC52ED743', 'devid': 'D605A70A-D084-487E-AAF1-8A057D40EF39', 'omgbizid': '138dc6ef3ae8a24f7c897a9bbde8b9098f210060113210', 'idfa': '05571C2D-1C86-4B5B-87EF-E4B4DAF07DDB'} count = 0 result_list = [] while count < list_page_max: post_dic['newsTopPage'] = str(count) post_dic['page'] = str(count + 1) timestamp = int(time.time()) query_dict['startTimestamp'] = timestamp url = domain_url + urllib.parse.urlencode(query_dict) get_page = requests.post(url, data=post_dic, headers=headers) page_dict = get_page.json() page_list.append(page_dict) count += 1 # continue # video_list1 = page_dict["kankaninfo"]["videos"] video_list2 = page_dict["newslist"] count += 1 if video_list2 != []: print("get data at page %s" % str(count - 1)) for video_info in video_list2: has_video = video_info.get('hasVideo') video_channel = video_info.get('video_channel') if has_video == 1 or video_channel is not None: video_dict = copy.deepcopy(self.video_data) video_dict['channel'] = channel_id video_dict['title'] = video_info['title'] print(video_dict['title']) video_dict['url'] = video_info['url'] try: dura_str = video_info['video_channel']['video']['duration'] video_dict['duration'] = trans_duration(dura_str) except: video_dict['duration'] = 0 try: video_dict['releaser'] = video_info['chlname'] except: video_dict['releaser'] = None try: video_dict['releaser_id'] = video_info['card']['uin'] except: video_dict['releaser_id'] = None video_dict['release_time'] = int(video_info['timestamp'] * 1e3) try: video_dict['read_count'] = video_info['readCount'] except: video_dict['read_count'] = 0 video_dict['comment_count'] = video_info['comments'] video_dict['favorite_count'] = video_info['likeInfo'] try: video_dict['play_count'] = video_info['video_channel']['video']['playcount'] except: video_dict['play_count'] = 0 video_dict['article_id'] = video_info['id'] try: video_dict['video_id_str'] = video_info['vid'] except: video_dict['video_id_str'] = None video_dict['fetch_time'] = int(time.time() * 1e3) result_list.append(video_dict) if len(result_list) >= 100: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) return result_list def releaser_page_by_time(self, start_time, end_time, url,allow,**kwargs): data_lis = [] count_false = 0 output_to_file = kwargs.get("output_to_file") filepath = kwargs.get("filepath") push_to_redis = kwargs.get("push_to_redis") output_to_es_register = kwargs.get("output_to_es_register") output_to_es_raw = kwargs.get("output_to_es_raw") es_index = kwargs.get("es_index") doc_type = kwargs.get("doc_type") for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: yield res else: count_false += 1 if count_false > allow: break else: yield res if __name__ == "__main__": t = Crawler_Tencent_News() #t.get_releaser_follower_num("https://view.inews.qq.com/media/5498518?tbkt=I&uid=") url = "https://view.inews.qq.com/media/5196832" # t.releaser_page("https://view.inews.qq.com/media/5196832",output_to_es_raw=True, es_index='crawler-data-raw', doc_type='doc') t.releaser_page_by_time(1546272000000, 1564362018000, url, output_to_es_raw=True, es_index='crawler-data-raw', doc_type='doc', releaser_page_num_max=4000)