# -*- coding: utf-8 -*- """ Created on Thu Mar 15 15:23:08 2018 @author: fangyucheng Edited by hanye on 2018-05-15 Edited by fangyucheng on 2018-05-17 Edited by litao on 2019-04-10 """ import copy import datetime import json import random import re import time import urllib try: from crawler_sys.framework.func_get_releaser_id import * except: from write_data_into_es.func_get_releaser_id import * import requests from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.framework.get_redirect_resp import get_redirected_resp # from crawler.crawler_sys.utils.get_toutiao_as_cp_signature import as_cp from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.site_crawler.toutiao_get_signature import getHoney from crawler.crawler_sys.utils.output_results import output_result from crawler.crawler_sys.utils.output_results import retry_get_url from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.utils import output_log from crawler.crawler_sys.utils.util_logging import logged from write_data_into_es.func_cal_doc_id import cal_doc_id class Crawler_toutiao(): def __init__(self, timeout=None, platform='toutiao'): if timeout is None: self.timeout = 10 else: self.timeout = timeout self.platform = platform std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform # remove fields that crawled data don't have pop_key_Lst = ['channel', 'describe', 'isOriginal', "video_id"] for popk in pop_key_Lst: self.video_data.pop(popk) self.releaser_url_pattern = 'http://www.365yg.com/c/user/[RELEASER_ID]/' self.list_page_url_dict = {'all_channel': ( 'https://www.365yg.com/api/pc/feed/?max_behot_time=0' '&category=video_new&utm_source=toutiao')} self.legal_list_page_urls = [] self.legal_channels = [] self.api_list = [ "ic", "is", "api3-normal-c-hl", "ib", "api3-normal-c-lf", "id", "ie", "api3-normal-c-lq", "ii", "io", "it", "iu", "lf", "lg", "lh", ] for ch in self.list_page_url_dict: list_page_url = self.list_page_url_dict[ch] self.legal_list_page_urls.append(list_page_url) self.legal_channels.append(ch) # self.headers = { # "Connection": "keep-alive", # "User-Agent": "News 7.6.0 rv:7.6.0.14 (iPad; iOS 13.3.1; zh_CN) Cronet", # "sdk-version": "1", # "x-ss-sessionid": "", # "passport-sdk-version": "5.4.0", # "X-SS-DP": "13", # "Accept-Encoding": "gzip, deflate", # "X-Gorgon": "030000003005ed4d41d9d6e0dd5c9b0dbb917c401852122c282d", # # } # self.headers = { # "Connection": "keep-alive", # # "User-Agent": "News 7.6.0 rv:7.6.0.14 (iPad; iOS 13.3.1; zh_CN) Cronet", # "User-Agent": "Dalvik/2.1.0 (Linux; U; Android 5.1.1; OPPO R11 Build/NMF26X) NewsArticle/7.1.5 cronet/TTNetVersion:2996a052 2019-08-12", # "sdk-version": "2", # # "x-ss-sessionid": "", # # "passport-sdk-version": "5.4.0", # # "X-SS-DP": "13", # "Accept-Encoding": "gzip, deflate", # "X-Gorgon": "030000003005ed4d41d9d6e0dd5c9b0dbb917c%sc282d" % random.randint(401800000, 401952122), # # } # self.headers = { # # # "Host": "api3-normal-c-lf.snssdk.com", # "Connection": "keep-alive", # "User-Agent": "Dalvik/2.1.0 (Linux; U; Android 5.1.1; OPPO R11 Build/NMF26X) NewsArticle/7.1.5 cronet/TTNetVersion:2996a052 2019-08-12", # "sdk-version": "1", # "Accept-Encoding": "gzip", # } self.headers = { "accept": "text/javascript, text/html, application/xml, text/xml, */*", "accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9", "content-type": "application/x-www-form-urlencoded", # "cookie": "gftoken=MjA4NTcyMDkyMXwxNTgyOTYxNjM3NjZ8fDAGBgYGBgY; SLARDAR_WEB_ID=9706fc8c-b8a6-4265-8a2e-e3f0739daaf2; UM_distinctid=1708fddb4c0466-04c756d28410e1-752c6c3c-51abc-1708fddb4c1790; CNZZDATA1274386066=608234173-1582960977-https%253A%252F%252Fwww.toutiao.com%252F%7C1582960977", # "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=50502346296&media_id=50502346296&request_source=1", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", "x-requested-with": "XMLHttpRequest", } # log_path = '/home/hanye/crawlersNew/crawler/crawler_log' # current_day = str(datetime.datetime.now())[:10] # info_log_file = log_path + '/all_' + current_day + '.log' # info_name = self.platform + '_info' # error_log_file = log_path + '/error_' + current_day + '.log' # error_name = self.platform + '_error' # self.loggerinfo = output_log.init_logger(name=info_name, log_file=info_log_file) # self.loggererror = output_log.init_logger(name=error_name, log_file=error_log_file) def extract_field(self, raw_str, raw_field_name): try: field_value = (re.findall('%s:.+?,' % raw_field_name, raw_str)[0] .replace('%s:' % raw_field_name, '')[:-1]) # remove string start space and single quotation marks field_value_cleaned = re.sub('\'$', '', re.sub('^\s+?\'', '', field_value)) except: field_value_cleaned = None return field_value_cleaned def get_host_str(self, url): get_host_str = re.findall('://.+?/', url) if get_host_str != []: host_str = get_host_str[0].replace(':', '').replace('/', '') else: host_str = None return host_str def video_page_via_m(self, url): url_video_id_midstep = ' '.join(re.findall('com/.*', url)).replace('com/', '') url_video_id = re.findall('\d+', url_video_id_midstep)[0] # mobile_url = 'https://m.365yg.com/i'+url_video_id+'/info/?' mobile_url = 'https://m.365yg.com/i' + url_video_id get_page = retry_get_url(mobile_url) if get_page is not None: return None else: page = get_page.text page = page.replace('true', 'True') page = page.replace('false', 'False') page = page.replace('null', '"Null"') try: page_dic = eval(page) except: page_dic = None print('Failed to transfer text to dict on url: %s' % url) return None video_dict = copy.deepcopy(self.video_data) try: video_dic = page_dic['data'] title = video_dic['title'] releaser = video_dic['source'] releaser_id = video_dic['creator_uid'] releaserUrl = self.releaser_url_pattern.replace('[RELEASER_ID]', str(releaser_id)) play_count = video_dic['video_play_count'] comment_count = video_dic['comment_count'] release_time = int(video_dic['publish_time'] * 1e3) video_id = video_dic['video_id'] fetch_time = int(datetime.datetime.now().timestamp() * 1e3) # video_dict['title'] = title video_dict['url'] = url video_dict['play_count'] = play_count video_dict['comment_count'] = comment_count video_dict['video_id'] = video_id video_dict['releaser'] = releaser video_dict['releaser_id_str'] = str(releaser_id) video_dict['releaserUrl'] = releaserUrl video_dict['release_time'] = release_time video_dict['fetch_time'] = fetch_time except: print('Failed when extracting data from page of url: %s' % url) return video_dict def video_page(self, url): """ release_time is missing, should update this field when inserting into es. url such as 'http://toutiao.com/group/6532819433331622404/' can not get data now suggestion by fangyucheng is to rebuild url as www.365yg.com with video_id to sovle the problem """ if "item" in url or "group" in url: vid = re.findall("/(\d+)/", url)[0] elif "xigua" in url: vid = re.findall("/i(\d+)/", url)[0] else: print(url) return None headers = { "Accept-Encoding": "gzip", "X-SS-REQ-TICKET": str(int(datetime.datetime.now().timestamp()) * 1e3), "sdk-version": "1", # "Cookie": "qh[360]=1; install_id=85200129335; ttreq=1$e8e97b875965bf4af4b5dbaaba4d4a5ec3441e47; history=JM89SDpxGAfw5%2F%2Bo%2F7tEz15%2FZ0tbUEN7Q8FhEYQQIdJ2oNFBgpagCA7BIbUFNUT0NjSkRIvl2AveOdr2XEVUuDS0FFnQEETEo%2BOH5%2Fvj9%2F0WyqF4xphMZNLJeD6aSBmk15Tt4nTWSGUaEHR0e%2BG9aqGfPFOgOXrZ%2BtQBJVI6QXPA89R9dzs2QCqC6eil7H3eQhcFiJOXE4NLgDL9q7FscXLM78Qv62rk0GuiRN511vlNRZioEEArGesNaKhQXxBmHd1q7ic19JNcb90Cu1ELfdQz11KkY4Ob%2BWZYex%2BRPCfFK6uaO12GkJ%2FEN%2BtofMgAVEg8s0qbw2ehgkKiwToovMVNdJP4ai%2Fqvw4vjlLXFi%2BqefWmhTKpUvum%2FoR3VBIvYDrgeYT5YtpNksxJe6WeA3SReODW1diayV1cq%2FzDhf2%2FoqFMognaHwAAAP%2F%2F; odin_tt=8cd4f07f6dc385b01edd52312dd29fbe7fdbfa059194493779de3fe408b8836bb9265292bb9335bc976037dd93e5d131de7acf894a805930417b4d3be7f308e0", # "X-Gorgon": "0300ddd08400675de6e75ad03849011c863306ddae2b0eb3cec4", # "X-Khronos": str(int(datetime.datetime.now().timestamp())), # "Host": "xgapi.snssdk.com", "Connection": "Keep-Alive", "Authorization": "HMAC-SHA1:2.0:1573091168911407306:bab42eac5b9e4a8eb25a91fc371ad533:WTfDrhnIsymHfmHCgG9YvRSu2YY=", "User-Agent": "okhttp/3.10.0.1", "X-Pods": "", } print(vid) url_dic = { "group_id": vid, "item_id": vid, "aggr_type": 0, "context": 1, "flags": 64, # "iid": "77627602260", # "device_id": random.randint(50000000000,59999999999), "ac": "wifi", "channel": "update", "aid": "13", "app_name": "news_article", "version_code": "732", "version_name": "7.3.2", "device_platform": "android", "ab_version": "830855,947965,942635,662176,665176,674051,643894,919834,649427,677130,710077,801968,707372,661900,668775,990369,739390,662099,668774,765190,976875,857803,952277,757281,679101,660830,759657,661781,648315", "ab_group": "100168", "ab_feature": "94563,102749", "ssmix": "a", # "device_type": "oppo R11s Plus", # "device_brand": "OPPO", "language": "zh", "os_api": "23", "os_version": "9.0.1", # "uuid": "250129616283002", # "openudid": "7313ae71df9e5367", "manifest_version_code": "731", "resolution": "810*1440", "dpi": "270", "update_version_code": "75410", "_rticket": int(datetime.datetime.now().timestamp() * 1e3), # "rom_version": "coloros__v417ir release-keys", # "fp": "w2TZFzTqczmWFlwOLSU1J2xecSKO", "tma_jssdk_version": "1.24.0.1", # "pos": "5r_x8vP69Ono-fi_p6ysq7Opra2kr6ixv_H86fTp6Pn4v6eupLOkra6vpajg", # "plugin": "0", # "ts":int(datetime.datetime.now().timestamp()), # "as":"ab7f9fce505d1d7dbe7f9f", # "mas":"011993339399f959a359d379b98587814259a359d3997919d319b3" } url = 'http://xgapi.snssdk.com/video/app/article/information/v25/?%s' % ( urllib.parse.urlencode(url_dic)) # get_page = get_redirected_resp(url) res = retry_get_url(url, headers=headers, timeout=5, proxies=1) try: get_page = res.json() # print(get_page) except: return None if get_page is None: return None else: data = get_page.get("data") video_dict = copy.deepcopy(self.video_data) fetch_time = int(datetime.datetime.now().timestamp() * 1e3) video_dict['url'] = data.get("display_url") try: try: video_dict['title'] = data.get("h5_extra").get("title") except: video_dict['title'] = data.get("share_info").get("title") except: return None video_dict['play_count'] = data.get("video_watch_count") video_dict['favorite_count'] = data.get("digg_count") video_dict['comment_count'] = data.get("comment_count") if not video_dict['comment_count']: video_dict['comment_count'] = 0 video_dict['repost_count'] = data.get("repin_count") if not video_dict['repost_count']: video_dict['repost_count'] = 0 video_dict['video_id'] = vid try: video_dict['releaser'] = data.get("h5_extra").get("name") except: video_dict['releaser'] = data.get("source") try: video_dict['releaser_id_str'] = data.get("h5_extra").get("media_user_id") except: video_dict['releaser_id_str'] = data.get("user_info").get("user_id") video_dict['releaserUrl'] = "https://www.toutiao.com/c/user/%s/" % video_dict['releaser_id_str'] video_dict['releaser_id_str'] = "toutiao_%s" % video_dict['releaser_id_str'] video_dict['duration'] = data.get("video_duration") try: if type(video_dict['play_count']) != int: video_dict['play_count'] = data.get("video_detail_info").get("video_watch_count") except: return None try: if type(video_dict['duration']) != int: video_dict['duration'] = 0 except: return None video_dict['fetch_time'] = fetch_time try: video_dict['release_time'] = int(int(data.get("h5_extra").get("publish_stamp")) * 1e3) except: video_dict.pop("release_time") for k in video_dict: if type(video_dict[k]) == str or type(video_dict[k]) == int: pass else: return None return video_dict def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None): urls = [] for page_num in range(0, search_pages_max): page_num = page_num * 20 url = ('https://www.toutiao.com/search_content/?offset=' + str(page_num) + '&format=json&keyword=' + keyword + '&autoload=true&count=20&cur_tab=2&from=video&aid=24') urls.append(url) toutiao_Lst = [] for search_page_url in urls: get_page = requests.get(search_page_url) if get_page.status_code != 200: # retry once get_page = requests.get(search_page_url) if get_page.status_code != 200: continue page_dict = get_page.json() for one_line in page_dict['data']: try: title = one_line['title'] duration = one_line['video_duration'] url = one_line['share_url'] play_count = one_line['read_count'] comment_count = one_line['comment_count'] favorite_count = one_line['digg_count'] videoid = one_line['id'] releaser = one_line['media_name'] releaserUrl = one_line['media_url'] release_time = one_line['publish_time'] release_time = int(int(release_time) * 1e3) D0 = copy.deepcopy(self.video_data) D0['title'] = title D0['duration'] = duration D0['url'] = url D0['play_count'] = play_count D0['comment_count'] = comment_count D0['favorite_count'] = favorite_count D0['video_id'] = videoid D0['releaser'] = releaser D0['releaserUrl'] = releaserUrl D0['release_time'] = release_time toutiao_Lst.append(D0) except KeyError: # It's totally ok to drop the last return data value. # The search api just return something seems related to search continue if len(toutiao_Lst) >= 100: output_result(result_Lst=toutiao_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) toutiao_Lst.clear() if toutiao_Lst != []: output_result(result_Lst=toutiao_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return toutiao_Lst def find_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) def releaser_page_via_pc(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxy_dic=None): """ If output_to_file=True is passed in, an absolute path string should also be passed to filepath parameter. filepath string tells where to put the output file. The output file name is assigned automatically, is not supported to assign by user. """ headers = {'Host': 'www.365yg.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0'} result_Lst = [] whether_continue = True behot_time = 0 page_count = 0 releaser_id = self.find_releaser_id(releaserUrl) if releaser_id is None: pass # self.loggererror.error("%s can't get releaser_id" % releaserUrl) else: print('releaser_id', releaser_id) # self.loggerinfo.info("process on releaser %s" % releaser_id) while whether_continue is True and page_count <= releaser_page_num_max: releaser_page_url = ('https://www.365yg.com/c/user/article/?user_id=' + releaser_id + '&max_behot_time=' + str(behot_time) + '&max_repin_time=0&count=20&page_type=0') # http://m.365yg.com/video/app/user/home/?to_user_id=73299297129&format=json&max_behot_time=0 get_page = retry_get_url(releaser_page_url, headers=headers, proxies=proxy_dic) if get_page is None: # self.loggererror.error("%s can't get page at page num %s" % (releaserUrl, page_count)) print("can't get page at %s" % page_count) continue else: page_count += 1 try: page_dic = get_page.json() except: page_dic = None # self.loggererror.error('Failed to transfer text to dict on json data url: %s' % releaser_page_url) print('Failed to transfer text to dict on ' 'json data url: %s' % releaser_page_url) continue video_dic = page_dic['data'] whether_continue = page_dic['has_more'] behot_time = page_dic['next']['max_behot_time'] for line in video_dic: video_dict = copy.deepcopy(self.video_data) # behot_time is different in every video item in the same # releaser page, and will generally descent. The one used # to get next page is the behot_time value from the last # video in the present releaser page. try: video_dict['release_time'] = int(line['behot_time'] * 1e3) video_dict['title'] = line['title'] video_dict['url'] = line['display_url'] video_dict['releaser'] = line['source'] video_id = line['group_id'] video_dict['video_id'] = video_id video_dict['url'] = 'http://www.365yg.com/a' + str(video_id) + '/' video_dict['comment_count'] = line['comments_count'] duration_str = line['video_duration_str'] video_dict['duration'] = trans_duration(duration_str) video_dict['play_count'] = line['video_watch_count'] except KeyError as except_msg: # self.loggererror.error('Got KeyError exception: %s at page %s' # % (except_msg, releaserUrl)) print('Got KeyError exception: %s at page %s' % ( except_msg, releaserUrl)) try: print(duration_str) except: print("can't print duration_str") continue fetch_time = int(datetime.datetime.now().timestamp() * 1e3) video_dict['platform'] = self.platform video_dict['releaser_id_str'] = str(releaser_id) video_dict['fetch_time'] = fetch_time video_dict['releaserUrl'] = releaserUrl result_Lst.append(video_dict) if len(result_Lst) % 100 == 0: output_result(result_Lst, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type) result_Lst.clear() print(behot_time) # self.loggerinfo.info("write %s 100 video_data_dicts into es %s, %s" # % (releaser_id, es_index, doc_type)) if result_Lst != []: output_result(result_Lst, self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type) # self.loggerinfo.info("write %s %s video_data_dicts into es %s, %s" # % (releaser_id, len(result_Lst), es_index, doc_type)) def check_play_count_by_video_page(self, url): """ To check whether the play_count from releaser_page(www.365yg.com) is the real play_count show on video_page """ if "toutiao.com" in url: video_id_str = ' '.join(re.findall('/group/[0-9]+', url)) video_id = ' '.join(re.findall('\d+', video_id_str)) url = 'http://www.365yg.com/a' + video_id get_page = get_redirected_resp(url) if get_page is None: return None else: page = get_page.text find_play_count = re.findall('videoPlayCount: \d+,', page) if find_play_count != []: play_count = re.findall('\d+', find_play_count[0])[0] return int(play_count) else: print("can't get play_count") def get_releaser_follower_num(self, releaserUrl): headers = {'Host': 'www.toutiao.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0'} releaser_id = self.find_releaser_id(releaserUrl) releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/' get_page = retry_get_url(releaserUrl, headers=headers, proxies=1) page = get_page.text try: follower_num = int(re.findall('\d+', ' '.join(re.findall("fensi:'\d+'", page)))[0]) print('%s follower number is %s' % (releaserUrl, follower_num)) releaser_img = self.get_releaser_image(data=page) return follower_num, releaser_img except: print("can't find followers") def get_releaserUrl_from_video_page(self, url, proxy_dic=None): """ To get releaserUrl from video page """ if "toutiao.com" in url: video_id_str = ' '.join(re.findall('/group/[0-9]+', url)) video_id = ' '.join(re.findall('\d+', video_id_str)) url = 'http://www.365yg.com/a' + video_id get_page = retry_get_url(url, proxies=proxy_dic) if get_page is None: return None else: page = get_page.text find_releaser_id = re.findall("mediaId: '\d+',", page) if find_releaser_id != []: releaser_id = re.findall('\d+', ' '.join(find_releaser_id))[0] releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/' return releaserUrl else: return None def list_page(self, task_lst, output_to_file=False, filepath=None, page_num_max=20, output_to_es_raw=False, output_to_es_register=False, es_index='crawler-data-raw', doc_type='doc', proxy_dic=None): """ To get video data from list page, it can not be revised to async-crawler due to the next page depends on the previous page's data """ cookie = ('tt_webid=6553778542248003086;' 'CNZZDATA1259612802=1625670355-1516338642-%7C1527150919;' '_ga=GA1.2.1539151044.1516342895;' '__utma=91863192.1539151044.1516342895.1521092491.1521092491.1;' '__tea_sdk__user_unique_id=6553778542248003086;' '__tea_sdk__ssid=545b4c91-3bd3-4748-831b-6edbf9415b70;' 'CNZZDATA1262382642=810628165-1527124428-%7C1529484233;' '__tasessionId=ptjvpftsc1539054420281;' '_gid=GA1.2.1520435477.1539054422') headers = {'Host': 'www.365yg.com', 'User-Agent': self.random_useragent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Cookie': cookie, 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0'} result_lst = [] max_behot_time = 0 page_num = 1 while page_num <= page_num_max: listurl = ('https://www.365yg.com/api/pc/feed/?max_behot_time=' + str(max_behot_time) + '&category=video_new&utm_source=toutiao') get_page = retry_get_url(listurl, headers=headers, proxies=proxy_dic) page_num += 1 try: page_dic = get_page.json() except: page_dic = {} if page_dic == {}: max_behot_time = 0 print("can't get list page") continue else: max_behot_time = page_dic['next']['max_behot_time'] video_info_lst = page_dic['data'] for line in video_info_lst: video_dic = copy.deepcopy(self.video_data) title = line['title'] video_dic['title'] = line['title'] video_dic['data_from'] = 'list_page' video_dic['url'] = 'https://www.365yg.com/a' + line['group_id'] try: dura_str = line['video_duration_str'] video_dic['duration'] = trans_duration(dura_str) except: video_dic['duration'] = 0 print("%s can't get duration" % title) video_dic['releaser'] = line['source'] video_dic['releaserUrl'] = 'https://www.365yg.com' + line['media_url'] video_dic['release_time'] = int(int(line['behot_time']) * 1e3) try: video_dic['describe'] = line['abstract'] except: video_dic['describe'] = '' print("%s can't get describe" % title) try: video_dic['play_count'] = line['video_play_count'] except: video_dic['play_count'] = 0 print("%s can't get play_count" % title) video_dic['fetch_time'] = int(time.time() * 1e3) result_lst.append(video_dic) if len(result_lst) >= 100: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) result_lst.clear() print(max_behot_time) if result_lst != []: output_result(result_Lst=result_lst, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return result_lst def get_data_mediaid(self, releaserUrl, releaser_id): headers = { "Host": "m.toutiao.com", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": self.random_useragent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9" } releaserUrl = "http://m.toutiao.com/profile/%s/#mid=%s" % (releaser_id, releaser_id) time.sleep(1) res = requests.get(releaserUrl, headers=headers, timeout=5) # cookie = requests.utils.dict_from_cookiejar(res.cookies) # print(cookie) try: data_mediaid = re.findall(r'data-mediaid="(\d+)"', res.text) except: data_mediaid = "" if data_mediaid: # print(data_mediaid) return data_mediaid[0] else: return False def get_releaser_image(self, releaserUrl=None, data=None): if releaserUrl: headers = {'Host': 'www.toutiao.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0'} proxies = get_proxy(1) releaser_id = self.find_releaser_id(releaserUrl) releaserUrl = 'https://www.toutiao.com/c/user/' + str(releaser_id) + '/' get_page = retry_get_url(releaserUrl, headers=headers, proxies=proxies) page = get_page.text try: releaser_img = re.findall("avtar_img:'(.*)'", data)[0] return "http:" + releaser_img except: print("can't get releaser_img") else: releaser_img = re.findall("avtar_img:'(.*)'", data)[0] return "http:" + releaser_img @staticmethod def get_video_image(data): video_image_url = "" if data.get("large_image_list"): video_image_url = data["large_image_list"][0]["url_list"][0]["url"] # height = data["large_image_list"][0]["height"] # width = data["large_image_list"][0]["width"] elif data.get("ugc_video_cover"): video_image_url = data["ugc_video_cover"]["url_list"][0]["url"] # height = data["ugc_video_cover"]["height"] # width = data["ugc_video_cover"]["width"] elif data.get("video_detail_info"): video_image_url = data["video_detail_info"]["detail_video_large_image"]["url_list"][0]["url"] # height = data["video_detail_info"]["detail_video_large_image"]["height"] # width = data["video_detail_info"]["detail_video_large_image"]["width"] return video_image_url def get_web_article_info(self,article_id,proxies_num=0): # headers = { # "Accept": "*/*", # "Accept-Encoding": "gzip, deflate", # "Accept-Language": "zh,zh-CN;q=0.9", # "Connection": "keep-alive", # # "Cookie": "tt_webid=6851461299689686542; SLARDAR_WEB_ID=568d391e-7f96-491b-9557-b045a55e9dd8", # "Host": "m.toutiao.com", # # "Referer": "https://m.toutiao.com/i6851146167279944199/", # "Sec-Fetch-Dest": "empty", # "Sec-Fetch-Mode": "cors", # "Sec-Fetch-Site": "same-origin", # "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", # } # headers["Referer"] = "https://m.toutiao.com/i%s" % article_id headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "tt_webid=6851788569271944719", "Host": "m.toutiao.com", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id,article_id) requests_res = retry_get_url(url,headers=headers,proxies=proxies_num) res_json = requests_res.json() res_dic = { "title":res_json["data"].get("title").replace("\r","").replace("\n",""), 'high_quality_flag':int(res_json["data"].get('high_quality_flag')), "play_count": int(res_json["data"].get('impression_count')), "comment_count": res_json["data"].get("comment_count"), "repost_count": res_json["data"].get("repost_count"), "favorite_count": res_json["data"].get("digg_count"), 'releaser_followers_count': res_json["data"].get("follower_count"), 'release_time': int(res_json["data"].get('publish_time')*1e3), "content":res_json["data"].get("content").replace("\r","").replace("\n",""), } return res_dic def web_releaser_page_article(self, releaserUrl, releaser_page_num_max=50000, proxies_num=None,**kwargs): result_list = [] has_more = True count = 1 releaser_id = self.find_releaser_id(releaserUrl) count_false = 0 offset = "0" headers = {"accept": "text/javascript, text/html, application/xml, text/xml, */*", "accept-encoding": "gzip, deflate", "accept-language": "zh,zh-CN;q=0.9", "content-type": "application/x-www-form-urlencoded", # "cookie": "gftoken=NDAxNzc3NjcyM3wxNTk1MjI3MTU0ODh8fDAGBgYGBgY; SLARDAR_WEB_ID=0ddc45df-54ce-42c5-8dfd-27403ea3319e; s_v_web_id=verify_kcu52781_yF9Mw8Pu_VGOQ_4R2p_8AeG_NwGKWAkt7YLl; ttcid=df5933a4926945c68dde9bf5e5542f9730; tt_scid=KlhjcsMcR9m7a1GIqnzjDfr.XZ0-jnU4X-ZPLZFZ51vyyv6FmjCdmDwYVWtjq2JO18fd", # "referer": "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id, "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "x-requested-with": "XMLHttpRequest"} # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999) # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999) # idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999) # iid = str(random.randint(104525900000, 104526000000)) while has_more and count <= releaser_page_num_max: # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format( # random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic)) # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset)) # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format( # random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset)) url = "https://profile.zjurl.cn/api/feed/profile/v2/?category=profile_article&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format( str(releaser_id), str(offset), str(releaser_id)) try: proxies = get_proxy(proxies_num) if proxies: get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10) else: get_page = requests.get(url, headers=self.headers, timeout=10) except: continue print("get_page %s on page %s" % (releaser_id, count)) page_dic = {} try: page_dic = get_page.json() if page_dic.get("message") != "success": count_false += 1 if count_false < 3: continue else: print("unknow error") break data_list = page_dic.get('data') has_more = page_dic.get('has_more') offset = str(page_dic.get("offset")) except: if not page_dic: count_false += 1 if count_false >= 3: break else: continue if data_list: data_list = page_dic.get('data') has_more = page_dic.get('has_more') else: data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if not data_list: print("toutiao no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) # print(url) count_false += 1 proxies = get_proxy(1) if count_false >= 5: has_more = False break continue else: count_false = 0 count += 1 for one_video in data_list: # print(one_video) # info_str = one_video.get('content') info_dic = json.loads(one_video["content"]) video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['abstract'] = info_dic.get('abstract') video_dic['url'] = info_dic.get('share_url') video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl release_time = info_dic.get('publish_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = info_dic.get('video_duration') video_dic['play_count'] = info_dic.get('read_count') video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') video_dic['comment_count'] = info_dic.get('comment_count') video_dic['favorite_count'] = info_dic.get('digg_count') video_dic['article_id'] = info_dic.get('tag_id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) video_dic['id'] = cal_doc_id(video_dic["platform"], url=video_dic["url"], doc_id_type='all-time-url', data_dict=video_dic) try: article_info = self.get_web_article_info(info_dic.get('tag_id'),proxies_num=proxies_num) video_dic.update(article_info) except Exception as e: print("method get_web_article_info error %s" %e) yield video_dic def App_releaser_page_video(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=50000, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxies_num=None): result_list = [] has_more = True count = 1 releaser_id = self.find_releaser_id(releaserUrl) count_false = 0 offset = "0" self.headers[ "referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999) # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999) # idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999) # iid = str(random.randint(104525900000, 104526000000)) while has_more and count <= releaser_page_num_max: # proxies = get_proxy(proxies_num) # print(str(releaser_id)+str(max_behot_time)) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) # url_dic = { # "visited_uid": releaser_id, # "client_extra_params": r'{"playparam":"codec_type:0"}', # "count": "20", # "offset": offset, # "stream_api_version": "88", # "category": "profile_video", # "version_code": "7.6.0", # "app_name": "news_article", # "channel": "App%20Store", # "resolution": "1536*2048", # "aid": "35", # "ab_feature": "794528", # "ab_version": "765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395", # "ab_group": "794528", # "pos": r"5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==", # "update_version_code": "76014", # "ac": "WIFI", # "os_version": "13.3.1", # "ssmix": "a", # "device_platform": "ipad", # "iid": iid, # "ab_client": "a1,f2,f7,e1", # "device_type": "iPad6,11", # # "idfa": idfa, # # "iid": "95105525198", # # # "device_id": "70149736870", # # "device_id": random.randint(60000000000,80000000000), # } # url_dic = { # # "category": "profile_video", # "visited_uid": releaser_id, # "stream_api_version": "47", # "count": "20", # "offset": offset, # "client_extra_params": r'{}', # "iid": iid, # "ac": "wifi", # # "mac_address": "08:00:27:1F:7E:A0", # "channel": "wap_test_lite_1", # "aid": "35", # "app_name": "news_article_lite", # "version_code": "715", # "version_name": "7.1.5", # "device_platform": "android", # "ab_version": "668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,9289425", # "ab_client": "a1,c2,e1,f2,g2,f7", # "ab_feature": "z1", # "abflag": "3", # "ssmix": "a", # "device_type": "OPPO R11", # "device_brand": "OPPO", # "language": "zh", # "os_api": "22", # "os_version": "5.1.1", # "manifest_version_code": "715", # "resolution": "1920*1080", # "dpi": "401", # "update_version_code": "71504", # "sa_enable": "0", # "tma_jssdk_version": "1.25.4.2", # # "fp": "a_fake_fp", # "rom_version": "coloros__r11-user", # "plugin_state": "30631999", # "as": "ab9db50e485e50977f9db5", # "mas": "ab9db50e485e50977f9db5", # # } # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&{2}".format( # random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic)) # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_video&version_code=7.6.0&app_name=news_article&channel=App%20Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format(random.choice(self.api_list), random.randint(1, 1),str(releaser_id),str(offset)) # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_video&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format( # random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset)) url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_video&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format( str(releaser_id), str(offset), str(releaser_id)) try: proxies = get_proxy(proxies_num) if proxies: # proxies = { # "http": "http://127.0.0.1:80", # "https": "http://127.0.0.1:443" # } get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=10) else: get_page = requests.get(url, headers=self.headers, timeout=10) except: continue print("get_page %s on page %s" % (releaser_id, count)) page_dic = {} try: page_dic = get_page.json() if page_dic.get("message") != "success": count_false += 1 if count_false < 3: continue else: print("unknow error") break data_list = page_dic.get('data') has_more = page_dic.get('has_more') offset = str(page_dic.get("offset")) except: if not page_dic: count_false += 1 if count_false >= 3: break else: continue if data_list: data_list = page_dic.get('data') has_more = page_dic.get('has_more') else: data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if not data_list: print("toutiao no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) # print(url) count_false += 1 proxies = get_proxy(1) if count_false >= 5: has_more = False break continue else: count_false = 0 count += 1 for one_video in data_list: # print(one_video) # info_str = one_video.get('content') info_dic = json.loads(one_video["content"]) video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['url'] = info_dic.get('share_url') video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl release_time = info_dic.get('publish_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = info_dic.get('video_duration') video_dic['play_count'] = info_dic.get('video_detail_info').get("video_watch_count") video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') video_dic['comment_count'] = info_dic.get('comment_count') video_dic['favorite_count'] = info_dic.get('digg_count') video_dic['video_id'] = info_dic.get('item_id') video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['video_img'] = self.get_video_image(info_dic) yield video_dic def App_releaser_page_all(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=500, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxy_dic=None, crawler_type="profile_all", proxies_num=None): result_list = [] has_more = True count = 1 releaser_id = self.find_releaser_id(releaserUrl) count_false = 0 count_no_data = 0 offset = "0" self.headers[ "referer"] = "https://profile.zjurl.cn/rogue/ugc/profile/?user_id=%s&request_source=1" % releaser_id # vid = "AB5483CA-FCDC-42F1-AFB1-077A1%sDA" % random.randint(100000, 999999) # ccid = "F153594D-1310-4984-A4C3-A679D4D%s" % random.randint(10000, 99999) # openudid = "5d44f2ea1b74e3731b27e5ed8039ac29f%s" % random.randint(1000000, 9999999) # idfa = "E3FC9054-384B-485F-9B4C-936F33D7D%s" % random.randint(100, 999) # iid = str(random.randint(104525900000, 104526000000)) while has_more and count <= releaser_page_num_max: count_no_data += 1 print("get %s article on page %s" % (releaser_id, count)) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) # url_dic = { # # "category": crawler_type, # "visited_uid": releaser_id, # "client_extra_params": '{"playparam": "codec_type:0"}', # "count": "20", # "offset": offset, # "stream_api_version": "88", # "category": crawler_type, # "version_code": "7.6.0", # "app_name": "news_article", # "channel": "App%20Store", # "resolution": "1536*2048", # "aid": "13", # "ab_feature": "794528", # "ab_version": "765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395", # "ab_group": "794528", # "pos": "5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==", # "update_version_code": "76014", # "ac": "WIFI", # "os_version": "13.3.1", # "ssmix": "a", # "device_platform": "ipad", # "iid": iid, # "ab_client": "a1,f2,f7,e1", # "device_type": "iPad6,11", # } # url_dic = { # # "category": "profile_video", # "visited_uid": releaser_id, # "stream_api_version": "47", # "count": "20", # "offset": offset, # "client_extra_params": '{}', # "iid": iid, # "ac": "wifi", # # "mac_address": "08:00:27:1F:7E:A0", # "channel": "wap_test_lite_1", # "aid": "35", # "app_name": "news_article_lite", # "version_code": "715", # "version_name": "7.1.5", # "device_platform": "android", # "ab_version": "668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,9289425", # "ab_client": "a1,c2,e1,f2,g2,f7", # "ab_feature": "z1", # "abflag": "3", # "ssmix": "a", # "device_type": "OPPO R11", # "device_brand": "OPPO", # "language": "zh", # "os_api": "22", # "os_version": "5.1.1", # "manifest_version_code": "715", # "resolution": "1920*1080", # "dpi": "401", # "update_version_code": "71504", # # "sa_enable": "0", # # "fp": "a_fake_fp", # # "tma_jssdk_version": "1.25.4.2", # # "rom_version": "coloros__r11-user 5.1.1 nmf26x 500200210 release-keys", # # "plugin_state": "30631999", # # "as": "ab9db50e485e50977f9db5", # # "mas": "ab9db50e485e50977f9db5", # # } # url = """https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&visited_uid={2}&client_extra_params=&count=20&offset={3}&stream_api_version=88&category=profile_all&version_code=7.6.0&app_name=news_article&channel=App Store&resolution=1536*2048&aid=13&ab_feature=794528&ab_version=765192,857803,660830,1444046,1397712,1434498,662176,801968,1419045,668775,1462526,1190525,1489306,1493796,1439625,1469498,668779,1417599,662099,1477261,1484884,668774,1496422,1427395&ab_group=794528&pos=5pe9vb/x8v788cLx/On47unC7fLuv72nveaXvb29vb/ 8vLv fTz/On4y/zx6Pjuv72nveaXvb29vb29v/Hy8/r06ej5 L 9p72tsZe9vb29vb2/8fzp9Ono fi/vae9rZe9vb294Je9veCX4A==&update_version_code=76014&ac=WIFI&os_version=13.3.1&ssmix=a&device_platform=ipad&iid={4}&ab_client=a1,f2,f7,e1&device_type=iPad6,11""".format( # random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset), iid) # url = "https://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&visited_uid={2}&stream_api_version=47&count=20&offset={3}&ac=wifi&channel=wap_test_lite_1&aid=35&app_name=news_article_lite&version_code=715&version_name=7.1.5&device_platform=android&ab_version=668903,668905,668907,808414,772541,1378617,668908,668904,668906,1401332,1496418,928942&ab_client=a1,c2,e1,f2,g2,f7&ab_feature=z1&abflag=3&ssmix=a&device_type=OPPO R11&device_brand=OPPO&language=zh&os_api=22&os_version=5.1.1&manifest_version_code=715&resolution=900*1600&dpi=320&update_version_code=71504&sa_enable=0&fp=a_fake_fp&tma_jssdk_version=1.25.4.2&rom_version=coloros__r11-user 5.1.1 nmf26x 500200210 release-keys&plugin_state=30631999".format( # random.choice(self.api_list), random.randint(5, 10), str(releaser_id), str(offset)) url = "https://profile.zjurl.cn/api/feed/profile/v1/?category=profile_all&visited_uid={0}&stream_api_version=82&request_source=1&offset={1}&user_id={2}".format( str(releaser_id), str(offset), str(releaser_id)) # url = "http://{0}.snssdk.com/api/feed/profile/v{1}/?category=profile_all&{2}".format( # random.choice(self.api_list), random.randint(5, 10), urllib.parse.urlencode(url_dic)) try: proxies = get_proxy(proxies_num) if proxies: get_page = requests.get(url, headers=self.headers, proxies=proxies, timeout=8) else: get_page = requests.get(url, headers=self.headers, timeout=8) except: continue # print(get_page.text) # time.sleep(0.5) # format_json = re.match(r"jsonp\d+", get_page.text) page_dic = {} try: page_dic = get_page.json() if page_dic.get("message") != "success": count_false += 1 if count_false < 3: continue else: print("unknow error") break if count_no_data > 10: break data_list = page_dic.get('data') has_more = page_dic.get('has_more') offset = str(page_dic.get("offset")) except: if not page_dic: count_false += 1 if count_false >= 3: break else: continue if data_list: data_list = page_dic.get('data') has_more = page_dic.get('has_more') else: data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if not data_list: print("no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) has_more = False continue else: count += 1 count_false = 0 try: for one_video in data_list: # info_str = one_video.get('content') info_dic = json.loads(one_video["content"]) video_url = None if info_dic.get("has_video"): video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['url'] = info_dic.get('share_url') if video_dic['url'] == "": video_dic['url'] = info_dic.get('url') video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl release_time = info_dic.get('publish_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = info_dic.get('video_duration') video_dic['play_count'] = info_dic.get("video_detail_info").get("video_watch_count") if not video_dic['play_count']: video_dic['play_count'] = info_dic.get("read_count") video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') video_dic['comment_count'] = info_dic.get('comment_count') video_dic['favorite_count'] = info_dic.get('digg_count') video_dic['video_id_str'] = info_dic.get('item_id') video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['fetch_time'] = int(time.time() * 1e3) video_dic['video_img'] = self.get_video_image(info_dic) for v in video_dic: if video_dic[v] is not None: pass else: video_dic = self.video_page(video_dic['url']) if video_dic: pass else: continue count_no_data = 0 yield video_dic elif info_dic.get("abstract") == "": video_dic = copy.deepcopy(self.video_data) # print(info_dic) if info_dic.get('article_url'): video_url = info_dic.get('article_url') elif info_dic.get('display_url'): video_url = info_dic.get('display_url') elif info_dic.get('url'): video_url = info_dic.get('url') elif info_dic.get('share_url'): video_url = info_dic.get('share_url') elif info_dic.get("raw_data"): if info_dic.get("raw_data").get("origin_group"): video_url = info_dic.get("raw_data").get("origin_group").get('article_url') elif info_dic.get("raw_data").get("comment_base"): video_url = info_dic.get("raw_data").get("comment_base").get('share').get( 'share_url') elif info_dic.get("raw_data").get("action"): video_url = "https://m.toutiaoimg.cn/group/%s/" % info_dic.get("raw_data").get( 'group_id') video_dic['video_id'] = info_dic.get("raw_data").get('group_id') video_dic['play_count'] = info_dic.get("raw_data").get("action").get("play_count") video_dic['repost_count'] = info_dic.get("raw_data").get("action").get( "share_count") video_dic['comment_count'] = info_dic.get("raw_data").get("action").get( 'comment_count') video_dic['favorite_count'] = info_dic.get("raw_data").get("action").get( 'digg_count') video_dic['duration'] = info_dic.get('raw_data').get('video').get("duration") video_dic['title'] = info_dic.get('raw_data').get("title") video_dic['releaser'] = info_dic.get('raw_data').get("user").get("info").get("name") video_dic['releaserUrl'] = releaserUrl video_dic['url'] = video_url video_dic['releaser_id_str'] = "toutiao_%s" % releaser_id video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3) video_dic['video_img'] = "http://p1-tt.bytecdn.cn/large/" + info_dic.get( 'raw_data').get('video').get("origin_cover").get("uri") video_dic['release_time'] = int(info_dic.get("raw_data").get("create_time") * 1e3) video_url = None if video_url: video_page_dic = self.video_page(video_url) if video_page_dic: video_dic.update(video_page_dic) count_no_data = 0 yield video_dic except: continue def retry_url(self, url, headers, cookies): retry_count = 0 while retry_count < 10: # time.sleep(0.2) get_page = requests.get(url, headers=headers, allow_redirects=False, cookies=cookies) page_dic = get_page.json() data_list = page_dic.get('data') retry_count += 1 if data_list: return page_dic else: return False def releaser_page_test(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxy_dic=None): page_dic = {} result_list = [] has_more = True count = 1 releaser_id = self.find_releaser_id(releaserUrl) # print('releaser_id', releaser_id) max_behot_time = 0 data_count = 0 # print(as_cp_sign) headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "content-type": "application/x-www-form-urlencoded", "x-requested-with": "XMLHttpRequest", "Referer": "https://www.toutiao.com/c/user/%s/" % releaser_id, "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", # "cookie":'cookie: tt_webid=6673330506500982276; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=169c3156bb86b3-00d2e2a0ad50b2-7a1437-161398-169c3156bb9746; tt_webid=6673330506500982276; csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; CNZZDATA1259612802=281397494-1553752275-https%253A%252F%252Fwww.baidu.com%252F%7C1555306390', "cache-control": "max-age=0", "upgrade-insecure-requests": "1" } user_page_url = "https://www.toutiao.com/c/user/%s/" % releaser_id user_page = requests.get(user_page_url, headers=headers) cookies = user_page.cookies while has_more and count <= releaser_page_num_max: # print(str(releaser_id)+str(max_behot_time)) # js_head = json.loads(get_js(str(releaser_id)+str(max_behot_time))) get_as_cp_sign = requests.get( "http://127.0.0.1:3000/?id=%s&max_behot_time=%s" % (releaser_id, max_behot_time)) as_cp_sign = get_as_cp_sign.json() url_dic = {"page_type": "0", "user_id": releaser_id, "max_behot_time": max_behot_time, "count": "20", "as": as_cp_sign.get("as"), "cp": as_cp_sign.get("cp"), "_signature": as_cp_sign.get('_signature') } url = "https://www.toutiao.com/c/user/article/?%s" % urllib.parse.urlencode(url_dic) get_page = requests.get(url, headers=headers, allow_redirects=False, cookies=cookies) # time.sleep(0.5) # format_json = re.match(r"jsonp\d+", get_page.text) page_dic = {} try: page_dic = get_page.json() data_list = page_dic.get('data') has_more = page_dic.get('has_more') max_behot_time = str(page_dic.get("next")["max_behot_time"]) except: data_list = self.retry_url(url, headers, cookies) if data_lis: data_list = page_dic.get('data') has_more = page_dic.get('has_more') else: data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if data_list == []: print("no data in releaser %s page %s" % (releaser_id, count)) # print(page_dic) print(url) count += 1 continue else: count += 1 for one_video in data_list: max_behot_time = str(page_dic.get("next")["max_behot_time"]) # info_str = one_video.get('content') info_dic = one_video video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['url'] = info_dic.get('source_url') video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl # release_time = info_dic.get('publish_time') # video_dic['release_time'] = int(release_time * 1e3) # video_dic['duration'] = self.t2s(info_dic.get('video_duration_str')) # video_dic['play_count'] = info_dic.get('detail_play_effective_count') # # video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') # video_dic['comment_count'] = info_dic.get('comment_count') # video_dic['favorite_count'] = info_dic.get('digg_count') # video_dic['video_id'] = info_dic.get('item_id') # video_dic['fetch_time'] = int(time.time() * 1e3) result_list.append(video_dic) if len(result_list) >= 100: data_count += len(result_list) print(result_list) # output_result(result_Lst=result_list, # platform=self.platform, # output_to_file=output_to_file, # filepath=filepath, # output_to_es_raw=output_to_es_raw, # es_index=es_index, # doc_type=doc_type, # output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: data_count += len(result_list) # print(result_list) print(data_count) # output_result(result_Lst=result_list, # platform=self.platform, # output_to_file=output_to_file, # filepath=filepath, # output_to_es_raw=output_to_es_raw, # es_index=es_index, # doc_type=doc_type, # output_to_es_register=output_to_es_register) # @logged def releaser_page_old(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxy_dic=None): result_list = [] has_more = True offset = 0 count = 1 releaser_id = self.find_releaser_id(releaserUrl) headers = {'Host': 'is.snssdk.com', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0', 'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', # 'Cookie': 'odin_tt=36c67e1135e11981703d797f6246957dcc05964f811721d0c77bcf091fc7484db5698ef9ca767558ea9f1b643ad9e3ac; UM_distinctid=167c9e978c419c-0107b7bccfdb688-143a7540-1fa400-167c9e978c54a2; CNZZDATA1274386066=172017830-1545278343-%7C1545278343' } while has_more is True and count <= releaser_page_num_max and offset is not None: url_dic = {"category": "profile_video", "visited_uid": releaser_id, "offset": offset} url = urllib.parse.urlencode(url_dic) url = "https://i.snssdk.com/api/feed/profile/v1/?%s" % urllib.parse.urlencode(url_dic) print(url) get_page = requests.get(url, headers=headers) print("process on releaser %s page %s" % (releaser_id, offset)) page_dic = get_page.json() data_list = page_dic['data'] has_more = page_dic.get('has_more') offset = page_dic.get('offset') if has_more is None: has_more = False if data_list == []: print("no data in releaser %s page %s" % (releaser_id, count)) count += 1 continue else: count += 1 for one_video in data_list: info_str = one_video.get('content') info_dic = json.loads(info_str) video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['url'] = info_dic.get('display_url') video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl release_time = info_dic.get('publish_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = info_dic.get('video_duration') video_dic['play_count'] = info_dic.get('video_detail_info').get('video_watch_count') video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') video_dic['comment_count'] = info_dic.get('comment_count') video_dic['favorite_count'] = info_dic.get('digg_count') video_dic['video_id'] = info_dic.get('item_id') video_dic['fetch_time'] = int(time.time() * 1e3) result_list.append(video_dic) if len(result_list) >= 100: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) def web_releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxy_dic=None): page_dic = {} result_list = [] has_more = 1 count = 1 releaser_id = self.find_releaser_id(releaserUrl) print('releaser_id', releaser_id) max_behot_time = "" count_callback = 3 data_count = 0 media_id = self.get_data_mediaid(releaserUrl, releaser_id) if not media_id: media_id = releaser_id headers = { "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "upgrade-insecure-requests": "1", "Referer": "http://m.toutiao.com/profile/%s/" % releaser_id, "User-Agent": self.random_useragent(), } while has_more == 1 and count <= releaser_page_num_max: eas, ecp = getHoney() url_dic = {"page_type": "0", "uid": releaser_id, "max_behot_time": max_behot_time, "media_id": media_id, "output": "json", "is_json": "1", "count": "20", "from": "user_profile_app", "version": "2", "as": eas, "cp": ecp, "callback": "jsonp%s" % count_callback, } url = "https://www.toutiao.com/pgc/ma/?%s" % urllib.parse.urlencode(url_dic) get_page = requests.get(url, headers=headers, allow_redirects=False) # time.sleep(0.5) # format_json = re.match(r"jsonp\d+", get_page.text) page_dic = {} if get_page.text[0:4] == "json": get_page_text = (get_page.text[len(url_dic["callback"]) + 1:-1]) # print(res)0 try: page_dic = json.loads(r"%s" % get_page_text) data_list = page_dic.get('data') has_more = page_dic.get('has_more') max_behot_time = str(page_dic.get("next").get("max_behot_time")) except: print("json失败 ", get_page.text) data_list = [] has_more = False # offset = page_dic.get('offset') if has_more is None: has_more = False if data_list == []: print("no data in releaser %s page %s" % (releaser_id, count)) print(page_dic) print(url) count += 1 continue else: count += 1 count_callback += 1 for one_video in data_list: # info_str = one_video.get('content') info_dic = one_video video_dic = copy.deepcopy(self.video_data) video_dic['title'] = info_dic.get('title') video_dic['url'] = info_dic.get('source_url') print(info_dic.get('source_url')) video_dic['releaser'] = info_dic.get('source') video_dic['releaserUrl'] = releaserUrl release_time = info_dic.get('publish_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = self.t2s(info_dic.get('video_duration_str')) video_dic['play_count'] = info_dic.get('detail_play_effective_count') # video_dic['repost_count'] = info_dic.get('forward_info').get('forward_count') video_dic['comment_count'] = info_dic.get('comment_count') video_dic['favorite_count'] = info_dic.get('digg_count') video_dic['video_id'] = info_dic.get('item_id') video_dic['fetch_time'] = int(time.time() * 1e3) result_list.append(video_dic) if len(result_list) >= 100: # data_count += len(result_list) # print(result_list) # data_count += len(result_list) output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) result_list.clear() if result_list != []: # data_count += len(result_list) # print(result_list) # print(data_count) print(result_list) output_result(result_Lst=result_list, platform=self.platform, output_to_file=output_to_file, filepath=filepath, output_to_es_raw=output_to_es_raw, es_index=es_index, doc_type=doc_type, output_to_es_register=output_to_es_register) # @logged def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=10000, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, es_index=None, doc_type=None, proxies_num=None): for res in self.web_releaser_page_article(releaserUrl, output_to_file=output_to_file, filepath=filepath, releaser_page_num_max=releaser_page_num_max, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type, proxies_num=proxies_num): # print(res) yield res # for res in self.App_releaser_page_all(releaserUrl, output_to_file=output_to_file, filepath=filepath, releaser_page_num_max=releaser_page_num_max, output_to_es_raw=output_to_es_raw, # output_to_es_register=output_to_es_register, push_to_redis=push_to_redis, es_index=es_index, doc_type=doc_type, proxies_num=proxies_num): # yield res @staticmethod def t2s(t): if t: if len(t) == 5: t = str(t) m, s = t.strip().split(":") return float(m) * 60 + float(s) elif len(t) == 8: t = str(t) h, m, s = t.strip().split(":") return float(h) * 3600 + float(m) * 60 + float(s) else: return 0 @staticmethod def random_useragent(): agent_lis = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" ] return agent_lis[random.randrange(0, len(agent_lis))] def releaser_page_by_time(self, start_time, end_time, url, **kwargs): data_lis = [] count_false = 0 output_to_file = kwargs.get("output_to_file") filepath = kwargs.get("filepath") push_to_redis = kwargs.get("push_to_redis") output_to_es_register = kwargs.get("output_to_es_register") output_to_es_raw = kwargs.get("output_to_es_raw") es_index = kwargs.get("es_index") for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: # print(res) # try: # res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S') # res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S') # # except: # pass data_lis.append(res) if len(data_lis) >= 10: output_result(result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, ) data_lis.clear() else: count_false += 1 if count_false > 10: break else: continue count_false = 0 # for res in self.App_releaser_page_all(url, proxies_num=kwargs.get("proxies_num")): # video_time = res["release_time"] # print(video_time) # if video_time: # if start_time < video_time: # if video_time < end_time: # data_lis.append(res) # # if len(data_lis) >= 100: # output_result(result_Lst=data_lis, # platform=self.platform, # output_to_file=output_to_file, # filepath=filepath, # push_to_redis=push_to_redis, # output_to_es_register=output_to_es_register, # output_to_es_raw=output_to_es_raw, # es_index=es_index, # doc_type=doc_type) # data_lis.clear() # # else: # count_false += 1 # if count_false > 5: # break # else: # continue # # if data_lis != []: # output_result(result_Lst=data_lis, # platform=self.platform, # output_to_file=output_to_file, # filepath=filepath, # push_to_redis=push_to_redis, # output_to_es_register=output_to_es_register, # output_to_es_raw=output_to_es_raw, # es_index=es_index, # doc_type=doc_type) # import pandas as pd # data = pd.DataFrame(data_lis) # s = datetime.datetime.now() # ss = str(s)[0:19].replace(' ', '-').replace(':', '-') # res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030', # # columns=columns # ) if __name__ == '__main__': data_lis = [ # "https://www.toutiao.com/c/user/5839829632/#mid=5839829632", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=103497952048&media_id=1609675594821640&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=51218680623&media_id=51210905535&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=5547176384&media_id=5547176384&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=64781639962&media_id=1574600923716622&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=5784214021&media_id=5806157501&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=79111609720&media_id=1586021722311694&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", "https://profile.zjurl.cn/rogue/ugc/profile/?version_code=7.7.9&version_name=70709&user_id=5576607099&media_id=5575391553&request_source=1&active_tab=dongtai&device_id=65&app_name=news_article", ] # data_lis = ["https://www.toutiao.com/c/user/6911429466/#mid=6911254049"] # data_lis = ["https://www.toutiao.com/c/user/6113709817/#mid=6113817558","https://www.toutiao.com/c/user/3688888283/#mid=3689528443","https://www.toutiao.com/c/user/4188615746/#mid=4273783271"] # data_lis = ["https://www.toutiao.com/c/user/6027579671/#mid=6217730861","http://www.toutiao.com/c/user/61621115551/#mid=1569627833237506"] # data_lis = [ # # "https://www.toutiao.com/c/user/3383347912/#mid=3405329282", # "https://www.toutiao.com/c/user/64883978705/"] miaopai_list = [] test = Crawler_toutiao() # res = test.video_page("https://www.ixigua.com/i6701478014242259463/") # print(res) for url in data_lis: test.releaser_page_by_time(1595088000000, 1595319362610, url, output_to_es_raw=True, es_index='crawler-data-raw', releaser_page_num_max=2, proxies_num=1 ) # test.get_releaser_follower_num(url) # test.get_releaser_image(releaserUrl=data_lis[0])