# -*- coding:utf-8 -*- # @Time : 2020/3/2 16:37 # @Author : litao # -*- coding:utf-8 -*- # @Time : 2020/3/2 11:07 # @Author : litao # -*- coding:utf-8 -*- # @Time : 2020/2/28 12:09 # @Author : litao import requests import json, re, datetime, urllib from crawler.crawler_sys.utils.output_results import retry_get_url from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp from write_data_into_es.func_cal_doc_id import * class Crawler_douyin(object): def __init__(self): self.platform = "抖音" timestamp = int(datetime.datetime.now().timestamp() * 1e3) self.headers = { "Host": "api3-normal-c-lf.amemv.com", "Connection": "keep-alive", # "Cookie": "d_ticket=38c841789e38ea43c6338910dac65ffe192e3; odin_tt=82086544bb9028f027b5aea78724ccf512dead26658f45321be33bade615793782bf6ac7fe0c18b73b9592f4284413d5300974810d439b42ef0b3eaa761b1640; msh=cakLg8lvbK5CxiSWkIbD2UInwAI; sid_guard=09fe3dfd89dfbc79f081fb2db9dd81ee%7C1581832192%7C5184000%7CThu%2C+16-Apr-2020+05%3A49%3A52+GMT; uid_tt=da0b53b7563eca87c47da41f5f17c30f; uid_tt_ss=da0b53b7563eca87c47da41f5f17c30f; sid_tt=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid_ss=09fe3dfd89dfbc79f081fb2db9dd81ee; install_id=104847319549; ttreq=1$51e484720311469c4b70f4754d730d538a074c4b", # "X-SS-REQ-TICKET": "1583137750770", # "X-Tt-Token": "0009fe3dfd89dfbc79f081fb2db9dd81ee013243f7134b3eb37249cc729a5276172df69a4391b56ae4bf253c3c6352322611", "sdk-version": "1", "X-SS-DP": "1128", # "x-tt-trace-id": "00-9a5d00730a107b4310780861c7c50468-9a5d00730a107b43-01", "User-Agent": "com.ss.android.ugc.aweme/990 (Linux; U; Android 5.1.1; zh_CN; OPPO R11; Build/NMF26X; Cronet/77.0.3844.0)", "Accept-Encoding": "gzip, deflate", # "X-Gorgon": "0401c0cd4001df62dd7cff2a7d35092b14b3f2264163368f7f19", # "X-Khronos": "1583137750", } def get_hot_words(self): bulk_list = [] url = "https://api3-normal-c-lf.amemv.com/aweme/v1/hot/search/list/?detail_list=1&mac_address=48%3AA4%3A72%3A58%3A86%3AD5&os_api=22&device_type=OPPO%20R11&ssmix=a&manifest_version_code=990&dpi=320&uuid=866174725888628&app_name=aweme&version_name=9.9.0&app_type=normal&ac=wifi&update_version_code=9902&channel=tengxun_new&device_platform=android&iid=104847319549&version_code=990&cdid=fce00742-ccef-4b14-943d-1f62b6d637b0&openudid=48a4725886d57203&device_id=70787469432&resolution=900*1600&os_version=5.1.1&language=zh&device_brand=OPPO&aid=1128&mcc_mnc=46007" page_res = retry_get_url(url, headers=self.headers, proxies=3, timeout=5) page_json = page_res.json() for data in page_json["data"]["word_list"]: title = data["word"] if title: dic = { "platform": self.platform, "title": title, "fetch_time": int(datetime.datetime.now().timestamp() * 1e3), "hot_value": data.get("hot_value"), "top": data.get("position"), } bulk_list.append(dic) hot_words_output_result(bulk_list) return True def search_page(self, title=None,**kwargs): data_list = [] headers = { "Host": "aweme.snssdk.com", "Connection": "keep-alive", # "Cookie": "d_ticket=38c841789e38ea43c6338910dac65ffe192e3; odin_tt=82086544bb9028f027b5aea78724ccf512dead26658f45321be33bade615793782bf6ac7fe0c18b73b9592f4284413d5300974810d439b42ef0b3eaa761b1640; msh=cakLg8lvbK5CxiSWkIbD2UInwAI; sid_guard=09fe3dfd89dfbc79f081fb2db9dd81ee%7C1581832192%7C5184000%7CThu%2C+16-Apr-2020+05%3A49%3A52+GMT; uid_tt=da0b53b7563eca87c47da41f5f17c30f; uid_tt_ss=da0b53b7563eca87c47da41f5f17c30f; sid_tt=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid=09fe3dfd89dfbc79f081fb2db9dd81ee; sessionid_ss=09fe3dfd89dfbc79f081fb2db9dd81ee; install_id=104847319549; ttreq=1$51e484720311469c4b70f4754d730d538a074c4b", # "X-SS-REQ-TICKET": "1583139618192", # "X-Tt-Token": "0009fe3dfd89dfbc79f081fb2db9dd81ee013243f7134b3eb37249cc729a5276172df69a4391b56ae4bf253c3c6352322611", "sdk-version": "1", # "x-tt-trace-id": "00-9a797f160a107b431078db3e93480468-9a797f160a107b43-01", "User-Agent": "com.ss.android.ugc.aweme/990 (Linux; U; Android 5.1.1; zh_CN; OPPO R11; Build/NMF26X; Cronet/77.0.3844.0)", "Accept-Encoding": "gzip, deflate", # "X-Gorgon": "0401a0514001f64964a8ebef9f4305ccbef2df1aa3c92fdf955a", # "X-Khronos": "1583139618", } url = "https://aweme.snssdk.com/aweme/v1/hot/search/video/list/?hotword={0}&offset=0&count=12&source=trending_page&is_ad=0&item_id_list&is_trending=0&os_api=22&device_type=OPPO%20R11&ssmix=a&manifest_version_code=990&dpi=320&uuid=866174725888628&app_name=aweme&version_name=9.9.0&ts=1583139619&app_type=normal&ac=wifi&update_version_code=9902&channel=tengxun_new&_rticket=1583139618192&device_platform=android&iid=104847319549&version_code=990&cdid=fce00742-ccef-4b14-943d-1f62b6d637b0&openudid=48a4725886d57203&device_id=70787469432&resolution=900*1600&os_version=5.1.1&language=zh&device_brand=OPPO&aid=1128&mcc_mnc=46007".format(title) res = retry_get_url(url, headers=headers, timeout=5, proxies=3) page_text = res.json() for one_video in page_text["aweme_list"]: video_dic = {} video_dic['title'] = one_video.get('desc') video_dic['url'] = one_video.get('share_url') releaser_id = one_video.get('author_user_id') video_dic['releaser'] = one_video.get('author').get("nickname") video_dic['releaserUrl'] = "https://www.iesdouyin.com/share/user/%s" % releaser_id release_time = one_video.get('create_time') video_dic['release_time'] = int(release_time * 1e3) video_dic['duration'] = int(one_video.get('duration') / 1000) video_dic['play_count'] = 0 video_dic['repost_count'] = one_video.get('statistics').get('share_count') video_dic['comment_count'] = one_video.get('statistics').get('comment_count') video_dic['favorite_count'] = one_video.get('statistics').get('digg_count') video_dic['video_id'] = one_video.get('aweme_id') video_dic['fetch_time'] = int(datetime.datetime.now().timestamp() * 1e3) video_dic['releaser_id_str'] = "抖音_%s" % releaser_id video_dic['platform'] = "抖音" video_dic['video_img'] = one_video.get('video').get('cover').get('url_list')[0] video_dic["is_hot"] = 1 video_dic["data_provider"] = "CCR" data_list.append(video_dic) output_result(result_Lst=data_list, platform=self.platform, output_to_es_raw=True, ) data_list.clear() ## sign和ts为加密字段 无法解决 def get_hot_videos(self,*args,**kwargs): self.search_page(**kwargs) if __name__ == "__main__": crawler = Crawler_douyin() crawler.get_hot_words() crawler.search_page("模仿刘柏辛的哼翻车")