1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding:utf-8 -*-
# @Time : 2020/3/2 11:07
# @Author : litao
# -*- coding:utf-8 -*-
# @Time : 2020/2/28 12:09
# @Author : litao
import requests
import json, re, datetime, urllib
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import hot_words_output_result, output_result
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
from write_data_into_es.func_cal_doc_id import *
import base64
from crawler.crawler_sys.site_crawler.crawler_wangyi_news import Crawler_wangyi_news as Crawler_wy
crawler_qq_video_page = Crawler_wy().video_page
class Crawler_WangYi_News(object):
def __init__(self):
self.platform = "网易新闻"
timestamp = int(datetime.datetime.now().timestamp() * 1e3)
self.headers = {
"data4-Sent-Millis": str(timestamp),
"Add-To-Queue-Millis": str(timestamp),
"User-D": "2zx5YfHmoBb72ayxYpQVUg==",
"User-N": "HPcUw15+Yla9nvIP1c9vbqrHfvh/PCmpfK2DVDjsFFGhp4IV17bdU7hTwNc3Kfe3gVZiNSrnIe+bsZBFoMsbZQ==",
"httpDNSIP": "101.71.145.130",
"User-C": "5aS05p2h",
"User-Agent": "NewsApp/32.1 Android/5.1.1 (OPPO/OPPO R11)",
"X-NR-Trace-Id": "%s_355730601_866174725888628" % timestamp,
"Host": "c.m.163.com",
"Connection": "Keep-Alive",
"Accept-Encoding": "gzip",
}
def get_hot_words(self):
bulk_list = []
url = "http://c.m.163.com/nc/search/hotWord.html"
page_res = retry_get_url(url, headers=self.headers, proxies=3, timeout=5)
page_json = page_res.json()
for data in page_json["hotWordList"]:
title = data["searchWord"]
if title:
dic = {
"platform": self.platform,
"title": title,
"fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
}
bulk_list.append(dic)
hot_words_output_result(bulk_list)
return True
def search_page(self,title):
data_list = []
encodestr = base64.b64encode(title.encode('utf-8'))
encodestr = str(encodestr, 'utf-8')
url = "http://c.m.163.com/search/comp2/Kg%3D%3D/20/{0}.html?".format(encodestr)
para = "deviceId=2zx5YfHmoBb72ayxYpQVUg%3D%3D&version=newsclient.32.1.android&channel=VDEzNDg2NDc5MDkxMDc%3D&canal=bmV3c19sZl9jcGFfMg%3D%3D&dtype=0&tabname=shipin&position=5YiX6KGo6aG26YOo&ts={0}&sign=Di3opZw%2FFIPDdgreSK4VCKlnMSpm6FPoel5LeY88RgZ48ErR02zJ6%2FKXOnxX046I&spever=FALSE&open=scheme_%E9%BB%98%E8%AE%A4&openpath=/video/VT5O1KVCO".format(str(int(datetime.datetime.now().timestamp())))
res = retry_get_url(url+para, headers=self.headers, timeout=5, proxies=3)
page_text = res.json()
for data in page_text["doc"]["result"]:
print(data)
data_list.append(data)
output_result(result_Lst=data_list,
platform=self.platform,
output_to_es_raw=True,
)
data_list.clear()
## sign和ts为加密字段 无法解决
def get_hot_videos(self, max_page=10,**kwargs):
pass
if __name__ == "__main__":
crawler = Crawler_WangYi_News()
crawler.get_hot_words()
crawler.search_page("患者私自出院散步")
# crawler.get_hot_videos("https://v.qq.com/x/search/?q=%E6%95%99%E8%82%B2%E9%83%A8%E5%9B%9E%E5%BA%94%E6%89%A9%E5%A4%A7%E7%A1%95%E5%A3%AB%E5%92%8C%E4%B8%93%E5%8D%87%E6%9C%AC%E6%8B%9B%E7%94%9F&stag=12",channel="教育部回应扩大硕士和专升本招生")