1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 22 10:57:16 2018
@author: fangyucheng
"""
import re
import time
import json
import requests
from bs4 import BeautifulSoup
from crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp
cookie = ('sm_uuid=300c859169f7a0e16f6f3fd637b51b44%7C%7C%7C1534912939;'
'sm_diu=300c859169f7a0e16f6f3fd637b51b44%7C%7C11eeeeee4a6df8bafe%7C1534912939;'
'cna=T8v9EQPOimUCAXL/Az0YrDOB;'
'isg=BEpKJ2iKJQYvZqlV7VhJwkckmDMsk__fIdGRvNSD9x0oh-tBvMtipV61kzX-bEYt;'
'sm_sid=9a1582ab658abd059600560bb5d855a0;'
'phid=9a1582ab658abd059600560bb5d855a0')
headers = {'Host': 'api.m.sm.cn',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': cookie,
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
video_page_cookie = ('Hm_lvt_c337010bc5a1154d2fb6741a4d77d226=1535364629;'
'Hm_lpvt_c337010bc5a1154d2fb6741a4d77d226=1535364629;'
'vpstoken=t7Kp5v8ulKpE3VrNXYWg6w%3D%3D;'
'cna=T8v9EQPOimUCAXL/Az0YrDOB; hasLoadCommentEmojiData=1;'
'isg=BOjoR1oOJwg5eguMtQXfZ9aDutU6uX1RiONAHqIZNGNW_YhnSiEcq34_8RUNVgTz;'
'_pk_id.070b5f1f4053.1564=84fc2996-3cae-4f2a-8df4-92f03a3ce790.1535371022.1.1535371040.1535371022.;'
'_pk_ses.070b5f1f4053.1564=*')
video_page_headers = {'Host': 'mparticle.uc.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': video_page_cookie,
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
other_type_name_cookie = ('sm_uuid=300c859169f7a0e16f6f3fd637b51b44%7C%7C%7C1534912939;'
'sm_diu=300c859169f7a0e16f6f3fd637b51b44%7C%7C11eeeeee4a6df8bafe%7C1534912939;'
'cna=T8v9EQPOimUCAXL/Az0YrDOB;'
'isg=BKSkG1vmo5TKrNcnhx6PEP2KdqFWlfntaw-Pbr7FMW94aUUz5k2-NwSDLQdUqgD_;'
'sm_sid=254b6b0e0ceded0fc0e605dd15979af4;'
'phid=254b6b0e0ceded0fc0e605dd15979af4')
other_type_name_headers = {'Host': 'm.sm.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': other_type_name_cookie,
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
keyword_type_dic = {'足球': 'Football',
'NBA': 'Nbanew',
'精选头条': 'Highquality.quality',}
def get_whole_page_from_article_url(url, whole_page_text):
biz_id_str = re.findall('"biz_org_id":\d+,', whole_page_text)[0]
biz_id = re.findall('\d+', biz_id_str)[0]
get_article_id_lst = url.split('!')
for line in get_article_id_lst:
if 'wm_aid' in line:
get_aid = line
article_id = get_aid.split('=')[-1]
article_url = 'https://ff.dayu.com/contents/origin/' + article_id + '?biz_id=' + str(biz_id)
get_article_page = requests.get(article_url)
get_article_page_dic = get_article_page.json()
whole_page_text = get_article_page_dic['body']['text']
print('get whole page from %s' % article_url)
return whole_page_text
def shenma_info_page_highquality(keyword, max_page_num, kw_type='精选头条'):
keyword_type = keyword_type_dic[kw_type]
page_num = 1
result_lst = []
while page_num <= max_page_num:
search_url = 'https://api.m.sm.cn/rest?q=' + keyword +'&method=' + keyword_type +'&page_num=' + str(page_num)
get_page = requests.get(search_url, headers=headers)
page_dic = get_page.json()
if page_dic['error'] == 1:
print('there is no content about %s' % keyword)
return result_lst
info_lst = page_dic['data']['content']
print('get page at page %s' % page_num)
page_num += 1
for line in info_lst:
title = line['title']
title = title.replace('<em>', '').replace('</em>', '')
url = line['url']
releaser_time_str = line['time']
release_time = trans_strtime_to_timestamp(releaser_time_str)
source = line['source']
try:
get_whole_page = requests.get(url, headers=video_page_headers)
whole_page_text = get_whole_page.text
try:
whole_page_text = get_whole_page_from_article_url(url, whole_page_text)
except:
print("don't get article detail from dayu")
except:
print('get whole page process error %s' % title)
whole_page_text = 'missing'
info_dic = {'title': title,
'url': url,
'release_time': release_time,
'source': source,
'whole_page_text': whole_page_text}
result_lst.append(info_dic)
return result_lst
def shenma_info_page_other_type_name(keyword, max_page_num, kw_type='NBA'):
keyword_type = keyword_type_dic[kw_type]
page_num = 1
result_lst = []
while page_num <= max_page_num:
search_url = ('https://m.sm.cn/api/rest?method=' + keyword_type + '.feed&format=json&q='
+ keyword + '&uads=&page=' + str(page_num)) + '&ps_index=30'
get_page = requests.get(search_url, headers=other_type_name_headers)
page_dic = get_page.json()
if page_dic['error'] == 1:
print('there is no content about %s' % keyword)
return result_lst
info_for_soup = page_dic['data']['feed_html']
soup = BeautifulSoup(info_for_soup, 'html.parser')
info_lst = soup.find_all('li', {'class': 'y-feed-item'})
print('get page at page %s' % page_num)
page_num += 1
for line in info_lst:
title = line.find('p', {'class': 'y-feed-title'}).text
url = line.a['href']
releaser_time_str = line.find('span', {'class': 'y-feed-desc-time'}).text
release_time = trans_strtime_to_timestamp(releaser_time_str)
source = line.find('span', {'class': 'y-feed-desc-source'}).text
try:
get_whole_page = requests.get(url)
whole_page_text = get_whole_page.text
print('get news %s' % title)
except:
print('get whole page process error %s' % title)
whole_page_text = 'missing'
info_dic = {'title': title,
'url': url,
'release_time': release_time,
'source': source,
'whole_page_text': whole_page_text}
result_lst.append(info_dic)
return result_lst
if __name__ == '__main__':
keyword = '独行侠'
max_page_num = 10
result2 = shenma_info_page_other_type_name(keyword, max_page_num, kw_type='NBA')