1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
"""
Created on Mon May 28 10:29:57 2018
@author: fangyucheng
"""
import requests
import json
import datetime
import re
from framework.video_fields_std import Std_fields_video
#from . import bulk_write_into_es
import js2py
import hashlib
import time
from selenium import webdriver
class Crawler_Watermelon(Std_fields_video):
def write_into_file(self, data_dict, file_obj):
json_str=json.dumps(data_dict)
file_obj.write(json_str)
file_obj.write('\n')
file_obj.flush()
def feed_url_into_redis(self, dict_Lst):
pass
def output_result(self, result_Lst, output_to_file=False, filepath=None):
# write data into es crawler-raw index
#bulk_write_into_es(result_Lst)
# feed url into redis
self.feed_url_into_redis(result_Lst)
# output into file according to passed in parameters
if output_to_file==True and filepath!=None:
output_fn='crawler_watermelon_%s_json' % datetime.datetime.now().isoformat()[:10]
output_f=open(filepath+'/'+output_fn, 'a', encoding='utf-8')
self.write_into_file(result_Lst, output_f)
else:
pass
def get_list_video(self,output_to_file=False, filepath=None):
result_Lst = []
max_behot_time = 0
count = 0
headers = {'Host': 'ic.snssdk.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Cookie': 'odin_tt=5b54e47f71b1963502fe03c4028f5672c887a0b739ce2302481beda2a4388a0a538ade820b54b4589da13d18dde9d245',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
while count <= 0:
time_now = int(time.time())
listurl = 'http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&max_behot_time='+str(max_behot_time)+'&list_entrance=main_tab&last_refresh_sub_entrance_interval='+str(time_now)
#http://ic.snssdk.com/video/app/stream/v51/?category=subv_xg_society&refer=1&count=20&list_entrance=main_tab&last_refresh_sub_entrance_interval=1527473360&loc_mode=5&tt_from=refresh_auto&play_param=codec_type%3A0&iid=33815381012&device_id=52965120460&ac=wifi&channel=wandoujia&aid=32&app_name=video_article&version_code=653&version_name=6.5.3&device_platform=android&ab_version=359940%2C344692%2C353539%2C356329%2C361439%2C324397%2C361311%2C358091%2C358364%2C356602%2C350431%2C354439%2C325211%2C346575%2C342302%2C361530%2C320651%2C361551&ssmix=a&device_type=MuMu&device_brand=Android&language=zh&os_api=19&os_version=4.4.4&uuid=008796749793280&openudid=54767d8bf41ac9a4&manifest_version_code=253&resolution=1280*720&dpi=240&update_version_code=65307&_rticket=1527473360674&rom_version=cancro-eng+4.4.4+V417IR+eng.root.20180201.174500+release-keys&fp=i2T_FYmuPzL5Fl4ZcrU1FYFeL2FW
get_page = requests.get(listurl,headers=headers)
page = get_page.text
page = page.replace('true','True')
page = page.replace('false','False')
page = page.replace('null','"Null"')
page_dic = eval(page)
video_agg = page_dic['data']
count += 1
for line in video_agg:
try:
video_str=line['content']
video_dic=eval(video_str)
if video_dic['has_video']==True:
title = video_dic['title']
url = video_dic['display_url']
browser = webdriver.Chrome()
browser.get(url)
pc_midstep = browser.find_element_by_class_name('num').text
play_count = ' '.join(re.findall('\d+',pc_midstep))
release_time = int(video_dic['publish_time']*1e3)
play_count2 = video_dic['read_count']
releaser = video_dic['media_name']
max_behot_time = video_dic['behot_time']
video_id = video_dic['item_id']
releaser_id = video_dic['user_info']['user_id']
fetch_time = int(datetime.datetime.now().timestamp()*1e3)
D0={'title':title,'url':url,'release_time':release_time,'releaser':releaser,'play_count':play_count,
'video_id':video_id,'releaser_id':releaser_id,'fetch_time':fetch_time,'play_count2':play_count2}
result_Lst.append(D0)
print ('get one video')
except:
pass
browser.close()
self.output_result(result_Lst,output_to_file=output_to_file,filepath=filepath)
return result_Lst
#result_Lst.clear()
if __name__=='__main__':
test=Crawler_Watermelon()
output_to_file = True
filepath = 'D:/CSM3.0/爬虫结果/watermelon'
gogogo = test.get_list_video(output_to_file,filepath)