1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from selenium import webdriver
import datetime
import re
import pandas as pd
class Craler_tudou(object):
def __init__(self):
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(chrome_options=chrome_options)
@staticmethod
def video_time(time_str):
now = datetime.datetime.now()
if "分钟前" in time_str:
min_str = re.findall(r"(\d+)分钟前",time_str)[0]
videotime = now - datetime.timedelta(minutes=int(min_str))
elif "小时前" in time_str:
hour_str = re.findall(r"(\d+)小时前", time_str)[0]
videotime = now - datetime.timedelta(hours=int(hour_str))
elif "昨天" in time_str:
date_lis = time_str.split(" ")
hours, mins = date_lis[1].split(":")
last_day = now - datetime.timedelta(days=1)
videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
elif "前天" in time_str:
date_lis = time_str.split(" ")
hours, mins = date_lis[1].split(":")
last_day = now - datetime.timedelta(days=2)
videotime = datetime.datetime(year=int(last_day.year), month=int(last_day.month), day=int(last_day.day), hour=int(hours), minute=int(mins))
elif "天前" in time_str:
day_str = re.findall(r"(\d+)天前", time_str)[0]
videotime = now - datetime.timedelta(days=int(day_str))
elif "刚刚" in time_str:
videotime = now
else:
if str(now.year) in time_str:
pass
else:
date_lis = time_str.split(" ")
month,days = date_lis[0].split("-")
hours,mins = date_lis[1].split(":")
videotime = datetime.datetime(year=int(now.year),month=int(month),day=int(days),hour=int(hours),minute=int(mins))
# print(videotime.strftime("%Y-%m-%d %H:%M:%S"))
return videotime
def time_range_video_num(self,start_time,end_time,url_list):
data_lis = []
info_lis = []
columns = [""]
for dic in url_list:
for res in self.get_page(dic["url"]):
title,link,video_time = res
print(res)
if start_time < video_time < end_time:
data_lis.append((title,link,video_time,dic["url"]))
else:
break
csv_save = pd.DataFrame(data_lis)
csv_save.to_csv("%s.csv" % (dic["platform"] + "_" + dic["releaser"]),encoding="GBK")
info_lis.append([dic["platform"],dic["releaser"],len(data_lis)])
data_lis = []
csv_save = pd.DataFrame(info_lis)
csv_save.to_csv("%s.csv" % (datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S")), encoding="GBK")
self.driver.quit()
def get_page(self,url):
#video_page = self.driver.get(url)
# js = 'window.open("%s");' % url
# self.driver.execute_script(js)
page_num = -1
try:
video_page = self.driver.get(url)
while True:
page_num += 1
if page_num != 0:
self.driver.find_element_by_class_name("next").click()
video_lis = self.driver.find_elements_by_xpath("/html/body/div[2]/div/div[3]/div/div/div/div[2]/div/div/div/div[1]/div")
for v in video_lis:
v_a = v.find_element_by_xpath("./div[2]/a")
title = v_a.get_attribute("title")
link = v_a.get_attribute("href")
video_time = self.video_time(v.find_element_by_class_name("v-publishtime").text)
yield (title,link,video_time)
except Exception as e:
raise e
print(e)
print("page %s has no more data" % page_num)
if __name__ == "__main__":
test = Craler_tudou()
url_lis = [
{"platform":"new_tudou",
"url":"https://id.tudou.com/i/UNTk2NjE0MDM4NA==/videos?",
"releaser":"酷娱文化先锋"
},
{"platform": "new_tudou",
"url": "https://id.tudou.com/i/UMTQ3MDM0MjAw/videos?",
"releaser": "酷娱文化先锋"
}]
start_time = datetime.datetime(year=2019,month=6,day=6)
end = datetime.datetime.now()
test.time_range_video_num(start_time,end,url_lis)