1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 14 16:12:57 2019
@author: zhouyujiang
查找切片中头条发布者+发布时间+duration相同的数据
"""
import pandas as pd
import datetime
import elasticsearch
from elasticsearch.helpers import scan
from crawler_url_video_info import get_target_video_info
hosts='192.168.17.11'
port=80
user='zhouyujiang'
passwd='8tM9JDN2LVxM'
http_auth=(user, passwd)
es=elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
zyj_set = set()
zyj_dict = {}
index = 'short-video-weekly'
doc_type = 'daily-url-2019_w07_s1'
re_s_t = 1549728000000
re_e_t = 1550332800000
count = 0
sacn_body = {
"query": {
"bool": {
"filter": [
{"term": {"platform.keyword": 'toutiao'}},
{"range": {"release_time": {"gte": re_s_t,"lt":re_e_t}}}
]
}
}
}
scan_re = scan(client=es, index=index, doc_type=doc_type,
query=sacn_body, scroll='3m')
for one in scan_re:
count = count +1
if count %1000 == 0:
print(count)
line = one['_source']
releaser = line['releaser']
release_time = line['release_time']
duration = line['duration']
zyj_id = releaser + str(release_time) + str(duration)
if zyj_id not in zyj_dict:
zyj_dict[zyj_id] = []
zyj_dict[zyj_id].append(line)
else:
zyj_set.add(zyj_id)
zyj_dict[zyj_id].append(line)
re_list = []
for one_key in zyj_set:
for one_value in zyj_dict[one_key]:
# url = one_value['url']
# new_playcount = get_target_video_info(url=url, platform='toutiao')
# one_value['new_playcount'] = new_playcount
re_list.append(one_value)
data = pd.DataFrame(re_list)
data.to_csv('头条7zhou重复数据重新抓取播放量.csv')