1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 11:57:40 2018
@author: fangyucheng
"""
import elasticsearch
import json
import time
from crawler_sys.utils.releaser_url_check import test_releaserUrl
from crawler_sys.utils import trans_format
hosts = '192.168.17.11'
port = 80
user_id = 'fangyucheng'
password = 'VK0FkWf1fV8f'
http_auth = (user_id, password)
lose_re_url = []
es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
test_lst = trans_format.csv_to_lst_with_headline('F:/add_target_releaser/album_playcnt/album_playcnt_002.csv')
task_lst = []
for line in test_lst:
if line['releaserUrl'] is not None:
task_lst.append(line)
bulk_all_body = ''
poster = 'fangyucheng'
test_re = test_releaserUrl(task_lst)
for one_re in test_re:
if one_re['True_or_False'] == 1:
line_dic = {}
post_by = poster
post_time = int(time.time() * 1000)
timestamp = int(time.time() * 1000)
releaserUrl = one_re['releaserUrl']
platform = one_re['platform']
releaser = one_re['releaser']
try:
album_play_count = one_re['album_play_count']
except:
album_play_count = None
_id = platform + '_' + releaser
bulk_head = '{"index": {"_id":"%s"}}' % _id
line_dic['is_valid'] = True
line_dic['platform'] = platform
line_dic['post_by'] = post_by
if album_play_count is not None:
line_dic['album_play_count'] = album_play_count
line_dic['post_time'] = post_time
line_dic['releaser'] = releaser
line_dic['releaserUrl'] = releaserUrl
line_dic['timestamp'] = timestamp
data_str=json.dumps(line_dic, ensure_ascii=False)
bulk_one_body = bulk_head + '\n' + data_str + '\n'
bulk_all_body += bulk_one_body
es.bulk(index='target_releasers', doc_type='doc',
body=bulk_all_body, request_timeout=200)
bulk_all_body = ''
print('success')