1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 4 14:00:03 2018
@author: fangyucheng
"""
import argparse
import configparser
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=[], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=False,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=True,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
PARSER.add_argument('-px', '--proxies_num', default=3,
help=('proxies_num'))
ARGS = PARSER.parse_args()
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
PLATFORM_LIST = ARGS.platform
proxies_num = ARGS.proxies_num
#
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
def func_search_keywordlist(platform):
res_dic = {}
res_list = ["比基尼线脱毛",
"嗨体泪沟",
"根据脸型选发型",
"圆脸适合什么发型",
"5热玛吉",
"耳软骨假体鼻综合",
"肉毒素去法令纹",
"吸脂瘦腹部",
"嗨体填充泪沟",
"6d小脸针",
"水剥离",
"嗨体去颈纹",
"胶原蛋白填充泪沟",
"吸脂瘦全身",
"肉毒素去狐臭",
"吸脂瘦腰部",
"fotona4d",
"嘴综合",
"胸部下垂矫正",
"5g天使光雕",
"唇综合",
"SVF-gel脂肪胶",
"嘴角上扬术",
"嗨体注射",
"脂肪填充修复",
"比基尼脱毛",
"lams吸脂",
"脂肪填充面部年轻化",
"嗨体",
"吸脂祛副乳",
"m22",
"胸部提升",
"fotona",
"O型腿矫正",
"肋骨鼻",
"欣颜",
"唯颜",
"垫眉骨",
"咬肌切除",
"背部吸脂",
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
"热玛吉",
"热玛吉5代",
]
for l in res_list:
res_dic[l] = 1
return res_dic
ES_INDEX = 'crawler-data-raw'
print(ES_INDEX)
pages = ARGS.maxpage
for platform in PLATFORM_LIST:
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
KEYWORD_dic = func_search_keywordlist(platform)
for keyword in KEYWORD_dic:
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,proxies_num=proxies_num)
except Exception as e:
print(e)
continue
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'