Commit 4362749f authored by litaolemo's avatar litaolemo

update

parent fee41916
......@@ -8,11 +8,16 @@ import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
def redis_path(redis_type=""):
if redis_type == "on_line":
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, password='ReDis!GmTx*0aN12')
else:
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
return rds
def feed_url_into_redis(dict_Lst, expire=0,
):
def feed_url_into_redis(dict_Lst, expire=0,rds=redis_path):
"""
release_time_lower_bdr must be an int value represent
timestamp in milliseconds if given.
......
......@@ -16,20 +16,18 @@ PARSER = argparse.ArgumentParser(description='video platform search page crawler
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao","腾讯新闻", "腾讯视频", "new_tudou"], action='append',
PARSER.add_argument('-p', '--platform', default=["toutiao","weibo", "zhihu"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
PARSER.add_argument('-w', '--output_to_es_raw', default=False,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
PARSER.add_argument('-g', '--output_to_es_register', default=True,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
......@@ -41,45 +39,92 @@ if ARGS.platform != []:
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
#
# def func_search_keywordlist(platform):
# search_body = {"query": {"bool": {"filter": []}}}
# search_resp = es_framework.search(index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# body=search_body,
# size=0,
# request_timeout=100)
# total_hit = search_resp['hits']['total']
# releaser_dic = {}
# if total_hit > 0:
# print('Got %d releaser for platform %s.' % (total_hit, platform))
# scan_resp = scan(client=es_framework, query=search_body,
# index=index_target_releaser,
# doc_type=doc_type_target_releaser,
# request_timeout=200)
# for line in scan_resp:
# try:
# title = line['_source']['title']
# page = line['_source']['page']
# releaser_dic[title] = page
# except:
# print('error in :', line)
# continue
# else:
# print('Got zero hits.')
# return releaser_dic
def func_search_keywordlist(platform):
search_body = {"query": {"bool": {"filter": []}}}
search_resp = es_framework.search(index=index_target_releaser,
doc_type=doc_type_target_releaser,
body=search_body,
size=0,
request_timeout=100)
total_hit = search_resp['hits']['total']
releaser_dic = {}
if total_hit > 0:
print('Got %d releaser for platform %s.' % (total_hit, platform))
scan_resp = scan(client=es_framework, query=search_body,
index=index_target_releaser,
doc_type=doc_type_target_releaser,
request_timeout=200)
for line in scan_resp:
try:
title = line['_source']['title']
page = line['_source']['page']
releaser_dic[title] = page
except:
print('error in :', line)
continue
else:
print('Got zero hits.')
return releaser_dic
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'test2'
DOC_TYPE = 'doc'
print(ES_INDEX, DOC_TYPE)
res_dic = {}
res_list = ["比基尼线脱毛",
"嗨体泪沟",
"根据脸型选发型",
"圆脸适合什么发型",
"5热玛吉",
"耳软骨假体鼻综合",
"肉毒素去法令纹",
"吸脂瘦腹部",
"嗨体填充泪沟",
"6d小脸针",
"水剥离",
"嗨体去颈纹",
"胶原蛋白填充泪沟",
"吸脂瘦全身",
"肉毒素去狐臭",
"吸脂瘦腰部",
"fotona4d",
"嘴综合",
"胸部下垂矫正",
"5g天使光雕",
"唇综合",
"SVF-gel脂肪胶",
"嘴角上扬术",
"嗨体注射",
"脂肪填充修复",
"比基尼脱毛",
"lams吸脂",
"脂肪填充面部年轻化",
"嗨体",
"吸脂祛副乳",
"m22",
"胸部提升",
"fotona",
"O型腿矫正",
"肋骨鼻",
"欣颜",
"唯颜",
"垫眉骨",
"咬肌切除",
"背部吸脂",
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
"热玛吉",
"热玛吉5代",
]
for l in res_list:
res_dic[l] = 1
return res_dic
ES_INDEX = 'crawler-data-raw'
print(ES_INDEX)
pages = ARGS.maxpage
for platform in PLATFORM_LIST:
......@@ -92,12 +137,11 @@ for platform in PLATFORM_LIST:
print("search keyword '%s' on platform %s" % (keyword, platform))
search_pages = int(KEYWORD_dic[keyword])
try:
if platform != "腾讯新闻":
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,)
crawler.search_page(keyword=keyword,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,)
except Exception as e:
print(e)
......
......@@ -10,7 +10,7 @@
"""
import redis, random
import kdl, requests
import sys
# from redis.sentinel import Sentinel
# sentinel = Sentinel([('192.168.17.65', 26379),
......@@ -23,9 +23,23 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=18, decode_responses=True, password='ReDis!GmTx*0aN12')
def func_get_redis():
sys_path = sys.path
for p in sys_path:
if "C:\\" in p:
stats = "test"
break
if stats == "on_line":
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=18, decode_responses=True, password='ReDis!GmTx*0aN12')
else:
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
return rds
rds = func_get_redis()
def get_proxy_from_redis():
try:
one_proxy = rds.randomkey()
......
......@@ -20,11 +20,11 @@ parser.add_argument('-d', '--days_from_now', default=30, type=int,
'default 30.'))
args = parser.parse_args()
def redis_url_batch_gen(platform, batch_str, release_time_lower_bdr):
url_Lst = pull_url_from_es(platform, release_time_lower_bdr)
if url_Lst != []:
redis_list_name, push_counter = feed_url_into_redis(url_Lst, platform,
batch_str=batch_str)
redis_list_name, push_counter = feed_url_into_redis(url_Lst, platform,)
return (redis_list_name, push_counter)
else:
return (None, None)
......
......@@ -343,8 +343,7 @@ class Crawler_toutiao():
def search_page_old(self, keyword, search_pages_max=12,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None,proxies_num=0):
es_index=None,proxies_num=0):
headers_search = {
"accept": "application/json, text/javascript",
"accept-encoding": "gzip, deflate",
......@@ -428,9 +427,10 @@ class Crawler_toutiao():
print("method get_web_article_info error %s" % e)
print(D0)
toutiao_Lst.append(D0)
except KeyError:
except Exception as e:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
print(e)
continue
else:
break
......@@ -440,7 +440,7 @@ class Crawler_toutiao():
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
)
toutiao_Lst.clear()
if toutiao_Lst != []:
......@@ -449,7 +449,7 @@ class Crawler_toutiao():
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
)
return toutiao_Lst
......@@ -461,7 +461,7 @@ class Crawler_toutiao():
self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type, proxies_num=proxies_num)
proxies_num=proxies_num)
def find_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
......@@ -1799,4 +1799,4 @@ if __name__ == '__main__':
# doc_type='doc',
# releaser_page_num_max=3, proxies_num=1))
# test.releaser_page(u)
test.search_page("热玛吉五代")
test.search_page("比基尼线脱毛")
......@@ -432,6 +432,7 @@ class Crawler_weibo():
video_dic["releaserUrl"] = data["userinfo"].get('url')
video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"])
video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"])
video_dic["mid"] = article_id
return video_dic
except Exception as e:
print("single data row formate error %s" % e)
......@@ -442,6 +443,7 @@ class Crawler_weibo():
output_to_es_register=False,
es_index=None,
doc_type=None, proxies_num=0):
count_false = 0
headers_search = {
"Accept": "application/json, text/plain, */*",
"MWeibo-Pwa": "1",
......@@ -463,6 +465,13 @@ class Crawler_weibo():
if get_page.status_code != 200:
continue
page_dict = get_page.json()
while page_dict['data'].get("msg") == '这里还没有内容':
get_page = retry_get_url(search_page_url, headers=headers_search)
page_dict = get_page.json()
count_false += 1
if count_false >= 3:
continue
if page_dict['data'].get("cards")[0].get("card_group"):
for one_line in page_dict['data'].get("cards")[0].get("card_group"):
try:
......@@ -488,7 +497,7 @@ class Crawler_weibo():
# D0['play_count'] = play_count
# D0['comment_count'] = comment_count
# D0['favorite_count'] = favorite_count
D0['article_id'] = article_id
D0['mid'] = article_id
# D0['releaser'] = releaser
# D0['releaserUrl'] = releaserUrl
# D0['release_time'] = release_time
......@@ -501,6 +510,7 @@ class Crawler_weibo():
D0.update(article_info)
except Exception as e:
print("method get_web_article_info error %s" % e)
continue
print(D0)
weibo_Lst.append(D0)
except KeyError:
......@@ -850,5 +860,5 @@ if __name__ == '__main__':
# test_search2 = weibo.search_page(keyword, user_name, password)
# test_repost = weibo.repost_page(weibo_id, user_name, password)
# user_page = weibo.user_page(user_id, user_name, password)
weibo.search_page("迪丽热巴")
weibo.search_page("迪丽热巴",output_to_es_register=True,es_index="crawler-data-raw",search_pages_max=1)
# print(user_page)
......@@ -24,8 +24,8 @@ import requests
# import execjs
import hashlib
import requests
from bs4 import BeautifulSoup
import execjs
# from bs4 import BeautifulSoup
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url, output_result
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
......@@ -48,6 +48,9 @@ class Crawler_zhihu():
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js)
for popk in pop_key_Lst:
self.video_data.pop(popk)
......@@ -71,7 +74,7 @@ class Crawler_zhihu():
requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
tres_json_test = requests_res.text
res_json = json.loads(re.findall('<script id="js-initialData" type="text/json">(.*?)</script>',tres_json_test)[0])
print(res_json)
# print(res_json)
data = res_json["initialState"]
video_dic = {}
video_dic["url"] = url
......@@ -131,7 +134,7 @@ class Crawler_zhihu():
pass
return res_dict
def search_article_page(self, keyword, search_pages_max=12,
def search_article_page(self, keyword, search_pages_max=10,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
......@@ -151,7 +154,7 @@ class Crawler_zhihu():
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": "1.0_a_Yy6euBS_xfbM28ZhtycHU8gG2XoHtyGTxqHve8rXtY",
"x-zse-86": None,
"referer": "https://www.zhihu.com/search?type=content&q={0}".format(urllib.parse.quote(keyword)),
}
......@@ -163,6 +166,10 @@ class Crawler_zhihu():
url = "https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0".format(
urllib.parse.quote(keyword))
offset = 0
f = "+".join(["3_2.0", url.replace("https://www.zhihu.com",""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b",fmd5)
res_list = []
while offset <= search_pages_max * 20:
offset += 20
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
......@@ -176,7 +183,6 @@ class Crawler_zhihu():
# print(get_page.cookies.get_dict())
cookies_dict.update(get_page.cookies.get_dict())
headers_search.pop("x-zse-86", 0)
res_list = []
if page_dict.get("data"):
for one_line in page_dict['data']:
try:
......@@ -191,7 +197,7 @@ class Crawler_zhihu():
D0.update(res_data)
except Exception as e:
print("method get_web_article_info error %s" % e)
print(D0)
# print(D0)
res_list.append(D0)
except KeyError:
# It's totally ok to drop the last return data value.
......@@ -231,137 +237,6 @@ class Crawler_zhihu():
es_index=es_index,
doc_type=doc_type, proxies_num=proxies_num)
def repost_page(self, weibo_id, user_name, password):
total_page = 0
result_lst = []
cookie = self.manipulate_login(user_name=user_name,
password=password)
# cookie = self.test_cookie(get_cookie)
if cookie is not None:
current_time = int(time.time() * 1000)
repost_url = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=1&__rnd=' + str(
current_time)
get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
get_page.encoding = 'utf-8'
try:
page_dic = get_page.json()
total_page = page_dic['data']['page']['totalpage']
repost_info = page_dic['data']['html']
repost_soup = BeautifulSoup(repost_info, 'html.parser')
repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
for line in repost_agg:
try:
one_repost = self.get_repost_info(line)
result_lst.append(one_repost)
print('get one repost')
except:
print('one repost data error')
print(one_repost)
except:
print("can't get repost data")
time.sleep(6)
if cookie is not None and total_page != 0:
for page_num in range(1, total_page + 1):
current_time = int(time.time() * 1000)
repost_url = ('https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id +
'&max_id=0&page=' + str(page_num) + '&__rnd=' + str(current_time))
get_page = requests.get(repost_url, headers=self.headers, cookies=cookie)
time.sleep(3)
get_page.encoding = 'utf-8'
try:
page_dic = get_page.json()
total_page = page_dic['data']['page']['totalpage']
repost_info = page_dic['data']['html']
repost_soup = BeautifulSoup(repost_info, 'html.parser')
repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'})
for line in repost_agg:
one_repost = self.get_repost_info(line)
result_lst.append(one_repost)
print('get one repost at %s' % page_num)
print(one_repost)
except:
print("can't get repost data")
if result_lst != []:
return result_lst
else:
print("can't get repost data")
return None
def user_page(self, user_id, user_name, password):
result_lst = []
cookie_pool = open('cookie_pool',
'r', encoding='utf-8')
for coo in cookie_pool:
print(coo)
cookie = json.loads(coo)
# cookie = self.manipulate_login(user_name=user_name,password=password)
# cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
if cookie is not None:
for page_num in range(1, 3):
first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0'
'&profile_ftype=1&page=' + str(page_num) + '#feedtop')
get_page = requests.get(first_url, headers=self.headers, cookies=cookie)
get_page.encoding = 'utf-8'
page = get_page.text
soup = BeautifulSoup(page, 'html.parser')
sfa = soup.find_all('script')
find_content = ''
for line in sfa:
if 'Pl_Official_MyProfileFeed__' in str(line):
find_content = str(line)
find_content = find_content.replace('<script>FM.view(', '').replace(')</script>', '')
# print(find_content)
find_content_dic = json.loads(find_content)
content_for_soup = find_content_dic['html']
soup_content = BeautifulSoup(content_for_soup, 'html.parser')
weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'})
# time.sleep(15)
for line_count, line in enumerate(weibo_lst):
weibo_info = self.get_user_weibo_info(line, cookie)
weibo_info['user_id'] = user_id
weibo_info['user_url'] = 'https://weibo.com/' + user_id
result_lst.append(weibo_info)
print('get data at element page:%s pagebar:%s' % (page_num, line_count))
get_parameter = soup.find_all('script', {'type': 'text/javascript'})
for line in get_parameter:
if 'pid' in str(line) and 'oid' in str(line):
parameter_str = str(line)
parameter_str = parameter_str.replace('\r', '').replace('\n', '').replace("\'", '')
domain = re.findall('\d+', ''.join(re.findall("pid]=\d+", parameter_str)))[0]
special_id = re.findall('\d+', ''.join(re.findall("page_id]=\d+", parameter_str)))[0]
current_time = int(time.time() * 1000)
for pagebar in [0, 1]:
user_url = ('https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain='
+ domain + '&profile_ftype=1&is_all=1&pagebar=' + str(pagebar) +
'&pl_name=Pl_Official_MyProfileFeed__22&id=' + special_id +
'&script_uri=/' + user_id + '&feed_type=0&page=' + str(page_num) + '&pre_page=1'
'&domain_op=' + domain + '&__rnd=' + str(
current_time))
get_page = requests.get(user_url, headers=self.headers, cookies=cookie)
get_page.encoding = 'utf-8'
try:
page_dic = get_page.json()
user_weibo_str = page_dic['data']
user_weibo_soup = BeautifulSoup(user_weibo_str, 'html.parser')
user_weibo_agg = user_weibo_soup.find_all('div', {'action-type': 'feed_list_item'})
# time.sleep(15)
for line in user_weibo_agg:
try:
weibo_info = self.get_user_weibo_info(line, cookie)
weibo_info['user_id'] = user_id
weibo_info['user_url'] = 'https://weibo.com/' + user_id
result_lst.append(weibo_info)
print('get data at ajax page page_num:%s pagebar:%s'
% (page_num, pagebar))
except:
print('one weibo_info error')
except:
print('page error at page_num:%s pagebar:%s' % (page_num, pagebar))
if result_lst != []:
return result_lst
else:
print("can't get repost data")
return None
@staticmethod
def get_single_page(mid):
......@@ -530,40 +405,11 @@ class Crawler_zhihu():
if __name__ == '__main__':
zhihu = Crawler_zhihu()
import os
# import PyV8
import execjs
os.environ["EXECJS_RUNTIME"] = 'Node'
# print(execjs.get().name )
# os.environ["EXECJS_RUNTIME"] = 'Node'
# print(execjs.get().name )
# zhihu.get_serach_page_cookies("热玛吉")
# zhihu.search_page("比基尼线脱毛")
zhihu.search_page("双眼皮",search_pages_max=1,output_to_es_register=True)
# zhihu.get_single_answer_page("325099876","1209953121")
# print(user_page)
if True:
# with PyV8.JSContext() as ctx:
url = "api/v4/search_v3?t=general&q=%E7%83%AD%E7%8E%9B%E5%90%89&correction=1&offset=20&limit=20&lc_idx=25&show_all_topics=0&search_hash_id=12d60c255d0be17b9830355a0d04de5b&vertical_info=0%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1"
referer = "https://www.zhihu.com/search?type=content&q=%E7%83%AD%E7%8E%9B%E5%90%89"
f = "+".join(["3_2.0", url, referer, '"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"'])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
# with open('./zhihu_js.js', 'r') as f:
# # print(f.read())
# ctx.eval(f.read())
# encrypt_str = ctx.locals.add('b',fmd5)
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
encrypt_str = execjs.compile(js).call('b', fmd5)
headers = {
"referer": referer,
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"cookie": 'd_c0="AACSLMY7lBGPTo9fXdy2pmiGQ4ZVVUcqzC4=|1594785557";',
"x-api-version": "3.0.91",
"x-zse-83": "3_2.0",
"x-zse-86": "1.0_%s" % encrypt_str,
}
print(headers)
r = requests.get("https://www.zhihu.com" + url, headers=headers)
print(r.text)
......@@ -15,7 +15,7 @@ import pymysql
import requests
from elasticsearch.exceptions import TransportError
from crawler_sys.framework.redis_interact import feed_url_into_redis
from crawler_sys.framework.redis_interact import rds
from crawler_sys.framework.redis_interact import redis_path
from crawler_sys.framework.es_ccr_index_defination import es_framework as es_site_crawler
from crawler_sys.framework.es_ccr_index_defination import index_url_register
from crawler_sys.framework.es_ccr_index_defination import doc_type_url_register
......@@ -296,7 +296,9 @@ def output_result(result_Lst, platform,
push_to_redis=False,
output_to_test_mysql=False,
output_to_mimas_mysql=False,
es_index=index_site_crawler, **kwargs):
es_index=index_site_crawler,
rds_path="on_line",
**kwargs):
# write data into es crawler-raw index
if output_to_es_raw:
bulk_write_into_es(result_Lst, es_index)
......@@ -308,14 +310,16 @@ def output_result(result_Lst, platform,
index=es_index,
construct_id=True,
platform=platform,
)
if output_to_test_mysql:
pass
# feed url into redis
if push_to_redis:
rds = redis_path(rds_path)
feed_url_into_redis(
result_Lst, expire=kwargs.get("expire"))
result_Lst, expire=kwargs.get("expire"),rds=rds)
# output into file according to passed in parameters
if output_to_file is True and filepath is not None:
......@@ -451,7 +455,7 @@ def load_json_file_into_dict_Lst(filename, path):
return data_Lst
def crawl_a_url_and_update_redis(url, platform, urlhash, processID=-1):
def crawl_a_url_and_update_redis(url, platform, urlhash, rds,processID=-1,):
# find crawler
# perform crawling, get the data
# write es or output to files
......@@ -469,7 +473,7 @@ def crawl_batch_task(url_Lst):
url_info['urlhash'])
def scan_redis_to_crawl():
def scan_redis_to_crawl(rds):
batch_size = 1000
cur = 0
task_batchs = []
......@@ -491,13 +495,13 @@ def scan_redis_to_crawl():
'urlhash': urlhash})
if len(task_batchs) == batch_size:
# multi-processing here
crawl_batch_task(task_batchs)
crawl_batch_task(rds,task_batchs)
task_batchs.clear()
if cur == 0:
break
def remove_fetched_url_from_redis(remove_interval=10):
def remove_fetched_url_from_redis(rds,remove_interval=10):
time.sleep(remove_interval)
cur = 0
delete_counter = 0
......
git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
# git+ssh://git@git.wanmeizhensuo.com/backend/gm-types.git@master
lxml==4.5.1
requests==2.23.0
tqdm==4.46.1
......
......@@ -5,7 +5,7 @@
# @author : litao
def calculate_douban_id(data_dic):
def calculate_zhihu_id(data_dic):
if "answer" in data_dic["url"]:
return data_dic["_id"].replace("zhihu_","")
else:
......
......@@ -17,6 +17,7 @@ from write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id import ca
from write_data_into_es.calculate_doc_id.func_calculate_douyin_id import calculate_douyin_id
from write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id import calculate_haokan_id
from write_data_into_es.calculate_doc_id.func_calculate_weibo_id import calculate_weibo_id
from write_data_into_es.calculate_doc_id.func_calculate_zhihu_id import calculate_zhihu_id
from write_data_into_es.calculate_doc_id.func_calculate_douban_id import calculate_douban_id
......@@ -32,7 +33,7 @@ def vid_cal_func(platform):
"haokan":calculate_haokan_id,
"weibo":calculate_weibo_id,
"douban":calculate_douban_id,
"zhihu":calculate_douban_id,
"zhihu":calculate_zhihu_id,
}
def general_vid_cal_func(url):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment