Commit 667bc377 authored by litaolemo's avatar litaolemo

update

parent b81bb38b
......@@ -10,16 +10,17 @@ from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
# rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
hosts = '172.18.52.14'
port = 9200
es_framework = Elasticsearch(hosts='172.16.32.37', port=9200)
HTTP_AUTH = ("elastic", "gm_test")
es_framework = Elasticsearch(hosts=hosts, port=port, http_auth=HTTP_AUTH)
index_target_releaser = 'target_releasers'
doc_type_target_releaser = 'doc'
def bulk_write_target_releasers(dict_Lst,
index=index_target_releaser,
doc_type=doc_type_target_releaser):
index=index_target_releaser,):
bulk_write_body=''
write_counter=0
for line in dict_Lst:
......@@ -28,8 +29,8 @@ def bulk_write_target_releasers(dict_Lst,
releaser=line['releaser']
platform=line['platform']
doc_id_releaser='%s_%s' % (platform, releaser)
action_str=('{ "index" : { "_index" : "%s", "_type" : "%s","_id" : "%s" } }'
% (index_target_releaser, doc_type_target_releaser, doc_id_releaser) )
action_str=('{ "index" : { "_index" : "%s","_id" : "%s" } }'
% (index_target_releaser, doc_id_releaser))
data_str=json.dumps(line, ensure_ascii=False)
line_body = action_str + '\n' + data_str + '\n'
bulk_write_body += line_body
......@@ -58,7 +59,7 @@ def get_releaserUrls_from_es(platform,
# search_body['query']['bool']['filter'].append(frequency_dict)
# print(target_index,doc_type_target_releaser,search_body)
print(search_body)
search_resp=es_framework.search(index=target_index,
search_resp= es_framework.search(index=target_index,
body=search_body,
size=0,
request_timeout=100)
......
......@@ -34,8 +34,8 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='172.16.40.164', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
# rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('Pass platform names, they will be assembled in python list.'))
......
......@@ -21,7 +21,7 @@ from crawler.gm_upload.gm_upload import upload, upload_file
from selenium.webdriver import ActionChains
from selenium import webdriver
try:
from crawler_sys.framework.func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
......
#!/bin/bash
#sudo su - gmuser
#source /root/anaconda3/bin/activate
crawler-ops
#conda activate crawler_env
#/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5 > /data/log/crawler/write_task.log &
/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p douban -d 1 -proxies 5 > /data/log/crawler/write_task.log &
python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5 > /data/log/crawler/write_task.log &
#/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p douban -d 1 -proxies 5 > /data/log/crawler/write_task.log &
......@@ -281,6 +281,20 @@ def douban(releaserUrl,**kwargs):
releaser_id = re.findall(r"people/(.*)", releaserUrl)[0]
return releaser_id
def xiaohongshu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
res = re.findall(r"user/profile/(.*)", releaserUrl)
if res:
return res[0]
else:
return None
def zhihu(releaserUrl,**kwargs):
releaserUrl = releaserUrl.split("?")[0]
releaser_id = re.findall(r"people/(.+)", releaserUrl)[0]
if "/" in releaser_id:
releaser_id = releaser_id.split("/")[0]
return releaser_id
plantform_func = {
"toutiao": toutiao,
......@@ -297,7 +311,9 @@ plantform_func = {
"weixin":weixin,
"weibo":weibo,
"pearvideo":pearvideo,
"douban":douban
"douban":douban,
"zhihu":zhihu,
"xiaohongshu":xiaohongshu
}
......@@ -335,4 +351,4 @@ if __name__ == "__main__":
# print(get_releaser_id(platform=platform,releaserUrl=releaserUrl))
# print(releaser_id)
# print(weibo("https://weibo.com/1656058115"))
\ No newline at end of file
print(zhihu("https://www.zhihu.com/people/kokokou/jkh?!23"))
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment