Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
667bc377
Commit
667bc377
authored
Jan 15, 2021
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
b81bb38b
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
330 additions
and
480 deletions
+330
-480
es_target_releasers.py
crawler_sys/framework/es_target_releasers.py
+8
-7
func_get_releaser_id.py
crawler_sys/framework/func_get_releaser_id.py
+287
-287
write_releasers_to_redis.py
crawler_sys/framework/write_releasers_to_redis.py
+2
-2
crawler_xiaohongshu.py
crawler_sys/site_crawler/crawler_xiaohongshu.py
+1
-1
run.sh
run.sh
+3
-3
func_get_releaser_id.py
write_data_into_es/func_get_releaser_id.py
+19
-3
target_releaser_add.py
write_data_into_es/target_releaser_add.py
+10
-177
No files found.
crawler_sys/framework/es_target_releasers.py
View file @
667bc377
...
...
@@ -10,16 +10,17 @@ from elasticsearch import Elasticsearch
from
elasticsearch.helpers
import
scan
# rds=redis.StrictRedis(host='192.168.17.26',port=6379,db=0)
hosts
=
'172.18.52.14'
port
=
9200
es_framework
=
Elasticsearch
(
hosts
=
'172.16.32.37'
,
port
=
9200
)
HTTP_AUTH
=
(
"elastic"
,
"gm_test"
)
es_framework
=
Elasticsearch
(
hosts
=
hosts
,
port
=
port
,
http_auth
=
HTTP_AUTH
)
index_target_releaser
=
'target_releasers'
doc_type_target_releaser
=
'doc'
def
bulk_write_target_releasers
(
dict_Lst
,
index
=
index_target_releaser
,
doc_type
=
doc_type_target_releaser
):
index
=
index_target_releaser
,):
bulk_write_body
=
''
write_counter
=
0
for
line
in
dict_Lst
:
...
...
@@ -28,8 +29,8 @@ def bulk_write_target_releasers(dict_Lst,
releaser
=
line
[
'releaser'
]
platform
=
line
[
'platform'
]
doc_id_releaser
=
'
%
s_
%
s'
%
(
platform
,
releaser
)
action_str
=
(
'{ "index" : { "_index" : "
%
s",
"_type" : "
%
s",
"_id" : "
%
s" } }'
%
(
index_target_releaser
,
doc_
type_target_releaser
,
doc_id_releaser
)
)
action_str
=
(
'{ "index" : { "_index" : "
%
s","_id" : "
%
s" } }'
%
(
index_target_releaser
,
doc_
id_releaser
)
)
data_str
=
json
.
dumps
(
line
,
ensure_ascii
=
False
)
line_body
=
action_str
+
'
\n
'
+
data_str
+
'
\n
'
bulk_write_body
+=
line_body
...
...
@@ -58,7 +59,7 @@ def get_releaserUrls_from_es(platform,
# search_body['query']['bool']['filter'].append(frequency_dict)
# print(target_index,doc_type_target_releaser,search_body)
print
(
search_body
)
search_resp
=
es_framework
.
search
(
index
=
target_index
,
search_resp
=
es_framework
.
search
(
index
=
target_index
,
body
=
search_body
,
size
=
0
,
request_timeout
=
100
)
...
...
crawler_sys/framework/func_get_releaser_id.py
View file @
667bc377
# -*- coding:utf-8 -*-
# @Time : 2019/5/30 11:01
# @Author : litao
import
re
,
requests
try
:
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
except
:
pass
def
toutiao
(
releaserUrl
,
**
kwargs
):
if
'www.toutiao.com'
in
releaserUrl
or
'www.365yg.com'
in
releaserUrl
:
pattern
=
'user/[0-9]+'
re_find
=
re
.
findall
(
pattern
,
releaserUrl
)
if
re_find
!=
[]:
releaser_id
=
re_find
[
0
]
.
split
(
'/'
)[
1
]
else
:
pattern
=
'to_user_id=[0-9]+'
re_find
=
re
.
findall
(
pattern
,
releaserUrl
)
if
re_find
!=
[]:
releaser_id
=
re_find
[
0
]
.
split
(
'='
)[
1
]
else
:
re_find
=
re
.
findall
(
"/m(
\
d+)"
,
releaserUrl
)
if
re_find
:
return
re_find
[
0
]
else
:
releaser_id
=
None
return
releaser_id
elif
'm.toutiao.com'
in
releaserUrl
:
pattern
=
'profile/[0-9]+'
re_find
=
re
.
findall
(
pattern
,
releaserUrl
)
if
re_find
!=
[]:
releaser_id
=
re_find
[
0
]
.
split
(
'/'
)[
1
]
return
releaser_id
elif
'm.365yg.com'
in
releaserUrl
:
pattern
=
'to_user_id=[0-9]+'
re_find
=
re
.
findall
(
pattern
,
releaserUrl
)
if
re_find
!=
[]:
releaser_id
=
re_find
[
0
]
.
split
(
'='
)[
1
]
else
:
releaser_id
=
None
return
releaser_id
elif
"user_id"
in
releaserUrl
:
re_find
=
re
.
findall
(
"user_id=(
\
d+)"
,
releaserUrl
)
if
re_find
:
return
re_find
[
0
]
else
:
return
None
else
:
re_find
=
re
.
findall
(
"(
\
d+)"
,
releaserUrl
)
if
re_find
:
return
re_find
[
0
]
else
:
return
None
def
haokan
(
releaserUrl
,
**
kwargs
):
if
"app_id="
in
releaserUrl
:
releaser_id_str
=
' '
.
join
(
re
.
findall
(
'app_id=.*'
,
releaserUrl
))
releaser_id
=
' '
.
join
(
re
.
findall
(
'
\
d+'
,
releaser_id_str
))
return
releaser_id
elif
"app_id"
in
releaserUrl
:
try
:
releaser_id_str
=
re
.
findall
(
"
%22
(
\
d+)
%22
"
,
releaserUrl
)[
0
]
if
releaser_id_str
:
return
releaser_id_str
except
:
releaser_id_str
=
re
.
findall
(
'"(
\
d+)"'
,
releaserUrl
)[
0
]
if
releaser_id_str
:
return
releaser_id_str
else
:
releaser_id_str
=
re
.
findall
(
'(
\
d+)'
,
releaserUrl
)[
0
]
if
releaser_id_str
:
return
releaser_id_str
def
tengxunshipin
(
releaserUrl
,
is_qq
=
False
,
**
kwargs
):
if
not
is_qq
:
try
:
releaser_id
=
re
.
findall
(
"vplus/(.*)"
,
releaserUrl
)[
0
]
if
len
(
releaser_id
)
==
32
:
return
releaser_id
else
:
if
"#"
in
releaser_id
:
releaser_id
=
releaser_id
.
split
(
"#"
)[
0
]
if
len
(
releaser_id
)
==
32
or
len
(
releaser_id
)
==
16
:
return
releaser_id
if
"/videos"
in
releaser_id
:
releaser_id
=
releaser_id
.
split
(
"/videos"
)[
0
]
if
len
(
releaser_id
)
==
32
or
len
(
releaser_id
)
==
16
:
return
releaser_id
proxies
=
get_proxy
(
1
)
get_page
=
requests
.
get
(
releaserUrl
,
timeout
=
5
,
proxies
=
proxies
)
get_page
.
encoding
=
'utf-8'
page
=
get_page
.
text
try
:
USER_INFO
=
re
.
findall
(
"var USER_INFO = ({.*?})"
,
page
,
flags
=
re
.
DOTALL
)[
0
]
# releaser = re.findall("name: '(.*)',", USER_INFO)[0]
releaser_id
=
re
.
findall
(
"id: '(.*)',"
,
USER_INFO
)[
0
]
# number_id = re.findall("number: '(.*)',", USER_INFO)[0]
except
:
return
None
return
releaser_id
except
:
return
None
else
:
proxies
=
get_proxy
(
1
)
get_page
=
requests
.
get
(
releaserUrl
,
timeout
=
2
,
proxies
=
proxies
)
get_page
.
encoding
=
'utf-8'
page
=
get_page
.
text
try
:
USER_INFO
=
re
.
findall
(
"var USER_INFO = ({.*?})"
,
page
,
flags
=
re
.
DOTALL
)[
0
]
releaser
=
re
.
findall
(
"name: '(.*)',"
,
USER_INFO
)[
0
]
releaser_id
=
re
.
findall
(
"id: '(.*)',"
,
USER_INFO
)[
0
]
number_id
=
re
.
findall
(
"number: '(.*)',"
,
USER_INFO
)[
0
]
except
:
return
None
D0
=
{
'releaser'
:
releaser
,
'releaser_id'
:
releaser_id
,
"number_id"
:
number_id
}
return
D0
def
new_tudou
(
releaserUrl
,
**
kwargs
):
if
"?"
in
releaserUrl
:
releaserUrl
=
releaserUrl
.
split
(
"?"
)[
0
]
if
"="
in
releaserUrl
:
releaserUrl
=
releaserUrl
.
replace
(
"="
,
""
)
try
:
if
'videos'
in
releaserUrl
:
releaser_id_str
=
' '
.
join
(
re
.
findall
(
'i/.*/videos'
,
releaserUrl
))
releaser_id
=
releaser_id_str
.
split
(
'/'
)[
1
]
return
releaser_id
elif
releaserUrl
[
-
1
]
==
"/"
:
releaserUrl
=
releaserUrl
[
0
:
-
1
]
releaser_id_str
=
''
.
join
(
re
.
findall
(
'i/(.*)'
,
releaserUrl
))
releaser_id
=
releaser_id_str
return
releaser_id
else
:
releaser_id
=
releaserUrl
.
split
(
"/"
)[
-
1
]
return
releaser_id
except
:
return
None
def
douyin
(
releaserUrl
,
**
kwargs
):
try
:
releaser_id
=
re
.
findall
(
"user/(
\
d+)"
,
releaserUrl
)[
0
]
except
:
print
(
releaserUrl
)
return
None
return
releaser_id
def
tencent_news
(
releaserUrl
,
**
kwargs
):
releaserUrl
=
str
(
releaserUrl
)
try
:
if
"media/"
in
releaserUrl
:
res
=
re
.
findall
(
r"media/(\d+)"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
pattern
=
'media/[0-9]+'
re_find
=
re
.
findall
(
pattern
,
releaserUrl
)
if
re_find
!=
[]:
releaser_id
=
re_find
[
0
]
.
split
(
'/'
)[
1
]
else
:
releaser_id
=
False
return
releaser_id
else
:
res
=
re
.
findall
(
r"chlid=(\d+)"
,
releaserUrl
)
if
res
:
return
res
[
0
]
except
:
return
False
def
miaopai
(
releaserUrl
,
**
kwargs
):
if
'n.miaopai.com'
in
releaserUrl
:
releaser_id_str
=
releaserUrl
.
split
(
'/'
)[
-
1
]
releaser_id
=
releaser_id_str
.
replace
(
'.html'
,
''
)
releaser_id
=
releaser_id_str
.
replace
(
'.htm'
,
''
)
return
releaser_id
else
:
print
(
"input illegal releaserUrl
%
s"
%
releaserUrl
)
return
None
def
kwai
(
releaserUrl
,
**
kwargs
):
if
"profile"
in
releaserUrl
:
res
=
re
.
findall
(
r"/profile/(.+)"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
""
elif
"/u/"
in
releaserUrl
:
res
=
re
.
findall
(
r"/u/(.+)/"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
""
def
wangyi_news
(
releaserUrl
,
**
kwargs
):
if
"/sub/"
in
releaserUrl
:
res
=
re
.
findall
(
r"/sub/(.+)\.html"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
None
elif
"video"
in
releaserUrl
:
res
=
re
.
findall
(
r"/list/(.+)/video"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
None
elif
"all"
in
releaserUrl
:
res
=
re
.
findall
(
r"/list/(.+)/all"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
None
def
xiaohongshu
(
releaserUrl
,
**
kwargs
):
releaserUrl
=
releaserUrl
.
split
(
"?"
)[
0
]
res
=
re
.
findall
(
r"user/profile/(.*)"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
None
plantform_func
=
{
"toutiao"
:
toutiao
,
"haokan"
:
haokan
,
"腾讯视频"
:
tengxunshipin
,
"new_tudou"
:
new_tudou
,
"腾讯新闻"
:
tencent_news
,
"miaopai"
:
miaopai
,
"kwai"
:
kwai
,
"网易新闻"
:
wangyi_news
,
"抖音"
:
douyin
,
"xiaohongshu"
:
xiaohongshu
}
def
get_releaser_id
(
platform
=
None
,
releaserUrl
=
None
,
is_qq
=
False
):
if
platform
and
releaserUrl
:
if
platform
in
plantform_func
:
func
=
plantform_func
[
platform
]
res
=
func
(
releaserUrl
,
is_qq
=
is_qq
)
try
:
if
res
:
return
res
else
:
print
(
platform
,
releaserUrl
,
"can't git releaser_id"
)
return
None
except
:
return
None
else
:
# print(plantform," not in target list")
return
None
if
__name__
==
"__main__"
:
# file = r'D:\work_file\发布者账号\SMG.csv'
# with open(file, 'r')as f:
# head = f.readline()
# head_list = head.strip().split(',')
# for i in f:
# line_list = i.strip().split(',')
# line_dict = dict(zip(head_list, line_list))
# platform = line_dict['platform']
# releaser = line_dict['releaser']
# try:
# releaserUrl = line_dict['releaserUrl']
# if platform == 'new_tudou':
# if releaserUrl[-2:] == '==':
# releaserUrl = releaserUrl + '/videos'
# line_dict['releaserUrl'] = releaserUrl
# except:
# pass
# releaser_id = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
# print(platform, releaserUrl, releaser_id)
releaser_id
=
get_releaser_id
(
"腾讯新闻"
,
"https://r.inews.qq.com/getUserVideoList?chlid=5362294&page_time=&coral_uin=ec8bb1459b9d84100312bf035bb43cd4d0&coral_uid=&type=om&uid=7313ae71df9e5367&omgid=&trueVersion=5.8.00&qimei=287801615436009&devid=008796749793280&appver=23_android_5.8.00&qn-rid=9ec6d3f9-d341-4138-b4e2-6b2ed4b98b5b&qn-sig=891289f9217ec9623723c024dd00eaf5"
)
print
(
releaser_id
)
\ No newline at end of file
# # -*- coding:utf-8 -*-
# # @Time : 2019/5/30 11:01
# # @Author : litao
# import re, requests
# try:
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# except:
# pass
#
#
# def toutiao(releaserUrl,**kwargs):
# if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl:
# pattern = 'user/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# else:
# pattern = 'to_user_id=[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('=')[1]
# else:
# re_find = re.findall("/m(\d+)", releaserUrl)
# if re_find:
# return re_find[0]
# else:
# releaser_id = None
# return releaser_id
#
# elif 'm.toutiao.com' in releaserUrl:
# pattern = 'profile/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# return releaser_id
#
# elif 'm.365yg.com' in releaserUrl:
# pattern = 'to_user_id=[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('=')[1]
# else:
# releaser_id = None
# return releaser_id
# elif "user_id" in releaserUrl:
# re_find = re.findall("user_id=(\d+)",releaserUrl)
# if re_find:
# return re_find[0]
# else:
# return None
# else:
# re_find = re.findall("(\d+)", releaserUrl)
# if re_find:
# return re_find[0]
# else:
# return None
# def haokan(releaserUrl,**kwargs):
# if "app_id=" in releaserUrl:
# releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl))
# releaser_id = ' '.join(re.findall('\d+', releaser_id_str))
# return releaser_id
# elif "app_id" in releaserUrl:
# try:
# releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
# except:
# releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
# else:
# releaser_id_str = re.findall('(\d+)', releaserUrl)[0]
# if releaser_id_str:
# return releaser_id_str
#
#
# def tengxunshipin(releaserUrl,is_qq=False,**kwargs):
# if not is_qq:
# try:
# releaser_id = re.findall("vplus/(.*)", releaserUrl)[0]
# if len(releaser_id) == 32:
# return releaser_id
# else:
# if "#" in releaser_id:
# releaser_id = releaser_id.split("#")[0]
# if len(releaser_id) == 32 or len(releaser_id) == 16:
# return releaser_id
# if "/videos" in releaser_id:
# releaser_id = releaser_id.split("/videos")[0]
# if len(releaser_id) == 32 or len(releaser_id) == 16:
# return releaser_id
# proxies = get_proxy(1)
# get_page = requests.get(releaserUrl, timeout=5,proxies=proxies)
# get_page.encoding = 'utf-8'
# page = get_page.text
# try:
# USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# # releaser = re.findall("name: '(.*)',", USER_INFO)[0]
# releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# # number_id = re.findall("number: '(.*)',", USER_INFO)[0]
# except:
# return None
# return releaser_id
# except:
# return None
# else:
# proxies = get_proxy(1)
# get_page = requests.get(releaserUrl, timeout=2,proxies=proxies)
# get_page.encoding = 'utf-8'
# page = get_page.text
# try:
# USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
# releaser = re.findall("name: '(.*)',", USER_INFO)[0]
# releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
# number_id = re.findall("number: '(.*)',", USER_INFO)[0]
# except:
# return None
# D0 = {'releaser': releaser,
# 'releaser_id': releaser_id,
# "number_id": number_id}
# return D0
#
#
#
# def new_tudou(releaserUrl,**kwargs):
# if "?" in releaserUrl:
# releaserUrl = releaserUrl.split("?")[0]
# if "=" in releaserUrl:
# releaserUrl = releaserUrl.replace("=","")
# try:
# if 'videos' in releaserUrl:
# releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl))
# releaser_id = releaser_id_str.split('/')[1]
# return releaser_id
# elif releaserUrl[-1] == "/":
# releaserUrl = releaserUrl[0:-1]
# releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl))
# releaser_id = releaser_id_str
# return releaser_id
# else:
# releaser_id = releaserUrl.split("/")[-1]
# return releaser_id
# except:
# return None
#
# def douyin(releaserUrl,**kwargs):
# try:
# releaser_id = re.findall("user/(\d+)",releaserUrl)[0]
# except:
# print(releaserUrl)
# return None
#
# return releaser_id
#
#
# def tencent_news(releaserUrl,**kwargs):
# releaserUrl = str(releaserUrl)
# try:
# if "media/" in releaserUrl:
# res = re.findall(r"media/(\d+)", releaserUrl)
# if res:
# return res[0]
# else:
# pattern = 'media/[0-9]+'
# re_find = re.findall(pattern, releaserUrl)
# if re_find != []:
# releaser_id = re_find[0].split('/')[1]
# else:
# releaser_id = False
# return releaser_id
# else:
# res = re.findall(r"chlid=(\d+)", releaserUrl)
# if res:
# return res[0]
# except:
# return False
#
#
# def miaopai(releaserUrl,**kwargs):
# if 'n.miaopai.com' in releaserUrl:
# releaser_id_str = releaserUrl.split('/')[-1]
# releaser_id = releaser_id_str.replace('.html', '')
# releaser_id = releaser_id_str.replace('.htm', '')
# return releaser_id
# else:
# print("input illegal releaserUrl %s" % releaserUrl)
# return None
#
#
# def kwai(releaserUrl,**kwargs):
# if "profile" in releaserUrl:
# res = re.findall(r"/profile/(.+)", releaserUrl)
# if res:
# return res[0]
# else:
# return ""
# elif "/u/" in releaserUrl:
# res = re.findall(r"/u/(.+)/", releaserUrl)
# if res:
# return res[0]
# else:
# return ""
#
#
# def wangyi_news(releaserUrl,**kwargs):
# if "/sub/" in releaserUrl:
# res = re.findall(r"/sub/(.+)\.html", releaserUrl)
# if res:
# return res[0]
# else:
# return None
# elif "video" in releaserUrl:
# res = re.findall(r"/list/(.+)/video", releaserUrl)
# if res:
# return res[0]
# else:
# return None
# elif "all" in releaserUrl:
# res = re.findall(r"/list/(.+)/all", releaserUrl)
# if res:
# return res[0]
# else:
# return None
#
# def xiaohongshu(releaserUrl,**kwargs):
# releaserUrl = releaserUrl.split("?")[0]
# res = re.findall(r"user/profile/(.*)", releaserUrl)
# if res:
# return res[0]
# else:
# return None
#
# plantform_func = {
# "toutiao": toutiao,
# "haokan": haokan,
# "腾讯视频": tengxunshipin,
# "new_tudou": new_tudou,
# "腾讯新闻": tencent_news,
# "miaopai": miaopai,
# "kwai": kwai,
# "网易新闻": wangyi_news,
# "抖音":douyin,
# "xiaohongshu":xiaohongshu
# }
#
#
# def get_releaser_id(platform=None, releaserUrl=None,is_qq=False):
# if platform and releaserUrl:
# if platform in plantform_func:
# func = plantform_func[platform]
# res = func(releaserUrl,is_qq=is_qq)
# try:
# if res:
# return res
# else:
# print(platform, releaserUrl, "can't git releaser_id")
# return None
# except:
# return None
# else:
# # print(plantform," not in target list")
# return None
#
#
# if __name__ == "__main__":
# # file = r'D:\work_file\发布者账号\SMG.csv'
# # with open(file, 'r')as f:
# # head = f.readline()
# # head_list = head.strip().split(',')
# # for i in f:
# # line_list = i.strip().split(',')
# # line_dict = dict(zip(head_list, line_list))
# # platform = line_dict['platform']
# # releaser = line_dict['releaser']
# # try:
# # releaserUrl = line_dict['releaserUrl']
# # if platform == 'new_tudou':
# # if releaserUrl[-2:] == '==':
# # releaserUrl = releaserUrl + '/videos'
# # line_dict['releaserUrl'] = releaserUrl
# # except:
# # pass
# # releaser_id = get_releaser_id(platform=platform, releaserUrl=releaserUrl)
# # print(platform, releaserUrl, releaser_id)
# releaser_id= get_releaser_id("腾讯新闻","https://r.inews.qq.com/getUserVideoList?chlid=5362294&page_time=&coral_uin=ec8bb1459b9d84100312bf035bb43cd4d0&coral_uid=&type=om&uid=7313ae71df9e5367&omgid=&trueVersion=5.8.00&qimei=287801615436009&devid=008796749793280&appver=23_android_5.8.00&qn-rid=9ec6d3f9-d341-4138-b4e2-6b2ed4b98b5b&qn-sig=891289f9217ec9623723c024dd00eaf5")
# print(releaser_id)
\ No newline at end of file
crawler_sys/framework/write_releasers_to_redis.py
View file @
667bc377
...
...
@@ -34,8 +34,8 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds
=
redis
.
StrictRedis
(
host
=
'172.16.40.164
'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
,
password
=
'ReDis!GmTx*0aN12'
)
# rds = redis.StrictRedis(host='172.18.51.10
', port=6379, db=19, decode_responses=True, password='ReDis!GmTx*0aN12')
rds
=
redis
.
StrictRedis
(
host
=
'172.18.51.10'
,
port
=
6379
,
db
=
19
,
decode_responses
=
True
)
parser
=
argparse
.
ArgumentParser
(
description
=
'Specify a platform name.'
)
parser
.
add_argument
(
'-p'
,
'--platform'
,
default
=
[],
action
=
'append'
,
help
=
(
'Pass platform names, they will be assembled in python list.'
))
...
...
crawler_sys/site_crawler/crawler_xiaohongshu.py
View file @
667bc377
...
...
@@ -21,7 +21,7 @@ from crawler.gm_upload.gm_upload import upload, upload_file
from
selenium.webdriver
import
ActionChains
from
selenium
import
webdriver
try
:
from
crawler_sys.framework
.func_get_releaser_id
import
*
from
write_data_into_es
.func_get_releaser_id
import
*
except
:
from
func_get_releaser_id
import
*
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
...
...
run.sh
View file @
667bc377
#!/bin/bash
#sudo su - gmuser
#source /root/anaconda3/bin/activate
crawler-ops
#conda activate crawler_env
#/home/gmuser/.virtualenvs/litao/bin/python3
/srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 5 > /data/log/crawler/write_task.log &
/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py
-p
douban
-d
1
-proxies
5
>
/data/log/crawler/write_task.log &
python
/srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py
-p
weibo
-d
1
-proxies
5
>
/data/log/crawler/write_task.log &
#
/home/gmuser/.virtualenvs/litao/bin/python3 /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p douban -d 1 -proxies 5 > /data/log/crawler/write_task.log &
write_data_into_es/func_get_releaser_id.py
View file @
667bc377
...
...
@@ -281,6 +281,20 @@ def douban(releaserUrl,**kwargs):
releaser_id
=
re
.
findall
(
r"people/(.*)"
,
releaserUrl
)[
0
]
return
releaser_id
def
xiaohongshu
(
releaserUrl
,
**
kwargs
):
releaserUrl
=
releaserUrl
.
split
(
"?"
)[
0
]
res
=
re
.
findall
(
r"user/profile/(.*)"
,
releaserUrl
)
if
res
:
return
res
[
0
]
else
:
return
None
def
zhihu
(
releaserUrl
,
**
kwargs
):
releaserUrl
=
releaserUrl
.
split
(
"?"
)[
0
]
releaser_id
=
re
.
findall
(
r"people/(.+)"
,
releaserUrl
)[
0
]
if
"/"
in
releaser_id
:
releaser_id
=
releaser_id
.
split
(
"/"
)[
0
]
return
releaser_id
plantform_func
=
{
"toutiao"
:
toutiao
,
...
...
@@ -297,7 +311,9 @@ plantform_func = {
"weixin"
:
weixin
,
"weibo"
:
weibo
,
"pearvideo"
:
pearvideo
,
"douban"
:
douban
"douban"
:
douban
,
"zhihu"
:
zhihu
,
"xiaohongshu"
:
xiaohongshu
}
...
...
@@ -335,4 +351,4 @@ if __name__ == "__main__":
# print(get_releaser_id(platform=platform,releaserUrl=releaserUrl))
# print(releaser_id)
# print(weibo("https://weibo.com/1656058115"))
\ No newline at end of file
print
(
zhihu
(
"https://www.zhihu.com/people/kokokou/jkh?!23"
))
\ No newline at end of file
write_data_into_es/target_releaser_add.py
View file @
667bc377
...
...
@@ -18,9 +18,11 @@ from write_data_into_es.func_get_releaser_id import get_releaser_id
import
redis
import
hashlib
hosts
=
'172.1
6.32.37
'
hosts
=
'172.1
8.52.14
'
port
=
9200
es
=
Elasticsearch
(
hosts
=
hosts
,
port
=
port
)
HTTP_AUTH
=
(
"elastic"
,
"gm_test"
)
es
=
Elasticsearch
(
hosts
=
hosts
,
port
=
port
,
http_auth
=
HTTP_AUTH
)
# pool = redis.ConnectionPool(host='192.168.17.60', port=6379, db=2, decode_responses=True)
# rds = redis.Redis(connection_pool=pool)
...
...
@@ -113,9 +115,6 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if
extra_dic
:
line_dict
.
update
(
extra_dic
)
# import pdb;
# pdb.set_trace()
# print(str(get_releaser_id(platform=platform, releaserUrl=releaserUrl)))
line_dict
[
"releaser_id"
]
=
get_releaser_id
(
platform
=
platform
,
releaserUrl
=
releaserUrl
)
if
line_dict
[
"releaser_id"
]:
...
...
@@ -123,46 +122,12 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
else
:
doc_id
=
platform
+
'_'
+
line_dict
[
'releaser'
]
err_id_line
+=
str
(
line
+
2
)
+
","
find_exist
=
{
"query"
:
{
"bool"
:
{
"filter"
:
[
{
"term"
:
{
"_id"
:
doc_id
}}
]
}
}
}
if
not
extra_dic
.
get
(
"project_tags"
):
extra_dic
.
pop
(
"project_tags"
,
0
)
if
not
extra_dic
.
get
(
"department_tags"
):
extra_dic
.
pop
(
"department_tags"
,
0
)
# search_re = es.search(index='target_releasers', doc_type='doc', body=find_exist)
# if search_re['hits']['total'] > 0:
# search_source = search_re['hits']['hits'][0]['_source']
# # print(search_source)
# if search_source.get("project_tags"):
# try:
# # print(kwargs.get("extra_dic"))
# line_dict["project_tags"].extend(search_source.get("project_tags"))
# line_dict["project_tags"] = list(set(line_dict["project_tags"]))
# search_source.pop("project_tags", 0)
# except Exception as e:
# pass
# # print("project_tags error", e)
# if search_source.get("department_tags"):
# try:
# # print(kwargs.get("extra_dic"))
# line_dict["department_tags"].extend(search_source.get("department_tags"))
# line_dict["department_tags"] = list(set(line_dict["department_tags"]))
# search_source.pop("department_tags", 0)
# except Exception as e:
# pass
# # print("project_tags error", e)
# if update:
# line_dict.update(search_source)
# line_dict["post_time"] = search_source.get("post_time")
if
line_dict
.
get
(
"post_time"
):
pass
else
:
...
...
@@ -182,24 +147,7 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
line_dict
[
"project_tags"
]
=
[]
if
not
line_dict
.
get
(
"department_tags"
):
line_dict
[
"department_tags"
]
=
[]
if
line_dict
.
get
(
"add_departments"
):
line_dict
[
"department_tags"
]
.
extend
(
line_dict
.
get
(
"add_departments"
))
line_dict
[
"department_tags"
]
=
list
(
set
(
line_dict
[
"department_tags"
]))
if
line_dict
.
get
(
"del_departments"
):
for
key
in
line_dict
.
get
(
"del_departments"
):
try
:
line_dict
[
"department_tags"
]
.
remove
(
key
)
except
:
continue
if
line_dict
.
get
(
"add_project_tags"
):
line_dict
[
"project_tags"
]
.
extend
(
line_dict
.
get
(
"add_project_tags"
))
line_dict
[
"project_tags"
]
=
list
(
set
(
line_dict
[
"project_tags"
]))
if
line_dict
.
get
(
"del_project_tags"
):
for
key
in
line_dict
.
get
(
"del_project_tags"
):
try
:
line_dict
[
"project_tags"
]
.
remove
(
key
)
except
:
continue
bulk_dic
=
{
"releaser"
:
line_dict
.
get
(
"releaser"
),
"releaserUrl"
:
line_dict
.
get
(
"releaserUrl"
),
...
...
@@ -211,12 +159,11 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
"frequency"
:
3
if
line_dict
.
get
(
"project_tags"
)
else
1
,
"key_releaser"
:
line_dict
.
get
(
"key_releaser"
),
"is_valid"
:
line_dict
.
get
(
"is_valid"
),
"has_data"
:
line_dict
.
get
(
"has_data"
)
if
line_dict
.
get
(
"has_data"
)
else
0
,
#
"has_data": line_dict.get("has_data") if line_dict.get("has_data") else 0,
"project_tags"
:
line_dict
.
get
(
"project_tags"
),
"department_tags"
:
line_dict
.
get
(
"department_tags"
),
'timestamp'
:
int
(
datetime
.
datetime
.
timestamp
(
datetime
.
datetime
.
now
())
*
1000
),
'media_type'
:
line_dict
.
get
(
"media_type"
)
if
line_dict
.
get
(
"media_type"
)
else
""
,
'releaser_type'
:
line_dict
.
get
(
"releaser_type"
)
if
line_dict
.
get
(
"releaser_type"
)
else
""
,
}
...
...
@@ -251,127 +198,13 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if
__name__
==
"__main__"
:
data_list
=
[
{
"releaserUrl"
:
"https://weibo.com/u/1764615662"
,
"releaser"
:
"娱乐圈贵妃"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3662247177"
,
"releaser"
:
"捞娱君"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2378564111"
,
"releaser"
:
"娱乐扒皮"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2983578965"
,
"releaser"
:
"娱乐圈小青年"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3938976579"
,
"releaser"
:
"娱乐捞饭"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6511177474"
,
"releaser"
:
"小组吃瓜蜀黍"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6343916471"
,
"releaser"
:
"圈内老顽童"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6511177474"
,
"releaser"
:
"八组吃瓜蜀黍"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2921603920"
,
"releaser"
:
"娱乐圈新鲜事"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6470919752"
,
"releaser"
:
"伊丽莎白骨精啊"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1"
,
"releaser"
:
"娱乐榜姐"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3115996363?is_hot=1"
,
"releaser"
:
"娱乐星事"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"星探扒皮"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3926129482"
,
"releaser"
:
"星闻追踪"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5509337969?is_hot=1"
,
"releaser"
:
"卦哥娱乐"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5477320351"
,
"releaser"
:
"圈内扒爷"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"圈八戒 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6511173721"
,
"releaser"
:
"圈内课代表"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱闻少女"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3193443435"
,
"releaser"
:
"圈太妹"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2022990945"
,
"releaser"
:
"圈内狙击手"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1809782810?is_all=1"
,
"releaser"
:
"全娱乐爆料"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5157190426?is_all=1"
,
"releaser"
:
"娱乐扒少"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2125613987?is_all=1"
,
"releaser"
:
"圈内一把手 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005051948622644/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"影视圈扒姐 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2611791490"
,
"releaser"
:
"娱评八公"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1652840683"
,
"releaser"
:
"追星"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5086098727?is_hot=1"
,
"releaser"
:
"闻娱教主"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5101787982?is_all=1"
,
"releaser"
:
"扒婆说"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5101844765?is_hot=1"
,
"releaser"
:
"星娱客 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052115034114/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱乐明星团 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6473952993?is_hot=1"
,
"releaser"
:
"偶像日报"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5106602573?is_hot=1"
,
"releaser"
:
"八哥"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5909342713?"
,
"releaser"
:
"圈内教父"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3200673035?"
,
"releaser"
:
"扒圈老鬼"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005055965621313/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"圈内师爷"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1915749764?is_hot=1"
,
"releaser"
:
"迷妹速报"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1002061836328652/home?from=page_100206&mod=TAB#place"
,
"releaser"
:
"前线娱乐"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5896207859?is_hot=1"
,
"releaser"
:
"娱记者"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5717515328?is_hot=1"
,
"releaser"
:
"娱老汉"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005051795994180/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱乐News"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5978818414?is_hot=1"
,
"releaser"
:
"娱圈蜀黍"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2489917511?is_hot=1"
,
"releaser"
:
"芒果捞扒婆 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5279487569?is_hot=1"
,
"releaser"
:
"娱姐速报 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5106602573?is_hot=1"
,
"releaser"
:
"八哥 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5323541229?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"国内外白富美揭秘 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1003062512591982/home?from=page_100306&mod=TAB#place"
,
"releaser"
:
"圈少爷"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2821843050?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"圈内老鬼"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3028215832?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"娱扒爷"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5336756846?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"兔兔热议"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005051844235935/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱乐圈外汉"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052586409491/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱乐圈吃瓜指南 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5255814135"
,
"releaser"
:
"八组兔区爆料"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2871033210?is_hot=1"
,
"releaser"
:
"八组兔区热议 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052813285937/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"八组兔区娱乐圈"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052831749482/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"八组兔区揭秘"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2709814831"
,
"releaser"
:
"娱大蜀黍"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5634795408"
,
"releaser"
:
"圈八戒"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5176743404"
,
"releaser"
:
"瓜瓜搬运机"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5039775130"
,
"releaser"
:
"娱乐揭秘蜀黍"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/7123521074"
,
"releaser"
:
"饭圈日报"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1746658980"
,
"releaser"
:
"饭圈阿姨"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052453653365/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"圈内星探"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6311417880?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"星扒婆 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1420816495?profile_ftype=1&is_all=1#_0"
,
"releaser"
:
"娱尾纹"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1974754790"
,
"releaser"
:
"教父娱乐"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1818950785?refer_flag=1028035010_&is_hot=1"
,
"releaser"
:
"扒圈有鱼"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1893711543"
,
"releaser"
:
"娱乐有饭"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1002061653255165/home?from=page_100206&mod=TAB#place"
,
"releaser"
:
"娱乐日爆社"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052391322817/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"小娱乐家"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1003061994712500/home?from=page_100306&mod=TAB#place"
,
"releaser"
:
"星扒客push"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5700087877"
,
"releaser"
:
"毒舌八卦"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3779202361"
,
"releaser"
:
"西皮娱乐"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1632619962"
,
"releaser"
:
"瓜组新鲜事"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052103460752/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱嬷嬷 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/5874584452"
,
"releaser"
:
"吃瓜鹅每日搬"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005052397961280/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱大白"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005053246379064/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"娱乐圈扒姐 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/1830483711"
,
"releaser"
:
"娱乐女记"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005053847401640/home?from=page_100505&mod=TAB#place"
,
"releaser"
:
"吃瓜爆料每日搬 "
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://www.douban.com/people/hot_tag"
,
"releaser"
:
"hot_tag"
,
"platform"
:
"douban"
},
{
"releaserUrl"
:
"https://www.douban.com/people/new_tag"
,
"releaser"
:
"new_tag"
,
"platform"
:
"douban"
}
]
file
=
r"D:\work_file\gengmei\litao\temp.csv"
extra_dic
=
{
"department_tags"
:[
"
策略组
"
],
"department_tags"
:[
"
运营
"
],
'key_releaser'
:
True
,
'frequency'
:
3
,
}
# csv_type = {"SMG": [], "an_hui": [], "ronghe": [], "su_zhou": []}
#ronghe_releaser_write_es(file, post_by="litao")
write_to_es
(
data_list
,
post_by
=
"litao"
,
extra_dic
=
extra_dic
,
push_to_redis
=
False
)
write_to_es
(
file
,
post_by
=
"litao"
,
extra_dic
=
extra_dic
,
push_to_redis
=
False
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment