Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
f1e00560
Commit
f1e00560
authored
Jul 24, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update 更新doc_id生成逻辑
parent
8ebe168b
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
64 additions
and
198 deletions
+64
-198
crawler_douban.py
crawler_sys/site_crawler_test/crawler_douban.py
+0
-0
crawler_weibo.py
crawler_sys/site_crawler_test/crawler_weibo.py
+5
-5
output_results.py
crawler_sys/utils/output_results.py
+0
-0
__init__.py
write_data_into_es/calculate_doc_id/__init__.py
+6
-0
func_calculate_douban_id.py
...data_into_es/calculate_doc_id/func_calculate_douban_id.py
+18
-0
func_calculate_v_qq_video_id.py
..._into_es/calculate_doc_id/func_calculate_v_qq_video_id.py
+4
-3
func_calculate_weibo_id.py
..._data_into_es/calculate_doc_id/func_calculate_weibo_id.py
+13
-0
func_cal_doc_id.py
write_data_into_es/func_cal_doc_id.py
+14
-13
func_calculate_douyin_id.py
write_data_into_es/func_calculate_douyin_id.py
+0
-30
func_calculate_haokan_video_id.py
write_data_into_es/func_calculate_haokan_video_id.py
+0
-23
func_calculate_kwai_video_id_by_data.py
write_data_into_es/func_calculate_kwai_video_id_by_data.py
+0
-26
func_calculate_kwai_video_id_by_url.py
write_data_into_es/func_calculate_kwai_video_id_by_url.py
+0
-22
func_calculate_newTudou_video_id.py
write_data_into_es/func_calculate_newTudou_video_id.py
+0
-16
func_calculate_toutiao_video_id.py
write_data_into_es/func_calculate_toutiao_video_id.py
+0
-19
func_calculate_txxw_video_id.py
write_data_into_es/func_calculate_txxw_video_id.py
+0
-13
func_calculate_wangyi_news_id.py
write_data_into_es/func_calculate_wangyi_news_id.py
+0
-26
target_releaser_add.py
write_data_into_es/target_releaser_add.py
+4
-2
No files found.
crawler_sys/site_crawler_test/crawler_douban.py
0 → 100644
View file @
f1e00560
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_test/crawler_weibo.py
View file @
f1e00560
...
@@ -23,10 +23,10 @@ from crawler.crawler_sys.utils.output_results import output_result
...
@@ -23,10 +23,10 @@ from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
# from crawler.crawler_sys.utils import output_log
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_strtime_to_timestamp
import
weibo_parse_time
,
trans_strtime_to_timestamp
from
crawler.crawler_sys.utils.trans_strtime_to_timestamp
import
weibo_parse_time
,
trans_strtime_to_timestamp
from
crawler.crawler_sys.utils
import
connect_with_redis
#
from crawler.crawler_sys.utils import connect_with_redis
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
#
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from
crawler.crawler_sys.utils.util_logging
import
logged
#
from crawler.crawler_sys.utils.util_logging import logged
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
#
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from
crawler.crawler_sys.utils.html_to_str
import
dehtml
from
crawler.crawler_sys.utils.html_to_str
import
dehtml
from
write_data_into_es.func_get_releaser_id
import
*
from
write_data_into_es.func_get_releaser_id
import
*
...
@@ -112,7 +112,7 @@ class Crawler_weibo():
...
@@ -112,7 +112,7 @@ class Crawler_weibo():
doc_type
=
None
,
proxies_num
=
None
):
doc_type
=
None
,
proxies_num
=
None
):
print
(
'Processing releaserUrl
%
s'
%
releaserUrl
)
print
(
'Processing releaserUrl
%
s'
%
releaserUrl
)
result_Lst
=
[]
result_Lst
=
[]
releaser_id
,
containerid
=
self
.
get_releaser_id
(
releaserUrl
)
releaser_id
=
self
.
get_releaser_id
(
releaserUrl
)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers
=
{
headers
=
{
"accept"
:
"application/json, text/plain, */*"
,
"accept"
:
"application/json, text/plain, */*"
,
...
...
crawler_sys/utils/output_results.py
View file @
f1e00560
write_data_into_es/calculate_doc_id/__init__.py
0 → 100644
View file @
f1e00560
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:56
# @File : __init__.py.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
write_data_into_es/calculate_doc_id/func_calculate_douban_id.py
0 → 100644
View file @
f1e00560
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:59
# @File : func_calculate_douban_id.py
# @email : litao@igengmei.com
# @author : litao
import
re
def
calculate_douban_id
(
data_dic
):
if
data_dic
.
get
(
"mid"
):
return
data_dic
.
get
(
"mid"
)
else
:
try
:
find_mid
=
re
.
findall
(
'(
\
d+)'
,
data_dic
[
"url"
])[
0
]
return
find_mid
except
:
return
data_dic
[
"url"
]
\ No newline at end of file
write_data_into_es/func_calculate_v_qq_video_id.py
→
write_data_into_es/
calculate_doc_id/
func_calculate_v_qq_video_id.py
View file @
f1e00560
...
@@ -6,10 +6,11 @@ Created on Mon Nov 6 09:54:09 2017
...
@@ -6,10 +6,11 @@ Created on Mon Nov 6 09:54:09 2017
"""
"""
import
re
import
re
def
calculate_v_qq_video_id
(
v_qq_page_url
):
def
calculate_v_qq_video_id
(
data_dic
):
find_vid
=
re
.
findall
(
'/[0-9a-zA-Z]+.html'
,
v_qq_page_url
)
url
=
data_dic
.
get
(
"url"
)
find_vid
=
re
.
findall
(
'/[0-9a-zA-Z]+.html'
,
url
)
if
find_vid
!=
[]:
if
find_vid
!=
[]:
vid
=
find_vid
[
0
]
.
split
(
'/'
)[
1
]
.
split
(
'.'
)[
0
]
vid
=
find_vid
[
0
]
.
split
(
'/'
)[
1
]
.
split
(
'.'
)[
0
]
else
:
else
:
vid
=
v_qq_page_
url
vid
=
url
return
vid
return
vid
write_data_into_es/calculate_doc_id/func_calculate_weibo_id.py
0 → 100644
View file @
f1e00560
# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 17:59
# @File : func_calculate_weibo_id.py
# @email : litao@igengmei.com
# @author : litao
def
calculate_weibo_id
(
data_dic
):
if
data_dic
.
get
(
"mid"
):
return
data_dic
.
get
(
"mid"
)
else
:
return
data_dic
.
get
(
"url"
)
\ No newline at end of file
write_data_into_es/func_cal_doc_id.py
View file @
f1e00560
...
@@ -7,15 +7,17 @@ Created on Wed Jun 20 09:19:12 2018
...
@@ -7,15 +7,17 @@ Created on Wed Jun 20 09:19:12 2018
import
hashlib
import
hashlib
from
write_data_into_es.func_calculate_toutiao_video_id
import
calculate_toutiao_video_id
from
write_data_into_es.
calculate_doc_id.
func_calculate_toutiao_video_id
import
calculate_toutiao_video_id
from
write_data_into_es.func_calculate_newTudou_video_id
import
calculate_newTudou_video_id
from
write_data_into_es.
calculate_doc_id.
func_calculate_newTudou_video_id
import
calculate_newTudou_video_id
from
write_data_into_es.func_calculate_v_qq_video_id
import
calculate_v_qq_video_id
from
write_data_into_es.
calculate_doc_id.
func_calculate_v_qq_video_id
import
calculate_v_qq_video_id
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
#from func_calculate_kwai_video_id_by_data import calculate_kwai_video_id_by_data
from
write_data_into_es.func_calculate_kwai_video_id_by_url
import
calculate_kwai_video_id_by_data_by_url
from
write_data_into_es.calculate_doc_id.func_calculate_kwai_video_id_by_url
import
calculate_kwai_video_id_by_data_by_url
from
write_data_into_es.func_calculate_txxw_video_id
import
calculate_txxw_video_id
from
write_data_into_es.calculate_doc_id.func_calculate_txxw_video_id
import
calculate_txxw_video_id
from
write_data_into_es.func_calculate_wangyi_news_id
import
calculate_wangyi_news_id
from
write_data_into_es.calculate_doc_id.func_calculate_wangyi_news_id
import
calculate_wangyi_news_id
from
write_data_into_es.func_calculate_douyin_id
import
calculate_douyin_id
from
write_data_into_es.calculate_doc_id.func_calculate_douyin_id
import
calculate_douyin_id
from
write_data_into_es.func_calculate_haokan_video_id
import
calculate_haokan_id
from
write_data_into_es.calculate_doc_id.func_calculate_haokan_video_id
import
calculate_haokan_id
from
write_data_into_es.calculate_doc_id.func_calculate_weibo_id
import
calculate_weibo_id
from
write_data_into_es.calculate_doc_id.func_calculate_douban_id
import
calculate_douban_id
def
vid_cal_func
(
platform
):
def
vid_cal_func
(
platform
):
...
@@ -27,7 +29,9 @@ def vid_cal_func(platform):
...
@@ -27,7 +29,9 @@ def vid_cal_func(platform):
'腾讯新闻'
:
calculate_txxw_video_id
,
'腾讯新闻'
:
calculate_txxw_video_id
,
"网易新闻"
:
calculate_wangyi_news_id
,
"网易新闻"
:
calculate_wangyi_news_id
,
"抖音"
:
calculate_douyin_id
,
"抖音"
:
calculate_douyin_id
,
"haokan"
:
calculate_haokan_id
"haokan"
:
calculate_haokan_id
,
"weibo"
:
calculate_weibo_id
,
"douban"
:
calculate_douban_id
,
}
}
def
general_vid_cal_func
(
url
):
def
general_vid_cal_func
(
url
):
...
@@ -69,10 +73,7 @@ def cal_doc_id(platform, url=None,
...
@@ -69,10 +73,7 @@ def cal_doc_id(platform, url=None,
except
:
except
:
url
=
None
url
=
None
if
platform
==
'腾讯新闻'
or
platform
==
'haokan'
:
vid_bare
=
vid_cal_func
(
platform
)(
data_dict
)
vid_bare
=
vid_cal_func
(
platform
)(
data_dict
)
else
:
vid_bare
=
vid_cal_func
(
platform
)(
url
)
if
doc_id_type
==
'daily-url'
:
if
doc_id_type
==
'daily-url'
:
if
fetch_day_str
!=
None
:
if
fetch_day_str
!=
None
:
if
platform
==
'toutiao'
:
if
platform
==
'toutiao'
:
...
@@ -118,7 +119,7 @@ def cal_doc_id(platform, url=None,
...
@@ -118,7 +119,7 @@ def cal_doc_id(platform, url=None,
elif
platform
==
'网易新闻'
:
elif
platform
==
'网易新闻'
:
vid
=
'wyxw_
%
s'
%
(
vid_bare
)
vid
=
'wyxw_
%
s'
%
(
vid_bare
)
else
:
else
:
vid
=
'
%
s
'
%
(
vid_bare
)
vid
=
'
%
s
_
%
s'
%
(
platform
,
vid_bare
)
elif
doc_id_type
==
'time-track'
:
elif
doc_id_type
==
'time-track'
:
if
fetch_time_ts
!=
None
:
if
fetch_time_ts
!=
None
:
if
platform
==
'toutiao'
:
if
platform
==
'toutiao'
:
...
...
write_data_into_es/func_calculate_douyin_id.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding:utf-8 -*-
# @Time : 2019/7/16 16:08
# @Author : litao
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import
re
def
calculate_douyin_id
(
url
):
if
"?"
in
url
:
find_vid
=
url
.
split
(
"?"
)
elif
"video"
in
url
:
find_vid
=
re
.
findall
(
'/video/(.*?)/'
,
url
)
if
find_vid
:
find_vid
=
[
"https://www.iesdouyin.com/share/video/
%
s/"
%
find_vid
[
0
]]
else
:
return
url
if
find_vid
!=
[]:
vid
=
find_vid
[
0
]
else
:
vid
=
url
return
vid
if
__name__
==
'__main__'
:
print
(
calculate_douyin_id
(
"https://www.iesdouyin.com/share/vido/6688242923181591821/?mid=6688519042262665996"
))
print
(
calculate_douyin_id
(
"https://www.iesdouyin.com/share/video/6689249077596671245/?mid=6689052145968450308"
))
\ No newline at end of file
write_data_into_es/func_calculate_haokan_video_id.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding:utf-8 -*-
# @Time : 2019/8/27 16:24
# @Author : litao
import
re
def
calculate_haokan_id
(
data_dic
):
url
=
data_dic
.
get
(
"url"
)
# if data_dic.get("video_id"):
# return data_dic["video_id"]
if
"id="
in
url
:
find_vid
=
re
.
findall
(
'id=(
\
d+)'
,
url
)
return
find_vid
[
0
]
elif
"context=
%7
B
%22
nid
%22%3
A
%22
sv_"
in
url
:
find_vid
=
re
.
findall
(
'context=
%7
B
%22
nid
%22%3
A
%22
sv_(.+)
%22%7
D'
,
url
)
return
find_vid
[
0
]
else
:
return
url
if
__name__
==
'__main__'
:
print
(
calculate_haokan_id
({
"url"
:
"https://sv.baidu.com/videoui/page/videoland?context=
%7
B
%22
nid
%22%3
A
%22
sv_5091548046938576131
%22%7
D"
}))
print
(
calculate_haokan_id
({
"url"
:
"https://haokan.baidu.com/v?vid=4596161678511752193"
}))
\ No newline at end of file
write_data_into_es/func_calculate_kwai_video_id_by_data.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 3 15:12:09 2018
@author: hanye
"""
import
hashlib
def
calculate_kwai_video_id_by_data
(
kwai_video_dict
):
try
:
title
=
kwai_video_dict
[
'title'
]
title_c
=
title
.
replace
(
' '
,
''
)
.
replace
(
'
\r
'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\t
'
,
''
)
releaser
=
kwai_video_dict
[
'releaser'
]
release_time_ts
=
kwai_video_dict
[
'release_time'
]
kwai_key
=
title_c
+
'_'
+
releaser
+
'_'
+
str
(
release_time_ts
)
key_hash
=
hashlib
.
md5
(
kwai_key
.
encode
(
'utf-8'
))
.
hexdigest
()
vid
=
key_hash
except
:
try
:
kwai_key
=
kwai_video_dict
[
'url'
]
vid
=
kwai_key
except
:
vid
=
None
return
vid
write_data_into_es/func_calculate_kwai_video_id_by_url.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 6 15:53:10 2018
@author: zhouyujiang
"""
import
re
def
calculate_kwai_video_id_by_data_by_url
(
kwai_url
):
doc_id_str
=
re
.
findall
(
r"/u/(.+)?|/photo/(.+)?"
,
kwai_url
)
if
doc_id_str
!=
[]:
for
i
in
doc_id_str
[
0
]:
if
i
!=
''
:
vid
=
str
(
i
)
.
replace
(
'/'
,
'_'
)
return
vid
else
:
return
None
if
__name__
==
'__main__'
:
print
(
calculate_kwai_video_id_by_data_by_url
(
'https://www.kuaishou.com/u/143139353/5601747480'
))
write_data_into_es/func_calculate_newTudou_video_id.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 12 16:40:20 2017
@author: hanye
"""
import
re
def
calculate_newTudou_video_id
(
newTudou_url
):
try
:
d_url_s_Lst
=
newTudou_url
.
split
(
'.html'
)
d_videoID
=
d_url_s_Lst
[
0
]
newTudou_video_id
=
re
.
findall
(
r"/\w/(.+)?"
,
d_videoID
)[
0
]
except
:
newTudou_video_id
=
None
return
newTudou_video_id
write_data_into_es/func_calculate_toutiao_video_id.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 6 09:54:09 2017
@author: hanye
"""
import
re
def
calculate_toutiao_video_id
(
toutiao_url
):
if
toutiao_url
[
-
1
]
!=
'/'
:
toutiao_url
=
toutiao_url
+
'/'
find_vid
=
re
.
findall
(
'[0-9]+/'
,
toutiao_url
)
if
find_vid
!=
[]:
vid
=
find_vid
[
0
]
.
replace
(
'/'
,
''
)
return
vid
else
:
return
None
write_data_into_es/func_calculate_txxw_video_id.py
deleted
100644 → 0
View file @
8ebe168b
from
write_data_into_es.func_get_releaser_id
import
get_releaser_id
def
calculate_txxw_video_id
(
data_dict
):
try
:
releaser_id
=
get_releaser_id
(
platform
=
"腾讯新闻"
,
releaserUrl
=
data_dict
[
"releaserUrl"
])
video_id
=
data_dict
[
'video_id'
]
if
releaser_id
:
return
video_id
+
"_"
+
releaser_id
else
:
return
video_id
except
:
print
(
'error in :'
,
data_dict
)
return
None
write_data_into_es/func_calculate_wangyi_news_id.py
deleted
100644 → 0
View file @
8ebe168b
# -*- coding:utf-8 -*-
# @Time : 2019/5/5 14:38
# @Author : litao
import
re
def
calculate_wangyi_news_id
(
url
):
if
"/sub/"
in
url
:
find_vid
=
re
.
findall
(
'/sub/(.+)
\
.html'
,
url
)
elif
"/v/"
in
url
:
find_vid
=
re
.
findall
(
'/v/(.+)
\
.html'
,
url
)
else
:
return
url
if
find_vid
!=
[]:
vid
=
find_vid
[
0
]
else
:
vid
=
url
return
vid
if
__name__
==
'__main__'
:
print
(
calculate_wangyi_news_id
(
"https://c.m.163.com/news/v/VA9LBOJ7S.html"
))
print
(
calculate_wangyi_news_id
(
"https://c.m.163.com/news/sub/T1539761239294.html"
))
\ No newline at end of file
write_data_into_es/target_releaser_add.py
View file @
f1e00560
...
@@ -251,7 +251,8 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
...
@@ -251,7 +251,8 @@ def write_to_es(file, push_to_redis=True, update=True, key_releaser=False, updat
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
data_list
=
[{
"releaserUrl"
:
"https://weibo.com/u/1764615662"
,
"releaser"
:
"娱乐圈贵妃"
,
"platform"
:
"weibo"
},
data_list
=
[
{
"releaserUrl"
:
"https://weibo.com/u/1764615662"
,
"releaser"
:
"娱乐圈贵妃"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3662247177"
,
"releaser"
:
"捞娱君"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/3662247177"
,
"releaser"
:
"捞娱君"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2378564111"
,
"releaser"
:
"娱乐扒皮"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2378564111"
,
"releaser"
:
"娱乐扒皮"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2983578965"
,
"releaser"
:
"娱乐圈小青年"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/2983578965"
,
"releaser"
:
"娱乐圈小青年"
,
"platform"
:
"weibo"
},
...
@@ -273,7 +274,8 @@ if __name__ == "__main__":
...
@@ -273,7 +274,8 @@ if __name__ == "__main__":
"platform"
:
"weibo"
},
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6511173721"
,
"releaser"
:
"圈内课代表"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/u/6511173721"
,
"releaser"
:
"圈内课代表"
,
"platform"
:
"weibo"
},
{
"releaserUrl"
:
"https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place"
,
{
"releaserUrl"
:
"https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place"
,
"releaser"
:
"娱闻少女"
,
"platform"
:
"weibo"
}]
"releaser"
:
"娱闻少女"
,
"platform"
:
"weibo"
}
]
extra_dic
=
{
extra_dic
=
{
"department_tags"
:[
"策略组"
],
"department_tags"
:[
"策略组"
],
'key_releaser'
:
True
,
'key_releaser'
:
True
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment