Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Chengyang Zhong
crawler
Commits
d1db5b38
Commit
d1db5b38
authored
Jul 22, 2020
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update 增加微博爬虫,测试ok
parent
aa4469d5
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
47 additions
and
12 deletions
+47
-12
README.md
README.md
+2
-1
func_get_proxy_form_kuaidaili.py
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
+1
-1
crawler_toutiao.py
crawler_sys/site_crawler_test/crawler_toutiao.py
+1
-1
crawler_v_qq.py
crawler_sys/site_crawler_test/crawler_v_qq.py
+2
-2
crawler_weibo.py
crawler_sys/site_crawler_test/crawler_weibo.py
+0
-0
trans_strtime_to_timestamp.py
crawler_sys/utils/trans_strtime_to_timestamp.py
+28
-1
requirements.txt
requirements.txt
+7
-0
test_read_config.py
test/test_read_config.py
+1
-1
func_get_releaser_id.py
write_data_into_es/func_get_releaser_id.py
+5
-5
No files found.
README.md
View file @
d1db5b38
# crawler
1.
部署在BJ-
PaaS-test-nvwa
001/srv/apps/
1.
部署在BJ-
GM-Prod-Cos-faiss
001/srv/apps/
2.
创建虚拟环境 conda activate crawler_env/conda deactivate
\ No newline at end of file
crawler_sys/proxy_pool/func_get_proxy_form_kuaidaili.py
View file @
d1db5b38
...
...
@@ -41,7 +41,7 @@ def get_proxy_from_redis():
def
func_get_proxy_to_redis
():
# chance = random.random()
auth
=
kdl
.
Auth
(
"99
7803479675913"
,
"er2siw6i58c61s387sqljvovoz8zybaq
"
)
auth
=
kdl
.
Auth
(
"99
0866563045611"
,
"quxguz4hwm9cxnx6wpjhkokx04klpr8v
"
)
client
=
kdl
.
Client
(
auth
)
# 获取订单到期时间, 返回时间字符串
...
...
crawler_sys/site_crawler_test/crawler_toutiao.py
View file @
d1db5b38
...
...
@@ -1792,7 +1792,7 @@ class Crawler_toutiao():
# pass
data_lis
.
append
(
res
)
if
len
(
data_lis
)
>=
10
:
if
len
(
data_lis
)
>=
10
0
:
output_result
(
result_Lst
=
data_lis
,
platform
=
self
.
platform
,
output_to_file
=
output_to_file
,
...
...
crawler_sys/site_crawler_test/crawler_v_qq.py
View file @
d1db5b38
...
...
@@ -27,7 +27,7 @@ import requests
import
re
import
datetime
import
json
import
aiohttp
#
import aiohttp
import
random
from
bs4
import
BeautifulSoup
from
multiprocessing
import
Pool
...
...
@@ -45,7 +45,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
try
:
from
crawler_sys.framework.func_get_releaser_id
import
*
except
:
from
func_get_releaser_id
import
*
from
write_data_into_es.
func_get_releaser_id
import
*
class
Crawler_v_qq
():
...
...
crawler_sys/site_crawler_test/crawler_weibo.py
0 → 100644
View file @
d1db5b38
This diff is collapsed.
Click to expand it.
crawler_sys/utils/trans_strtime_to_timestamp.py
View file @
d1db5b38
...
...
@@ -130,7 +130,7 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
year
=
func_inyear
(
input_time
)
if
year
!=
str
(
0
):
input_time
=
year
+
'-'
+
input_time
real_time
=
real_time
=
int
(
datetime
.
datetime
.
strptime
(
input_time
,
real_time
=
int
(
datetime
.
datetime
.
strptime
(
input_time
,
'
%
Y-
%
m-
%
d'
)
.
timestamp
()
*
1e3
)
else
:
print
(
'error in {input_time}'
.
format
(
input_time
=
input_time
))
...
...
@@ -145,3 +145,29 @@ def trans_strtime_to_timestamp(input_time, missing_year=False):
return
real_time
def
weibo_parse_time
(
publish_time
):
publish_time
=
publish_time
.
split
(
'来自'
)[
0
]
if
'刚刚'
in
publish_time
:
publish_time
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M'
)
elif
'分钟'
in
publish_time
:
minute
=
publish_time
[:
publish_time
.
find
(
'分钟'
)]
minute
=
datetime
.
timedelta
(
minutes
=
int
(
minute
))
publish_time
=
(
datetime
.
datetime
.
now
()
-
minute
)
.
strftime
(
'
%
Y-
%
m-
%
d
%
H:
%
M'
)
elif
'今天'
in
publish_time
:
today
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y-
%
m-
%
d'
)
time
=
publish_time
[
3
:]
publish_time
=
today
+
' '
+
time
elif
'月'
in
publish_time
:
year
=
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y'
)
month
=
publish_time
[
0
:
2
]
day
=
publish_time
[
3
:
5
]
time
=
publish_time
[
7
:
12
]
publish_time
=
year
+
'-'
+
month
+
'-'
+
day
+
' '
+
time
else
:
publish_time
=
publish_time
return
trans_strtime_to_timestamp
(
publish_time
)
if
__name__
==
"__main__"
:
print
(
trans_strtime_to_timestamp
(
"06-03"
))
\ No newline at end of file
requirements.txt
0 → 100644
View file @
d1db5b38
lxml
==4.5.1
requests
==2.23.0
tqdm
==4.46.1
absl-py
==0.9.0
dkl
=0.2.15
redis
=3.5.3
elasticsearch
=7.8.0
test/test_read_config.py
View file @
d1db5b38
...
...
@@ -43,7 +43,7 @@ def write_es(target_index,platform="short_video"):
'abstract'
:
'《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数。'
,
'article_id'
:
6851819988165394958
,
'releaser_id_str'
:
'toutiao_103497952048'
,
'video_img'
:
'http://p26-tt.byteimg.com/img/pgc-image/7448338f2712460d968e65062b249a40~720x380_cs.webp'
,
'id'
:
'toutiao_6851819988165394958'
,
'high_quality_flag'
:
0
,
'releaser_followers_count'
:
264759
,
'
a
id'
:
'toutiao_6851819988165394958'
,
'high_quality_flag'
:
0
,
'releaser_followers_count'
:
264759
,
'content'
:
'<h1 class="pgc-h-arrow-right">饿了吗?戳右边关注我们,每天给您送上最新出炉的娱乐硬核大餐!</h1><p>上周六(7月18日)《这就是街舞》火热开播,引来无数热爱街舞的小伙伴前来观看,除了选手们精彩的表演,最令人期待的还是四位导师的表现。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/1cde4effb617411fb66467228f5fc20e" img_width="494" img_height="585" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>《这街》由王嘉尔、钟汉良、王一博、张艺兴四位男艺人担当导师,首期节目四人带来个人开场秀进行Battle,用来确定到手毛巾数(选手晋级票)。</p><p>最终,张艺兴以第一名的成绩拿到最多毛巾,而45岁的钟汉良力压90后王一博成为倒数第二。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f7d980bc497a4f98963d76dbd5ec08cc" img_width="1582" img_height="887" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>7月21日,有网友发帖质疑,是因为王一博年纪最小,红的时间最短所以用他挽尊吗?并质疑钟汉良是否暗箱操作。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/ba8cd99c5cb2429aa46d99c5a5f62918" img_width="636" img_height="126" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>该网友提出两个疑点,一是投票阶段的视频虽然经过剪辑,却能看出最开始的时候都没人投钟汉良,最后是怎么比王一博多的?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/e0a38a7a89bc4763b551b5011def58f1" img_width="619" img_height="403" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>二是投票结束时钟汉良表情凝重,而王一博表情轻松,看起来很有信心。钟汉良有在节目里说能用耳朵听见有多少人投自己。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/546810450d7b4f0c929479eea207b3ad" img_width="855" img_height="465" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实网友有这样的猜想并不奇怪,在节目播出时便有圈内人发博表示搞不懂为什么王一博会垫底。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/0eca18a8e15243e0b03ff1e8cd90f0af" img_width="919" img_height="384" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>而对钟汉良是否能胜任《这街》导师身份的话题也持续讨论未断过。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/8831a69ccbef4b8fae65497356fedfcb" img_width="775" img_height="323" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>在大多数人心中,张艺兴、王嘉尔、王一博三个年轻人都是在街舞方面有所造诣的唱跳艺人。</p><p>钟汉良不是演员吗?</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/f55235255cee43dd8fce9f091f987176" img_width="879" img_height="518" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>诚然,如今已有45岁的钟汉良在跳舞方面已经不如年轻人有爆发力,但节目中也可以看出他有跳街舞的基础。</p><p>许多人不知道的是,他出道前其实是无线的舞蹈艺人,没靠脸吃饭的时候,他也是拥有一身舞艺,不一定比王一博差。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/7448338f2712460d968e65062b249a40" img_width="759" img_height="502" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1993年钟汉良加入TVB的舞蹈艺员训练班,学习现代舞、民族舞、芭蕾舞,而他最擅长的是拉丁舞,当时曾为梅艳芳、张国荣等天王巨星担任伴舞,更有一个当芭蕾舞者的梦想。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a5d6f1c59bad422eb4631b5d6a8feadc" img_width="506" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>但因为他生的白净,身高183cm,长相帅气,于是很快被调至艺员部拍电视剧。那段时间他连续出演《少年五虎》、《刑事侦缉档案》、《第三类法庭》为自己积累了不少观众缘。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/a05a5086d9164c909058276ce4a9f3f5" img_width="594" img_height="281" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>1995年,他获台湾唱作人邰正宵发掘,赴台湾发展并成为唱跳歌手,并出了自己的首张专辑《OREA》。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/59f1d258c5274cfebe109a2917d4b2e8" img_width="400" img_height="262" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>钟汉良当了5年的歌手都没有什么水花,直到2000年后,他开始转向内地拍剧,才开始小有名气。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/73663a4cf05841d694c42b3cb9d91be3" img_width="524" img_height="345" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>其实这期间他并不是没有尝试再回到舞台上,2006年他就参加了《舞林大会》,但这时候大家已经开始淡忘他曾是一位唱跳歌手。</p><div class="pgc-img"><img src="https://p3.pstatp.com/large/pgc-image/dcc4137db0a24d28b5595ca771a2e8f5" img_width="1311" img_height="684" inline="0" alt="45岁钟汉良Battle战胜王一博,两个细节被疑是暗箱操作?你怎么看" onerror="javascript:errorimg.call(this);"><p class="pgc-img-caption"></p></div><p>直到2015年,他成立自己的工作室,再次重拾歌手的身份,除了出个人音乐专辑外,还开过演唱会,如今加盟《这就是街舞》,可以尽情展示他的舞技,也算是一场多年来当舞者的夙愿。</p><p>他在街舞也确实玩的很开心。虽然依然有质疑的声音出现,但相信在后面的节目中能在他身上看到更多惊喜。</p><p>#钟汉良暗箱操作#、#钟汉良跳舞#、#钟汉良王一博#</p><p>作者:每天都想吃榴莲</p><p>责编:阿叉</p>'
}
_id
=
"test0"
...
...
write_data_into_es/func_get_releaser_id.py
View file @
d1db5b38
...
...
@@ -256,21 +256,21 @@ def pearvideo(releaserUrl,**kwargs):
def
weibo
(
releaserUrl
,
**
kwargs
):
try
:
containerid
=
""
if
"/u/"
in
releaserUrl
:
releaser_id
=
re
.
findall
(
"/u/(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
containerid
=
re
.
findall
(
"/u/(
\
d+)"
,
releaserUrl
)[
0
]
elif
"/p/"
in
releaserUrl
:
releaser_id
=
re
.
findall
(
"/p/(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
containerid
=
re
.
findall
(
"/p/(
\
d+)"
,
releaserUrl
)[
0
]
if
len
(
releaser_id
)
>=
15
:
releaser_id
=
releaser_id
[
6
:]
elif
"/"
in
releaserUrl
:
releaser_id
=
re
.
findall
(
"(
\
d+)"
,
releaserUrl
)[
0
]
releaser_id
=
containerid
=
re
.
findall
(
"(
\
d+)"
,
releaserUrl
)[
0
]
else
:
try
:
releaserid
=
int
(
releaserUrl
)
except
:
return
None
return
releaser_id
return
releaser_id
,
containerid
except
:
return
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment