Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
62991f3f
Commit
62991f3f
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
6a26b3f6
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
212 additions
and
2 deletions
+212
-2
crawler_baijiahao.py
crawler_sys/site_crawler/crawler_baijiahao.py
+1
-1
crawler_weibo.py
crawler_sys/site_crawler_by_redis/crawler_weibo.py
+210
-0
crawler_v_qq.py
crawler_sys/site_crawler_test/crawler_v_qq.py
+1
-1
No files found.
crawler_sys/site_crawler/crawler_baijiahao.py
View file @
62991f3f
...
...
@@ -28,7 +28,7 @@ from crawler.crawler_sys.utils.util_logging import logged
try
:
from
crawler_sys.framework.func_get_releaser_id
import
*
except
:
from
func_get_releaser_id
import
*
from
write_data_into_es.
func_get_releaser_id
import
*
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
*
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_by_redis/crawler_weibo.py
0 → 100644
View file @
62991f3f
# -*- coding:UTF-8 -*-
# @Time : 2020/7/23 17:40
# @File : crawler_weibo.py
# @email : litao@igengmei.com
# @author : litao
import
copy
import
requests
import
re
import
datetime
,
time
import
json
import
random
# from bs4 import BeautifulSoup
from
crawler.crawler_sys.framework.video_fields_std
import
Std_fields_video
from
crawler.crawler_sys.utils.output_results
import
retry_get_url
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
from
crawler.crawler_sys.utils.trans_strtime_to_timestamp
import
weibo_parse_time
,
trans_strtime_to_timestamp
from
crawler.crawler_sys.utils.trans_duration_str_to_second
import
trans_duration
from
crawler.crawler_sys.utils.util_logging
import
logged
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
crawler.crawler_sys.utils.html_to_str
import
dehtml
from
write_data_into_es.func_get_releaser_id
import
*
class
Crawler_weibo
():
def
__init__
(
self
,
timeout
=
None
,
platform
=
'weibo'
):
if
timeout
==
None
:
self
.
timeout
=
10
else
:
self
.
timeout
=
timeout
self
.
platform
=
platform
std_fields
=
Std_fields_video
()
self
.
video_data
=
std_fields
.
video_data
self
.
video_data
[
'platform'
]
=
self
.
platform
# remove fields that crawled data don't have
pop_key_Lst
=
[
'describe'
,
'repost_count'
,
'isOriginal'
,
'video_id'
]
for
popk
in
pop_key_Lst
:
self
.
video_data
.
pop
(
popk
)
@staticmethod
def
get_single_page
(
mid
):
url
=
"https://m.weibo.cn/status/
%
s"
%
mid
headers
=
{
"accept"
:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"same-origin"
,
"sec-fetch-site"
:
"same-origin"
,
"upgrade-insecure-requests"
:
"1"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
}
page_res
=
retry_get_url
(
url
,
headers
=
headers
,
proxies
=
0
)
page_json_context
=
re
.
findall
(
r"render_data = (.*)\[0\]"
,
page_res
.
text
,
flags
=
re
.
DOTALL
)[
0
]
page_json
=
json
.
loads
(
page_json_context
)
text
=
dehtml
(
page_json
[
0
][
"status"
][
"text"
])
repost_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"reposts_count"
])
comment_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"comments_count"
])
favorite_count
=
trans_play_count
(
page_json
[
0
][
"status"
][
"attitudes_count"
])
return
text
,
repost_count
,
comment_count
,
favorite_count
def
get_releaser_id
(
self
,
releaserUrl
):
return
get_releaser_id
(
platform
=
self
.
platform
,
releaserUrl
=
releaserUrl
)
def
releaser_page
(
self
,
releaserUrl
,
output_to_file
=
False
,
filepath
=
None
,
output_to_es_raw
=
False
,
output_to_es_register
=
False
,
push_to_redis
=
False
,
releaser_page_num_max
=
10000
,
es_index
=
None
,
doc_type
=
None
,
proxies_num
=
None
):
print
(
'Processing releaserUrl
%
s'
%
releaserUrl
)
result_Lst
=
[]
releaser_id
,
containerid
=
self
.
get_releaser_id
(
releaserUrl
)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers
=
{
"accept"
:
"application/json, text/plain, */*"
,
"accept-encoding"
:
"gzip, deflate, br"
,
"accept-language"
:
"zh-CN,zh;q=0.9"
,
# "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
"mweibo-pwa"
:
"1"
,
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
"sec-fetch-dest"
:
"empty"
,
"sec-fetch-mode"
:
"cors"
,
"sec-fetch-site"
:
"same-origin"
,
"user-agent"
:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
,
"x-requested-with"
:
"XMLHttpRequest"
,
# "x-xsrf-token": xsrf_token,
}
pagenum
=
0
has_more
=
True
since_id
=
0
if
releaser_id
:
while
pagenum
<=
releaser_page_num_max
and
has_more
:
pagenum
+=
1
time
.
sleep
(
0.5
)
"?uid=1669879400&t=0&luicode=10000011&lfid=100103type
%3
D1
%26
q
%3
D
%
E8
%
BF
%
AA
%
E4
%
B8
%
BD
%
E7
%83%
AD
%
E5
%
B7
%
B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
url
=
"https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}"
.
format
(
releaser_id
,
releaser_id
,
releaser_id
,
since_id
)
headers
[
"referer"
]
=
"https://m.weibo.cn/u/uid={0}&t=0"
.
format
(
releaser_id
)
print
(
'Page number:
%
d'
%
pagenum
)
try
:
if
proxies_num
:
get_page
=
retry_get_url
(
url
,
headers
=
headers
,
timeout
=
self
.
timeout
,
proxies
=
proxies_num
)
else
:
get_page
=
retry_get_url
(
url
,
headers
=
headers
,
timeout
=
self
.
timeout
)
except
:
get_page
=
None
has_more
=
False
if
get_page
and
get_page
.
status_code
==
200
:
try
:
page_json
=
get_page
.
json
()
total
=
page_json
[
"data"
][
"cardlistInfo"
][
"total"
]
if
pagenum
>
total
:
break
since_id
=
page_json
[
"data"
][
"cardlistInfo"
][
"since_id"
]
page_dic
=
page_json
[
"data"
]
.
get
(
"cards"
)
except
Exception
as
e
:
print
(
"load data error
%
s"
%
e
)
continue
if
page_dic
:
for
one
in
page_dic
:
try
:
mblog
=
one
.
get
(
"mblog"
)
mid
=
mblog
.
get
(
"mid"
)
forward_text
=
""
forward_user
=
""
if
one
.
get
(
"source"
)
==
"绿洲"
:
text_type
=
"绿洲"
elif
mblog
.
get
(
"retweeted_status"
):
text_type
=
"转发"
forward_text
=
mblog
.
get
(
"retweeted_status"
)
.
get
(
"raw_text"
)
forward_user
=
mblog
.
get
(
"retweeted_status"
)
.
get
(
"user"
)
.
get
(
"screen_name"
)
else
:
text_type
=
one
.
get
(
"source"
)
if
mblog
.
get
(
"isLongText"
):
text
,
repost_count
,
comment_count
,
favorite_count
=
self
.
get_single_page
(
mid
)
else
:
text
=
mblog
[
"raw_text"
]
res_dic
=
{
"release_time"
:
trans_strtime_to_timestamp
(
mblog
[
"created_at"
]),
"url"
:
one
[
"scheme"
],
"releaser"
:
mblog
[
"user"
][
"screen_name"
],
"repost_count"
:
trans_play_count
(
mblog
[
"reposts_count"
]),
"comment_count"
:
trans_play_count
(
mblog
[
"comments_count"
]),
"favorite_count"
:
trans_play_count
(
mblog
[
"attitudes_count"
]),
"title"
:
text
.
replace
(
"
\u200b
"
,
""
),
"wb_type"
:
text_type
,
"forward_user"
:
forward_user
,
"forward_text"
:
forward_text
,
"mid"
:
mid
,
"releaserUrl"
:
"https://www.weibo.com/u/
%
s"
%
releaser_id
,
"releaser_id_str"
:
"weibo_
%
s"
%
releaser_id
}
yield
res_dic
except
Exception
as
e
:
print
(
mblog
)
print
(
"row formate error
%
s"
%
e
)
continue
def
get_releaser_follower_num
(
self
,
releaserUrl
):
pass
def
releaser_page_by_time
(
self
,
start_time
,
end_time
,
url
,
allow
,
**
kwargs
):
count_false
=
0
for
res
in
self
.
releaser_page
(
url
,
proxies_num
=
kwargs
.
get
(
"proxies_num"
)):
video_time
=
res
[
"release_time"
]
# print(res)
if
video_time
:
if
start_time
<
video_time
:
if
video_time
<
end_time
:
yield
res
else
:
count_false
+=
1
if
count_false
>
allow
:
break
else
:
yield
res
if
__name__
==
'__main__'
:
test
=
Crawler_weibo
()
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
url_list
=
[
# "https://weibo.com/u/1764615662",
# "https://weibo.com/u/3662247177",
# "https://weibo.com/u/2378564111",
# "https://weibo.com/u/2983578965",
# "https://weibo.com/u/3938976579",
# "https://weibo.com/u/6511177474",
"https://weibo.com/u/6511173721"
,
# "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place",
]
# res = test.releaser_page(url, output_to_es_raw=True,
# es_index='crawler-data-raw',
# releaser_page_num_max=400,proxies_num=0)
# for r in res:
# print(r)
for
u
in
url_list
:
test
.
releaser_page_by_time
(
1590940800000
,
1595468554268
,
u
,
output_to_es_raw
=
False
,
es_index
=
'crawler-data-raw'
,
doc_type
=
'doc'
,
releaser_page_num_max
=
4000
)
# test.get_single_page(4524055937468233)
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_test/crawler_v_qq.py
View file @
62991f3f
...
...
@@ -1011,7 +1011,7 @@ class Crawler_v_qq():
count
=
0
pid
=
os
.
getpid
()
redis_key
=
connect_with_redis
.
platform_redis_lst_reg
[
self
.
platform
]
while
connect_with_redis
.
length_of_lst
(
lst_
key
=
redis_key
)
>
0
:
while
connect_with_redis
.
length_of_lst
(
key
=
redis_key
)
>
0
:
resp_str
=
connect_with_redis
.
retrieve_video_html_from_redis_renew
(
self
.
platform
)
video_dic
=
self
.
parse_video_html_to_renew_play_count
(
resp_str
)
if
video_dic
is
not
None
:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment