Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
f247fe38
Commit
f247fe38
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
d817d81c
mr/develop/xiaohongshu
litao
No related merge requests found
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
8 deletions
+11
-8
xiaohongshu_to_rpc.py
crawler_sys/scheduler/xiaohongshu_to_rpc.py
+2
-1
crawler_xiaohongshu.py
crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
+9
-7
No files found.
crawler_sys/scheduler/xiaohongshu_to_rpc.py
View file @
f247fe38
...
...
@@ -5,6 +5,7 @@
# @author : litao
import
copy
import
datetime
import
hashlib
import
random
import
time
...
...
@@ -153,7 +154,7 @@ def xiaohongshu_xiaochengxu(res_json):
# print(res_json["NoteView"].get("data"))
for
comment
in
res_json
[
"data"
][
"commentList"
]:
video_dic
[
"content"
]
=
comment
[
'content'
]
video_dic
[
"platform_id"
]
=
comment
[
'id'
]
video_dic
[
"platform_id"
]
=
hashlib
.
md5
(
comment
[
'user'
][
'id'
]
+
comment
[
'content'
]
.
encode
(
"utf8"
))
.
hexdigest
()
comment_id_list_copy
=
copy
.
deepcopy
(
majiayonghu_list
)
comment_id
=
random
.
choice
(
comment_id_list_copy
)
video_dic
[
"user_id"
]
=
comment_id
...
...
This diff is collapsed.
Click to expand it.
crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
View file @
f247fe38
...
...
@@ -141,7 +141,7 @@ class Crawler_xiaohongshu():
releaser_id
=
self
.
get_releaser_id
(
releaserUrl
)
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while
count
<=
releaser_page_num_max
and
count
<=
1
:
while
count
<=
releaser_page_num_max
:
releaserUrl
=
"https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/user/
%
s/notes?page=
%
s&page_size=15"
%
(
releaser_id
,
str
(
count
))
sign
=
releaserUrl
.
replace
(
"https://www.xiaohongshu.com"
,
""
)
+
"WSUDD"
pid
=
"X"
+
hashlib
.
md5
(
sign
.
encode
(
"utf8"
))
.
hexdigest
()
...
...
@@ -165,7 +165,7 @@ class Crawler_xiaohongshu():
time
.
sleep
(
random
.
randint
(
1
,
2
))
data_list
=
res
.
json
()
if
data_list
[
"code"
]
!=
0
or
not
data_list
[
"success
"
]:
if
not
data_list
[
"data
"
]:
break
if
data_list
:
print
(
"get data at releaser:
%
s page:
%
s"
%
(
releaser_id
,
count
))
...
...
@@ -179,8 +179,7 @@ class Crawler_xiaohongshu():
time_ts
=
datetime
.
datetime
.
strptime
(
info_dic
[
"time"
],
'
%
Y-
%
m-
%
d
%
H:
%
M'
)
.
timestamp
()
page_data
=
self
.
get_one_page_xiaochengxu
(
page_id
,
proxies
=
proxies_num
)
# print(page_data)
rds
.
hset
(
"xiaohongshu"
,
key
=
pid
,
value
=
json
.
dumps
(
page_data
))
rds
.
hset
(
"xiaohongshu"
,
key
=
page_id
,
value
=
json
.
dumps
(
page_data
))
yield
page_data
def
releaser_page_by_pc
(
self
,
releaserUrl
,
...
...
@@ -284,7 +283,7 @@ class Crawler_xiaohongshu():
continue
if
rds
.
hexists
(
"xiaohongshu"
,
pid
):
continue
rds
.
hset
(
"xiaohongshu"
,
key
=
pid
,
value
=
json
.
dumps
(
page_data
))
rds
.
hset
(
"xiaohongshu"
,
key
=
p
age_
id
,
value
=
json
.
dumps
(
page_data
))
yield
page_data
# break
...
...
@@ -313,7 +312,7 @@ if __name__ == '__main__':
test
=
Crawler_xiaohongshu
()
releaserurl
=
'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list
=
[
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae"
,
#
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
"https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3"
,
"https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86"
,
"https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740"
,
...
...
@@ -418,8 +417,11 @@ if __name__ == '__main__':
'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07'
,
'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65'
,
]
count
=
0
for
url
in
url_list
:
print
(
url
)
res
=
test
.
releaser_page
(
url
,
proxies_num
=
0
)
for
r
in
res
:
print
(
r
)
count
+=
1
print
(
count
)
# pass
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment