Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
C
crawler
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
backend
crawler
Commits
0aaa57e6
Commit
0aaa57e6
authored
4 years ago
by
litaolemo
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
5223e9cc
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
10 deletions
+11
-10
crawler_xiaohongshu.py
crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
+11
-10
No files found.
crawler_sys/site_crawler_by_redis/crawler_xiaohongshu.py
View file @
0aaa57e6
...
...
@@ -16,7 +16,6 @@ import requests
import
json
import
datetime
import
re
# from . import bulk_write_into_es
import
hashlib
import
time
from
selenium
import
webdriver
...
...
@@ -29,12 +28,11 @@ from selenium import webdriver
try
:
from
write_data_into_es.func_get_releaser_id
import
*
except
:
from
func_get_releaser_id
import
*
from
crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili
import
get_proxy
from
crawler.write_data_into_es.
func_get_releaser_id
import
*
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from
crawler.crawler_sys.utils.trans_str_play_count_to_int
import
trans_play_count
import
random
,
urllib
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
rds
=
redis
.
StrictRedis
(
host
=
'172.18.51.10'
,
port
=
6379
,
db
=
20
,
decode_responses
=
True
)
class
Crawler_xiaohongshu
():
...
...
@@ -69,16 +67,16 @@ class Crawler_xiaohongshu():
# self.chrome_options.add_argument('sec-fetch-user="?1"')
# self.chrome_options.add_argument('upgrade-insecure-requests="1"')
self
.
chrome_options
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-automation'
])
#
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self
.
timestamp
=
str
(
datetime
.
datetime
.
now
()
.
timestamp
()
*
1e3
)
prefs
=
{
"profile.managed_default_content_settings.images"
:
2
}
self
.
chrome_options
.
add_experimental_option
(
"prefs"
,
prefs
)
#
self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
def
__exit__
(
self
):
self
.
driver
.
close
()
#
self.driver.close()
pass
def
get_one_page_xiaochengxu
(
self
,
page_id
,
proxies
=
0
):
url
=
"https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/note/
%
s/single_feed"
%
page_id
...
...
@@ -178,8 +176,10 @@ class Crawler_xiaohongshu():
continue
time_ts
=
datetime
.
datetime
.
strptime
(
info_dic
[
"time"
],
'
%
Y-
%
m-
%
d
%
H:
%
M'
)
.
timestamp
()
page_data
=
self
.
get_one_page_xiaochengxu
(
page_id
,
proxies
=
proxies_num
)
page_data
[
'release_time'
]
=
int
(
time_ts
*
1e3
)
page_data
[
'platform'
]
=
'xiaohongshu'
# print(page_data)
rds
.
hset
(
"xiaohongshu"
,
key
=
page_id
,
value
=
json
.
dumps
(
page_data
))
#
rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
yield
page_data
def
releaser_page_by_pc
(
self
,
releaserUrl
,
...
...
@@ -312,7 +312,7 @@ if __name__ == '__main__':
test
=
Crawler_xiaohongshu
()
releaserurl
=
'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list
=
[
#
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae"
,
"https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3"
,
"https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86"
,
"https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740"
,
...
...
@@ -417,6 +417,7 @@ if __name__ == '__main__':
'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07'
,
'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65'
,
]
print
(
len
(
url_list
))
count
=
0
for
url
in
url_list
:
print
(
url
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment