Commit 0aaa57e6 authored by litaolemo's avatar litaolemo

update

parent 5223e9cc
......@@ -16,7 +16,6 @@ import requests
import json
import datetime
import re
# from . import bulk_write_into_es
import hashlib
import time
from selenium import webdriver
......@@ -29,12 +28,11 @@ from selenium import webdriver
try:
from write_data_into_es.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.write_data_into_es.func_get_releaser_id import *
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
import random, urllib
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True)
class Crawler_xiaohongshu():
......@@ -69,16 +67,16 @@ class Crawler_xiaohongshu():
# self.chrome_options.add_argument('sec-fetch-user="?1"')
# self.chrome_options.add_argument('upgrade-insecure-requests="1"')
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs)
# self.chrome_options.add_experimental_option("prefs", prefs)
# self.driver = webdriver.Chrome(options=self.chrome_options)
def __exit__(self):
self.driver.close()
# self.driver.close()
pass
def get_one_page_xiaochengxu(self, page_id, proxies=0):
url = "https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/note/%s/single_feed" % page_id
......@@ -178,8 +176,10 @@ class Crawler_xiaohongshu():
continue
time_ts = datetime.datetime.strptime(info_dic["time"], '%Y-%m-%d %H:%M').timestamp()
page_data = self.get_one_page_xiaochengxu(page_id, proxies=proxies_num)
page_data['release_time'] = int(time_ts*1e3)
page_data['platform'] = 'xiaohongshu'
# print(page_data)
rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
# rds.hset("xiaohongshu", key=page_id, value=json.dumps(page_data))
yield page_data
def releaser_page_by_pc(self, releaserUrl,
......@@ -312,7 +312,7 @@ if __name__ == '__main__':
test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
url_list = [
# "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
"https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
"https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3",
"https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86",
"https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740",
......@@ -417,6 +417,7 @@ if __name__ == '__main__':
'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07',
'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65',
]
print(len(url_list))
count =0
for url in url_list:
print(url)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment