Commit e8a80929 authored by litaolemo's avatar litaolemo

update

parent 1d640c63
...@@ -17,7 +17,8 @@ from crawler.crawler_sys.utils.output_results import retry_get_url ...@@ -17,7 +17,8 @@ from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler_sys.framework.video_fields_std import Std_fields_video from crawler_sys.framework.video_fields_std import Std_fields_video
from crawler_sys.utils.output_results import output_result from crawler_sys.utils.output_results import output_result
from crawler.gm_upload.gm_upload import upload, upload_file from crawler.gm_upload.gm_upload import upload, upload_file
from selenium.webdriver import ActionChains
from selenium import webdriver
try: try:
from crawler_sys.framework.func_get_releaser_id import * from crawler_sys.framework.func_get_releaser_id import *
except: except:
...@@ -62,18 +63,31 @@ class Crawler_xiaohongshu(): ...@@ -62,18 +63,31 @@ class Crawler_xiaohongshu():
std_fields = Std_fields_video() std_fields = Std_fields_video()
self.video_data = std_fields.video_data self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform self.video_data['platform'] = self.platform
unused_key_list = ['channel', 'describe', 'repost_count', 'isOriginal'] self.chrome_options = webdriver.ChromeOptions()
for key in unused_key_list: # self.chrome_options.add_argument('--headless')
self.video_data.pop(key) self.chrome_options.add_argument('--disable-gpu')
# self.chrome_options.add_argument("--start-maximized")
self.chrome_options.add_argument("--no-sandbox")
self.chrome_options.add_argument(
'User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"')
self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(options=self.chrome_options)
def get_one_page(self, page_id, proxies=0): def __exit__(self):
self.driver.close()
def get_one_page(self, page_id, proxies=0,cookies={}):
url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br", "accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"cookie": "timestamp2=202101062497d4bed842476b2618e0ea;", # "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;",
"referer": "https://www.xiaohongshu.com/explore", "referer": "https://www.xiaohongshu.com/explore",
"sec-fetch-dest": "document", "sec-fetch-dest": "document",
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
...@@ -82,7 +96,7 @@ class Crawler_xiaohongshu(): ...@@ -82,7 +96,7 @@ class Crawler_xiaohongshu():
"upgrade-insecure-requests": "1", "upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
} }
res = retry_get_url(url, headers=headers, proxies=proxies) res = retry_get_url(url, headers=headers, proxies=proxies,cookies=cookies)
res_text = res.text res_text = res.text
res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0] res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0]
# scope = {} # scope = {}
...@@ -116,12 +130,22 @@ class Crawler_xiaohongshu(): ...@@ -116,12 +130,22 @@ class Crawler_xiaohongshu():
doc_type=None, doc_type=None,
output_to_es_register=False, output_to_es_register=False,
push_to_redis=False, proxies_num=None, **kwargs): push_to_redis=False, proxies_num=None, **kwargs):
self.driver.get(releaserUrl)
self.driver.implicitly_wait(1)
self.driver.get(releaserUrl)
self.driver.implicitly_wait(1)
cookie = self.driver.get_cookies()
# print(self.driver.get_log("performance"))
cookie_dic={}
for k in cookie:
cookie_dic[k["name"]] = k["value"]
headers = { headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate", "accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9", "accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0", "cache-control": "max-age=0",
"cookie": "timestamp2=202101062497d4bed842476b2618e0ea;", # "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;",
"referer": releaserUrl, "referer": releaserUrl,
"sec-fetch-dest": "document", "sec-fetch-dest": "document",
"sec-fetch-mode": "navigate", "sec-fetch-mode": "navigate",
...@@ -144,7 +168,7 @@ class Crawler_xiaohongshu(): ...@@ -144,7 +168,7 @@ class Crawler_xiaohongshu():
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1000: while count <= releaser_page_num_max and count <= 1000:
try: try:
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num) res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num,cookies=cookie_dic)
except: except:
continue continue
# print(get_page.content) # print(get_page.content)
...@@ -166,7 +190,7 @@ class Crawler_xiaohongshu(): ...@@ -166,7 +190,7 @@ class Crawler_xiaohongshu():
time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp() time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp()
if info_dic["type"] != "normal": if info_dic["type"] != "normal":
continue continue
page_data = self.get_one_page(page_id,proxies=proxies_num) page_data = self.get_one_page(page_id,proxies=proxies_num,cookies=cookie)
print(page_data) print(page_data)
title = title title = title
anwser = desc anwser = desc
......
...@@ -42,6 +42,6 @@ def post_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict: ...@@ -42,6 +42,6 @@ def post_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict:
data = { data = {
'requests': '[{"params": {"replies": [{%s}]}, "method": "%s, "timeout": 120}]' % (str(data_dict),rpc_type) 'requests': '[{"params": {"replies": [{%s}]}, "method": "%s, "timeout": 120}]' % (str(data_dict),rpc_type)
} }
response = requests.post('http://127.0.0.1:8003/v1/batch', headers=headers, data=data) response = requests.post('http://cims-qa.paas-develop.env/v1/batch', headers=headers, data=data)
print(response.text) print(response.text)
return response.json() return response.json()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment