Commit bc83352f authored by litaolemo's avatar litaolemo

update

parent 8ae3640f
......@@ -42,7 +42,7 @@ rds = func_get_redis()
def get_proxy_from_redis():
try:
one_proxy = rds.randomkey()
proxies = {"http": "http://{}".format(one_proxy), "https": "https://{}".format(one_proxy)}
proxies = {"http": "http://{}".format(one_proxy), "https": "http://{}".format(one_proxy)}
return proxies
except Exception as e:
print(e)
......@@ -100,7 +100,7 @@ def func_get_proxy_to_redis():
rds.set(key, key, ex=int(seconds[key]) - 3)
except Exception as e:
print(e)
return ips[0]
return {"http": "http://{}".format(ips[0]), "https": "http://{}".format(ips[0])}
def proxy_test(proxies):
......
......@@ -27,7 +27,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
import random, urllib
from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
user_id_list = [29865245,
36426151,
......@@ -64,7 +64,7 @@ class Crawler_xiaohongshu():
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
self.chrome_options = webdriver.ChromeOptions()
self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--headless')
# self.chrome_options.add_argument('--disable-gpu')
# self.chrome_options.add_argument("--no-sandbox")
self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"')
......@@ -82,7 +82,7 @@ class Crawler_xiaohongshu():
self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(options=self.chrome_options)
# self.driver = webdriver.Chrome(options=self.chrome_options)
def __exit__(self):
self.driver.close()
......@@ -137,30 +137,30 @@ class Crawler_xiaohongshu():
doc_type=None,
output_to_es_register=False,
push_to_redis=False, proxies_num=None, **kwargs):
self.driver.get("https://www.xiaohongshu.com/")
self.driver.implicitly_wait(2)
time.sleep(1)
self.driver.get(releaserUrl)
self.driver.implicitly_wait(2)
time.sleep(1)
self.driver.refresh()
page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
print(page_element)
cookie = self.driver.get_cookies()
# self.driver.get("https://www.xiaohongshu.com/")
# self.driver.implicitly_wait(2)
# time.sleep(1)
# self.driver.get(releaserUrl)
# self.driver.implicitly_wait(2)
# time.sleep(1)
#
#
# self.driver.refresh()
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# print(page_element)
# cookie = self.driver.get_cookies()
# print(self.driver.get_log("performance"))
cookie_dic={}
for k in cookie:
cookie_dic[k["name"]] = k["value"]
print(cookie_dic)
# cookie_dic={}
# for k in cookie:
# cookie_dic[k["name"]] = k["value"]
# print(cookie_dic)
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTrackerId=38ec6dcb-d882-45e8-c539-834386696a14; xhsuid=Dv4OTnGbDg9LivGu; timestamp2=202101062497d4bed842476b2618e0ea; timestamp2.sig=-Jax1vd_iNZtToaWYMOMoFUmCJwojKQPnfP8iMeOpAc; extra_exp_ids=gif_clt1,ques_clt1; xhs_spses.5dde=*; xhs_spid.5dde=59a50d47116c4333.1609921946.2.1609997747.1609922354.16a344a9-3e6e-43d8-b894-33a7205e181f",
# "cookie": "xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTrackerId=38ec6dcb-d882-45e8-c539-834386696a14; xhsuid=Dv4OTnGbDg9LivGu; timestamp2=202101062497d4bed842476b2618e0ea; timestamp2.sig=-Jax1vd_iNZtToaWYMOMoFUmCJwojKQPnfP8iMeOpAc; extra_exp_ids=gif_clt1,ques_clt1; xhs_spses.5dde=*; xhs_spid.5dde=59a50d47116c4333.1609921946.2.1609997747.1609922354.16a344a9-3e6e-43d8-b894-33a7205e181f",
"referer": releaserUrl,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
......@@ -178,12 +178,12 @@ class Crawler_xiaohongshu():
releaserUrl = 'https://www.xiaohongshu.com/user/profile/%s' % releaser_id
self.video_data['releaserUrl'] = releaserUrl
pcursor = 0
cookie_dic = {'timestamp2.sig': 'QaPtkKr8VeAbx324ZSJgUSeLhjE2Lj1kDhdmZReaewo', 'timestamp2': '20210108b8c577995da3b1aa5e9a7392', 'xhsuid': 'cqq3glNpFsMgH50j', 'xhs_spses.5dde': '*', 'xhs_spid.5dde': 'fa1043ce96194610.1610072893.1.1610072895.1610072893.3536bab9-1e85-4a3a-8a46-37e694100de1', 'extra_exp_ids': 'gif_clt1,ques_clt1', 'xhsTrackerId': '591fba69-1884-4ab2-ca05-9ae70ab77d2e'}
# print(proxies)
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1000:
try:
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num)
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num,cookies=cookie_dic)
except:
continue
# print(get_page.content)
......@@ -216,11 +216,11 @@ class Crawler_xiaohongshu():
# video_dic["content"] = anwser
video_dic["user_id"] = random.choice(user_id_list)
video_dic["create_time"] = time_ts
rpc_res = post_data(video_dic,"cims/question/batch_create")
# rpc_res = post_data(video_dic,"cims/question/batch_create")
print(res)
break
if __name__ == '__main__':
test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
res = test.releaser_page(releaserurl,proxies_num=0)
res = test.releaser_page(releaserurl,proxies_num=1)
......@@ -338,6 +338,7 @@ def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs):
try:
if proxies:
proxies_dic = get_proxy(proxies)
print(proxies_dic)
if not proxies_dic:
get_resp = requests.get(url, timeout=timeout, **kwargs)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment