# -*- coding:UTF-8 -*- # @Time : 2021/1/4 13:39 # @File : crawler_xiaohongshu.py # @email : litao@igengmei.com # @author : litao import copy import redis import requests import json import datetime import re # from . import bulk_write_into_es import hashlib import time from selenium import webdriver from crawler.crawler_sys.utils.output_results import retry_get_url from crawler_sys.framework.video_fields_std import Std_fields_video from crawler_sys.utils.output_results import output_result from crawler.gm_upload.gm_upload import upload, upload_file from selenium.webdriver import ActionChains from selenium import webdriver try: from crawler_sys.framework.func_get_releaser_id import * except: from func_get_releaser_id import * from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count import random, urllib # from crawler.crawler_sys.utils.rpc_data_to_answer import post_data rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True) user_id_list = [29865245, 36426151, 36426142, 36427666, 36427661, 36427657, 36427655, 36427634, 33524762, 33524779, 33524697, 30963358, 31293584, 31358392, 31358396, 31358397, 31358419, 31358448, 31358610, 31358658, ] class Crawler_xiaohongshu(): def __init__(self, timeout=None, platform='xiaohongshu'): if timeout == None: self.timeout = 10 else: self.timeout = timeout self.platform = platform self.TotalVideo_num = None self.midstepurl = None std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform self.chrome_options = webdriver.ChromeOptions() self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--disable-gpu') self.chrome_options.add_argument("--no-sandbox") # proxies_dic = get_proxy(1) # proxies_dic_list = proxies_dic["http"].split(":") # proxy_server = '--proxy-server=http://{ip}:{port}'.format(ip=proxies_dic_list[1].replace("/",""), port=int(proxies_dic_list[2])) # print(proxy_server) # self.chrome_options.add_argument(proxy_server) # self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"') # self.chrome_options.add_argument('accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"') # self.chrome_options.add_argument('accept-encoding="gzip, deflate"') # self.chrome_options.add_argument('accept-language="zh-CN,zh;q=0.9"') # self.chrome_options.add_argument('cache-control="max-age=0"') # self.chrome_options.add_argument('referer="https://www.xiaohongshu.com/explore"') # self.chrome_options.add_argument('sec-fetch-dest="document"') # self.chrome_options.add_argument('sec-fetch-mode="navigate"') # self.chrome_options.add_argument('sec-fetch-user="?1"') # self.chrome_options.add_argument('upgrade-insecure-requests="1"') self.chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) self.timestamp = str(datetime.datetime.now().timestamp() * 1e3) prefs = {"profile.managed_default_content_settings.images": 2} self.chrome_options.add_experimental_option("prefs", prefs) # self.driver = webdriver.Chrome(options=self.chrome_options) def __exit__(self): self.driver.close() def get_one_page(self, page_id, proxies=0,cookies={}): url = "https://www.xiaohongshu.com/discovery/item/%s" % page_id headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", # "cookie": "timestamp2=202101062497d4bed842476b2618e0ea;", "referer": "https://www.xiaohongshu.com/explore", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } res = retry_get_url(url, headers=headers, proxies=proxies,cookies=cookies) res_text = res.text res_json_text = re.findall('__INITIAL_SSR_STATE__=(.*?)</script>',res_text, flags=re.DOTALL)[0] # scope = {} json_data = json.loads(res_json_text.replace("undefined","null")) # exec("res_json =" + res_json_text.strip("\n"),scope) qiniu_img_list = [] # for img_url in json_data["NoteView"]["content"]["imageList"]: # try: # img_wb = retry_get_url(img_url["url"]).content # res = upload(img_wb,img_type=99) # print(res) # img_info = retry_get_url(res + "-imageinfo") # img_info_json = img_info.json() # qiniu_img_list.append((res + "-w", img_info_json)) # except Exception as e: # print("down load img error %s" % e) # return {} # json_data["NoteView"]["content"]["qiniu_img_list"] = qiniu_img_list return json_data def get_releaser_id(self, releaserUrl): return get_releaser_id(platform="xiaohongshu", releaserUrl=releaserUrl) def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, releaser_page_num_max=30, output_to_es_raw=False, es_index=None, doc_type=None, output_to_es_register=False, push_to_redis=False, proxies_num=None, **kwargs): # self.driver.get("https://www.xiaohongshu.com/") # time.sleep(1) # self.driver.implicitly_wait(2) # self.driver.add_cookie(cookie_dict={'name': 'timestamp2', 'value': '2021010899964852bd70ca4c0c991c6c'}) # page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML") # # print(page_element) # self.driver.get("https://www.xiaohongshu.com/explore") # self.driver.implicitly_wait(2) # time.sleep(1) # page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML") # # print(page_element) # # self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]").click() # self.driver.implicitly_wait(2) # time.sleep(1) # page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML") # print(page_element) # cookie = self.driver.get_cookies() # # return True # print(self.driver.get_log("performance")) # cookie_dic={} # for k in cookie: # cookie_dic[k["name"]] = k["value"] # print(cookie_dic) headers = { "host":"www.xiaohongshu.com", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "no-cache", # "cookie": "xhsTrackerId=a81077f9-661a-4731-c790-ac6fbbeaa44b; extra_exp_ids=gif_clt1,ques_exp2; xhsuid=2EFsw5qOMk70l1we; timestamp2=2021010899964852bd70ca4c0c991c6c; timestamp2.sig=Lj3xTHgJ-JO20IUULPRnAhACddlzUtd7AsUzrlJQbWc; xhs_spses.5dde=*; xhsTracker=url=index&searchengine=baidu; xhs_spid.5dde=4dc700089fbdde46.1610082780.1.1610083480.1610082780.d70776d0-eac9-4684-912e-130f0cdb86a1", "pragma": "no-cache", "sec-ch-ua": '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"', "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", } count = 1 # has_more = True retry_time = 0 result_list = [] releaser_id = self.get_releaser_id(releaserUrl) releaserUrl = 'http://www.xiaohongshu.com/user/profile/%s' % releaser_id pcursor = 0 cookie_dic = {'timestamp2': '2021010899964852bd70ca4c0c991c6c'} # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} while count <= releaser_page_num_max and count <= 1: try: print(releaserUrl) res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num, cookies=cookie_dic) except: continue # print(get_page.content) # time.sleep(random.randint(1, 2)) page_text = res.text # print(page_text) data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0] # print(data_list) data_json = json.loads(data_list.replace("undefined","null")) # # print(data_list) if data_json: print("get data at releaser: %s page: %s" % (releaser_id, count)) count += 1 for info_dic in data_json["Main"]["notesDetail"]: video_dic = {} page_id = info_dic["id"] title = info_dic["title"] desc = info_dic.get("desc","") time_ts = datetime.datetime.strptime(info_dic["time"],'%Y-%m-%d %H:%M').timestamp() if info_dic["type"] != "normal": continue page_data = self.get_one_page(page_id,proxies=proxies_num,cookies=cookie_dic) print(page_data) title = title anwser = desc try: pid = page_data["NoteView"]["content"]["id"] except: print("get pid error") continue if rds.hexists("xiaohongshu",pid): continue rds.hset("xiaohongshu",key=pid,value=json.dumps(page_data)) # video_dic["platform"] = "9" # video_dic["platform_id"] = pid # video_dic["title"] = title # # video_dic["content"] = anwser # video_dic["user_id"] = random.choice(user_id_list) # video_dic["create_time"] = time_ts # rpc_res = post_data(video_dic,"cims/question/batch_create") print(res) # break if __name__ == '__main__': test = Crawler_xiaohongshu() releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae' url_list =[ "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae", "https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3", "https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86", "https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740", "https://www.xiaohongshu.com/user/profile/5c4140500000000006006cb7", "https://www.xiaohongshu.com/user/profile/5bd2beff7da0890001b5408a", "https://www.xiaohongshu.com/user/profile/5b5edc5211be1044bcce7824", "https://www.xiaohongshu.com/user/profile/5b35cce84eacab52fbe15c0b", "https://www.xiaohongshu.com/user/profile/5efec35c000000000101d75a", "https://www.xiaohongshu.com/user/profile/5f91428a000000000101d909", "https://www.xiaohongshu.com/user/profile/5ed49f1200000000010017f0", "https://www.xiaohongshu.com/user/profile/5ae3f47b11be105fae4b854c", "https://www.xiaohongshu.com/user/profile/5a9e10fb11be1006adc5b9d5", "https://www.xiaohongshu.com/user/profile/5d0c3b900000000012013409", "https://www.xiaohongshu.com/user/profile/5f1013a70000000001005b16", "https://www.xiaohongshu.com/user/profile/5f5c6d860000000001001787", "https://www.xiaohongshu.com/user/profile/5eeb18e600000000010062b6", "https://www.xiaohongshu.com/user/profile/5bab62e9ee80fc0001505980", "https://www.xiaohongshu.com/user/profile/5f262a610000000001004ea9", "https://www.xiaohongshu.com/user/profile/5eb6779300000000010045f5", "https://www.xiaohongshu.com/user/profile/5c855374000000001202ef0c", "https://www.xiaohongshu.com/user/profile/5ecb6d7300000000010016a4", "https://www.xiaohongshu.com/user/profile/5f100b2d000000000100138d", "https://www.xiaohongshu.com/user/profile/5c14ae400000000006016f5d", "https://www.xiaohongshu.com/user/profile/5bbd28de4c26220001881cbd", "https://www.xiaohongshu.com/user/profile/5f86b6fc000000000100a5d8", "https://www.xiaohongshu.com/user/profile/5db16ca20000000001004c02", "https://www.xiaohongshu.com/user/profile/5ad553bb4eacab34ee9f7d4a", "https://www.xiaohongshu.com/user/profile/5f12cffd000000000101da61", "https://www.xiaohongshu.com/user/profile/596d7e4f5e87e722ff1bfd32", "https://www.xiaohongshu.com/user/profile/5ef17ad00000000001005e1c", "https://www.xiaohongshu.com/user/profile/5f75a5700000000001007679", "https://www.xiaohongshu.com/user/profile/5c639f59000000001000c731", "https://www.xiaohongshu.com/user/profile/5f865cbd0000000001002f01", "https://www.xiaohongshu.com/user/profile/5eccc58f000000000100753e", "https://www.xiaohongshu.com/user/profile/5fbe05b4000000000101c88d", "https://www.xiaohongshu.com/user/profile/5b7d1da7e8ac2b471ee6fef3", "https://www.xiaohongshu.com/user/profile/5a11b22211be101018ba7125", "https://www.xiaohongshu.com/user/profile/5a76c3c611be107f08bd35b3", "https://www.xiaohongshu.com/user/profile/5ecb6d7300000000010016a4", "https://www.xiaohongshu.com/user/profile/5f2539e80000000001009d9e", "https://www.xiaohongshu.com/user/profile/561b1fd8e4b1cf0295755d05", "https://www.xiaohongshu.com/user/profile/5beeba1ff7e8b93bc0405234", "https://www.xiaohongshu.com/user/profile/5c87785f000000001000ed51", "https://www.xiaohongshu.com/user/profile/5efdba65000000000101c79c", "https://www.xiaohongshu.com/user/profile/5507e7dfa46e9616260827f6", "https://www.xiaohongshu.com/user/profile/567573470bf90c27957dd73c", "https://www.xiaohongshu.com/user/profile/5fd1821b000000000100381a", "https://www.xiaohongshu.com/user/profile/5f5f6b1c00000000010064dc", "https://www.xiaohongshu.com/user/profile/5aea4d31e8ac2b4a44e1d2d4", "https://www.xiaohongshu.com/user/profile/5f39eabb00000000010076ca", "https://www.xiaohongshu.com/user/profile/5cda11d7000000001703780c", "https://www.xiaohongshu.com/user/profile/5cbc3e9f000000001701d7bf", "https://www.xiaohongshu.com/user/profile/5e7886930000000001003f7f", "https://www.xiaohongshu.com/user/profile/566fbc3550c4b435f51f637b", "https://www.xiaohongshu.com/user/profile/5e86cb34000000000100a223", "https://www.xiaohongshu.com/user/profile/558e15b2f5a263490c65cdaa", "https://www.xiaohongshu.com/user/profile/5d9eef320000000001001615", "https://www.xiaohongshu.com/user/profile/5a6ba3214eacab4eee8e627a", "https://www.xiaohongshu.com/user/profile/5f58cacb000000000100bdf5", "https://www.xiaohongshu.com/user/profile/5f954030000000000100780c", "https://www.xiaohongshu.com/user/profile/5f5745bf000000000100351d", "https://www.xiaohongshu.com/user/profile/5c74a2b9000000001002e667", "https://www.xiaohongshu.com/user/profile/595ee5b882ec397553103dd3", "https://www.xiaohongshu.com/user/profile/5a5e20324eacab30f03654fb", "https://www.xiaohongshu.com/user/profile/55743bedc2bdeb1a16844741", "https://www.xiaohongshu.com/user/profile/5f0d523800000000010056de", "https://www.xiaohongshu.com/user/profile/59d5b03e44363b61a050532f", "https://www.xiaohongshu.com/user/profile/5ebdd5f40000000001002a67", "https://www.xiaohongshu.com/user/profile/5f1c1b7b0000000001006cbf", "https://www.xiaohongshu.com/user/profile/5ae404944eacab794dfb95b1", "https://www.xiaohongshu.com/user/profile/5d26276a0000000012017538", "https://www.xiaohongshu.com/user/profile/5ed5aa8f0000000001001f1e", "https://www.xiaohongshu.com/user/profile/5f92cf4f000000000100a846", "https://www.xiaohongshu.com/user/profile/5a75d42011be10344b917ffe", "https://www.xiaohongshu.com/user/profile/5ccea0ff000000001002b753", "https://www.xiaohongshu.com/user/profile/5c4418750000000005006717", "https://www.xiaohongshu.com/user/profile/5ec582d60000000001005315", "https://www.xiaohongshu.com/user/profile/594a93835e87e72f3e2ded11", "https://www.xiaohongshu.com/user/profile/5b8ab07606311b000184195a", "https://www.xiaohongshu.com/user/profile/54e7413ea46e96122dab7674", "https://www.xiaohongshu.com/user/profile/5f3657900000000001002181", "https://www.xiaohongshu.com/user/profile/5a65d6554eacab6864e2749e", "https://www.xiaohongshu.com/user/profile/5a745dc911be101d9ceab748", "https://www.xiaohongshu.com/user/profile/59b2033550c4b45e5d43c3d9", "https://www.xiaohongshu.com/user/profile/59a97aaa5e87e760e012dcd0", "https://www.xiaohongshu.com/user/profile/5a5de03611be100219719b0f", "https://www.xiaohongshu.com/user/profile/5f40a5170000000001008577", "https://www.xiaohongshu.com/user/profile/597e82aa5e87e73c4915db81", "https://www.xiaohongshu.com/user/profile/580e0bc36a6a69043935369d", "https://www.xiaohongshu.com/user/profile/5d1a17670000000012021d8e", "https://www.xiaohongshu.com/user/profile/59a830be82ec39155146f421", "https://www.xiaohongshu.com/user/profile/55efc1b73397db0e969c8fbd", "https://www.xiaohongshu.com/user/profile/5c8c55220000000010005810", "https://www.xiaohongshu.com/user/profile/5f337df2000000000101e2b2", "https://www.xiaohongshu.com/user/profile/5f2111500000000001009b7b", "https://www.xiaohongshu.com/user/profile/59c840ff44363b497f335cd4", "https://www.xiaohongshu.com/user/profile/5f8e8508000000000101d70e", "https://www.xiaohongshu.com/user/profile/5a163e3511be10234e1abffd", "https://www.xiaohongshu.com/user/profile/5e71f6870000000001005e52", 'https://www.xiaohongshu.com/user/profile/5cca9b3700000000120314c9', 'https://www.xiaohongshu.com/user/profile/5aa0f7bae8ac2b65bfcdaf0e', 'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07', 'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65', ] for url in url_list: print(url) res = test.releaser_page(url,proxies_num=0)