Commit eddb0d24 authored by litaolemo's avatar litaolemo

update

parent 7183d859
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
# @author : litao # @author : litao
import copy import copy
import redis
import requests import requests
import json import json
import datetime import datetime
...@@ -28,7 +29,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro ...@@ -28,7 +29,7 @@ from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_pro
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
import random, urllib import random, urllib
# from crawler.crawler_sys.utils.rpc_data_to_answer import post_data # from crawler.crawler_sys.utils.rpc_data_to_answer import post_data
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=17, decode_responses=True)
user_id_list = [29865245, user_id_list = [29865245,
36426151, 36426151,
36426142, 36426142,
...@@ -67,11 +68,11 @@ class Crawler_xiaohongshu(): ...@@ -67,11 +68,11 @@ class Crawler_xiaohongshu():
self.chrome_options.add_argument('--headless') self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('--disable-gpu') self.chrome_options.add_argument('--disable-gpu')
self.chrome_options.add_argument("--no-sandbox") self.chrome_options.add_argument("--no-sandbox")
proxies_dic = get_proxy(1) # proxies_dic = get_proxy(1)
proxies_dic_list = proxies_dic["http"].split(":") # proxies_dic_list = proxies_dic["http"].split(":")
proxy_server = '--proxy-server=http://{ip}:{port}'.format(ip=proxies_dic_list[1].replace("/",""), port=int(proxies_dic_list[2])) # proxy_server = '--proxy-server=http://{ip}:{port}'.format(ip=proxies_dic_list[1].replace("/",""), port=int(proxies_dic_list[2]))
print(proxy_server) # print(proxy_server)
self.chrome_options.add_argument(proxy_server) # self.chrome_options.add_argument(proxy_server)
# self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"') # self.chrome_options.add_argument('User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"')
# self.chrome_options.add_argument('accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"') # self.chrome_options.add_argument('accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"')
# self.chrome_options.add_argument('accept-encoding="gzip, deflate"') # self.chrome_options.add_argument('accept-encoding="gzip, deflate"')
...@@ -87,7 +88,7 @@ class Crawler_xiaohongshu(): ...@@ -87,7 +88,7 @@ class Crawler_xiaohongshu():
self.timestamp = str(datetime.datetime.now().timestamp() * 1e3) self.timestamp = str(datetime.datetime.now().timestamp() * 1e3)
prefs = {"profile.managed_default_content_settings.images": 2} prefs = {"profile.managed_default_content_settings.images": 2}
self.chrome_options.add_experimental_option("prefs", prefs) self.chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome(options=self.chrome_options)
def __exit__(self): def __exit__(self):
...@@ -143,33 +144,33 @@ class Crawler_xiaohongshu(): ...@@ -143,33 +144,33 @@ class Crawler_xiaohongshu():
doc_type=None, doc_type=None,
output_to_es_register=False, output_to_es_register=False,
push_to_redis=False, proxies_num=None, **kwargs): push_to_redis=False, proxies_num=None, **kwargs):
self.driver.get("https://www.xiaohongshu.com/") # self.driver.get("https://www.xiaohongshu.com/")
time.sleep(1)
self.driver.implicitly_wait(2)
self.driver.add_cookie(cookie_dict={'name': 'timestamp2', 'value': '2021010899964852bd70ca4c0c991c6c'})
page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
print(page_element)
self.driver.get("https://www.xiaohongshu.com/explore")
self.driver.implicitly_wait(2)
time.sleep(1)
page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
print(page_element)
self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]").click() # time.sleep(1)
self.driver.implicitly_wait(2) # self.driver.implicitly_wait(2)
time.sleep(1) # self.driver.add_cookie(cookie_dict={'name': 'timestamp2', 'value': '2021010899964852bd70ca4c0c991c6c'})
page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML") # page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
print(page_element) # # print(page_element)
# self.driver.get("https://www.xiaohongshu.com/explore")
# self.driver.implicitly_wait(2)
# time.sleep(1)
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# # print(page_element)
#
# self.driver.find_element_by_xpath("/html[1]/body[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]").click()
# self.driver.implicitly_wait(2)
# time.sleep(1)
# page_element = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
# print(page_element)
cookie = self.driver.get_cookies() # cookie = self.driver.get_cookies()
# #
# return True # return True
# print(self.driver.get_log("performance")) # print(self.driver.get_log("performance"))
cookie_dic={} # cookie_dic={}
for k in cookie: # for k in cookie:
cookie_dic[k["name"]] = k["value"] # cookie_dic[k["name"]] = k["value"]
print(cookie_dic) # print(cookie_dic)
headers = { headers = {
"host":"www.xiaohongshu.com", "host":"www.xiaohongshu.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
...@@ -196,10 +197,10 @@ class Crawler_xiaohongshu(): ...@@ -196,10 +197,10 @@ class Crawler_xiaohongshu():
releaser_id = self.get_releaser_id(releaserUrl) releaser_id = self.get_releaser_id(releaserUrl)
releaserUrl = 'http://www.xiaohongshu.com/user/profile/%s' % releaser_id releaserUrl = 'http://www.xiaohongshu.com/user/profile/%s' % releaser_id
pcursor = 0 pcursor = 0
# cookie_dic = {'timestamp2.sig': 'QaPtkKr8VeAbx324ZSJgUSeLhjE2Lj1kDhdmZReaewo', 'timestamp2': '20210108b8c577995da3b1aa5e9a7392', 'xhsuid': 'cqq3glNpFsMgH50j', 'xhs_spses.5dde': '*', 'xhs_spid.5dde': 'fa1043ce96194610.1610072893.1.1610072895.1610072893.3536bab9-1e85-4a3a-8a46-37e694100de1', 'extra_exp_ids': 'gif_clt1,ques_clt1', 'xhsTrackerId': '591fba69-1884-4ab2-ca05-9ae70ab77d2e'} cookie_dic = {'timestamp2': '2021010899964852bd70ca4c0c991c6c', 'xhsuid': 'cqq3glNpFsMgH50j', 'xhs_spses.5dde': '*', 'xhs_spid.5dde': 'fa1043ce96194610.1610072893.1.1610072895.1610072893.3536bab9-1e85-4a3a-8a46-37e694100de1', 'extra_exp_ids': 'gif_clt1,ques_clt1', 'xhsTrackerId': '591fba69-1884-4ab2-ca05-9ae70ab77d2e'}
# print(proxies) # print(proxies)
# proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'} # proxies = {'http': 'http://hanye:i9mmu0a3@58.55.159.141:16085/', 'https': 'http://hanye:i9mmu0a3@58.55.159.141:16085/'}
while count <= releaser_page_num_max and count <= 1000: while count <= releaser_page_num_max and count <= 1:
try: try:
print(releaserUrl) print(releaserUrl)
res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num, cookies=cookie_dic) res = retry_get_url(releaserUrl, headers=headers, proxies=proxies_num, cookies=cookie_dic)
...@@ -209,7 +210,7 @@ class Crawler_xiaohongshu(): ...@@ -209,7 +210,7 @@ class Crawler_xiaohongshu():
# time.sleep(random.randint(1, 2)) # time.sleep(random.randint(1, 2))
page_text = res.text page_text = res.text
print(page_text) # print(page_text)
data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0] data_list = re.findall("window.__INITIAL_SSR_STATE__=(.*?)</script>", page_text)[0]
# print(data_list) # print(data_list)
data_json = json.loads(data_list.replace("undefined","null")) data_json = json.loads(data_list.replace("undefined","null"))
...@@ -229,18 +230,133 @@ class Crawler_xiaohongshu(): ...@@ -229,18 +230,133 @@ class Crawler_xiaohongshu():
print(page_data) print(page_data)
title = title title = title
anwser = desc anwser = desc
pid = page_data["NoteView"]["content"]["id"] try:
video_dic["platform"] = "9" pid = page_data["NoteView"]["content"]["id"]
video_dic["platform_id"] = pid except:
video_dic["title"] = title print("get pid error")
# video_dic["content"] = anwser continue
video_dic["user_id"] = random.choice(user_id_list) if rds.hexists("xiaohongshu",pid):
video_dic["create_time"] = time_ts continue
rds.hset("xiaohongshu",key=pid,value=json.dumps(page_data))
# video_dic["platform"] = "9"
# video_dic["platform_id"] = pid
# video_dic["title"] = title
# # video_dic["content"] = anwser
# video_dic["user_id"] = random.choice(user_id_list)
# video_dic["create_time"] = time_ts
# rpc_res = post_data(video_dic,"cims/question/batch_create") # rpc_res = post_data(video_dic,"cims/question/batch_create")
print(res) print(res)
break # break
if __name__ == '__main__': if __name__ == '__main__':
test = Crawler_xiaohongshu() test = Crawler_xiaohongshu()
releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae' releaserurl = 'https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae'
res = test.releaser_page(releaserurl,proxies_num=1) url_list =[
# "https://www.xiaohongshu.com/user/profile/5abbb57211be1027a0c880ae",
# "https://www.xiaohongshu.com/user/profile/5ea6909900000000010057a3",
# "https://www.xiaohongshu.com/user/profile/5a03b1f4b1da1412dd070a86",
# "https://www.xiaohongshu.com/user/profile/5b6e76419276ee0001bd5740",
# "https://www.xiaohongshu.com/user/profile/5c4140500000000006006cb7",
# "https://www.xiaohongshu.com/user/profile/5bd2beff7da0890001b5408a",
# "https://www.xiaohongshu.com/user/profile/5b5edc5211be1044bcce7824",
# "https://www.xiaohongshu.com/user/profile/5b35cce84eacab52fbe15c0b",
# "https://www.xiaohongshu.com/user/profile/5efec35c000000000101d75a",
# "https://www.xiaohongshu.com/user/profile/5f91428a000000000101d909",
# "https://www.xiaohongshu.com/user/profile/5ed49f1200000000010017f0",
# "https://www.xiaohongshu.com/user/profile/5ae3f47b11be105fae4b854c",
# "https://www.xiaohongshu.com/user/profile/5a9e10fb11be1006adc5b9d5",
# "https://www.xiaohongshu.com/user/profile/5d0c3b900000000012013409",
# "https://www.xiaohongshu.com/user/profile/5f1013a70000000001005b16",
# "https://www.xiaohongshu.com/user/profile/5f5c6d860000000001001787",
# "https://www.xiaohongshu.com/user/profile/5eeb18e600000000010062b6",
# "https://www.xiaohongshu.com/user/profile/5bab62e9ee80fc0001505980",
# "https://www.xiaohongshu.com/user/profile/5f262a610000000001004ea9",
# "https://www.xiaohongshu.com/user/profile/5eb6779300000000010045f5",
# "https://www.xiaohongshu.com/user/profile/5c855374000000001202ef0c",
# "https://www.xiaohongshu.com/user/profile/5ecb6d7300000000010016a4",
# "https://www.xiaohongshu.com/user/profile/5f100b2d000000000100138d",
# "https://www.xiaohongshu.com/user/profile/5c14ae400000000006016f5d",
# "https://www.xiaohongshu.com/user/profile/5bbd28de4c26220001881cbd",
# "https://www.xiaohongshu.com/user/profile/5f86b6fc000000000100a5d8",
# "https://www.xiaohongshu.com/user/profile/5db16ca20000000001004c02",
# "https://www.xiaohongshu.com/user/profile/5ad553bb4eacab34ee9f7d4a",
# "https://www.xiaohongshu.com/user/profile/5f12cffd000000000101da61",
# "https://www.xiaohongshu.com/user/profile/596d7e4f5e87e722ff1bfd32",
# "https://www.xiaohongshu.com/user/profile/5ef17ad00000000001005e1c",
# "https://www.xiaohongshu.com/user/profile/5f75a5700000000001007679",
# "https://www.xiaohongshu.com/user/profile/5c639f59000000001000c731",
# "https://www.xiaohongshu.com/user/profile/5f865cbd0000000001002f01",
# "https://www.xiaohongshu.com/user/profile/5eccc58f000000000100753e",
# "https://www.xiaohongshu.com/user/profile/5fbe05b4000000000101c88d",
# "https://www.xiaohongshu.com/user/profile/5b7d1da7e8ac2b471ee6fef3",
# "https://www.xiaohongshu.com/user/profile/5a11b22211be101018ba7125",
# "https://www.xiaohongshu.com/user/profile/5a76c3c611be107f08bd35b3",
# "https://www.xiaohongshu.com/user/profile/5ecb6d7300000000010016a4",
# "https://www.xiaohongshu.com/user/profile/5f2539e80000000001009d9e",
# "https://www.xiaohongshu.com/user/profile/561b1fd8e4b1cf0295755d05",
# "https://www.xiaohongshu.com/user/profile/5beeba1ff7e8b93bc0405234",
# "https://www.xiaohongshu.com/user/profile/5c87785f000000001000ed51",
# "https://www.xiaohongshu.com/user/profile/5efdba65000000000101c79c",
# "https://www.xiaohongshu.com/user/profile/5507e7dfa46e9616260827f6",
# "https://www.xiaohongshu.com/user/profile/567573470bf90c27957dd73c",
# "https://www.xiaohongshu.com/user/profile/5fd1821b000000000100381a",
# "https://www.xiaohongshu.com/user/profile/5f5f6b1c00000000010064dc",
# "https://www.xiaohongshu.com/user/profile/5aea4d31e8ac2b4a44e1d2d4",
# "https://www.xiaohongshu.com/user/profile/5f39eabb00000000010076ca",
# "https://www.xiaohongshu.com/user/profile/5cda11d7000000001703780c",
# "https://www.xiaohongshu.com/user/profile/5cbc3e9f000000001701d7bf",
# "https://www.xiaohongshu.com/user/profile/5e7886930000000001003f7f",
# "https://www.xiaohongshu.com/user/profile/566fbc3550c4b435f51f637b",
# "https://www.xiaohongshu.com/user/profile/5e86cb34000000000100a223",
# "https://www.xiaohongshu.com/user/profile/558e15b2f5a263490c65cdaa",
# "https://www.xiaohongshu.com/user/profile/5d9eef320000000001001615",
# "https://www.xiaohongshu.com/user/profile/5a6ba3214eacab4eee8e627a",
# "https://www.xiaohongshu.com/user/profile/5f58cacb000000000100bdf5",
# "https://www.xiaohongshu.com/user/profile/5f954030000000000100780c",
# "https://www.xiaohongshu.com/user/profile/5f5745bf000000000100351d",
# "https://www.xiaohongshu.com/user/profile/5c74a2b9000000001002e667",
# "https://www.xiaohongshu.com/user/profile/595ee5b882ec397553103dd3",
# "https://www.xiaohongshu.com/user/profile/5a5e20324eacab30f03654fb",
# "https://www.xiaohongshu.com/user/profile/55743bedc2bdeb1a16844741",
# "https://www.xiaohongshu.com/user/profile/5f0d523800000000010056de",
# "https://www.xiaohongshu.com/user/profile/59d5b03e44363b61a050532f",
# "https://www.xiaohongshu.com/user/profile/5ebdd5f40000000001002a67",
# "https://www.xiaohongshu.com/user/profile/5f1c1b7b0000000001006cbf",
# "https://www.xiaohongshu.com/user/profile/5ae404944eacab794dfb95b1",
# "https://www.xiaohongshu.com/user/profile/5d26276a0000000012017538",
# "https://www.xiaohongshu.com/user/profile/5ed5aa8f0000000001001f1e",
# "https://www.xiaohongshu.com/user/profile/5f92cf4f000000000100a846",
# "https://www.xiaohongshu.com/user/profile/5a75d42011be10344b917ffe",
# "https://www.xiaohongshu.com/user/profile/5ccea0ff000000001002b753",
# "https://www.xiaohongshu.com/user/profile/5c4418750000000005006717",
# "https://www.xiaohongshu.com/user/profile/5ec582d60000000001005315",
# "https://www.xiaohongshu.com/user/profile/594a93835e87e72f3e2ded11",
# "https://www.xiaohongshu.com/user/profile/5b8ab07606311b000184195a",
# "https://www.xiaohongshu.com/user/profile/54e7413ea46e96122dab7674",
# "https://www.xiaohongshu.com/user/profile/5f3657900000000001002181",
# "https://www.xiaohongshu.com/user/profile/5a65d6554eacab6864e2749e",
# "https://www.xiaohongshu.com/user/profile/5a745dc911be101d9ceab748",
# "https://www.xiaohongshu.com/user/profile/59b2033550c4b45e5d43c3d9",
# "https://www.xiaohongshu.com/user/profile/59a97aaa5e87e760e012dcd0",
# "https://www.xiaohongshu.com/user/profile/5a5de03611be100219719b0f",
# "https://www.xiaohongshu.com/user/profile/5f40a5170000000001008577",
# "https://www.xiaohongshu.com/user/profile/597e82aa5e87e73c4915db81",
# "https://www.xiaohongshu.com/user/profile/580e0bc36a6a69043935369d",
# "https://www.xiaohongshu.com/user/profile/5d1a17670000000012021d8e",
# "https://www.xiaohongshu.com/user/profile/59a830be82ec39155146f421",
# "https://www.xiaohongshu.com/user/profile/55efc1b73397db0e969c8fbd",
# "https://www.xiaohongshu.com/user/profile/5c8c55220000000010005810",
# "https://www.xiaohongshu.com/user/profile/5f337df2000000000101e2b2",
# "https://www.xiaohongshu.com/user/profile/5f2111500000000001009b7b",
# "https://www.xiaohongshu.com/user/profile/59c840ff44363b497f335cd4",
# "https://www.xiaohongshu.com/user/profile/5f8e8508000000000101d70e",
# "https://www.xiaohongshu.com/user/profile/5a163e3511be10234e1abffd",
# "https://www.xiaohongshu.com/user/profile/5e71f6870000000001005e52",
'https://www.xiaohongshu.com/user/profile/5cca9b3700000000120314c9',
'https://www.xiaohongshu.com/user/profile/5aa0f7bae8ac2b65bfcdaf0e',
'https://www.xiaohongshu.com/user/profile/5c20dd200000000007027c07',
'https://www.xiaohongshu.com/user/profile/5fe1c1ba0000000001006e65',
]
for url in url_list:
print(url)
res = test.releaser_page(url,proxies_num=0)
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# @File : rpc_data_to_answer.py # @File : rpc_data_to_answer.py
# @email : litao@igengmei.com # @email : litao@igengmei.com
# @author : litao # @author : litao
import copy
import requests import requests
import typing import typing
...@@ -36,16 +37,34 @@ data_type_dict = { ...@@ -36,16 +37,34 @@ data_type_dict = {
"cims/reply/batch_create": ["platform","platform_id","platform_answer_id","content","user_id","create_time","is_online"] "cims/reply/batch_create": ["platform","platform_id","platform_answer_id","content","user_id","create_time","is_online"]
} }
def post_muilty_data(data_list:typing.List,rpc_type:str) -> typing.Dict:
headers = {
'X-GAIA-HELIOS-VERSION': '0.7.5',
}
for data_dict in data_list:
for key in data_dict:
if key not in data_type_dict[rpc_type]:
data_dict.pop(key)
dic = {"questions":data_list}
invoker = create_default_invoker(debug=True).with_config(dump_curl=True)
res = invoker[rpc_type](**dic)
# print(res.)
print(res)
print(res.unwrap())
return res
def post_single_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict: def post_single_data(data_dict:typing.Dict,rpc_type:str) -> typing.Dict:
headers = { headers = {
'X-GAIA-HELIOS-VERSION': '0.7.5', 'X-GAIA-HELIOS-VERSION': '0.7.5',
} }
for key in data_dict: data_dict_copy = copy.deepcopy(data_dict)
for key in data_dict_copy:
if key not in data_type_dict[rpc_type]: if key not in data_type_dict[rpc_type]:
data_dict.pop(key) data_dict_copy.pop(key)
print(data_dict) print(data_dict_copy)
dic = {"questions":[data_dict]} dic = {"questions":[data_dict_copy]}
invoker = create_default_invoker(debug=True).with_config(dump_curl=True) invoker = create_default_invoker(debug=True).with_config(dump_curl=True)
res = invoker[rpc_type](**dic) res = invoker[rpc_type](**dic)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment