Commit 882a255e authored by litaolemo's avatar litaolemo

update

parent 859aedb1
......@@ -30,6 +30,8 @@ from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_tim
# from bs4 import BeautifulSoup
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
import hmac
import base64
class CrawlerDouban():
......@@ -47,25 +49,26 @@ class CrawlerDouban():
'video_id']
for popk in pop_key_Lst:
self.video_data.pop(popk)
self.sig_list = [
"aOI2VYvkFvPfUngaeoz%2BNYQ7MQM%3D",
"Glc52sbPO46I%2FR%2FOCjl%2BGwKo94I%3D",
"l9oVu%2FYau2UwMyhc5m8ldALp5eU%3D",
"tL36trbi73v7Y057K10%2FQ9fdCiA%3D"
]
self.headers = {
"User-Agent": "api-client/1 com.douban.frodo/10.39.0(189) Android/23 product/cancro vendor/Netease model/Miui rom/android network/wifi platform/AndroidPad",
"Host": "frodo.douban.com",
"Connection": "Keep-Alive",
"Accept-Encoding": "gzip",
"User-Agent": "api-client/1 com.douban.frodo/6.42.2(194) Android/22 product/shamu vendor/OPPO model/OPPO R11 Plus rom/android network/wifi platform/mobile nd/1",
# "Host": "frodo.douban.com",
# "Connection": "Keep-Alive",
# "Accept-Encoding": "gzip",
# "Authorization": "Bearer ee99197a01a77702cbcb4c6e04f66506",
}
self.b_str = "74CwfJd4+7LYgFhXi1cx0IQC35UQqYVFycCE+EVyw1E="
self.c_str = "bHUvfbiVZUmm2sQRKwiAcw=="
self.package_sign = "bf7dddc7c9cfe6f7"
def get_single_page(self,mid,proxies):
count_true = 0
while count_true <= 5:
try:
count_true += 1
url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(mid,random.randint(10000,99999),random.choice(self.sig_list))
url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&udid=dc18733e9f33c54b4bb579c49100b6f2cc0dc5cc&_sig={1}&_ts=1598339497".format(
mid, 'lgrIVA7Zvp7r0+WuOe4APb9EL0A=')
page_res = retry_get_url(url,headers=self.headers,proxies=proxies)
page_json = page_res.json()
# content = dehtml(page_json["content"])
......@@ -100,6 +103,25 @@ class CrawlerDouban():
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
def get_sig(self,url):
sign = "bf7dddc7c9cfe6f7"
url_limit = url.split("?")[0].replace("http://frodo.douban.com",'')
url_limit = urllib.parse.quote(url_limit,safe='')
ts = str(int(datetime.datetime.now().timestamp()))
ts = '1600423244'
url_str = 'GET&%s&%s' % (url_limit,ts)
# print(url_str)
sig_sha1 = hmac.new(sign.encode('utf-8'), url_str.encode('utf-8'), digestmod='SHA1')
sig_sha1 = sig_sha1.hexdigest().upper()
bytes_arry = bytearray.fromhex(sig_sha1)
# print([x for x in bytearray(sig.hexdigest(),'utf_8')])
# print(bytearray(sig.hexdigest()))
# binData = ''.join(map(lambda x: chr(x % 256), data))
sig = bytes.decode(base64.encodebytes(bytes_arry)).replace('\n','')
print(urllib.parse.quote(sig,safe=''))
return ts, urllib.parse.quote(sig,safe='')
def gooseneck(self,releaserUrl,output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
......@@ -109,29 +131,30 @@ class CrawlerDouban():
page = 0
has_more = True
ts,sig = self.get_sig('/api/v2/group/248952/topics')
url_dic = {
"start": None,
"count": "20",
"sortby": "new",
"apple": "389276ed556d40cada2e208482b51cd7",
"icecream": "ffd8f7d71419a98e48819cbac587ebbd",
"mooncake": "0f607264fc6318a92b9e13c65db7cd3c",
"webview_ua": "Mozilla%2F5.0%20%28Linux%3B%20Android%2010.0.1%3B%20Miui%20Build%2FV417IR%3B%20wv%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Version%2F4.0%20Chrome%2F52.0.2743.100%20Mobile%20Safari%2F537.36",
"screen_width": "810",
"screen_height": "1440",
"sugar": "0",
"longitude": "0",
"latitude": "0",
"os_rom": "android",
# "apple": "389276ed556d40cada2e208482b51cd7",
# "icecream": "7b92c1aa7b531d1500c6e4905de2ca76",
# "mooncake": "0f607264fc6318a92b9e13c65db7cd3c",
# "webview_ua": "Mozilla/5.0 (Linux; Android 6.0.1; oppo R11s Plus Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36",
# "screen_width": "1080",
# "screen_height": "1920",
# "sugar": "460000",
# "longitude": "0.0",
# "latitude": "0.0",
# "os_rom": "android",
"apikey": "0dad551ec0f84ed02907ff5c42e8ec70",
"channel": "Baidu_Market",
"udid": "dc{0}e9f33c54b4bb579c49100b6f2cc0dc5cc".format(random.randint(10000,99999)),
"_sig": random.choice(self.sig_list),
"_ts": None,
# "channel": "Baidu_Market",
# "udid": "dc18733e9f33c54b4bb579c49100b6f2cc0dc5cc",
"_sig": sig,
"_ts": ts,
}
while page <= releaser_page_num_max and has_more:
url_dic["_ts"] = int(datetime.datetime.now().timestamp())
# url_dic["_ts"] = int(datetime.datetime.now().timestamp())
url_dic["start"] = str(page * 20)
if "hot_tag" in releaserUrl:
url_dic["sortby"] = "hot"
......@@ -234,7 +257,7 @@ if __name__ == '__main__':
# for r in res:
# print(r)
for u in url_list:
ttt = test.releaser_page_by_time(1595755100232, 1595906959333, u, output_to_es_register=True,
ttt = test.releaser_page_by_time(1595755100232, 1595906959333, u, output_to_es_register=False,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000,allow=20)
for t in ttt:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment