# -*- coding:UTF-8 -*-
# @Time : 2020/7/24 16:07
# @File : crawler_douban.py
# @email : litao@igengmei.com
# @author : litao
import os
import copy
import requests
import re
import datetime ,time
import json
import urllib
import random
# from bs4 import BeautifulSoup
# from multiprocessing import Pool
# from multiprocessing import Process
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url
from crawler.crawler_sys.utils.output_results import output_result
# from crawler.crawler_sys.utils import output_log
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
# from crawler.crawler_sys.utils import connect_with_redis
# from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
# from crawler.crawler_sys.utils.util_logging import logged
# from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
# from crawler.crawler_sys.utils.html_to_str import dehtml
# from bs4 import BeautifulSoup
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
import hmac
import base64
class CrawlerDouban():
def __init__(self, timeout=None, platform='douban'):
if timeout == None:
self.timeout = 10
else:
self.timeout = timeout
self.platform = platform
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['describe', 'repost_count', 'isOriginal',
'video_id']
for popk in pop_key_Lst:
self.video_data.pop(popk)
self.headers = {
"User-Agent": 'api-client/1 com.douban.frodo/6.42.2(194) Android/22 product/shamu vendor/OPPO model/OPPO R11 Plus rom/android network/wifi platform/mobile nd/1',
# "Host": "frodo.douban.com",
# "Connection": "Keep-Alive",
# "Accept-Encoding": "gzip",
# "Authorization": "Bearer ee99197a01a77702cbcb4c6e04f66506",
}
self.b_str = "74CwfJd4+7LYgFhXi1cx0IQC35UQqYVFycCE+EVyw1E="
self.c_str = "bHUvfbiVZUmm2sQRKwiAcw=="
self.package_sign = "bf7dddc7c9cfe6f7"
def get_single_page(self,mid,proxies):
count_true = 0
while count_true <= 5:
try:
ts,sig = self.get_sig('/api/v2/group/topic/{0}'.format(mid))
count_true += 1
url = "https://frodo.douban.com/api/v2/group/topic/{mid}?event_source=search&apikey=0dad551ec0f84ed02907ff5c42e8ec70&_sig={sig}&_ts={ts}".format(
mid=mid,ts=ts,sig=sig)
page_res = retry_get_url(url,headers=self.headers,proxies=proxies)
page_json = page_res.json()
# content = dehtml(page_json["content"])
if page_json.get('localized_message'):
continue
# content_html = """
Title%s""" % page_json["content"]
# bs = BeautifulSoup(content_html, "html.parser")
# content = bs.textarea.get_text()
content = page_json["content"]
repost_count = trans_play_count(page_json["reshares_count"])
comment_count = trans_play_count(page_json["comments_count"])
favorite_count = trans_play_count(page_json["like_count"])
collection_count = trans_play_count(page_json["collections_count"])
img_list = re.findall(r'"(http.*?[jpg|webp]{1}?)"',content)
dic = {
"content":content,
"repost_count":repost_count,
"comment_count":comment_count,
"favorite_count":favorite_count,
"collection_count":collection_count,
"img_list":img_list,
}
return dic
except Exception as e:
print("single page error %s"% e)
continue
print("single page error")
return None
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
def get_sig(self,url):
sign = "bf7dddc7c9cfe6f7"
url_limit = url.split("?")[0].replace("http://frodo.douban.com",'')
url_limit = urllib.parse.quote(url_limit,safe='')
ts = str(int(datetime.datetime.now().timestamp()))
# ts = '1600650372'
url_str = 'GET&%s&%s' % (url_limit,ts)
# print(url_str)
sig_sha1 = hmac.new(sign.encode('utf-8'), url_str.encode('utf-8'), digestmod='SHA1')
sig_sha1 = sig_sha1.hexdigest().upper()
# bytes_arry = bytearray.fromhex(sig_sha1)
# print([x for x in bytearray(bytes_arry,'utf_8')])
# print(bytearray(sig_sha1.hexdigest()))
# print(''.join(map(lambda x: chr(x % 256), bytearray(sig_sha1.hexdigest()))))
sig = bytes.decode(base64.encodebytes(bytearray.fromhex(sig_sha1))).replace('\n','')
# print(urllib.parse.quote(sig,safe=''))
return ts, sig
def gooseneck(self,releaserUrl,output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
releaser_page_num_max=10000,
es_index=None,proxies_num=None):
page = 0
has_more = True
ts,sig = self.get_sig('/api/v2/group/248952/topics')
url_dic = {
# "start": None,
"count": "20",
"sortby": "new",
# "apple": "389276ed556d40cada2e208482b51cd7",
# "icecream": "7b92c1aa7b531d1500c6e4905de2ca76",
# "mooncake": "0f607264fc6318a92b9e13c65db7cd3c",
# "webview_ua": "Mozilla/5.0 (Linux; Android 6.0.1; oppo R11s Plus Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36",
# "screen_width": "1080",
# "screen_height": "1920",
# "sugar": "460000",
# "longitude": "0.0",
# "latitude": "0.0",
# "os_rom": "android",
"apikey": "0dad551ec0f84ed02907ff5c42e8ec70",
# "channel": "Baidu_Market",
# "udid": "dc18733e9f33c54b4bb579c49100b6f2cc0dc5cc",
"_sig": sig,
"_ts": ts,
}
while page <= releaser_page_num_max and has_more:
# url_dic["_ts"] = int(datetime.datetime.now().timestamp())
if page:
url_dic["start"] = str(page * 20)
if "hot_tag" in releaserUrl:
url_dic["sortby"] = "hot"
elif "new_tag" in releaserUrl:
url_dic["sortby"] = "new"
url = "https://frodo.douban.com/api/v2/group/248952/topics?%s" % urllib.parse.urlencode(url_dic)
try:
if proxies_num:
get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout, proxies=proxies_num)
else:
get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout)
except:
continue
if get_page and get_page.status_code == 200:
try:
page_json = get_page.json()
total = page_json["total"]
page += 1
if page > total:
break
page_dic = page_json["topics"]
except Exception as e:
print("load data error %s" % e)
continue
if page_dic:
for one in page_dic:
try:
releaser_id = one["author"]["id"]
mid = one["id"]
if True:
# try:
res_dic = {
"release_time": trans_strtime_to_timestamp(one["create_time"]),
"fetch_time": int(datetime.datetime.now().timestamp()*1e3),
"url": one["url"],
"releaser": one["author"]["name"],
"repost_count": None,
"comment_count": trans_play_count(one["comments_count"]),
"favorite_count": None,
"title": one["title"],
"releaserUrl": "https://www.douban.com/people/%s" % releaser_id,
"releaser_id_str": "douban_%s" % releaser_id,
'video_img':one["cover_url"],
"mid":mid,
"platform":"douban",
"article_type": "article"
# "doc_id":doc_id
}
doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,
doc_id_type="all-time-url")
res_dic["doc_id"] = doc_id
res_dic.update(self.get_single_page(mid,proxies_num))
# print(res_dic)
yield res_dic
except Exception as e:
print("single data parse error %s " %e)
# except Exception as e:
# print(one)
# print("row formate error %s" % e)
# continue
# @logged
def releaser_page(self, releaserUrl,
**kwargs):
return self.gooseneck(releaserUrl,**kwargs)
def get_releaser_follower_num(self, releaserUrl):
pass
def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
count_false = 0
for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time < video_time:
if video_time < end_time:
count_false = 0
yield res
else:
count_false += 1
if count_false > allow:
break
else:
yield res
if __name__ == '__main__':
test = CrawlerDouban()
# releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d'
url_list = [
"https://www.douban.com/people/new_tag"
]
# res = test.releaser_page(url, output_to_es_raw=True,
# es_index='crawler-data-raw',
# releaser_page_num_max=400,proxies_num=0)
# for r in res:
# print(r)
for u in url_list:
ttt = test.releaser_page_by_time(1600531200000, 1600660917502, u, output_to_es_register=False,
es_index='crawler-data-raw',
doc_type='doc', releaser_page_num_max=4000,allow=20)
for t in ttt:
print(t)
# test.get_single_page(4524055937468233)