# -*- coding:UTF-8 -*- # @Time : 2020/7/24 16:07 # @File : crawler_douban.py # @email : litao@igengmei.com # @author : litao import os import copy import requests import re import datetime ,time import json import urllib import random # from bs4 import BeautifulSoup # from multiprocessing import Pool # from multiprocessing import Process from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.output_results import retry_get_url from crawler.crawler_sys.utils.output_results import output_result # from crawler.crawler_sys.utils import output_log from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp # from crawler.crawler_sys.utils import connect_with_redis # from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration # from crawler.crawler_sys.utils.util_logging import logged # from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.utils.html_to_str import dehtml from bs4 import BeautifulSoup from write_data_into_es.func_get_releaser_id import * from write_data_into_es.func_cal_doc_id import cal_doc_id class Crawler_douban(): def __init__(self, timeout=None, platform='weibo'): if timeout == None: self.timeout = 10 else: self.timeout = timeout self.platform = platform std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform # remove fields that crawled data don't have pop_key_Lst = ['describe', 'repost_count', 'isOriginal', 'video_id'] for popk in pop_key_Lst: self.video_data.pop(popk) self.sig_list = [ # "aOI2VYvkFvPfUngaeoz%2BNYQ7MQM%3D", # "Glc52sbPO46I%2FR%2FOCjl%2BGwKo94I%3D", # "l9oVu%2FYau2UwMyhc5m8ldALp5eU%3D", # "tL36trbi73v7Y057K10%2FQ9fdCiA%3D", "vu4h6fzkqrvpNxWOYee95RPPV04=" ] self.headers = { "User-Agent": "api-client/1 com.douban.frodo/6.39.0(189) Android/23 product/oppo R11s Plus vendor/OPPO model/oppo R11s Plus rom/android network/wifi platform/AndroidPad", "Host": "frodo.douban.com", "Connection": "Keep-Alive", "Accept-Encoding": "gzip", "Authorization": "Bearer ee99197a01a77702cbcb4c6e04f66506", } def get_single_page(self,mid,proxies): count_true = 0 while count_true <= 3: try: count_true += 1 url = "https://frodo.douban.com/api/v2/group/topic/{0}?event_source=search&os_rom=android&apikey=0dad551ec0f84ed02907ff5c42e8ec70&channel=Baidu_Market&_sig={2}&udid=dc{1}e9f33c54b4bb579c49100b6f2cc0dc5cc&_ts=1598339497".format(mid,random.randint(10000,99999),random.choice(self.sig_list)) page_res = retry_get_url(url,headers=self.headers,proxies=proxies) page_json = page_res.json() # content = dehtml(page_json["content"]) if page_json.get('localized_message'): continue # content_html = """<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body>%s</body></html>""" % page_json["content"] # bs = BeautifulSoup(content_html, "html.parser") # content = bs.textarea.get_text() content = page_json["content"] repost_count = trans_play_count(page_json["reshares_count"]) comment_count = trans_play_count(page_json["comments_count"]) favorite_count = trans_play_count(page_json["like_count"]) collection_count = trans_play_count(page_json["collections_count"]) img_list = re.findall('img src="(.*?)"',content) dic = { "content":content, "repost_count":repost_count, "comment_count":comment_count, "favorite_count":favorite_count, "collection_count":collection_count, "img_list":img_list, } return dic except Exception as e: print("single page error %s"% e) continue def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) def gooseneck(self,releaserUrl,output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=10000, es_index=None,proxies_num=None): page = 0 has_more = True url_dic = { "start": None, "count": "20", "sortby": "new", "apple": "389276ed556d40cada2e208482b51cd7", "icecream": "7b92c1aa7b531d1500c6e4905de2ca76", "mooncake": "0f607264fc6318a92b9e13c65db7cd3c", "webview_ua": "Mozilla/5.0 (Linux; Android 6.0.1; oppo R11s Plus Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36", "screen_width": "1080", "screen_height": "1920", "sugar": "460000", "longitude": "0.0", "latitude": "0.0", "os_rom": "android", "apikey": "0dad551ec0f84ed02907ff5c42e8ec70", "channel": "Baidu_Market", "udid": "dc18733e9f33c54b4bb579c49100b6f2cc0dc5cc", "_sig": random.choice(self.sig_list), "_ts": 1598337519, } while page <= releaser_page_num_max and has_more: # url_dic["_ts"] = int(datetime.datetime.now().timestamp()) url_dic["start"] = str(page * 20) url = "http://frodo.douban.com/api/v2/group/248952/topics?%s" % urllib.parse.urlencode(url_dic) try: if proxies_num: get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout, proxies=proxies_num) else: get_page = retry_get_url(url, headers=self.headers, timeout=self.timeout) except: get_page = None has_more = False if get_page and get_page.status_code == 200: try: page_json = get_page.json() total = page_json["total"] page += 1 if page > total: break page_dic = page_json["topics"] except Exception as e: print("load data error %s" % e) continue if page_dic: for one in page_dic: releaser_id = one["author"]["id"] mid = one["id"] doc_id = cal_doc_id(platform=res_dic["platform"], url=res_dic["url"], data_dict=res_dic,doc_id_type="all-time-url") try: res_dic = { "release_time": trans_strtime_to_timestamp(one["create_time"]), "url": one["url"], "releaser": one["author"]["name"], "repost_count": None, "comment_count": trans_play_count(one["comments_count"]), "favorite_count": None, "title": one["title"], "releaserUrl": "https://www.douban.com/people/%s" % releaser_id, "releaser_id_str": "douban_%s" % releaser_id, 'video_img':one["cover_url"], "mid":mid, "platform":"douban", "doc_id":doc_id } res_dic.update(self.get_single_page(mid,proxies_num)) print(res_dic) yield res_dic except Exception as e: print(one) print("row formate error %s" % e) continue # @logged def releaser_page(self, releaserUrl, **kwargs): return self.gooseneck(releaserUrl,**kwargs) def get_releaser_follower_num(self, releaserUrl): pass def releaser_page_by_time(self, start_time, end_time, url,**kwargs): data_lis = [] count_false = 0 output_to_file = kwargs.get("output_to_file") filepath = kwargs.get("filepath") push_to_redis = kwargs.get("push_to_redis") output_to_es_register = kwargs.get("output_to_es_register") output_to_es_raw = kwargs.get("output_to_es_raw") es_index = kwargs.get("es_index") for res in self.releaser_page(url,proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time <= video_time: if video_time < end_time: try: # res["fetch_time"] = datetime.datetime.fromtimestamp(res.get("fetch_time") / 1000).strftime('%Y-%m-%d %H:%M:%S') res["release_time"] = datetime.datetime.fromtimestamp(res.get("release_time") / 1000).strftime('%Y-%m-%d %H:%M:%S') except: pass data_lis.append(res) if len(data_lis) >= 100: output_result(result_Lst=data_lis, platform=self.platform, output_to_file=output_to_file, filepath=filepath, push_to_redis=push_to_redis, output_to_es_register=output_to_es_register, output_to_es_raw=output_to_es_raw, es_index=es_index, ) data_lis.clear() else: count_false += 1 if count_false > 10: break else: continue # if data_lis != []: # output_result(result_Lst=data_lis, # platform=self.platform, # output_to_file=output_to_file, # filepath=filepath, # push_to_redis=push_to_redis, # output_to_es_register=output_to_es_register, # output_to_es_raw=output_to_es_raw, # es_index=es_index, # ) import pandas as pd data = pd.DataFrame(data_lis) s = datetime.datetime.now() ss = str(s)[0:19].replace(' ', '-').replace(':', '-') res = data.to_csv('%s%sall_s1.csv' % ("all_", ss), encoding='gb18030', # columns=columns ) data_lis.clear() if __name__ == '__main__': test = Crawler_douban() url = 'https://weibo.com/p/1644114654/home?from=page_100306&mod=TAB#place' # releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d' url_list = [ # "https://weibo.com/u/1764615662", # "https://weibo.com/u/3662247177", # "https://weibo.com/u/2378564111", # "https://weibo.com/u/2983578965", # "https://weibo.com/u/3938976579", # "https://weibo.com/u/6511177474", # "https://weibo.com/u/6343916471", # "https://weibo.com/u/6511177474", # "https://weibo.com/u/2921603920", # "https://weibo.com/u/6470919752", # "https://weibo.com/u/2653906910?refer_flag=1001030103_&is_hot=1", # "https://weibo.com/u/3115996363?is_hot=1", # "https://weibo.com/p/1005053212093237/home?from=page_100505&mod=TAB#place", # "https://weibo.com/u/3926129482", # "https://weibo.com/u/5509337969?is_hot=1", # "https://weibo.com/u/5477320351", # "https://weibo.com/p/1005055634795408/home?from=page_100505&mod=TAB#place", "https://weibo.com/u/6511173721", # "https://weibo.com/p/1005055471534537/home?from=page_100505&mod=TAB&is_hot=1#place", ] # res = test.releaser_page(url, output_to_es_raw=True, # es_index='crawler-data-raw', # releaser_page_num_max=400,proxies_num=0) # for r in res: # print(r) for u in url_list: test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_register=True, es_index='crawler-data-raw', doc_type='doc', releaser_page_num_max=4000,proxies_num=1) # test.get_single_page(4524055937468233)