# -*- coding:UTF-8 -*- # @Time : 2020/7/23 17:40 # @File : crawler_weibo.py # @email : litao@igengmei.com # @author : litao import copy import requests import re import datetime, time import json import random # from bs4 import BeautifulSoup from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.output_results import retry_get_url from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.utils.util_logging import logged from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.utils.html_to_str import dehtml from write_data_into_es.func_get_releaser_id import * from write_data_into_es.func_cal_doc_id import cal_doc_id class Crawler_weibo(): def __init__(self, timeout=None, platform='weibo'): if timeout == None: self.timeout = 10 else: self.timeout = timeout self.platform = platform std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform # remove fields that crawled data don't have pop_key_Lst = ['describe', 'repost_count', 'isOriginal', 'video_id'] for popk in pop_key_Lst: self.video_data.pop(popk) @staticmethod def get_single_page(mid): url = "https://m.weibo.cn/status/%s" % mid headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", # "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543", "sec-fetch-dest": "empty", "sec-fetch-mode": "same-origin", "sec-fetch-site": "same-origin", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } page_res = retry_get_url(url, headers=headers, proxies=0) page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text, flags=re.DOTALL)[0] page_json = json.loads(page_json_context) text = dehtml(page_json[0]["status"]["text"]) repost_count = trans_play_count(page_json[0]["status"]["reposts_count"]) comment_count = trans_play_count(page_json[0]["status"]["comments_count"]) favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"]) return text, repost_count, comment_count, favorite_count def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) @staticmethod def get_img(data): img_list = [] if data.get("pics"): for one in data.get("pics"): try: img_list.append(one["large"]["url"]) except Exception as e: img_list.append(one["url"]) print("add img error %s" % e) return img_list def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=200, es_index=None, doc_type=None, proxies_num=None): print('Processing releaserUrl %s' % releaserUrl) result_Lst = [] releaser_id = self.get_releaser_id(releaserUrl) # xsrf_token,url_extr = self.get_weibo_info(releaser_id) headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", # "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011", "mweibo-pwa": "1", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", "x-requested-with": "XMLHttpRequest", # "x-xsrf-token": xsrf_token, } pagenum = 0 has_more = True since_id = 0 if releaser_id: while pagenum <= releaser_page_num_max and has_more: pagenum += 1 time.sleep(0.5) "?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429" url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format( releaser_id, releaser_id, releaser_id, since_id) headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id) print('Page number: %d' % pagenum) try: if proxies_num: get_page = retry_get_url(url, headers=headers, timeout=self.timeout, proxies=proxies_num) else: get_page = retry_get_url(url, headers=headers, timeout=self.timeout) except: get_page = None has_more = False # print(get_page.text) if get_page and get_page.status_code == 200: try: page_json = get_page.json() total = page_json["data"]["cardlistInfo"]["total"] if pagenum > total: break since_id = page_json["data"]["cardlistInfo"]["since_id"] page_dic = page_json["data"].get("cards") except Exception as e: print("load data error %s" % e) continue if page_dic: for one in page_dic: try: mblog = one.get("mblog") mid = mblog.get("mid") forward_text = "" forward_user = "" if one.get("source") == "绿洲": text_type = "绿洲" elif mblog.get("retweeted_status"): text_type = "转发" forward_text = mblog.get("retweeted_status").get("raw_text") forward_user = mblog.get("retweeted_status").get("user").get("screen_name") else: text_type = one.get("source") if mblog.get("isLongText"): text, repost_count, comment_count, favorite_count = self.get_single_page(mid) else: try: text = mblog["raw_text"] except: text = mblog["text"] if mblog.get("page_info"): article_type = mblog.get("page_info").get("type") else: article_type = "article" res_dic = { "release_time": trans_strtime_to_timestamp(mblog["created_at"]), "fetch_time": int(datetime.datetime.now().timestamp()*1e3), "url": one["scheme"], "releaser": mblog["user"]["screen_name"], "repost_count": trans_play_count(mblog["reposts_count"]), "comment_count": trans_play_count(mblog["comments_count"]), "favorite_count": trans_play_count(mblog["attitudes_count"]), "title": text.replace("\u200b", ""), "content": text.replace("\u200b", ""), "wb_type": text_type, "forward_user": forward_user, "forward_text": forward_text, "mid": mid, "releaserUrl": "https://www.weibo.com/u/%s" % releaser_id, "releaser_id_str": "weibo_%s" % releaser_id, "img_list":self.get_img(mblog), "platform":"weibo", "article_type": article_type # "doc_id":doc_id } res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic, doc_id_type="all-time-url") yield res_dic except Exception as e: print(json.dumps(mblog)) print("row formate error %s" % e) continue def get_releaser_follower_num(self, releaserUrl): pass def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs): count_false = 0 for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: yield res else: count_false += 1 if count_false > allow: break else: yield res if __name__ == '__main__': test = Crawler_weibo() # releaserUrl = 'http://v.qq.com/vplus/cfa34d96d1b6609f1dccdea65b26b83d' url_list = [ "https://weibo.com/u/3934754081?is_hot=1", "https://weibo.com/u/1752877052?is_hot=1", "https://weibo.com/u/5395999826?is_hot=1", "https://weibo.com/u/6426118092?is_hot=1", "https://weibo.com/u/5597352985?is_hot=1", "https://weibo.com/u/5536360057?is_hot=1", "https://weibo.com/u/6574937525", "https://weibo.com/u/7396392576?is_hot=1", "https://weibo.com/p/1005051719151460/home", "https://weibo.com/u/6514156406?refer_flag=1005050006_", "https://weibo.com/p/1005051922120917/home", "https://weibo.com/p/1005051922120917/home", "https://weibo.com/u/5268223514?is_hot=1", "https://weibo.com/u/2950468700?is_hot=1", "https://weibo.com/u/7171118361?is_hot=1", "https://weibo.com/u/2259870914?is_all=1", "https://weibo.com/u/7054003977", "https://weibo.com/u/5311113459", "https://weibo.com/u/2903761483?refer_flag=1001030103_&is_hot=1", "https://weibo.com/u/5061685077?refer_flag=1001030103_&is_hot=1", "https://weibo.com/u/7125942835?is_hot=1", "https://weibo.com/p/1005055040459465/home", "https://weibo.com/u/6346951781", "https://weibo.com/p/1005057224900173/home", "https://weibo.com/u/6831245172?is_hot=1", "https://weibo.com/u/2868511894?is_hot=1", "https://weibo.com/u/6082896987?is_hot=1", "https://weibo.com/p/1005052198383217/home", "https://weibo.com/p/1005055305217578/home", "https://weibo.com/270055196?is_hot=1", "https://weibo.com/u/5193254387?is_hot=1", "https://weibo.com/u/6433308372?is_hot=1", "ttps://weibo.com/u/6165106408?is_hot=1", "https://weibo.com/p/1005051918721053/home", "https://weibo.com/611678617?is_hot=1", "https://weibo.com/u/6899857321?is_hot=1", "https://weibo.com/u/3607024971?is_hot=1", "https://weibo.com/u/2913046461?is_hot=1", "https://weibo.com/u/2674747062?is_hot=1", "https://weibo.com/u/2412955604?topnav=1&wvr=6&topsug=1", "https://weibo.com/u/6614002566?is_hot=1", "https://weibo.com/u/3976738742?is_hot=1", "https://weibo.com/p/1005052056164927/home", "https://weibo.com/p/1005055241775605/home", "https://weibo.com/u/6369481847?is_hot=1", "https://weibo.com/u/2764013814", "https://weibo.com/p/1005055119549629/home", "https://weibo.com/u/6874207129?is_hot=1", "https://weibo.com/u/1856570454?is_all=1", "https://weibo.com/u/6433256474?is_hot=1", "https://weibo.com/u/6854091345?is_hot=1", "https://weibo.com/p/1005055706502018/home", "https://weibo.com/u/3916296864?is_hot=1", "https://weibo.com/u/6758410336?is_hot=1", "https://weibo.com/u/6095636964?is_hot=1", "https://weibo.com/u/5627013949?is_all=1", "https://weibo.com/u/3607576112?profile_ftype=1&is_all=1#_0", "https://weibo.com/p/1005052004301745/home", "https://weibo.com/p/1005051891287691/home", "https://weibo.com/u/1874437190?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/3307996457?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/2636564637?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/6852655070?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/5589574529?is_hot=1", "https://weibo.com/p/1005051861506152/home", "https://weibo.com/u/2135443220?profile_ftype=1&is_all=1#_0", "https://weibo.com/p/1005055407180113/home", "https://weibo.com/u/7343693352?profile_ftype=1&is_all=1#_0", "https://weibo.com/p/1005056064161260/home", "https://weibo.com/p/1005056347228074/home", "https://weibo.com/u/1293633660?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/6567254124?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/3934401200?profile_ftype=1&is_all=1#_0", "https://weibo.com/p/1005055270614973/home", "https://weibo.com/u/6807978304?refer_flag=1001030103_&is_hot=1#_0", "https://weibo.com/u/1931193225?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/2501778585?refer_flag=1001030103_&is_hot=1", "https://weibo.com/u/6857168590?display=0&retcode=6102&sudaref=passport.weibo.com&is_hot=1", "https://weibo.com/u/2307946431?is_hot=1", "https://weibo.com/p/1005056228248712/home", "https://weibo.com/u/2189983640?refer_flag=1001030103_&is_hot=1", "https://weibo.com/p/1005051868643982/home", "https://weibo.com/u/6343475414?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/3912360020?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/1697399444?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/6962283304?is_hot=1", "https://weibo.com/u/3937105056?profile_ftype=1&is_all=1#_0", "https://weibo.com/p/1005055903923679/home", "https://weibo.com/u/2267627614?profile_ftype=1&is_all=1#_0", "https://weibo.com/u/2918298864?is_hot=1#1609235796088", "https://weibo.com/p/1005052154909694/home", "https://weibo.com/u/3854719915?is_hot=1", "https://weibo.com/p/1005056323607436/home", "https://weibo.com/u/6076861345?is_hot=1", "https://weibo.com/u/6130461637?is_hot=1", "https://weibo.com/p/1005053602744927/home", "https://weibo.com/u/5884212886?is_hot=1", "https://weibo.com/u/1843074012?from=page_100505_profile&wvr=6&mod=myfollowhisfan&refer_flag=1005050010_&is_all=1", "https://weibo.com/u/5356845303?is_hot=1", "https://weibo.com/u/5623052931?refer_flag=1001030103_&is_hot=1#1609745544373", "https://weibo.com/u/3236957071?topnav=1&wvr=6&topsug=1&is_hot=1", "https://weibo.com/u/5147711482?refer_flag=1005050006_", "https://weibo.com/u/6628617667?refer_flag=1005050006_", "https://weibo.com/u/6855680230?refer_flag=1005050006_", "https://weibo.com/u/5836153857?refer_flag=1005050006_&is_hot=1", "https://weibo.com/u/5145935171?refer_flag=1005050006_", "https://weibo.com/u/5143063731?refer_flag=1005050006_", "https://weibo.com/u/5144888803?refer_flag=1005050006_", "https://weibo.com/u/6431905918?refer_flag=1005050006_", "https://weibo.com/u/7048594049", ] import redis rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=20, decode_responses=True) for url in url_list: res = test.releaser_page(url, output_to_es_raw=True, es_index='crawler-data-raw', releaser_page_num_max=20,proxies_num=0) for r in res: print(r) res = rds.hset("weibo", key=r["doc_id"], value=json.dumps(r)) # for u in url_list: # test.releaser_page_by_time(1590940800000, 1595468554268, u, output_to_es_raw=False, # es_index='crawler-data-raw', # doc_type='doc', releaser_page_num_max=4000) # test.get_single_page(4524055937468233)