# -*- coding: utf-8 -*- """ Created on Tue Aug 14 20:13:21 2018 @author: fangyucheng """ import copy import re # import rsa import time import json import urllib import base64 import binascii import datetime import requests from bs4 import BeautifulSoup from crawler.crawler_sys.framework.video_fields_std import Std_fields_video from crawler.crawler_sys.utils.output_results import retry_get_url, output_result from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration from crawler.crawler_sys.utils.util_logging import logged from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy from crawler.crawler_sys.utils.html_to_str import dehtml from write_data_into_es.func_get_releaser_id import * from write_data_into_es.func_cal_doc_id import cal_doc_id class Crawler_weibo(): def __init__(self, timeout=None, platform='weibo'): self.platform = "weibo" self.session = requests.Session() self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2128.59 Safari/537.36'} std_fields = Std_fields_video() self.video_data = std_fields.video_data self.video_data['platform'] = self.platform # remove fields that crawled data don't have pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"] for popk in pop_key_Lst: self.video_data.pop(popk) # def manipulate_login(self, user_name, password): # # cookie写入cookie_pool # cookie_pool = open('cookie_pool', # 'a', encoding='utf-8') # # # 转换用户名 # user_name_quote = urllib.parse.quote_plus(user_name) # user_name_base64 = base64.b64encode(user_name_quote.encode('utf-8')) # user_name_b64 = user_name_base64.decode('utf-8') # # # 获得servertime pubkey rsakv nonce 四个参数 # current_time = int(time.time() * 1000) # login_url_first_part = 'http://login.sina.com.cn/sso/prelogin.php?' # login_url_dic = {'entry': 'weibo', # 'callback': 'sinaSSOController.preloginCallBack', # 'su': user_name_b64, # 'rsakt': 'mod', # 'checkpin': '1', # 'client': 'ssologin.js(v1.4.18)', # '_': current_time} # login_url_second_part = urllib.parse.urlencode(login_url_dic) # login_url = login_url_first_part + login_url_second_part # get_page = requests.get(login_url) # get_page.encoding = 'utf-8' # page = get_page.text # page_rep = page.replace('sinaSSOController.preloginCallBack', '') # page_dic = eval(page_rep) # pubkey = page_dic['pubkey'] # servertime = page_dic['servertime'] # rsakv = page_dic['rsakv'] # nonce = page_dic['nonce'] # 构造密码 # rsa_pubkey = int(pubkey, 16) # key = rsa.PublicKey(rsa_pubkey, 65537) # message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) # message = message.encode("utf-8") # password_rsa = rsa.encrypt(message, key) # password_bi = binascii.b2a_hex(password_rsa) # login,通过post,获得cookie # post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' # post_data_dic = {'encoding': 'UTF-8', # 'entry': 'weibo', # 'from': '', # 'gateway': '1', # 'nonce': nonce, # 'pagerefer': "", # 'prelt': 67, # 'pwencode': 'rsa2', # "returntype": "META", # 'rsakv': rsakv, # 'savestate': '7', # 'servertime': servertime, # 'service': 'miniblog', # 'sp': password_bi, # 'sr': '1920*1080', # 'su': user_name_b64, # 'useticket': '1', # 'vsnf': '1', # 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&display=0&'} # # logining_page = self.session.post(post_url, data=post_data_dic, headers=self.headers) # login_loop = logining_page.content.decode("GBK") # # if '正在登录' in login_loop or 'Signing in' in login_loop: # cookie = logining_page.cookies.get_dict() # print(cookie,type(cookie)) # current_time = int(time.time() * 1000) # cookie_dic = {'cookie': cookie, # 'current_time': current_time} # cookie_json = json.dump(cookie_dic,cookie_pool) # print('got cookie in login process') # return cookie # else: # print('post failed, suggest to login again') def test_cookie(self, test_url=None, cookie=None, user_name=None, password=None): if test_url is None: test_url = 'https://weibo.com/1188203673/GuV3o9VYt' get_page = requests.get(test_url, cookies=cookie) page = get_page.text length = len(page) if length > 20000: print("due to the page's length is %s, cookie is useful" % length) return cookie else: print("invalid cookie at the page length %s" % length) return None def get_weibo_info_from_search_page(self, retrieve_soup, cookie): try: weibo_id = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['mid'] user_id_str = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['tbinfo'] user_id = re.findall('\d+', user_id_str)[0] except: try: weibo_id_str = retrieve_soup.find('a', {'action-type': 'fl_menu'})['action-data'] weibo_id = re.findall('\d+', weibo_id_str)[0] user_id_str = retrieve_soup.find('a', {'class': 'name_txt'})['usercard'] user_id = re.findall('\d+', ' '.join(re.findall('id=\d+', user_id_str)))[0] except: weibo_id = None user_id = None print('id_error') try: user_url = retrieve_soup.find('div', {'class': 'face'}).a['href'] except: user_url = None print('user_url error') try: user_nickname = retrieve_soup.find('div', {'class': 'face'}).a['title'] except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('p', {'class': 'comment_txt'}).text weibo_content = weibo_content.strip('\n').strip() if '展开全文' in weibo_content: weibo_content = self.get_longtext(weibo_id, cookie) except: weibo_content = None print('weibo_content error') # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'class': 'W_textb'})['date'] except: try: release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date'] except: release_time = None print('release_time error') try: weibo_url = retrieve_soup.find('a', {'class': 'W_textb'})['href'] except: weibo_url = None print('weibo_url error') try: come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text except: come_from = None print("can't find come_from") try: repost_count = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).em.text except: repost_count = 0 try: comment_count = retrieve_soup.find('a', {'action-type': 'feed_list_comment'}).em.text except: comment_count = 0 try: favorite_count = retrieve_soup.find('a', {'action-type': 'feed_list_like'}).em.text except: favorite_count = 0 fetch_time = int(time.time() * 1000) weibo_info = {'weibo_id': weibo_id, 'user_id': user_id, 'user_url': user_url, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'release_time': release_time, 'weibo_url': weibo_url, 'come_from': come_from, 'repost_count': repost_count, 'comment_count': comment_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return weibo_info def get_repost_info(self, retrieve_soup): try: weibo_id = retrieve_soup['mid'] except: weibo_id = None print('weibo_id error') try: user_id_str = retrieve_soup.div.a['usercard'] user_id = re.findall('\d+', user_id_str)[0] except: user_id = None print('user_id error') try: user_url = retrieve_soup.div.a['href'] except: user_url = None print('user_url error') try: user_nickname = retrieve_soup.find('div', {'class': 'WB_text'}).a.text except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('span', {'node-type': 'text'}).text weibo_content = weibo_content.strip('\n').strip() except: weibo_content = None print('weibo_content error') if weibo_content is not None and '//' in weibo_content: parent_lst = weibo_content.split('//') try: pattern = '@.*:' parent_weibo = re.findall(pattern, parent_lst[1])[0].replace(':', '').replace('@', '') except: pattern = '@.*:' parent_weibo = re.findall(pattern, parent_lst[1])[0].replace(':', '').replace('@', '') else: parent_weibo = None # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date'] except: release_time = None print('release_time error') try: weibo_url = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['href'] except: weibo_url = None print('weibo_url error') try: repost_count_str = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).text repost_count_lst = re.findall('\d+', repost_count_str) if repost_count_lst != []: repost_count = repost_count_lst[0] else: repost_count = 0 except: repost_count = 0 try: favorite_count_str = retrieve_soup.find('span', {'node-type': 'like_status'}).text favorite_count_str.replace('ñ', '') try: favorite_count_lst = re.findall('\d+', favorite_count_str) if favorite_count_lst != []: favorite_count = favorite_count_lst[0] else: favorite_count = 0 except: favorite_count = 0 print('favorite_count is zero') except: favorite_count = 0 fetch_time = int(time.time() * 1000) repost_info = {'weibo_id': weibo_id, 'user_id': user_id, 'user_url': user_url, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'parent_weibo': parent_weibo, 'release_time': release_time, 'weibo_url': weibo_url, 'repost_count': repost_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return repost_info def get_user_weibo_info(self, retrieve_soup, cookie): try: weibo_id = retrieve_soup['mid'] except: weibo_id = None print('weibo_id error') try: user_nickname = retrieve_soup.find('a', {'class': 'W_f14'}).text except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('div', {'class': 'WB_text'}).text weibo_content = weibo_content.strip('\n').strip() if '展开全文' in weibo_content: weibo_content = self.get_longtext(weibo_id, cookie) except: weibo_content = None print('weibo_content error') # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'class': 'S_txt2'})['date'] except: release_time = None print('release_time error') try: text = "weibo.com" weibo_url = text.join((retrieve_soup.find('a', {'class': 'S_txt2'})['href'])) print(weibo_url) except: weibo_url = None print('weibo_url error') try: come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text except: come_from = None print("can't find come_from") try: repost_count_lst = retrieve_soup.find('span', {'node-type': 'forward_btn_text'}).find_all('em') for line in repost_count_lst: try: repost_count = int(line.text) except: repost_count = 0 except: repost_count = 0 try: comment_count_lst = retrieve_soup.find('span', {'node-type': 'comment_btn_text'}).find_all('em') for line in comment_count_lst: try: comment_count = int(line.text) except: comment_count = 0 except: comment_count = 0 try: favorite_count_lst = retrieve_soup.find('span', {'node-type': 'like_status'}).find_all('em') for line in favorite_count_lst: try: favorite_count = int(line.text) except: favorite_count = 0 except: favorite_count = 0 print('favorite_count is zero') fetch_time = int(time.time() * 1000) weibo_info = {'weibo_id': weibo_id, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'come_from': come_from, 'release_time': release_time, 'weibo_url': weibo_url, 'repost_count': repost_count, 'comment_count': comment_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return weibo_info def get_longtext(self, weibo_id, cookie): current_time = int(time.time() * 1000) longtext_url = ('https://weibo.com/p/aj/mblog/getlongtext?ajwvr=6&mid=' + weibo_id + '&is_settop&is_sethot&is_setfanstop&' 'is_setyoudao&__rnd=' + str(current_time)) get_page = requests.get(longtext_url, headers=self.headers, cookies=cookie) try: page_dic = get_page.json() wait_for_soup = page_dic['data']['html'] soup = BeautifulSoup(wait_for_soup, 'html.parser') longtext = soup.text return longtext except: print("can't get longtext") return '' def get_single_article_page(self,article_id,keyword,proxies=0): headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", # "Cookie": "SINAGLOBAL=565010119549.1364.1559571258394; login_sid_t=85753e367d54782a25518436f329cfa0; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5712976583220.359.1595386386561; ULV=1595386386575:2:1:1:5712976583220.359.1595386386561:1592884354178; UOR=,,login.sina.com.cn; SSOLoginState=1595829153; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ46TE-isMWEvFjmXZnGFZ5JpX5KMhUgL.Fo2Re0zpShqfSoe2dJLoI7_e9gfadcvadcvad7tt; ALF=1627695088; SCF=AlrGNPCzM_VX3PzgxftYKkUv6Gj7FjmOVVbH8EpsTADeRxEeW-7_ipW8LVV7sGN-t7JJA-VwFKC2Ot0ZkHwHstE.; SUB=_2A25yJwQhDeRhGedG6FAQ9CjJzT-IHXVRVXLprDV8PUNbmtAKLRPmkW9NUVHbR2NjdmB2ZEtnFBK75m3CwwTzeqTJ; SUHB=08J6qQipU2qH8A; CARD-MAIN=cfec82595a1164dea323b2fb276c823f", "Host": "card.weibo.com", "Referer": "https://card.weibo.com/article/m/show/id/{0}?_wb_client_=1&open_source=weibo_search&luicode=10000011&lfid=100103type%3D21%26q%3D{1}%26t%3D0".format(article_id,urllib.parse.quote(keyword)), "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } url = "https://card.weibo.com/article/m/aj/detail?id={0}&_t={1}".format(article_id,int(datetime.datetime.now().timestamp()*1e3)) try: requests_res = retry_get_url(url,headers=headers,proxies=proxies) res_json = requests_res.json() # print(res_json) data = res_json["data"] video_dic = {} video_dic["url"] = data["target_url"] video_dic["title"] = data["title"] video_dic["fetch_time"] = int(datetime.datetime.now().timestamp()*1e3) video_dic["release_time"] = trans_strtime_to_timestamp(data["create_at"]) video_dic["play_count"] = trans_play_count(data["read_count"]) video_dic["content"] = data["content"] video_dic["releaser"] = data["userinfo"].get('screen_name') video_dic["releaser_id"] = str(data["userinfo"].get('id')) video_dic["releaserUrl"] = data["userinfo"].get('url') video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"]) video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"]) video_dic["mid"] = article_id return video_dic except Exception as e: print("single data row formate error %s" % e) def search_article_page(self, keyword, search_pages_max=12, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None, proxies_num=0): count_false = 0 headers_search = { "Accept": "application/json, text/plain, */*", "MWeibo-Pwa": "1", "Referer": "https://m.weibo.cn/search?containerid=100103type=1&q={0}".format(urllib.parse.quote(keyword)), "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", "X-Requested-With": "XMLHttpRequest", "X-XSRF-TOKEN": "65d497" } urls = [] for page_num in range(0, search_pages_max): url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D21%26q%3D{0}%26t%3D0&page_type=searchall&page={1}'.format(urllib.parse.quote(keyword),page_num + 1) urls.append(url) weibo_Lst = [] for search_page_url in urls: get_page = retry_get_url(search_page_url, headers=headers_search) if get_page.status_code != 200: # retry once get_page = requests.get(search_page_url) if get_page.status_code != 200: continue page_dict = get_page.json() while page_dict['data'].get("msg") == '这里还没有内容': get_page = retry_get_url(search_page_url, headers=headers_search) page_dict = get_page.json() count_false += 1 if count_false >= 3: continue if page_dict['data'].get("cards")[0].get("card_group"): for one_line in page_dict['data'].get("cards")[0].get("card_group"): try: title = one_line['title_sub'] # abstract = one_line['abstract'] # url = one_line['article_url'] # play_count = one_line['read_count'] # comment_count = one_line['comment_count'] # favorite_count = one_line['digg_count'] article_id = re.findall("(\d+)",one_line['scheme'])[0] # releaser = one_line['media_name'] # uid = one_line['user_id'] # releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid # release_time = one_line['publish_time'] # release_time = int(int(release_time) * 1e3) fetch_time = int(datetime.datetime.now().timestamp() * 1e3) # releaser_id = self.get_releaser_id(releaserUrl) D0 = copy.deepcopy(self.video_data) D0['title'] = title # D0['abstract'] = abstract # D0['url'] = url # D0['play_count'] = play_count # D0['comment_count'] = comment_count # D0['favorite_count'] = favorite_count D0['mid'] = article_id # D0['releaser'] = releaser # D0['releaserUrl'] = releaserUrl # D0['release_time'] = release_time # D0['releaser_id_str'] = "toutiao_%s" % releaser_id D0['fetch_time'] = fetch_time D0['search_word'] = keyword D0["type"] = "article" try: article_info = self.get_single_article_page(article_id,keyword, proxies=proxies_num) D0.update(article_info) except Exception as e: print("method get_web_article_info error %s" % e) continue # print(D0) weibo_Lst.append(D0) except KeyError: # It's totally ok to drop the last return data value. # The search api just return something seems related to search continue else: break if len(weibo_Lst) >= 100: output_result(result_Lst=weibo_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) weibo_Lst.clear() if weibo_Lst != []: output_result(result_Lst=weibo_Lst, platform=self.platform, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type) return weibo_Lst def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) def search_page(self, keyword, search_pages_max=30, output_to_es_raw=False, output_to_es_register=False, es_index=None, doc_type=None, proxies_num=0): self.search_article_page(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw, output_to_es_register=output_to_es_register, es_index=es_index, doc_type=doc_type, proxies_num=proxies_num) def repost_page(self, weibo_id, user_name, password): total_page = 0 result_lst = [] cookie = self.manipulate_login(user_name=user_name, password=password) # cookie = self.test_cookie(get_cookie) if cookie is not None: current_time = int(time.time() * 1000) repost_url = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=1&__rnd=' + str( current_time) get_page = requests.get(repost_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' try: page_dic = get_page.json() total_page = page_dic['data']['page']['totalpage'] repost_info = page_dic['data']['html'] repost_soup = BeautifulSoup(repost_info, 'html.parser') repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'}) for line in repost_agg: try: one_repost = self.get_repost_info(line) result_lst.append(one_repost) print('get one repost') except: print('one repost data error') print(one_repost) except: print("can't get repost data") time.sleep(6) if cookie is not None and total_page != 0: for page_num in range(1, total_page + 1): current_time = int(time.time() * 1000) repost_url = ('https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=' + str(page_num) + '&__rnd=' + str(current_time)) get_page = requests.get(repost_url, headers=self.headers, cookies=cookie) time.sleep(3) get_page.encoding = 'utf-8' try: page_dic = get_page.json() total_page = page_dic['data']['page']['totalpage'] repost_info = page_dic['data']['html'] repost_soup = BeautifulSoup(repost_info, 'html.parser') repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'}) for line in repost_agg: one_repost = self.get_repost_info(line) result_lst.append(one_repost) print('get one repost at %s' % page_num) print(one_repost) except: print("can't get repost data") if result_lst != []: return result_lst else: print("can't get repost data") return None def user_page(self, user_id, user_name, password): result_lst = [] cookie_pool = open('cookie_pool', 'r', encoding='utf-8') for coo in cookie_pool: print(coo) cookie = json.loads(coo) #cookie = self.manipulate_login(user_name=user_name,password=password) #cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"} if cookie is not None: for page_num in range(1, 3): first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0' '&profile_ftype=1&page=' + str(page_num) + '#feedtop') get_page = requests.get(first_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') sfa = soup.find_all('script') find_content = '' for line in sfa: if 'Pl_Official_MyProfileFeed__' in str(line): find_content = str(line) find_content = find_content.replace('', '') # print(find_content) find_content_dic = json.loads(find_content) content_for_soup = find_content_dic['html'] soup_content = BeautifulSoup(content_for_soup, 'html.parser') weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'}) # time.sleep(15) for line_count,line in enumerate(weibo_lst): weibo_info = self.get_user_weibo_info(line, cookie) weibo_info['user_id'] = user_id weibo_info['user_url'] = 'https://weibo.com/' + user_id result_lst.append(weibo_info) print('get data at element page:%s pagebar:%s' % (page_num,line_count)) get_parameter = soup.find_all('script', {'type': 'text/javascript'}) for line in get_parameter: if 'pid' in str(line) and 'oid' in str(line): parameter_str = str(line) parameter_str = parameter_str.replace('\r', '').replace('\n', '').replace("\'", '') domain = re.findall('\d+', ''.join(re.findall("pid]=\d+", parameter_str)))[0] special_id = re.findall('\d+', ''.join(re.findall("page_id]=\d+", parameter_str)))[0] current_time = int(time.time() * 1000) for pagebar in [0, 1]: user_url = ('https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=' + domain + '&profile_ftype=1&is_all=1&pagebar=' + str(pagebar) + '&pl_name=Pl_Official_MyProfileFeed__22&id=' + special_id + '&script_uri=/' + user_id + '&feed_type=0&page=' + str(page_num) + '&pre_page=1' '&domain_op=' + domain + '&__rnd=' + str( current_time)) get_page = requests.get(user_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' try: page_dic = get_page.json() user_weibo_str = page_dic['data'] user_weibo_soup = BeautifulSoup(user_weibo_str, 'html.parser') user_weibo_agg = user_weibo_soup.find_all('div', {'action-type': 'feed_list_item'}) # time.sleep(15) for line in user_weibo_agg: try: weibo_info = self.get_user_weibo_info(line, cookie) weibo_info['user_id'] = user_id weibo_info['user_url'] = 'https://weibo.com/' + user_id result_lst.append(weibo_info) print('get data at ajax page page_num:%s pagebar:%s' % (page_num, pagebar)) except: print('one weibo_info error') except: print('page error at page_num:%s pagebar:%s' % (page_num, pagebar)) if result_lst != []: return result_lst else: print("can't get repost data") return None @staticmethod def get_single_page(mid): url = "https://m.weibo.cn/status/%s" % mid headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", # "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543", "sec-fetch-dest": "empty", "sec-fetch-mode": "same-origin", "sec-fetch-site": "same-origin", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } page_res = retry_get_url(url, headers=headers, proxies=0) page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text, flags=re.DOTALL)[0] page_json = json.loads(page_json_context) text = dehtml(page_json[0]["status"]["text"]) repost_count = trans_play_count(page_json[0]["status"]["reposts_count"]) comment_count = trans_play_count(page_json[0]["status"]["comments_count"]) favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"]) return text, repost_count, comment_count, favorite_count def get_releaser_id(self, releaserUrl): return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl) @staticmethod def get_img(data): img_list = [] if data.get("pics"): for one in data.get("pics"): try: img_list.append(one["large"]["url"]) except Exception as e: img_list.append(one["url"]) print("add img error %s" % e) return img_list def releaser_page(self, releaserUrl, output_to_file=False, filepath=None, output_to_es_raw=False, output_to_es_register=False, push_to_redis=False, releaser_page_num_max=10000, es_index=None, doc_type=None, proxies_num=None): print('Processing releaserUrl %s' % releaserUrl) result_Lst = [] releaser_id = self.get_releaser_id(releaserUrl) # xsrf_token,url_extr = self.get_weibo_info(releaser_id) headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", # "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011", "mweibo-pwa": "1", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4", # "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", "x-requested-with": "XMLHttpRequest", # "x-xsrf-token": xsrf_token, } pagenum = 0 has_more = True since_id = 0 if releaser_id: while pagenum <= releaser_page_num_max and has_more: pagenum += 1 time.sleep(0.5) "?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429" url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format( releaser_id, releaser_id, releaser_id, since_id) headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id) print('Page number: %d' % pagenum) try: if proxies_num: get_page = retry_get_url(url, headers=headers, timeout=self.timeout, proxies=proxies_num) else: get_page = retry_get_url(url, headers=headers, timeout=self.timeout) except: get_page = None has_more = False if get_page and get_page.status_code == 200: try: page_json = get_page.json() total = page_json["data"]["cardlistInfo"]["total"] if pagenum > total: break since_id = page_json["data"]["cardlistInfo"]["since_id"] page_dic = page_json["data"].get("cards") except Exception as e: print("load data error %s" % e) continue if page_dic: for one in page_dic: try: mblog = one.get("mblog") mid = mblog.get("mid") forward_text = "" forward_user = "" if one.get("source") == "绿洲": text_type = "绿洲" elif mblog.get("retweeted_status"): text_type = "转发" forward_text = mblog.get("retweeted_status").get("raw_text") forward_user = mblog.get("retweeted_status").get("user").get("screen_name") else: text_type = one.get("source") if mblog.get("isLongText"): text, repost_count, comment_count, favorite_count = self.get_single_page(mid) else: text = mblog["raw_text"] res_dic = { "release_time": trans_strtime_to_timestamp(mblog["created_at"]), "fetch_time": int(datetime.datetime.now().timestamp() * 1e3), "url": one["scheme"], "releaser": mblog["user"]["screen_name"], "repost_count": trans_play_count(mblog["reposts_count"]), "comment_count": trans_play_count(mblog["comments_count"]), "favorite_count": trans_play_count(mblog["attitudes_count"]), "title": text.replace("\u200b", ""), "wb_type": text_type, "forward_user": forward_user, "forward_text": forward_text, "mid": mid, "releaserUrl": "https://www.weibo.com/u/%s" % releaser_id, "releaser_id_str": "weibo_%s" % releaser_id, "img_list": self.get_img(mblog), "platform": "weibo", # "doc_id":doc_id } res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic, doc_id_type="all-time-url") yield res_dic except Exception as e: print(json.dumps(mblog)) print("row formate error %s" % e) continue def get_releaser_follower_num(self, releaserUrl): pass def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs): count_false = 0 for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")): video_time = res["release_time"] # print(res) if video_time: if start_time < video_time: if video_time < end_time: yield res else: count_false += 1 if count_false > allow: break else: yield res if __name__ == '__main__': weibo = Crawler_weibo() # user_name = '7255925880' # password = 'Lemo1995' # keyword = '罗奕佳' # user_id = 'jianchuan' # weibo_id = '4273575663592672' # user_id = '1788283193' # test_search2 = weibo.search_page(keyword, user_name, password) # test_repost = weibo.repost_page(weibo_id, user_name, password) # user_page = weibo.user_page(user_id, user_name, password) weibo.search_page("迪丽热巴",output_to_es_register=True,es_index="crawler-data-raw",search_pages_max=1) # print(user_page)