# -*- coding: utf-8 -*- """ Created on Tue Aug 14 20:13:21 2018 @author: fangyucheng """ import re import rsa import time import json import urllib import base64 import binascii import requests from bs4 import BeautifulSoup class Crawler_weibo(): def __init__(self, timeout=None, platform='weibo'): self.session = requests.Session() self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2128.59 Safari/537.36'} def manipulate_login(self, user_name, password): # cookie写入cookie_pool cookie_pool = open('cookie_pool', 'a', encoding='utf-8') # 转换用户名 user_name_quote = urllib.parse.quote_plus(user_name) user_name_base64 = base64.b64encode(user_name_quote.encode('utf-8')) user_name_b64 = user_name_base64.decode('utf-8') # 获得servertime pubkey rsakv nonce 四个参数 current_time = int(time.time() * 1000) login_url_first_part = 'http://login.sina.com.cn/sso/prelogin.php?' login_url_dic = {'entry': 'weibo', 'callback': 'sinaSSOController.preloginCallBack', 'su': user_name_b64, 'rsakt': 'mod', 'checkpin': '1', 'client': 'ssologin.js(v1.4.18)', '_': current_time} login_url_second_part = urllib.parse.urlencode(login_url_dic) login_url = login_url_first_part + login_url_second_part get_page = requests.get(login_url) get_page.encoding = 'utf-8' page = get_page.text page_rep = page.replace('sinaSSOController.preloginCallBack', '') page_dic = eval(page_rep) pubkey = page_dic['pubkey'] servertime = page_dic['servertime'] rsakv = page_dic['rsakv'] nonce = page_dic['nonce'] # 构造密码 rsa_pubkey = int(pubkey, 16) key = rsa.PublicKey(rsa_pubkey, 65537) message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) message = message.encode("utf-8") password_rsa = rsa.encrypt(message, key) password_bi = binascii.b2a_hex(password_rsa) # login,通过post,获得cookie post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' post_data_dic = {'encoding': 'UTF-8', 'entry': 'weibo', 'from': '', 'gateway': '1', 'nonce': nonce, 'pagerefer': "", 'prelt': 67, 'pwencode': 'rsa2', "returntype": "META", 'rsakv': rsakv, 'savestate': '7', 'servertime': servertime, 'service': 'miniblog', 'sp': password_bi, 'sr': '1920*1080', 'su': user_name_b64, 'useticket': '1', 'vsnf': '1', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&display=0&'} logining_page = self.session.post(post_url, data=post_data_dic, headers=self.headers) login_loop = logining_page.content.decode("GBK") if '正在登录' in login_loop or 'Signing in' in login_loop: cookie = logining_page.cookies.get_dict() print(cookie,type(cookie)) current_time = int(time.time() * 1000) cookie_dic = {'cookie': cookie, 'current_time': current_time} cookie_json = json.dump(cookie_dic,cookie_pool) print('got cookie in login process') return cookie else: print('post failed, suggest to login again') def test_cookie(self, test_url=None, cookie=None, user_name=None, password=None): if test_url is None: test_url = 'https://weibo.com/1188203673/GuV3o9VYt' get_page = requests.get(test_url, cookies=cookie) page = get_page.text length = len(page) if length > 20000: print("due to the page's length is %s, cookie is useful" % length) return cookie else: print("invalid cookie at the page length %s" % length) return None def get_weibo_info_from_search_page(self, retrieve_soup, cookie): try: weibo_id = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['mid'] user_id_str = retrieve_soup.find('div', {'action-type': 'feed_list_item'})['tbinfo'] user_id = re.findall('\d+', user_id_str)[0] except: try: weibo_id_str = retrieve_soup.find('a', {'action-type': 'fl_menu'})['action-data'] weibo_id = re.findall('\d+', weibo_id_str)[0] user_id_str = retrieve_soup.find('a', {'class': 'name_txt'})['usercard'] user_id = re.findall('\d+', ' '.join(re.findall('id=\d+', user_id_str)))[0] except: weibo_id = None user_id = None print('id_error') try: user_url = retrieve_soup.find('div', {'class': 'face'}).a['href'] except: user_url = None print('user_url error') try: user_nickname = retrieve_soup.find('div', {'class': 'face'}).a['title'] except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('p', {'class': 'comment_txt'}).text weibo_content = weibo_content.strip('\n').strip() if '展开全文' in weibo_content: weibo_content = self.get_longtext(weibo_id, cookie) except: weibo_content = None print('weibo_content error') # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'class': 'W_textb'})['date'] except: try: release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date'] except: release_time = None print('release_time error') try: weibo_url = retrieve_soup.find('a', {'class': 'W_textb'})['href'] except: weibo_url = None print('weibo_url error') try: come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text except: come_from = None print("can't find come_from") try: repost_count = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).em.text except: repost_count = 0 try: comment_count = retrieve_soup.find('a', {'action-type': 'feed_list_comment'}).em.text except: comment_count = 0 try: favorite_count = retrieve_soup.find('a', {'action-type': 'feed_list_like'}).em.text except: favorite_count = 0 fetch_time = int(time.time() * 1000) weibo_info = {'weibo_id': weibo_id, 'user_id': user_id, 'user_url': user_url, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'release_time': release_time, 'weibo_url': weibo_url, 'come_from': come_from, 'repost_count': repost_count, 'comment_count': comment_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return weibo_info def get_repost_info(self, retrieve_soup): try: weibo_id = retrieve_soup['mid'] except: weibo_id = None print('weibo_id error') try: user_id_str = retrieve_soup.div.a['usercard'] user_id = re.findall('\d+', user_id_str)[0] except: user_id = None print('user_id error') try: user_url = retrieve_soup.div.a['href'] except: user_url = None print('user_url error') try: user_nickname = retrieve_soup.find('div', {'class': 'WB_text'}).a.text except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('span', {'node-type': 'text'}).text weibo_content = weibo_content.strip('\n').strip() except: weibo_content = None print('weibo_content error') if weibo_content is not None and '//' in weibo_content: parent_lst = weibo_content.split('//') try: pattern = '@.*:' parent_weibo = re.findall(pattern, parent_lst[1])[0].replace(':', '').replace('@', '') except: pattern = '@.*:' parent_weibo = re.findall(pattern, parent_lst[1])[0].replace(':', '').replace('@', '') else: parent_weibo = None # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['date'] except: release_time = None print('release_time error') try: weibo_url = retrieve_soup.find('a', {'node-type': 'feed_list_item_date'})['href'] except: weibo_url = None print('weibo_url error') try: repost_count_str = retrieve_soup.find('a', {'action-type': 'feed_list_forward'}).text repost_count_lst = re.findall('\d+', repost_count_str) if repost_count_lst != []: repost_count = repost_count_lst[0] else: repost_count = 0 except: repost_count = 0 try: favorite_count_str = retrieve_soup.find('span', {'node-type': 'like_status'}).text favorite_count_str.replace('ñ', '') try: favorite_count_lst = re.findall('\d+', favorite_count_str) if favorite_count_lst != []: favorite_count = favorite_count_lst[0] else: favorite_count = 0 except: favorite_count = 0 print('favorite_count is zero') except: favorite_count = 0 fetch_time = int(time.time() * 1000) repost_info = {'weibo_id': weibo_id, 'user_id': user_id, 'user_url': user_url, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'parent_weibo': parent_weibo, 'release_time': release_time, 'weibo_url': weibo_url, 'repost_count': repost_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return repost_info def get_user_weibo_info(self, retrieve_soup, cookie): try: weibo_id = retrieve_soup['mid'] except: weibo_id = None print('weibo_id error') try: user_nickname = retrieve_soup.find('a', {'class': 'W_f14'}).text except: user_nickname = None print('user_nickname error') try: weibo_content = retrieve_soup.find('div', {'class': 'WB_text'}).text weibo_content = weibo_content.strip('\n').strip() if '展开全文' in weibo_content: weibo_content = self.get_longtext(weibo_id, cookie) except: weibo_content = None print('weibo_content error') # 没有判断是否为超级话题 #没有摘取emoji #没有下载图片 try: release_time = retrieve_soup.find('a', {'class': 'S_txt2'})['date'] except: release_time = None print('release_time error') try: text = "weibo.com" weibo_url = text.join((retrieve_soup.find('a', {'class': 'S_txt2'})['href'])) print(weibo_url) except: weibo_url = None print('weibo_url error') try: come_from = retrieve_soup.find('a', {'rel': 'nofollow'}).text except: come_from = None print("can't find come_from") try: repost_count_lst = retrieve_soup.find('span', {'node-type': 'forward_btn_text'}).find_all('em') for line in repost_count_lst: try: repost_count = int(line.text) except: repost_count = 0 except: repost_count = 0 try: comment_count_lst = retrieve_soup.find('span', {'node-type': 'comment_btn_text'}).find_all('em') for line in comment_count_lst: try: comment_count = int(line.text) except: comment_count = 0 except: comment_count = 0 try: favorite_count_lst = retrieve_soup.find('span', {'node-type': 'like_status'}).find_all('em') for line in favorite_count_lst: try: favorite_count = int(line.text) except: favorite_count = 0 except: favorite_count = 0 print('favorite_count is zero') fetch_time = int(time.time() * 1000) weibo_info = {'weibo_id': weibo_id, 'user_nickname': user_nickname, 'weibo_content': weibo_content, 'come_from': come_from, 'release_time': release_time, 'weibo_url': weibo_url, 'repost_count': repost_count, 'comment_count': comment_count, 'favorite_count': favorite_count, 'fetch_time': fetch_time} return weibo_info def get_longtext(self, weibo_id, cookie): current_time = int(time.time() * 1000) longtext_url = ('https://weibo.com/p/aj/mblog/getlongtext?ajwvr=6&mid=' + weibo_id + '&is_settop&is_sethot&is_setfanstop&' 'is_setyoudao&__rnd=' + str(current_time)) get_page = requests.get(longtext_url, headers=self.headers, cookies=cookie) try: page_dic = get_page.json() wait_for_soup = page_dic['data']['html'] soup = BeautifulSoup(wait_for_soup, 'html.parser') longtext = soup.text return longtext except: print("can't get longtext") return '' def search_page(self, keyword, user_name, password): result_lst = [] openfile = open('D:/python_code/crawler/crawler_sys/site_crawler/crawler_weibo/error5', 'a', encoding='utf-8') cookie = self.manipulate_login(user_name=user_name, password=password) # cookie = self.test_cookie(get_cookie) if cookie is not None: for page_num in range(1, 2): search_url = 'http://s.weibo.com/weibo/' + keyword + '?b=1&page=' + str(page_num) get_page = requests.get(search_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' page = get_page.text print(len(page)) time.sleep(10) soup = BeautifulSoup(page, 'html.parser') sfa = soup.find_all('script') find_content = '' for line in sfa: if '"pid":"pl_weibo_direct"' in str(line): find_content = str(line) if find_content != '': find_content1 = find_content.replace('', '') find_content_dic = json.loads(find_content2) content = find_content_dic['html'] content_soup = BeautifulSoup(content, 'html.parser') weibo_lst = content_soup.find_all('div', {'class': 'WB_cardwrap'}) for line in weibo_lst: try: weibo_info = self.get_weibo_info_from_search_page(line, cookie) print('get_one_weibo_info') result_lst.append(weibo_info) except: openfile.write(str(line)) openfile.write('\n') openfile.flush() print('error') return result_lst else: print('no valid cookie') return '' openfile.close() def repost_page(self, weibo_id, user_name, password): total_page = 0 result_lst = [] cookie = self.manipulate_login(user_name=user_name, password=password) # cookie = self.test_cookie(get_cookie) if cookie is not None: current_time = int(time.time() * 1000) repost_url = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=1&__rnd=' + str( current_time) get_page = requests.get(repost_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' try: page_dic = get_page.json() total_page = page_dic['data']['page']['totalpage'] repost_info = page_dic['data']['html'] repost_soup = BeautifulSoup(repost_info, 'html.parser') repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'}) for line in repost_agg: try: one_repost = self.get_repost_info(line) result_lst.append(one_repost) print('get one repost') except: print('one repost data error') print(one_repost) except: print("can't get repost data") time.sleep(6) if cookie is not None and total_page != 0: for page_num in range(1, total_page + 1): current_time = int(time.time() * 1000) repost_url = ('https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=' + weibo_id + '&max_id=0&page=' + str(page_num) + '&__rnd=' + str(current_time)) get_page = requests.get(repost_url, headers=self.headers, cookies=cookie) time.sleep(3) get_page.encoding = 'utf-8' try: page_dic = get_page.json() total_page = page_dic['data']['page']['totalpage'] repost_info = page_dic['data']['html'] repost_soup = BeautifulSoup(repost_info, 'html.parser') repost_agg = repost_soup.find_all('div', {'action-type': 'feed_list_item'}) for line in repost_agg: one_repost = self.get_repost_info(line) result_lst.append(one_repost) print('get one repost at %s' % page_num) print(one_repost) except: print("can't get repost data") if result_lst != []: return result_lst else: print("can't get repost data") return None def user_page(self, user_id, user_name, password): result_lst = [] cookie_pool = open('cookie_pool', 'r', encoding='utf-8') for coo in cookie_pool: print(coo) cookie = json.loads(coo) #cookie = self.manipulate_login(user_name=user_name,password=password) #cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"} if cookie is not None: for page_num in range(1, 3): first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0' '&profile_ftype=1&page=' + str(page_num) + '#feedtop') get_page = requests.get(first_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' page = get_page.text soup = BeautifulSoup(page, 'html.parser') sfa = soup.find_all('script') find_content = '' for line in sfa: if 'Pl_Official_MyProfileFeed__' in str(line): find_content = str(line) find_content = find_content.replace('', '') # print(find_content) find_content_dic = json.loads(find_content) content_for_soup = find_content_dic['html'] soup_content = BeautifulSoup(content_for_soup, 'html.parser') weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'}) # time.sleep(15) for line_count,line in enumerate(weibo_lst): weibo_info = self.get_user_weibo_info(line, cookie) weibo_info['user_id'] = user_id weibo_info['user_url'] = 'https://weibo.com/' + user_id result_lst.append(weibo_info) print('get data at element page:%s pagebar:%s' % (page_num,line_count)) get_parameter = soup.find_all('script', {'type': 'text/javascript'}) for line in get_parameter: if 'pid' in str(line) and 'oid' in str(line): parameter_str = str(line) parameter_str = parameter_str.replace('\r', '').replace('\n', '').replace("\'", '') domain = re.findall('\d+', ''.join(re.findall("pid]=\d+", parameter_str)))[0] special_id = re.findall('\d+', ''.join(re.findall("page_id]=\d+", parameter_str)))[0] current_time = int(time.time() * 1000) for pagebar in [0, 1]: user_url = ('https://weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=' + domain + '&profile_ftype=1&is_all=1&pagebar=' + str(pagebar) + '&pl_name=Pl_Official_MyProfileFeed__22&id=' + special_id + '&script_uri=/' + user_id + '&feed_type=0&page=' + str(page_num) + '&pre_page=1' '&domain_op=' + domain + '&__rnd=' + str( current_time)) get_page = requests.get(user_url, headers=self.headers, cookies=cookie) get_page.encoding = 'utf-8' try: page_dic = get_page.json() user_weibo_str = page_dic['data'] user_weibo_soup = BeautifulSoup(user_weibo_str, 'html.parser') user_weibo_agg = user_weibo_soup.find_all('div', {'action-type': 'feed_list_item'}) # time.sleep(15) for line in user_weibo_agg: try: weibo_info = self.get_user_weibo_info(line, cookie) weibo_info['user_id'] = user_id weibo_info['user_url'] = 'https://weibo.com/' + user_id result_lst.append(weibo_info) print('get data at ajax page page_num:%s pagebar:%s' % (page_num, pagebar)) except: print('one weibo_info error') except: print('page error at page_num:%s pagebar:%s' % (page_num, pagebar)) if result_lst != []: return result_lst else: print("can't get repost data") return None if __name__ == '__main__': weibo = Crawler_weibo() user_name = '7255925880' password = 'Lemo1995' # keyword = '罗奕佳' # user_id = 'jianchuan' # weibo_id = '4273575663592672' user_id = '1788283193' # test_search2 = weibo.search_page(keyword, user_name, password) # test_repost = weibo.repost_page(weibo_id, user_name, password) user_page = weibo.user_page(user_id, user_name, password) print(user_page)