# -*- coding: utf-8 -*- """ Created on Wed Aug 22 09:30:20 2018 @author: fangyucheng """ import time import requests from bs4 import BeautifulSoup from crawler.crawler_sys.utils.trans_strtime_to_timestamp import trans_strtime_to_timestamp cookie = ('YYID=2FFBDAA6D4FBA37438F4067C8123E98B; IMEVER=8.5.0.1322;' 'SUID=3D03FF723865860A59795A5F000BB71F;' 'SUV=00C039A172FF033D5993ADBD770E7410; usid=lF0F7il0yWbXF5c9;' 'IPLOC=CN1100; sct=11; SMYUV=1512954490386200;' 'ad=19fxxkllll2zKxvnlllllVHr6$UllllltsDRlyllll9llllljgDll5@@@@@@@@@@;' 'SNUID=D0DE5A671A1E68C31FB628911B8277A5; wuid=AAGPcSphIAAAAAqLE2OSTQgAGwY=;' 'UM_distinctid=16449b02797449-0c5d9293f4a833-143f7040-1fa400-16449b02799881;' 'CXID=794EC592A14CE76F5DF3F3A3BDDDD787;' 'ld=Kyllllllll2bWX10QTIdJOHDsvSbWX1uK94Vhkllll9lllllVklll5@@@@@@@@@@;' 'cd=1534754086&17502a3f56c02f72dfd43a17cbb19663;' 'rd=Vyllllllll2bBEqoQLWCNCHfKv2bWX1uzX0atkllllwllllRVllll5@@@@@@@@@@;' 'LSTMV=173%2C72; LCLKINT=1570') headers = {'Host': 'news.sogou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Cookie': cookie, 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0'} def sogou_info_page(keyword): result_lst = [] for page_num in range(1,11): search_url = 'http://news.sogou.com/news?&query='+keyword+'&page='+str(page_num) get_page = requests.get(search_url, headers=headers) page = get_page.text soup = BeautifulSoup(page, 'html.parser') news_lst = soup.find_all('div', {'class': 'vrwrap'}) for line in news_lst: try: title = line.div.h3.a.text url = line.div.h3.a['href'] source_and_release_time = line.find('p', {'class': 'news-from'}).text source_and_release_time_lst = source_and_release_time.split('\xa0') source = source_and_release_time_lst[0] release_time_str = source_and_release_time_lst[-1] release_time = trans_strtime_to_timestamp(release_time_str) try: content = line.find('span').text except: print('no content at %s' % title) content = 'missing' fetch_time = int(time.time()*1000) try: similar_news = line.find('a', {'id': 'news_similar'}).text except: print('no similar news at %s' % title) similar_news = 'missing' news_info = {'title': title, 'url': url, 'source': source, 'release_time': release_time, 'fetch_time': fetch_time, 'content': content, 'similar_news': similar_news, 'keyword': keyword} result_lst.append(news_info) print('get data at page %s' % page_num) except: ('the error occured at position %s' % news_lst.index(line)) return result_lst if __name__=='__main__': keyword = '中超' test_sogou = sogou_info_page(keyword)