Commit ffd0efe4 authored by litaolemo's avatar litaolemo

update

parent d3373a11
# crawler
## 发布者页爬虫
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
2. 切换权限 sudo su - gmuser
3. source /root/anaconda3/bin/activate
4. 创建虚拟环境 conda activate crawler_env/conda deactivate
5. 抓取程序 nohup python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py > /data/log/fect_task.log &
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
\ No newline at end of file
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
##搜索页爬虫
......@@ -401,4 +401,11 @@ def task_main():
if __name__ == "__main__":
task_main()
from concurrent.futures import ProcessPoolExecutor
executor = ProcessPoolExecutor(max_workers=4)
futures = []
for processe in range(4):
future = executor.submit(task_main)
futures.append(future)
print('Processe %s start' % processe)
executor.shutdown(True)
......@@ -19,10 +19,7 @@ import re
import time
import urllib
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
import requests
......@@ -155,52 +152,6 @@ class Crawler_toutiao():
host_str = None
return host_str
def video_page_via_m(self, url):
url_video_id_midstep = ' '.join(re.findall('com/.*', url)).replace('com/', '')
url_video_id = re.findall('\d+', url_video_id_midstep)[0]
# mobile_url = 'https://m.365yg.com/i'+url_video_id+'/info/?'
mobile_url = 'https://m.365yg.com/i' + url_video_id
get_page = retry_get_url(mobile_url)
if get_page is not None:
return None
else:
page = get_page.text
page = page.replace('true', 'True')
page = page.replace('false', 'False')
page = page.replace('null', '"Null"')
try:
page_dic = eval(page)
except:
page_dic = None
print('Failed to transfer text to dict on url: %s' % url)
return None
video_dict = copy.deepcopy(self.video_data)
try:
video_dic = page_dic['data']
title = video_dic['title']
releaser = video_dic['source']
releaser_id = video_dic['creator_uid']
releaserUrl = self.releaser_url_pattern.replace('[RELEASER_ID]',
str(releaser_id))
play_count = video_dic['video_play_count']
comment_count = video_dic['comment_count']
release_time = int(video_dic['publish_time'] * 1e3)
video_id = video_dic['video_id']
fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
#
video_dict['title'] = title
video_dict['url'] = url
video_dict['play_count'] = play_count
video_dict['comment_count'] = comment_count
video_dict['video_id'] = video_id
video_dict['releaser'] = releaser
video_dict['releaser_id_str'] = str(releaser_id)
video_dict['releaserUrl'] = releaserUrl
video_dict['release_time'] = release_time
video_dict['fetch_time'] = fetch_time
except:
print('Failed when extracting data from page of url: %s' % url)
return video_dict
def video_page(self, url):
"""
......@@ -340,34 +291,97 @@ class Crawler_toutiao():
return None
return video_dict
def get_web_article_info(self, article_id, proxies_num=0):
# headers = {
# "Accept": "*/*",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh,zh-CN;q=0.9",
# "Connection": "keep-alive",
# # "Cookie": "tt_webid=6851461299689686542; SLARDAR_WEB_ID=568d391e-7f96-491b-9557-b045a55e9dd8",
# "Host": "m.toutiao.com",
# # "Referer": "https://m.toutiao.com/i6851146167279944199/",
# "Sec-Fetch-Dest": "empty",
# "Sec-Fetch-Mode": "cors",
# "Sec-Fetch-Site": "same-origin",
# "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
# }
# headers["Referer"] = "https://m.toutiao.com/i%s" % article_id
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "tt_webid=6851788569271944719",
"Host": "m.toutiao.com",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}
url = "https://m.toutiao.com/i{0}/info/?i={1}".format(article_id, article_id)
requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
res_json = requests_res.json()
res_dic = {
"title": res_json["data"].get("title").replace("\r", "").replace("\n", ""),
'high_quality_flag': int(res_json["data"].get('high_quality_flag')),
"play_count": int(res_json["data"].get('impression_count')),
"comment_count": res_json["data"].get("comment_count"),
"repost_count": res_json["data"].get("repost_count"),
"favorite_count": res_json["data"].get("digg_count"),
'releaser_followers_count': res_json["data"].get("follower_count"),
'release_time': int(res_json["data"].get('publish_time') * 1e3),
"content": res_json["data"].get("content").replace("\r", "").replace("\n", ""),
"img_list": re.findall('img src=".*?"', res_json["data"].get("content"))
}
return res_dic
def search_page_old(self, keyword, search_pages_max=12,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None):
doc_type=None,proxies_num=0):
headers_search = {
"accept": "application/json, text/javascript",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh,zh-CN;q=0.9",
"sec-fetch-mode": "cors",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"content-type": "application/x-www-form-urlencoded",
"cookie": r'csrftoken=301d4862d95090ad520f8a54ae360b93; uuid="w:79cdae1ec41c48c9b9cd21255077f629"; _ga=GA1.2.161734802.1557472235; sid_guard="135fa6481579978c777871e6fd64388b|1561603571|15552000|Tue\054 24-Dec-2019 02:46:11 GMT"; tt_webid=6722360764079998478; CNZZDATA1259612802=1042288349-1556155453-%7C1565168219; _ba=BA0.2-20191118-51299-hQirR6SUpyfp1u9bfDab; tt_webid=6722360764079998478; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16eef18c3b72df-0f53ff0c8ec3e8-72256d3c-5c030-16eef18c3b8634; s_v_web_id=335bc73471816523ac088af5f1424861; __tasessionId=dofpfb6fv1579159599726',
"referer": "https://www.toutiao.com/search/?keyword=%s" % urllib.parse.quote(keyword.encode('utf-8')),
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
# "cookie": "csrftoken=37420c8aa08013294281c3f0b053377d; WEATHER_CITY=%E5%8C%97%E4%BA%AC; SLARDAR_WEB_ID=7a399564-d37b-40eb-ad7e-bd04a4b7a43c; ttcid=01d94567f6f644248be6cfba11f23d8640; s_v_web_id=verify_kcvob60z_zV3lvb5j_dO3z_42Np_A2aO_h2hUwNM55Jt0; tt_webid=6855452651430823432; __tasessionId=xghnx07iq1596159468729; tt_webid=6855452651430823432; tt_scid=8KvMkZv-mN4OUclzXS7.9-pl0T409L4rqvI.Y2c0gwRvwMdRjm7SczvFbCGtzdgAcc7e",
"referer": "https://www.toutiao.com/search/?keyword=%s" % keyword.encode('utf-8'),
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
urls = []
for page_num in range(0, search_pages_max):
page_num = page_num * 20
url = (
'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset={0}&format=json&keyword={1}&autoload=true&count=20&en_qc=1&cur_tab=2&from=video&pd=video&timestamp={2}'.format(
page_num, urllib.parse.quote(keyword.encode('utf-8')),
int(datetime.datetime.now().timestamp())))
query_dic = {
"aid": "24",
"app_name": "web_search",
"offset": str(page_num),
"format": "json",
"keyword": keyword.encode('utf-8'),
"autoload": "true",
"count": "20",
"en_qc": "1",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis",
"timestamp": int(datetime.datetime.now().timestamp()*1e3),
# "_signature": "b.FccAAgEBCppuX9t-890W.wHWAADDV3aS3k4oUj4uf89BAU.AlKwpVJ5sJqJx5vRyWYy6hHnm9HqZSc0oNQfDNbq5oqJlQQvxS1Qb2pSamjLMSecFu67csOzqT88Nn13wX",
}
url = 'https://www.toutiao.com/api/search/content/?{0}'.format(urllib.parse.urlencode(query_dic))
urls.append(url)
toutiao_Lst = []
for search_page_url in urls:
get_page = requests.get(search_page_url, headers=headers_search)
get_page = retry_get_url(search_page_url, headers=headers_search)
if get_page.status_code != 200:
# retry once
get_page = requests.get(search_page_url)
......@@ -379,12 +393,12 @@ class Crawler_toutiao():
try:
title = one_line['title']
duration = one_line['video_duration']
abstract = one_line['abstract']
url = one_line['article_url']
play_count = one_line['read_count']
comment_count = one_line['comment_count']
favorite_count = one_line['digg_count']
videoid = one_line['id']
article_id = one_line['id']
releaser = one_line['media_name']
uid = one_line['user_id']
releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
......@@ -394,18 +408,24 @@ class Crawler_toutiao():
releaser_id = self.find_releaser_id(releaserUrl)
D0 = copy.deepcopy(self.video_data)
D0['title'] = title
D0['duration'] = duration
D0['abstract'] = abstract
D0['url'] = url
D0['play_count'] = play_count
D0['comment_count'] = comment_count
D0['favorite_count'] = favorite_count
D0['video_id'] = videoid
D0['article_id'] = article_id
D0['releaser'] = releaser
D0['releaserUrl'] = releaserUrl
D0['release_time'] = release_time
D0['releaser_id_str'] = "toutiao_%s" % releaser_id
D0['fetch_time'] = fetch_time
D0['search_word'] = keyword
D0["type"] = "article"
try:
article_info = self.get_web_article_info(article_id, proxies_num=proxies_num)
D0.update(article_info)
except Exception as e:
print("method get_web_article_info error %s" % e)
print(D0)
toutiao_Lst.append(D0)
except KeyError:
......@@ -437,11 +457,11 @@ class Crawler_toutiao():
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None):
doc_type=None, proxies_num=1):
self.search_page_old(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
doc_type=doc_type, proxies_num=proxies_num)
def find_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
......@@ -1768,15 +1788,15 @@ if __name__ == '__main__':
# test.releaser_page(data_lis[0])
# res = test.video_page("https://www.ixigua.com/i6701478014242259463/")
# print(res)
for u in data_lis:
# for u in data_lis:
# # # test.releaser_page(u)
# test.App_releaser_page_video(u, output_to_es_raw=True, es_index='crawler-data-raw',
# doc_type='doc',
# releaser_page_num_max=3, proxies_num=0)
test.get_releaser_page(u)
# test.get_releaser_page(u)
# test.App_releaser_page_all(u, output_to_es_raw=True, es_index='crawler-data-raw',
# doc_type='doc',
# releaser_page_num_max=3, proxies_num=1))
# test.releaser_page(u)
# test.search_page("北京国安")
test.search_page("热玛吉五代")
# -*- coding:UTF-8 -*-
# @Time : 2020/7/31 11:32
# @File : __init__.py
# @email : litao@igengmei.com
# @author : litao
\ No newline at end of file
......@@ -4,106 +4,124 @@ Created on Tue Aug 14 20:13:21 2018
@author: fangyucheng
"""
import copy
import re
import rsa
# import rsa
import time
import json
import urllib
import base64
import binascii
import datetime
import requests
from bs4 import BeautifulSoup
from crawler.crawler_sys.framework.video_fields_std import Std_fields_video
from crawler.crawler_sys.utils.output_results import retry_get_url, output_result
from crawler.crawler_sys.utils.trans_str_play_count_to_int import trans_play_count
from crawler.crawler_sys.utils.trans_strtime_to_timestamp import weibo_parse_time, trans_strtime_to_timestamp
from crawler.crawler_sys.utils.trans_duration_str_to_second import trans_duration
from crawler.crawler_sys.utils.util_logging import logged
from crawler.crawler_sys.proxy_pool.func_get_proxy_form_kuaidaili import get_proxy
from crawler.crawler_sys.utils.html_to_str import dehtml
from write_data_into_es.func_get_releaser_id import *
from write_data_into_es.func_cal_doc_id import cal_doc_id
class Crawler_weibo():
def __init__(self, timeout=None, platform='weibo'):
self.platform = "weibo"
self.session = requests.Session()
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2128.59 Safari/537.36'}
def manipulate_login(self, user_name, password):
# cookie写入cookie_pool
cookie_pool = open('cookie_pool',
'a', encoding='utf-8')
# 转换用户名
user_name_quote = urllib.parse.quote_plus(user_name)
user_name_base64 = base64.b64encode(user_name_quote.encode('utf-8'))
user_name_b64 = user_name_base64.decode('utf-8')
# 获得servertime pubkey rsakv nonce 四个参数
current_time = int(time.time() * 1000)
login_url_first_part = 'http://login.sina.com.cn/sso/prelogin.php?'
login_url_dic = {'entry': 'weibo',
'callback': 'sinaSSOController.preloginCallBack',
'su': user_name_b64,
'rsakt': 'mod',
'checkpin': '1',
'client': 'ssologin.js(v1.4.18)',
'_': current_time}
login_url_second_part = urllib.parse.urlencode(login_url_dic)
login_url = login_url_first_part + login_url_second_part
get_page = requests.get(login_url)
get_page.encoding = 'utf-8'
page = get_page.text
page_rep = page.replace('sinaSSOController.preloginCallBack', '')
page_dic = eval(page_rep)
pubkey = page_dic['pubkey']
servertime = page_dic['servertime']
rsakv = page_dic['rsakv']
nonce = page_dic['nonce']
std_fields = Std_fields_video()
self.video_data = std_fields.video_data
self.video_data['platform'] = self.platform
# remove fields that crawled data don't have
pop_key_Lst = ['channel', 'describe', 'isOriginal', "repost_count", "video_id"]
for popk in pop_key_Lst:
self.video_data.pop(popk)
# def manipulate_login(self, user_name, password):
# # cookie写入cookie_pool
# cookie_pool = open('cookie_pool',
# 'a', encoding='utf-8')
#
# # 转换用户名
# user_name_quote = urllib.parse.quote_plus(user_name)
# user_name_base64 = base64.b64encode(user_name_quote.encode('utf-8'))
# user_name_b64 = user_name_base64.decode('utf-8')
#
# # 获得servertime pubkey rsakv nonce 四个参数
# current_time = int(time.time() * 1000)
# login_url_first_part = 'http://login.sina.com.cn/sso/prelogin.php?'
# login_url_dic = {'entry': 'weibo',
# 'callback': 'sinaSSOController.preloginCallBack',
# 'su': user_name_b64,
# 'rsakt': 'mod',
# 'checkpin': '1',
# 'client': 'ssologin.js(v1.4.18)',
# '_': current_time}
# login_url_second_part = urllib.parse.urlencode(login_url_dic)
# login_url = login_url_first_part + login_url_second_part
# get_page = requests.get(login_url)
# get_page.encoding = 'utf-8'
# page = get_page.text
# page_rep = page.replace('sinaSSOController.preloginCallBack', '')
# page_dic = eval(page_rep)
# pubkey = page_dic['pubkey']
# servertime = page_dic['servertime']
# rsakv = page_dic['rsakv']
# nonce = page_dic['nonce']
# 构造密码
rsa_pubkey = int(pubkey, 16)
key = rsa.PublicKey(rsa_pubkey, 65537)
message = str(servertime) + '\t' + str(nonce) + '\n' + str(password)
message = message.encode("utf-8")
password_rsa = rsa.encrypt(message, key)
password_bi = binascii.b2a_hex(password_rsa)
# rsa_pubkey = int(pubkey, 16)
# key = rsa.PublicKey(rsa_pubkey, 65537)
# message = str(servertime) + '\t' + str(nonce) + '\n' + str(password)
# message = message.encode("utf-8")
# password_rsa = rsa.encrypt(message, key)
# password_bi = binascii.b2a_hex(password_rsa)
# login,通过post,获得cookie
post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
post_data_dic = {'encoding': 'UTF-8',
'entry': 'weibo',
'from': '',
'gateway': '1',
'nonce': nonce,
'pagerefer': "",
'prelt': 67,
'pwencode': 'rsa2',
"returntype": "META",
'rsakv': rsakv,
'savestate': '7',
'servertime': servertime,
'service': 'miniblog',
'sp': password_bi,
'sr': '1920*1080',
'su': user_name_b64,
'useticket': '1',
'vsnf': '1',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&display=0&'}
logining_page = self.session.post(post_url, data=post_data_dic, headers=self.headers)
login_loop = logining_page.content.decode("GBK")
if '正在登录' in login_loop or 'Signing in' in login_loop:
cookie = logining_page.cookies.get_dict()
print(cookie,type(cookie))
current_time = int(time.time() * 1000)
cookie_dic = {'cookie': cookie,
'current_time': current_time}
cookie_json = json.dump(cookie_dic,cookie_pool)
print('got cookie in login process')
return cookie
else:
print('post failed, suggest to login again')
# post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
# post_data_dic = {'encoding': 'UTF-8',
# 'entry': 'weibo',
# 'from': '',
# 'gateway': '1',
# 'nonce': nonce,
# 'pagerefer': "",
# 'prelt': 67,
# 'pwencode': 'rsa2',
# "returntype": "META",
# 'rsakv': rsakv,
# 'savestate': '7',
# 'servertime': servertime,
# 'service': 'miniblog',
# 'sp': password_bi,
# 'sr': '1920*1080',
# 'su': user_name_b64,
# 'useticket': '1',
# 'vsnf': '1',
# 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&display=0&'}
#
# logining_page = self.session.post(post_url, data=post_data_dic, headers=self.headers)
# login_loop = logining_page.content.decode("GBK")
#
# if '正在登录' in login_loop or 'Signing in' in login_loop:
# cookie = logining_page.cookies.get_dict()
# print(cookie,type(cookie))
# current_time = int(time.time() * 1000)
# cookie_dic = {'cookie': cookie,
# 'current_time': current_time}
# cookie_json = json.dump(cookie_dic,cookie_pool)
# print('got cookie in login process')
# return cookie
# else:
# print('post failed, suggest to login again')
def test_cookie(self, test_url=None,
......@@ -381,49 +399,147 @@ class Crawler_weibo():
print("can't get longtext")
return ''
def search_page(self, keyword, user_name, password):
result_lst = []
openfile = open('D:/python_code/crawler/crawler_sys/site_crawler/crawler_weibo/error5',
'a', encoding='utf-8')
cookie = self.manipulate_login(user_name=user_name,
password=password)
# cookie = self.test_cookie(get_cookie)
if cookie is not None:
for page_num in range(1, 2):
search_url = 'http://s.weibo.com/weibo/' + keyword + '?b=1&page=' + str(page_num)
get_page = requests.get(search_url, headers=self.headers, cookies=cookie)
get_page.encoding = 'utf-8'
page = get_page.text
print(len(page))
time.sleep(10)
soup = BeautifulSoup(page, 'html.parser')
sfa = soup.find_all('script')
find_content = ''
for line in sfa:
if '"pid":"pl_weibo_direct"' in str(line):
find_content = str(line)
if find_content != '':
find_content1 = find_content.replace('<script>STK && STK.pageletM && STK.pageletM.view(', '')
find_content2 = find_content1.replace(')</script>', '')
find_content_dic = json.loads(find_content2)
content = find_content_dic['html']
content_soup = BeautifulSoup(content, 'html.parser')
weibo_lst = content_soup.find_all('div', {'class': 'WB_cardwrap'})
for line in weibo_lst:
def get_single_article_page(self,article_id,keyword,proxies=0):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
# "Cookie": "SINAGLOBAL=565010119549.1364.1559571258394; login_sid_t=85753e367d54782a25518436f329cfa0; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5712976583220.359.1595386386561; ULV=1595386386575:2:1:1:5712976583220.359.1595386386561:1592884354178; UOR=,,login.sina.com.cn; SSOLoginState=1595829153; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ46TE-isMWEvFjmXZnGFZ5JpX5KMhUgL.Fo2Re0zpShqfSoe2dJLoI7_e9gfadcvadcvad7tt; ALF=1627695088; SCF=AlrGNPCzM_VX3PzgxftYKkUv6Gj7FjmOVVbH8EpsTADeRxEeW-7_ipW8LVV7sGN-t7JJA-VwFKC2Ot0ZkHwHstE.; SUB=_2A25yJwQhDeRhGedG6FAQ9CjJzT-IHXVRVXLprDV8PUNbmtAKLRPmkW9NUVHbR2NjdmB2ZEtnFBK75m3CwwTzeqTJ; SUHB=08J6qQipU2qH8A; CARD-MAIN=cfec82595a1164dea323b2fb276c823f",
"Host": "card.weibo.com",
"Referer": "https://card.weibo.com/article/m/show/id/{0}?_wb_client_=1&open_source=weibo_search&luicode=10000011&lfid=100103type%3D21%26q%3D{1}%26t%3D0".format(article_id,urllib.parse.quote(keyword)),
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
url = "https://card.weibo.com/article/m/aj/detail?id={0}&_t={1}".format(article_id,int(datetime.datetime.now().timestamp()*1e3))
try:
requests_res = retry_get_url(url,headers=headers,proxies=proxies)
res_json = requests_res.json()
# print(res_json)
data = res_json["data"]
video_dic = {}
video_dic["url"] = data["target_url"]
video_dic["title"] = data["title"]
video_dic["fetch_time"] = int(datetime.datetime.now().timestamp()*1e3)
video_dic["release_time"] = trans_strtime_to_timestamp(data["create_at"])
video_dic["play_count"] = int(data["read_count"])
video_dic["content"] = data["content"]
video_dic["releaser"] = data["userinfo"].get('screen_name')
video_dic["releaser_id"] = str(data["userinfo"].get('id'))
video_dic["releaserUrl"] = data["userinfo"].get('url')
video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"])
video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"])
return video_dic
except Exception as e:
print("single data row formate error %s" % e)
def search_article_page(self, keyword, search_pages_max=12,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None, proxies_num=0):
headers_search = {
"Accept": "application/json, text/plain, */*",
"MWeibo-Pwa": "1",
"Referer": "https://m.weibo.cn/search?containerid=100103type=1&q={0}".format(urllib.parse.quote(keyword)),
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
"X-Requested-With": "XMLHttpRequest",
"X-XSRF-TOKEN": "65d497"
}
urls = []
for page_num in range(0, search_pages_max):
url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D21%26q%3D{0}%26t%3D0&page_type=searchall&page={1}'.format(urllib.parse.quote(keyword),page_num + 1)
urls.append(url)
weibo_Lst = []
for search_page_url in urls:
get_page = retry_get_url(search_page_url, headers=headers_search)
if get_page.status_code != 200:
# retry once
get_page = requests.get(search_page_url)
if get_page.status_code != 200:
continue
page_dict = get_page.json()
if page_dict['data'].get("cards")[0].get("card_group"):
for one_line in page_dict['data'].get("cards")[0].get("card_group"):
try:
title = one_line['title_sub']
# abstract = one_line['abstract']
# url = one_line['article_url']
# play_count = one_line['read_count']
# comment_count = one_line['comment_count']
# favorite_count = one_line['digg_count']
article_id = re.findall("(\d+)",one_line['scheme'])[0]
# releaser = one_line['media_name']
# uid = one_line['user_id']
# releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
# release_time = one_line['publish_time']
# release_time = int(int(release_time) * 1e3)
fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
# releaser_id = self.get_releaser_id(releaserUrl)
D0 = copy.deepcopy(self.video_data)
D0['title'] = title
# D0['abstract'] = abstract
# D0['url'] = url
# D0['play_count'] = play_count
# D0['comment_count'] = comment_count
# D0['favorite_count'] = favorite_count
D0['article_id'] = article_id
# D0['releaser'] = releaser
# D0['releaserUrl'] = releaserUrl
# D0['release_time'] = release_time
# D0['releaser_id_str'] = "toutiao_%s" % releaser_id
D0['fetch_time'] = fetch_time
D0['search_word'] = keyword
D0["type"] = "article"
try:
weibo_info = self.get_weibo_info_from_search_page(line, cookie)
print('get_one_weibo_info')
result_lst.append(weibo_info)
except:
openfile.write(str(line))
openfile.write('\n')
openfile.flush()
print('error')
return result_lst
else:
print('no valid cookie')
return ''
openfile.close()
article_info = self.get_single_article_page(article_id,keyword, proxies=proxies_num)
D0.update(article_info)
except Exception as e:
print("method get_web_article_info error %s" % e)
print(D0)
weibo_Lst.append(D0)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
continue
else:
break
if len(weibo_Lst) >= 100:
output_result(result_Lst=weibo_Lst,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
weibo_Lst.clear()
if weibo_Lst != []:
output_result(result_Lst=weibo_Lst,
platform=self.platform,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
return weibo_Lst
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
def search_page(self, keyword, search_pages_max=30,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None, proxies_num=0):
self.search_article_page(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type, proxies_num=proxies_num)
def repost_page(self, weibo_id, user_name, password):
total_page = 0
......@@ -557,16 +673,182 @@ class Crawler_weibo():
print("can't get repost data")
return None
@staticmethod
def get_single_page(mid):
url = "https://m.weibo.cn/status/%s" % mid
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_T_WM=68345544646; WEIBOCN_FROM=1110006030; MLOGIN=0; XSRF-TOKEN=fd1a69; M_WEIBOCN_PARAMS=oid%3D4523948446845543%26luicode%3D20000061%26lfid%3D4528703037509890%26uicode%3D20000061%26fid%3D4523948446845543",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "same-origin",
"sec-fetch-site": "same-origin",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}
page_res = retry_get_url(url, headers=headers, proxies=0)
page_json_context = re.findall(r"render_data = (.*)\[0\]", page_res.text, flags=re.DOTALL)[0]
page_json = json.loads(page_json_context)
text = dehtml(page_json[0]["status"]["text"])
repost_count = trans_play_count(page_json[0]["status"]["reposts_count"])
comment_count = trans_play_count(page_json[0]["status"]["comments_count"])
favorite_count = trans_play_count(page_json[0]["status"]["attitudes_count"])
return text, repost_count, comment_count, favorite_count
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
@staticmethod
def get_img(data):
img_list = []
if data.get("pics"):
for one in data.get("pics"):
try:
img_list.append(one["large"]["url"])
except Exception as e:
img_list.append(one["url"])
print("add img error %s" % e)
return img_list
def releaser_page(self, releaserUrl,
output_to_file=False, filepath=None,
output_to_es_raw=False,
output_to_es_register=False,
push_to_redis=False,
releaser_page_num_max=10000,
es_index=None,
doc_type=None, proxies_num=None):
print('Processing releaserUrl %s' % releaserUrl)
result_Lst = []
releaser_id = self.get_releaser_id(releaserUrl)
# xsrf_token,url_extr = self.get_weibo_info(releaser_id)
headers = {
"accept": "application/json, text/plain, */*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "_T_WM=30976479190; XSRF-TOKEN=9e4bb8; WEIBOCN_FROM=1110006030; MLOGIN=0; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E8%25BF%25AA%25E4%25B8%25BD%25E7%2583%25AD%25E5%25B7%25B4%26fid%3D1076031669879400%26uicode%3D10000011",
"mweibo-pwa": "1",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4",
# "referer": "https://m.weibo.cn/u/1669879400?uid=1669879400&t=0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
# "x-xsrf-token": xsrf_token,
}
pagenum = 0
has_more = True
since_id = 0
if releaser_id:
while pagenum <= releaser_page_num_max and has_more:
pagenum += 1
time.sleep(0.5)
"?uid=1669879400&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%BF%AA%E4%B8%BD%E7%83%AD%E5%B7%B4&type=uid&value=1669879400&containerid=1076031669879400&since_id=451822205602429"
url = "https://m.weibo.cn/api/container/getIndex?uid={0}&t=0&type=uid&value={1}&containerid=107603{2}&since_id={3}".format(
releaser_id, releaser_id, releaser_id, since_id)
headers["referer"] = "https://m.weibo.cn/u/uid={0}&t=0".format(releaser_id)
print('Page number: %d' % pagenum)
try:
if proxies_num:
get_page = retry_get_url(url, headers=headers, timeout=self.timeout, proxies=proxies_num)
else:
get_page = retry_get_url(url, headers=headers, timeout=self.timeout)
except:
get_page = None
has_more = False
if get_page and get_page.status_code == 200:
try:
page_json = get_page.json()
total = page_json["data"]["cardlistInfo"]["total"]
if pagenum > total:
break
since_id = page_json["data"]["cardlistInfo"]["since_id"]
page_dic = page_json["data"].get("cards")
except Exception as e:
print("load data error %s" % e)
continue
if page_dic:
for one in page_dic:
try:
mblog = one.get("mblog")
mid = mblog.get("mid")
forward_text = ""
forward_user = ""
if one.get("source") == "绿洲":
text_type = "绿洲"
elif mblog.get("retweeted_status"):
text_type = "转发"
forward_text = mblog.get("retweeted_status").get("raw_text")
forward_user = mblog.get("retweeted_status").get("user").get("screen_name")
else:
text_type = one.get("source")
if mblog.get("isLongText"):
text, repost_count, comment_count, favorite_count = self.get_single_page(mid)
else:
text = mblog["raw_text"]
res_dic = {
"release_time": trans_strtime_to_timestamp(mblog["created_at"]),
"fetch_time": int(datetime.datetime.now().timestamp() * 1e3),
"url": one["scheme"],
"releaser": mblog["user"]["screen_name"],
"repost_count": trans_play_count(mblog["reposts_count"]),
"comment_count": trans_play_count(mblog["comments_count"]),
"favorite_count": trans_play_count(mblog["attitudes_count"]),
"title": text.replace("\u200b", ""),
"wb_type": text_type,
"forward_user": forward_user,
"forward_text": forward_text,
"mid": mid,
"releaserUrl": "https://www.weibo.com/u/%s" % releaser_id,
"releaser_id_str": "weibo_%s" % releaser_id,
"img_list": self.get_img(mblog),
"platform": "weibo",
# "doc_id":doc_id
}
res_dic["doc_id"] = cal_doc_id(platform="weibo", url=one["scheme"], data_dict=res_dic,
doc_id_type="all-time-url")
yield res_dic
except Exception as e:
print(json.dumps(mblog))
print("row formate error %s" % e)
continue
def get_releaser_follower_num(self, releaserUrl):
pass
def releaser_page_by_time(self, start_time, end_time, url, allow, **kwargs):
count_false = 0
for res in self.releaser_page(url, proxies_num=kwargs.get("proxies_num")):
video_time = res["release_time"]
# print(res)
if video_time:
if start_time < video_time:
if video_time < end_time:
yield res
else:
count_false += 1
if count_false > allow:
break
else:
yield res
if __name__ == '__main__':
weibo = Crawler_weibo()
user_name = '7255925880'
password = 'Lemo1995'
# user_name = '7255925880'
# password = 'Lemo1995'
# keyword = '罗奕佳'
# user_id = 'jianchuan'
# weibo_id = '4273575663592672'
user_id = '1788283193'
# user_id = '1788283193'
# test_search2 = weibo.search_page(keyword, user_name, password)
# test_repost = weibo.repost_page(weibo_id, user_name, password)
user_page = weibo.user_page(user_id, user_name, password)
print(user_page)
# user_page = weibo.user_page(user_id, user_name, password)
weibo.search_page("迪丽热巴")
# print(user_page)
......@@ -81,7 +81,7 @@ class Crawler_douban():
comment_count = trans_play_count(page_json["comments_count"])
favorite_count = trans_play_count(page_json["like_count"])
collection_count = trans_play_count(page_json["collections_count"])
img_list = re.findall('img src=".*?"',content)
img_list = re.findall('img src="(.*?)"',content)
dic = {
"content":content,
"repost_count":repost_count,
......
# coding=utf-8
import pymysql
from elasticsearch import Elasticsearch
import smtplib, xlwt, logging, traceback, datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
from email.utils import formataddr
es = Elasticsearch([
{
'host': '172.16.31.17',
'port': 9200,
}, {
'host': '172.16.31.11',
'port': 9200,
}])
def send_email_tome():
try:
date = datetime.datetime.now().date() - datetime.timedelta(days=1)
fromaddr = 'litao@igengmei.com'
password = 'hTx9kAikArsSNsDr'
# toaddrs = "lixiaofang@igengmei.com"
# toaddrs1 = "duanyingrong@igengmei.com"
# toaddrs2 = "dengguangyu@igengmei.com"
# toaddrs3 = "wangxin@igengmei.com"
# toaddrs4 ="hezijun@igengmei.com"
# toaddrs5 = "malinxi@igengmei.com"
toaddrs6 = "litao@igengmei.com"
content = 'hi all:附件为' + str(date) + '的搜索词数据统计结果以及近一周的数据统计结果,请查收!'
textApart = MIMEText(content)
zipFile = str(date)+".xls"
#zipFile = '昨日数据统计结果.xls'
zipApart = MIMEApplication(open(zipFile, 'rb').read())
zipApart.add_header('Content-Disposition', 'attachment', filename=zipFile)
zipFile_week = '近一周数据统计结果.xls'
zipApart_week = MIMEApplication(open(zipFile_week, 'rb').read())
zipApart_week.add_header('Content-Disposition', 'attachment', filename=zipFile_week)
m = MIMEMultipart()
m.attach(textApart)
m.attach(zipApart_week)
m.attach(zipApart)
m['From'] = formataddr(["黎涛", toaddrs6])
# m["To"] = formataddr(["李小芳", toaddrs])
# m["To"] = formataddr(["段英荣", toaddrs1])
# m["To"] = formataddr(["邓光宇", toaddrs2])
# m["To"] = formataddr(["王昕", toaddrs3])
# m["To"] = formataddr(["赫梓君", toaddrs4])
m["To"] = formataddr(["黎涛", toaddrs6])
m['Subject'] = '每日搜索词结果统计'
try:
server = smtplib.SMTP_SSL('smtp.exmail.qq.com', 465)
server.login(fromaddr, password)
server.sendmail(fromaddr, [toaddrs6], m.as_string())
print('success')
server.quit()
except smtplib.SMTPException as e:
print('error', e)
except Exception:
logging.error("catch exception,main:%s" % traceback.format_exc())
def get_es_word(word):
###answer
results = es.search(
index='gm-dbmw-answer-read',
doc_type='answer',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"title": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"desc": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}]
}
},
}
)
answer_content_num = results["hits"]["total"]
# tractate
results = es.search(
index='gm-dbmw-tractate-read',
doc_type='tractate',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"content": {"query": word, "analyzer": "gm_default_index"}}}, {
"match_phrase": {"tractate_tag_name": {"query": word, "analyzer": "gm_default_index"}}}, {
"match_phrase": {"tractate_tag_name_content": {"query": word,
"analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}]
}
},
}
)
tractate_content_num = results["hits"]["total"]
###diary
results = es.search(
index='gm-dbmw-diary-read',
doc_type='diary',
timeout='10s',
size=0,
body={
"query": {
"bool": {
"minimum_should_match": 1,
"should": [{"match_phrase": {"tags": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"answer": {"query": word, "analyzer": "gm_default_index"}}},
{"match_phrase": {"service.name": {"query": word, "analyzer": "gm_default_index"}}}],
"must": [{"term": {"is_online": True}}, {"range": {"content_level": {"gte": "3"}}}]
}
},
}
)
diary_content_num = results["hits"]["total"]
return answer_content_num, tractate_content_num, diary_content_num
class WritrExcel():
def set_style(self, name, height, bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
# 写入Excel
def write_excel(self, path, rows):
# 创建工作簿
workbook = xlwt.Workbook(encoding='utf-8')
# 创建sheet
data_sheet = workbook.add_sheet('Sheet1')
# 将样式定义在循环之外
default = self.set_style('Times New Roman', 220, True)
j = k = 0
# 循环读取每一行数据并写入Excel
for row in rows[:65530]:
for i in range(len(row)):
try:
# 写入
data_sheet.write((j + k), i, row[i], default)
except:
print(i)
raise
# data_sheet.write(1, i, row1[i], self.set_style('Times New Roman', 220, True))
k = k + 1
workbook.save(path)
print("写入文件成功,共" + str(k) + "行数据")
if __name__ == "__main__":
tag_names_list = []
tag_names_list_week = []
all_data_day = []
all_data_week = []
db_zhengxing_eagle = pymysql.connect(host="172.16.30.136", port=3306, user="doris",
password="o5gbA27hXHHm",
db="doris_prod",
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
zhengxing_cursor = db_zhengxing_eagle.cursor()
date = datetime.datetime.now().date() - datetime.timedelta(days=1)
sql = 'select keywords,sum(sorted) as nums,uv from api_search_words where is_delete = 0 and create_time = "' + str(
date) + '" group by keywords order by nums desc'
print(sql)
zhengxing_cursor.execute("set names 'UTF8'")
zhengxing_cursor.execute(sql)
data = zhengxing_cursor.fetchall()
tup_title = ("关键词", "搜索次数","uv", "日记数量", "回答数量", "帖子数量")
for name in list(data):
word = name.get("keywords", None)
num = name.get("nums", 0)
uv = name.get("uv",0)
answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
tag_names_list.append([word, num,uv, diary_content_num, answer_content_num, tractate_content_num])
all_data_day.append(tup_title)
for item in tag_names_list:
all_data_day.append(tuple(item))
path = str(date)+".xls"
WritrExcel().write_excel(path, tuple(all_data_day))
print(u'创建demo.xls文件成功')
date = datetime.datetime.now().date() - datetime.timedelta(days=7)
sql = 'select keywords,sum(sorted) as nums,sum(uv) as uvs from api_search_words where is_delete = 0 and create_time >= "' + str(
date) + '" group by keywords order by nums desc'
print(sql)
zhengxing_cursor.execute("set names 'UTF8'")
zhengxing_cursor.execute(sql)
data = zhengxing_cursor.fetchall()
tup_title = ("关键词", "搜索次数", "uv","日记数量", "回答数量", "帖子数量")
for name in list(data):
word = name.get("keywords", None)
sorteds = name.get("nums", 0)
uv = name.get("uvs",0)
answer_content_num, tractate_content_num, diary_content_num = get_es_word(word)
tag_names_list_week.append([word, sorteds,uv, diary_content_num, answer_content_num, tractate_content_num])
all_data_week.append(tup_title)
for item in tag_names_list_week:
all_data_week.append(tuple(item))
path = "近一周数据统计结果.xls"
WritrExcel().write_excel(path, tuple(all_data_week))
print(u'创建demo.xls文件成功')
send_email_tome()
......@@ -174,7 +174,7 @@ def bulk_write_into_es(dict_Lst,
)
except TransportError:
print("output to es register error")
write_str_into_file(file_path='/home/fangyucheng/',
write_str_into_file(file_path='/home/',
file_name='debug',
var=bulk_write_body)
return retry_counter_for_UnicodeEncodeError
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment