Commit 81cd3b51 authored by litaolemo's avatar litaolemo

update

parent 233c49d3
# crawler
## 发布者页爬虫
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/
1. 部署在BJ-GM-Prod-Cos-faiss001/srv/apps/ crontab -e
2. 切换权限 sudo su - gmuser
3. source /root/anaconda3/bin/activate
4. 创建虚拟环境 conda activate crawler_env/conda deactivate
5. 抓取程序 nohup python /srv/apps/crawler/crawler_sys/framework/update_data_in_target_releasers_multi_process_by_date_from_redis.py > /data/log/fect_task.log &
6. 写入抓取url程序 python /srv/apps/crawler/crawler_sys/framework/write_releasers_to_redis.py -p weibo -d 1 -proxies 2
7.
##搜索页爬虫
pass
......
# -*- coding:utf-8 -*-
# @Time : 2019/11/1 15:55
# @Author : litao
import requests
import datetime
from elasticsearch.helpers import scan
from elasticsearch import Elasticsearch
import redis,time
import logging
import oss2
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
# import torch.multiprocessing
# torch.multiprocessing.set_sharing_strategy('file_system')
try:
from write_data_into_es.func_cal_doc_id import *
except:
from write_data_into_es_new.func_cal_doc_id import *
hosts = '192.168.17.11'
port = 80
user = 'litao'
passwd = 'lQSmSEnGZZxl'
http_auth = (user, passwd)
es = Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)
pool = redis.ConnectionPool(host='192.168.17.60', port=6379, db=8)
rds = redis.Redis(connection_pool=pool)
def get_short_video_image_hot(AccessKeyId, AccessKeySecret, endpoint, region):
platform_list = ["抖音","toutiao","new_tudou","kwai","腾讯视频","腾讯新闻","haokan","网易新闻"]
for platform in platform_list:
search_body = {
"query": {
"bool": {
"filter": [
{
"term": {
"platform.keyword": platform
}
},
{"range": {"release_time": {"gte": int((datetime.datetime.now() + datetime.timedelta(days=-7)).timestamp()*1e3)}}},
#{"range": {"play_count": {"gte": 1000}}}
], "must": [{"exists": {"field": "video_img"}}]
}
}}
scan_res = scan(es,query=search_body,index="ronghe_recommend_staging,ronghe_recommend",raise_on_error=False,scroll='50m',
request_timeout=300)
# scan_res = scan(es, query=search_body,
# index="short-video-all-time-url",
# doc_type="all-time-url",
# scroll='50m',
# raise_on_error=False,
# request_timeout=300)
for res in scan_res:
_id = cal_doc_id(platform=res["_source"]["platform"], url=res["_source"]["url"], doc_id_type="all-time-url",
data_dict=res["_source"])
image_url = res["_source"].get("video_img")
if not image_url:
continue
date_str = datetime.datetime.fromtimestamp(res["_source"]["release_time"] / 1e3).strftime("%Y/%m/%d")
path_name = "media/data/video-title-images/%s/%s/%s/%s.jpg" % (
res["_source"]["platform"], res["_source"]["releaser_id_str"], date_str, _id)
put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name, image_url, _id)
def get_short_video_image(AccessKeyId, AccessKeySecret, endpoint, region):
platform_list = ["抖音","toutiao","new_tudou","kwai","腾讯视频","腾讯新闻","haokan","网易新闻"]
for platform in platform_list:
search_body = {
"query": {
"bool": {
"filter": [
{
"term": {
"platform.keyword": platform
}
},
{"range": {"release_time": {"gte": int((datetime.datetime.now() + datetime.timedelta(days=-7)).timestamp()*1e3)}}},
#{"range": {"play_count": {"gte": 1000}}}
], "must": [{"exists": {"field": "video_img"}}]
}
}}
scan_res = scan(es,query=search_body,index="short-video-production", doc_type="daily-url", raise_on_error=False,scroll='50m',
request_timeout=300)
# scan_res = scan(es, query=search_body,
# index="short-video-all-time-url",
# doc_type="all-time-url",
# scroll='50m',
# raise_on_error=False,
# request_timeout=300)
for res in scan_res:
_id = cal_doc_id(platform=res["_source"]["platform"], url=res["_source"]["url"], doc_id_type="all-time-url",
data_dict=res["_source"])
image_url = res["_source"].get("video_img")
if not image_url:
continue
date_str = datetime.datetime.fromtimestamp(res["_source"]["release_time"] / 1e3).strftime("%Y/%m/%d")
path_name = "media/data/video-title-images/%s/%s/%s/%s.jpg" % (
res["_source"]["platform"], res["_source"]["releaser_id_str"], date_str, _id)
put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name, image_url, _id)
def get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region):
search_body = {
"query": {
"bool": {
"filter": [
{"range": {"release_time": {"gte": int(
(datetime.datetime.now() + datetime.timedelta(days=-7)).timestamp() * 1e3)}}}
]
, "must": [{"exists": {"field": "wb_pic"}}]
}
}}
scan_res = scan(es, query=search_body, index="ronghe_weibo_monthly", doc_type="doc", raise_on_error=False,
scroll='50m',
request_timeout=300)
for res in scan_res:
_id = res["_source"]["wb_bowen_id"]
image_url = res["_source"]["wb_pic"]
if "," in image_url:
image_url = image_url.split(",")[0]
date_str = datetime.datetime.fromtimestamp(res["_source"]["release_time"] / 1e3).strftime("%Y/%m/%d")
path_name = "media/data/weibo-title-images/%s/%s/%s/%s.jpg" % (
res["_source"]["platform"], res["_source"]["UID"], date_str, _id)
put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name, image_url, _id)
def get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region):
search_body = {
"query": {
"bool": {
"filter": [
{"range": {"fetch_time": {"gte": int(
(datetime.datetime.now() + datetime.timedelta(
days=-7)).timestamp() * 1e3)}}}
]
}}
}
scan_res = scan(es, query=search_body, index="releaser_fans", doc_type="doc", raise_on_error=False,
scroll='50m',
request_timeout=300)
for res in scan_res:
platform = res["_source"]["platform"]
try:
if res["_source"]["platform"] == "weibo":
image_url = res["_source"]["wb_touxiang_url"]
_id = res["_source"]["UID"]
else:
image_url = res["_source"]["releaser_img"]
_id = res["_source"]["releaser_id_str"]
except:
continue
path_name = "media/data/releasers-avatar/%s/%s.jpg" % (
platform, _id)
put_obj(AccessKeyId, AccessKeySecret, endpoint, region, path_name, image_url, _id)
# access_key_id = settings.access_key_id
# access_key_secret = settings.access_key_secret
# bucket_name = settings.bucket_name
# endpoint = settings.endpoint
# sts_role_arn = settings.sts_role_arn
# region = settings.live_record_region
# url_expries = settings.oss_url_expries
# 阿里云主账号AccessKey拥有所有API的访问权限,风险很高。强烈建议您创建并使用RAM账号进行API访问或日常运维,请登录 https://ram.console.aliyun.com 创建RAM账号。
# auth = oss2.Auth(access_key_id, access_key_secret)
# Endpoint以杭州为例,其它Region请按实际情况填写。
# bucket = oss2.Bucket(auth, 'http://oss-cn-shagnhai.aliyuncs.com', bucket_name)
# 设置此签名URL在60秒内有效。
# print('= ' * 20)
# print(bucket.sign_url('GET', '***.txt',3600))
log_file_path = "oss2.log"
def get_obj_url(access_key_id, access_key_secret, region, name, bucket_name, expires=3600):
oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR)
auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, region, bucket_name)
# object_meta = bucket.get_object_meta('object')
return bucket.sign_url('GET', name, int(expires))
def put_obj(access_key_id, access_key_secret, region, bucket_name, object_name, fileId, _id):
res = rds.set(_id, 1, ex=864000, nx=True)
try:
if res:
reqObj = requests.get(fileId, timeout=3)
fileobj_content = reqObj.content
reqObj.close()
# oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR)
auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, region, bucket_name)
bucket.put_object(object_name, fileobj_content)
print("get %s img down"% _id)
else:
print("Already get image %s" % _id)
except:
rds.delete(_id)
# object_meta = bucket.get_object_meta('object')
def put_obj_from_file(access_key_id, access_key_secret, region, bucket_name, local_file, target,
mime_type='image/jpeg'):
oss2.set_file_logger(log_file_path, 'oss2', logging.ERROR)
auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, region, bucket_name)
bucket.put_object_from_file(target, local_file)
# object_meta = bucket.get_object_meta('object')
if __name__ == '__main__':
from crawler.crawler_sys.framework.config.oss_keyword import AccessKeySecret,AccessKeyId
endpoint = "oss-cn-beijing.aliyuncs.com"
# region = "v-plus-scope.oss-cn-beijing.aliyuncs.com"
region = "v-plus-scope"
oss_url_expries = 3600
access_key_id = AccessKeyId
access_key_secret = AccessKeySecret
bucket_name = "v-plus-scope"
auth = oss2.Auth(access_key_id, access_key_secret)
bucket = oss2.Bucket(auth, endpoint, bucket_name)
# 上传一段字符串。Object名是motto.txt,内容是一段名言。
# bucket.put_object(u'motto.txt', u'Never give up. - Jack Ma')
# 下载到本地文件
# bucket.get_object_to_file(u'motto.txt', u'localfile.txt')
# 删除名为motto.txt的Object
# bucket.delete_object(u'motto.txt')
# # 清除本地文件
# os.remove(u'localfile.txt')
from multiprocessing import Process
# p = Process(target=get_image,args=(AccessKeyId, AccessKeySecret, endpoint, region,))
# p.start()
# p.join()
# print('主', p)
# print('主线程/主进程')
# print('* ' * 20)
res_list = []
executor = ProcessPoolExecutor(6)
for task in range(5):
# get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region)
res = executor.submit(get_short_video_image_hot,AccessKeyId, AccessKeySecret, endpoint, region)
res_list.append(res)
executor.shutdown(True)
res_list = []
executor = ProcessPoolExecutor(6)
res_list = []
executor = ProcessPoolExecutor(6)
for task in range(5):
# get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region)
res = executor.submit(get_short_video_image,AccessKeyId, AccessKeySecret, endpoint, region)
res_list.append(res)
executor.shutdown(True)
res_list = []
executor = ProcessPoolExecutor(6)
for task in range(5):
# get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region)
res = executor.submit(get_avatar_image,AccessKeyId, AccessKeySecret, endpoint, region)
res_list.append(res)
executor.shutdown(True)
res_list = []
executor = ProcessPoolExecutor(6)
for task in range(5):
# get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region)
res = executor.submit(get_weibo_image, AccessKeyId, AccessKeySecret, endpoint, region)
res_list.append(res)
executor.shutdown(True)
# get_weibo_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_avatar_image(AccessKeyId, AccessKeySecret, endpoint, region)
# get_short_video_image(AccessKeyId, AccessKeySecret, endpoint, region)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 10 17:07:09 2018
@author: fangyucheng
"""
import sys
import argparse
import configparser
from multiprocessing import Pool
from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from crawler.crawler_sys.framework.platform_crawler_register import platform_crawler_reg
parser = argparse.ArgumentParser(description='a special crawler framework for key customer')
parser.add_argument('-p', '--platform', default=[], action='append',
help=('legal platform name is required'))
parser.add_argument('-c', '--conf', default='/home/hanye/crawlersNew/crawler/crawler_sys/framework/config/high_fre.ini',
help=('absolute path of config file'))
parser.add_argument('-num', '--page_num', default=20, type=int,
help=('the number of scrolling page'))
args = parser.parse_args()
if args.platform != []:
platform_list = args.platform
for platform in platform_list:
if platform not in platform_crawler_reg:
print("%s is not a legal platform name" % platform)
sys.exit(0)
config_file_path = args.conf
config = configparser.ConfigParser()
config.sections()
config.read(config_file_path)
releaser_page_num_max = args.page_num
ARGS_DICT = {"releaser_page_num_max": releaser_page_num_max,
"output_to_es_raw": True,
"output_es_index": "crawler-data-raw",
"output_doc_type": "doc",
"output_to_es_register": True}
for platform in platform_list:
crawler_initialization = get_crawler(platform)
crawler = crawler_initialization().releaser_page
get_task_list = config[platform]
TASK_LIST = []
for key, value in get_task_list.items():
TASK_LIST.append(value)
pool = Pool(processes=20)
for releaserUrl in TASK_LIST:
pool.apply_async(func=crawler, args=(releaserUrl,), kwds=ARGS_DICT)
pool.close()
pool.join()
print('Multiprocessing done for platform %s' % platform)
......@@ -18,24 +18,28 @@ from crawler.crawler_sys.site_crawler import (crawler_toutiao,
crawler_mango,
crawler_wangyi_news,
crawler_kwai,
crawler_douyin
crawler_douyin,
crawler_zhihu,
)
from crawler.crawler_sys.site_crawler.crawler_weibo.crawler_weibo import Crawler_weibo
platform_crawler_reg = {
'toutiao': crawler_toutiao.Crawler_toutiao,
'腾讯视频': crawler_v_qq.Crawler_v_qq,
'iqiyi': crawler_iqiyi.Crawler_iqiyi,
'youku': crawler_youku.Crawler_youku,
'new_tudou': crawler_tudou.Crawler_tudou,
'haokan': crawler_haokan.Crawler_haokan,
'腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
'miaopai': crawler_miaopai.Crawler_miaopai,
'pearvideo': crawler_pear.Crawler_pear,
'bilibili': crawler_bilibili.Crawler_bilibili,
'Mango': crawler_mango,
"网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
"kwai": crawler_kwai.Crawler_kwai,
'抖音': crawler_douyin.Crawler_douyin,
# '腾讯视频': crawler_v_qq.Crawler_v_qq,
# 'iqiyi': crawler_iqiyi.Crawler_iqiyi,
# 'youku': crawler_youku.Crawler_youku,
# 'new_tudou': crawler_tudou.Crawler_tudou,
# 'haokan': crawler_haokan.Crawler_haokan,
# '腾讯新闻': crawler_tencent_news.Crawler_Tencent_News,
# 'miaopai': crawler_miaopai.Crawler_miaopai,
# 'pearvideo': crawler_pear.Crawler_pear,
# 'bilibili': crawler_bilibili.Crawler_bilibili,
# 'Mango': crawler_mango,
# "网易新闻": crawler_wangyi_news.Crawler_wangyi_news,
# "kwai": crawler_kwai.Crawler_kwai,
# '抖音': crawler_douyin.Crawler_douyin,
"zhihu":crawler_zhihu.Crawler_zhihu,
"weibo":Crawler_weibo
}
......
......@@ -8,7 +8,7 @@ import redis, json
from crawler_sys.framework.platform_redis_register import get_redis_list_name
from crawler_sys.framework.es_crawler import scan_crawler_url_register
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
def feed_url_into_redis(dict_Lst, expire=0,
......
......@@ -10,17 +10,14 @@ from crawler.crawler_sys.framework.platform_crawler_register import get_crawler
from multiprocessing import Pool
PARSER = argparse.ArgumentParser(description='video platform search page crawler')
# PARSER.add_argument('-c', '--conf', default=('/home/hanye/crawlersNew/crawler'
# '/crawler_sys/framework/config'
# '/search_keywords.ini'),
# help=('config file absolute path'))
PARSER.add_argument('-p', '--platform', default=["toutiao", "腾讯新闻", "腾讯视频", "new_tudou"], action='append',
PARSER.add_argument('-p', '--platform', default=["zhihu","weibo", "toutiao"], action='append',
help=('legal platform name is required'))
PARSER.add_argument('-k', '--key_word_platform', default=[], action='append',
help=('key_word_legal platform name is required'))
PARSER.add_argument('-w', '--output_to_es_raw', default=True,
PARSER.add_argument('-w', '--output_to_es_raw', default=False,
help=('output to es raw'))
PARSER.add_argument('-g', '--output_to_es_register', default=False,
PARSER.add_argument('-g', '--output_to_es_register', default=True,
help=('output to es register'))
PARSER.add_argument('-n', '--maxpage', default=20,
help=('maxpage'))
......@@ -38,14 +35,8 @@ es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
# for platform in PLATFORM_LIST:
# if platform not in legal_platform_name:
# print("%s is not a legal platform name, "
# "program will exit" % platform)
# sys.exit(0)
# CONFIG = configparser.ConfigParser()
# CONFIG.read(ARGS.conf, encoding='utf-8')
OUTPUT_TO_ES_RAW = ARGS.output_to_es_raw
OUTPUT_TO_ES_REGISTER = ARGS.output_to_es_register
......@@ -96,6 +87,8 @@ def func_search_keywordlist(platform):
"m22王者之冠",
"bbl",
"胶原蛋白填充祛黑眼圈",
"热玛吉",
"热玛吉5代",
]
for l in res_list:
res_dic[l] = 10
......@@ -132,7 +125,6 @@ def func_search_keywordlist(platform):
if OUTPUT_TO_ES_RAW is True:
ES_INDEX = 'crawler-data-raw'
# ES_INDEX = 'test2'
DOC_TYPE = 'doc'
print(ES_INDEX, DOC_TYPE)
pages = ARGS.maxpage
......@@ -140,8 +132,7 @@ pages = ARGS.maxpage
def search_page_task(platform, output_to_es_raw,
output_to_es_register,
es_index,
doc_type):
es_index):
search_pages = []
initialize_crawler = get_crawler(platform)
crawler = initialize_crawler()
......@@ -154,33 +145,30 @@ def search_page_task(platform, output_to_es_raw,
search_pages_max=search_pages,
output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type)
es_index=es_index)
except Exception as e:
print(e)
continue
ES_INDEX = "crawler-data-raw"
result = []
kwargs_dict = {
'output_to_es_raw': OUTPUT_TO_ES_RAW,
'output_to_es_register': OUTPUT_TO_ES_REGISTER,
'es_index': ES_INDEX,
'doc_type': DOC_TYPE,
}
pool = Pool(processes=4)
# pool = Pool(processes=4)
for platform in PLATFORM_LIST:
res = pool.apply_async(func=search_page_task,
args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX, DOC_TYPE))
result.append(res)
pool.close()
pool.join()
search_page_task(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX)
# res = pool.apply_async(func=search_page_task,
# args=(platform, OUTPUT_TO_ES_RAW, OUTPUT_TO_ES_REGISTER, ES_INDEX))
# result.append(res)
# pool.close()
# pool.join()
print('=================')
for i in result:
print(i.get())
# config file absolute path in serve
# '/home/hanye/crawlers/crawler_sys/framework/config/search_keywords.ini'
......@@ -31,8 +31,7 @@ ARGS = PARSER.parse_args()
es_framework = Elasticsearch(hosts='192.168.17.11', port=80,
http_auth=('crawler', 'XBcasfo8dgfs'))
index_target_releaser = 'search_keywords'
doc_type_target_releaser = 'doc'
if ARGS.platform != []:
PLATFORM_LIST = ARGS.platform
......@@ -98,15 +97,8 @@ for platform in PLATFORM_LIST:
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
else:
crawler.search_video_page(keyword, None,
search_pages_max=search_pages,
output_to_es_raw=OUTPUT_TO_ES_RAW,
output_to_es_register=OUTPUT_TO_ES_REGISTER,
es_index=ES_INDEX,
doc_type=DOC_TYPE)
es_index=ES_INDEX,)
except Exception as e:
print(e)
continue
......
......@@ -44,7 +44,7 @@ from redis.sentinel import Sentinel
# # 连接数据库
# rds_1 = sentinel.master_for('ida_redis_master', socket_timeout=1, db=1, decode_responses=True)
rds_1 = redis.StrictRedis(host='192.144.194.190', port=6379, db=19, decode_responses=True)
rds_1 = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-n', '--max_page', default=30, type=int,
......
......@@ -34,7 +34,7 @@ from redis.sentinel import Sentinel
# 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=1, decode_responses=True)
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19, decode_responses=True)
parser = argparse.ArgumentParser(description='Specify a platform name.')
parser.add_argument('-p', '--platform', default=[], action='append',
......
......@@ -23,7 +23,7 @@ import kdl, requests
# slave = sentinel.discover_slaves('ida_redis_master')
# # 连接数据库
# rds = sentinel.master_for('ida_redis_master', socket_timeout=0.5, db=7, decode_responses=True)
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=18, decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=18, decode_responses=True)
def get_proxy_from_redis():
......
......@@ -26,7 +26,7 @@ from lxml.html.clean import Cleaner
import random
# from mistune import Renderer, InlineGrammar, InlineLexer, Markdown, escape
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19)
# conn = pymysql.connect(host='bj-cdb-6slgqwlc.sql.tencentcdb.com', port=62120, user='work', passwd='Gengmei1',
# db='mimas_test', charset='utf8')
......
......@@ -12,13 +12,13 @@ import time
import copy
import requests
import datetime
import aiohttp
# import aiohttp
import urllib
try:
from crawler_sys.framework.func_get_releaser_id import *
except:
from func_get_releaser_id import *
from write_data_into_es.func_get_releaser_id import *
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing import Process
......
......@@ -20,7 +20,9 @@ import urllib
import base64
import binascii
import datetime
import requests
# import execjs
import hashlib
import requests
from bs4 import BeautifulSoup
......@@ -49,45 +51,47 @@ class Crawler_zhihu():
for popk in pop_key_Lst:
self.video_data.pop(popk)
def get_single_article_page(self,article_id,keyword,proxies=0):
def get_single_answer_page(self, question_id,answer_id, proxies_num):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
# "Cookie": "SINAGLOBAL=565010119549.1364.1559571258394; login_sid_t=85753e367d54782a25518436f329cfa0; cross_origin_proto=SSL; _s_tentry=www.baidu.com; Apache=5712976583220.359.1595386386561; ULV=1595386386575:2:1:1:5712976583220.359.1595386386561:1592884354178; UOR=,,login.sina.com.cn; SSOLoginState=1595829153; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ46TE-isMWEvFjmXZnGFZ5JpX5KMhUgL.Fo2Re0zpShqfSoe2dJLoI7_e9gfadcvadcvad7tt; ALF=1627695088; SCF=AlrGNPCzM_VX3PzgxftYKkUv6Gj7FjmOVVbH8EpsTADeRxEeW-7_ipW8LVV7sGN-t7JJA-VwFKC2Ot0ZkHwHstE.; SUB=_2A25yJwQhDeRhGedG6FAQ9CjJzT-IHXVRVXLprDV8PUNbmtAKLRPmkW9NUVHbR2NjdmB2ZEtnFBK75m3CwwTzeqTJ; SUHB=08J6qQipU2qH8A; CARD-MAIN=cfec82595a1164dea323b2fb276c823f",
"Host": "card.weibo.com",
"Referer": "https://card.weibo.com/article/m/show/id/{0}?_wb_client_=1&open_source=weibo_search&luicode=10000011&lfid=100103type%3D21%26q%3D{1}%26t%3D0".format(article_id,urllib.parse.quote(keyword)),
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
}
url = "https://card.weibo.com/article/m/aj/detail?id={0}&_t={1}".format(article_id,int(datetime.datetime.now().timestamp()*1e3))
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
# "cookie": '_zap=20547721-b576-4409-95c1-000c6f20517b; d_c0="AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"; __gads=ID=bdc51df6433d4288:T=1562072932:S=ALNI_MbUwg2TeI33p4EnEYpHr8bAKBUiNQ; _ga=GA1.2.929365035.1592357886; tst=r; q_c1=e59a45f95396455e871eb111bdd827e1|1596185954000|1562072927000; _gid=GA1.2.544062079.1596418493; capsion_ticket="2|1:0|10:1596418535|14:capsion_ticket|44:MmJhMzEyNzYzNzE5NDAyOTg3ZGQzNDFmYTFlYjJmMjE=|facc3f88969d538b60f0530ff9bbdb74aa1bb7012584b9dfd2a5f3a3c1fb9726"; z_c0="2|1:0|10:1596418574|4:z_c0|92:Mi4xSDJLUUhRQUFBQUFBZ083dl9NYXNEeVlBQUFCZ0FsVk5EYmdVWUFDcDlBZjhBb0stY3RHTnhNS013YXItcko0VXFn|73520023927845cb04e21a4a1fbfae5d25088de4ffae91090d55cf7a5ba5b008"; _xsrf=MuvTOIUy5KNtEQCk76uG0nAbiqt6IyKS; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1596418492,1596419419,1596517065,1596593781; SESSIONID=6vXjlwH0fidkMDZqOt89HQlyXmxlz1J4fckhELGprA4; JOID=Vl4UCkslT8WtTsL2TigpEe-nCIZaET6Vzia3uXZHJKbOCoKuOFdW7v9Ix_lPjZ6PEBBE8JId1q13KhAqfRCRrDg=; osd=VlATBkIlQcKhR8L4SSQgEeGgBI9aHzmZxya5vnpOJKjJBouuNlBa5_9GwPVGjZCIHBlE_pUR3615LRwjfR6WoDE=; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1596596132; KLBRSID=e42bab774ac0012482937540873c03cf|1596596133|1596593779',
"referer": "https://www.zhihu.com/",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}
url = "https://www.zhihu.com/question/{0}/answer/{1}".format(question_id,answer_id)
try:
requests_res = retry_get_url(url,headers=headers,proxies=proxies)
res_json = requests_res.json()
# print(res_json)
data = res_json["data"]
requests_res = retry_get_url(url, headers=headers, proxies=proxies_num)
tres_json_test = requests_res.text
res_json = json.loads(re.findall('<script id="js-initialData" type="text/json">(.*?)</script>',tres_json_test)[0])
print(res_json)
data = res_json["initialState"]
video_dic = {}
video_dic["url"] = data["target_url"]
video_dic["title"] = data["title"]
video_dic["fetch_time"] = int(datetime.datetime.now().timestamp()*1e3)
video_dic["release_time"] = trans_strtime_to_timestamp(data["create_at"])
video_dic["play_count"] = int(data["read_count"])
video_dic["content"] = data["content"]
video_dic["releaser"] = data["userinfo"].get('screen_name')
video_dic["releaser_id"] = str(data["userinfo"].get('id'))
video_dic["releaserUrl"] = data["userinfo"].get('url')
video_dic["releaser_id_str"] = "weibo_" + str(video_dic["releaser_id"])
video_dic["img_list"] = re.findall('img src="(.*?)"',data["content"])
video_dic["url"] = url
video_dic["title"] = data["entities"]["answers"][answer_id]["question"]["title"]
video_dic["fetch_time"] = int(datetime.datetime.now().timestamp() * 1e3)
video_dic["release_time"] = int(data["entities"]["answers"][answer_id]["createdTime"]*1e3)
video_dic["voteup_count"] = data["entities"]["answers"][answer_id]["voteupCount"]
video_dic["comment_count"] = data["entities"]["answers"][answer_id]["commentCount"]
video_dic["content"] = data["entities"]["answers"][answer_id]["content"]
video_dic["releaser"] = data["entities"]["answers"][answer_id]["author"]["name"]
video_dic["releaser_id"] = data["entities"]["answers"][answer_id]["author"]["urlToken"]
video_dic["releaserUrl"] = "https://www.zhihu.com/people/%s" %video_dic["releaser_id"]
video_dic["releaser_id_str"] = "zhihu_" + str(video_dic["releaser_id"])
video_dic["img_list"] = re.findall('img src="(.*?)"', video_dic["content"])
video_dic["_id"] = 'zhihu_%s_%s' % (question_id, answer_id)
return video_dic
except Exception as e:
print("single data row formate error %s" % e)
def get_serach_page_cookies(self,keyword):
def get_serach_page_cookies(self, keyword):
url = "https://www.zhihu.com/search?type=content&q=%s" % keyword
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
......@@ -103,30 +107,35 @@ class Crawler_zhihu():
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
}
requests_res = retry_get_url(url,headers=headers)
requests_res = retry_get_url(url, headers=headers)
print(requests_res.cookies.get_dict())
return requests_res.cookies.get_dict()
def parse_sigle_page(self,aid,data_dict,article_type):
if article_type =="knowledge_ad":
def parse_sigle_page(self,article_type,data_dict,proxies_num):
res_dict = {}
if article_type == "knowledge_ad":
pass
elif article_type == "zvideo":
pass
elif article_type == "search_result":
elif article_type == 'search_result':
article_type == data_dict["object"]["type"]
url = data_dict["object"]["type"]
url = "https://www.zhihu.com/question/{0}/answer/{1}".format(data_dict["object"]["question"]["id"],
data_dict["object"]["id"])
res_dict = self.get_single_answer_page(data_dict["object"]["question"]["id"],data_dict["object"]["id"],proxies_num)
elif article_type == "search_club":
pass
elif article_type == "relevant_query":
pass
else:
pass
return res_dict
def search_article_page(self, keyword, search_pages_max=12,
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None, proxies_num=0):
output_to_es_raw=False,
output_to_es_register=False,
es_index=None,
doc_type=None, data_dict=0,proxies_num=0):
res_cookies_dict = self.get_serach_page_cookies(keyword=keyword)
headers_search = {
......@@ -142,20 +151,21 @@ class Crawler_zhihu():
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": "1.0_a0Oy67L8cXYxFgtBK8FBo6r0NCxxgBN009tBk4Lq2XFY",
"x-zse-86": "1.0_a_Yy6euBS_xfbM28ZhtycHU8gG2XoHtyGTxqHve8rXtY",
"referer": "https://www.zhihu.com/search?type=content&q={0}".format(urllib.parse.quote(keyword)),
}
}
cookies_dict = {
"d_c0":'"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"',
"KLBRSID":None
"d_c0": '"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"',
"KLBRSID": None
}
cookies_dict.update(res_cookies_dict)
url = "https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0".format(urllib.parse.quote(keyword))
url = "https://www.zhihu.com/api/v4/search_v3?t=general&q={0}&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0".format(
urllib.parse.quote(keyword))
offset = 0
while offset <= search_pages_max*20:
while offset <= search_pages_max * 20:
offset += 20
get_page = retry_get_url(url, headers=headers_search,cookies=cookies_dict)
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict,proxies=proxies_num)
if get_page.status_code != 200:
# retry once
get_page = requests.get(url)
......@@ -163,43 +173,22 @@ class Crawler_zhihu():
continue
page_dict = get_page.json()
url = page_dict["paging"]["next"]
print(get_page.cookies.get_dict())
# print(get_page.cookies.get_dict())
cookies_dict.update(get_page.cookies.get_dict())
headers_search.pop("x-zse-86",0)
headers_search.pop("x-zse-86", 0)
res_list = []
if page_dict.get("data"):
continue
for one_line in page_dict['data']:
try:
article_type = one_line['knowledge_ad']
title = one_line['object']["body"]["title"]
url = one_line['object']["url"]
article_id = re.findall("(\d+)",one_line['scheme'])[0]
releaser = one_line['object']["body"]["authors"][0]["name"]
uid = one_line['object']["body"]["authors"][0]["url_token"]
# releaserUrl = "https://www.toutiao.com/c/user/%s/" % uid
# release_time = one_line['publish_time']
# release_time = int(int(release_time) * 1e3)
fetch_time = int(datetime.datetime.now().timestamp() * 1e3)
# releaser_id = self.get_releaser_id(releaserUrl)
article_type = one_line['type']
res_data = self.parse_sigle_page(article_type,one_line,proxies_num)
if not res_data:
continue
D0 = copy.deepcopy(self.video_data)
D0['title'] = title
# D0['abstract'] = abstract
# D0['url'] = url
# D0['play_count'] = play_count
# D0['comment_count'] = comment_count
# D0['favorite_count'] = favorite_count
D0['article_id'] = article_id
# D0['releaser'] = releaser
# D0['releaserUrl'] = releaserUrl
# D0['release_time'] = release_time
# D0['releaser_id_str'] = "toutiao_%s" % releaser_id
D0['fetch_time'] = fetch_time
D0['search_word'] = keyword
D0["type"] = "article"
try:
article_info = self.get_single_article_page(article_id,keyword, proxies=proxies_num)
D0.update(article_info)
D0.update(res_data)
except Exception as e:
print("method get_web_article_info error %s" % e)
print(D0)
......@@ -238,9 +227,9 @@ class Crawler_zhihu():
es_index=None,
doc_type=None, proxies_num=0):
self.search_article_page(keyword, search_pages_max=search_pages_max, output_to_es_raw=output_to_es_raw,
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type, proxies_num=proxies_num)
output_to_es_register=output_to_es_register,
es_index=es_index,
doc_type=doc_type, proxies_num=proxies_num)
def repost_page(self, weibo_id, user_name, password):
total_page = 0
......@@ -305,8 +294,8 @@ class Crawler_zhihu():
for coo in cookie_pool:
print(coo)
cookie = json.loads(coo)
#cookie = self.manipulate_login(user_name=user_name,password=password)
#cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
# cookie = self.manipulate_login(user_name=user_name,password=password)
# cookie = {"ALC": "ac%3D2%26bt%3D1561705868%26cv%3D5.0%26et%3D1593241868%26ic%3D-621306587%26login_time%3D1561705868%26scf%3D%26uid%3D7211103954%26vf%3D0%26vs%3D0%26vt%3D0%26es%3Db91c9d11ca009f8c4f48080505ae615b", "LT": "1561705868", "tgc": "TGT-NzIxMTEwMzk1NA==-1561705868-tc-6005B5FEAADCEB07A63BA0D6D544CF92-1", "ALF": "1593241868", "SCF": "Ah7YtXJ_s6ue4BJWekcj8HMaZEYi3Kel5243tYoDHC9y0TD9y7MYKIhYu7fV0_BEaPmgGpFKmkyz-WA-cF6-Vgc.", "SUB": "_2A25wEc3cDeRhGeFM6lMQ8C3FzjiIHXVTZrgUrDV_PUNbm9AKLULSkW9NQP7JKShhH9bCX9VIpjzhPXX89XiDiHbj", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFmSG3DWrqckklXmwYD.UNJ5NHD95QNeo2peK501K-XWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNeKM7eKM0SheX15tt", "sso_info": "v02m6alo5qztKWRk5yljpOQpZCToKWRk5iljoOgpZCjnLGNs4CxjbOMtIyzkLiJp5WpmYO0t4yjhLGMk4CzjpOUtA==", "login": "609423641c81693ee710ee69b0d0e34c"}
if cookie is not None:
for page_num in range(1, 3):
first_url = ('https://weibo.com/u/' + user_id + '?visible=0&is_all=1&is_tag=0'
......@@ -327,12 +316,12 @@ class Crawler_zhihu():
soup_content = BeautifulSoup(content_for_soup, 'html.parser')
weibo_lst = soup_content.find_all('div', {'action-type': 'feed_list_item'})
# time.sleep(15)
for line_count,line in enumerate(weibo_lst):
for line_count, line in enumerate(weibo_lst):
weibo_info = self.get_user_weibo_info(line, cookie)
weibo_info['user_id'] = user_id
weibo_info['user_url'] = 'https://weibo.com/' + user_id
result_lst.append(weibo_info)
print('get data at element page:%s pagebar:%s' % (page_num,line_count))
print('get data at element page:%s pagebar:%s' % (page_num, line_count))
get_parameter = soup.find_all('script', {'type': 'text/javascript'})
for line in get_parameter:
if 'pid' in str(line) and 'oid' in str(line):
......@@ -400,8 +389,6 @@ class Crawler_zhihu():
def get_releaser_id(self, releaserUrl):
return get_releaser_id(platform=self.platform, releaserUrl=releaserUrl)
@staticmethod
def get_img(data):
img_list = []
......@@ -542,6 +529,41 @@ class Crawler_zhihu():
if __name__ == '__main__':
zhihu = Crawler_zhihu()
import os
# import PyV8
import execjs
os.environ["EXECJS_RUNTIME"] = 'Node'
# zhihu.get_serach_page_cookies("热玛吉")
zhihu.search_page("热玛吉")
# zhihu.search_page("比基尼线脱毛")
# zhihu.get_single_answer_page("325099876","1209953121")
# print(user_page)
if True:
# with PyV8.JSContext() as ctx:
url = "api/v4/search_v3?t=general&q=%E7%83%AD%E7%8E%9B%E5%90%89&correction=1&offset=20&limit=20&lc_idx=25&show_all_topics=0&search_hash_id=12d60c255d0be17b9830355a0d04de5b&vertical_info=0%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C1"
referer = "https://www.zhihu.com/search?type=content&q=%E7%83%AD%E7%8E%9B%E5%90%89"
f = "+".join(["3_2.0", url, referer, '"AIDu7_zGrA-PToWVy-siVNLS835i5YXmFCQ=|1562072925"'])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
# with open('./zhihu_js.js', 'r') as f:
# # print(f.read())
# ctx.eval(f.read())
# encrypt_str = ctx.locals.add('b',fmd5)
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
encrypt_str = execjs.compile(js).call('b', fmd5)
headers = {
"referer": referer,
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"cookie": 'd_c0="AACSLMY7lBGPTo9fXdy2pmiGQ4ZVVUcqzC4=|1594785557";',
"x-api-version": "3.0.91",
"x-zse-83": "3_2.0",
"x-zse-86": "1.0_%s" % encrypt_str,
}
print(headers)
r = requests.get("https://www.zhihu.com" + url, headers=headers)
print(r.text)
......@@ -4,7 +4,7 @@
import redis,time,json,datetime,sys
from maintenance.func_send_email_with_file import send_file_email
rds = redis.StrictRedis(host='192.144.194.190', port=6379, db=19,decode_responses=True)
rds = redis.StrictRedis(host='172.18.51.10', port=6379, db=19,decode_responses=True)
def write_email_task_to_redis(task_name=None,file_path=None, data_str=None, email_group=[],
......
# -*- coding:UTF-8 -*-
# @Time : 2020/8/5 14:29
# @File : func_calculate_zhihu_id.py
# @email : litao@igengmei.com
# @author : litao
def calculate_douban_id(data_dic):
if "answer" in data_dic["url"]:
return data_dic["_id"].replace("zhihu_","")
else:
return data_dic["url"]
\ No newline at end of file
......@@ -32,6 +32,7 @@ def vid_cal_func(platform):
"haokan":calculate_haokan_id,
"weibo":calculate_weibo_id,
"douban":calculate_douban_id,
"zhihu":calculate_douban_id,
}
def general_vid_cal_func(url):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment