Commit d1b63291 authored by haowang's avatar haowang

modify

parent 607c82f7
.idea
__pycache__
env
from zhihu_captcha import zhihu_captcha
\ No newline at end of file
import tensorflow as tf
from zhihu_captcha import utils
from tensorflow.python.training import moving_averages
FLAGS = utils.FLAGS
num_classes = utils.num_classes
class LSTMOCR(object):
def __init__(self, mode):
self.mode = mode
# 图像输入
self.inputs = tf.placeholder(tf.float32, [None, FLAGS.image_height, FLAGS.image_width, FLAGS.image_channel])
# ctc_loss 需要的是稀疏矩阵
self.labels = tf.sparse_placeholder(tf.int32)
# 一维数组,大小[batch_size]
self.seq_len = tf.placeholder(tf.int32, [None])
# l2
self._extra_train_ops = [] # 存储调整平滑均值,平滑方差的操作
def build_graph(self):
self._build_model()
self._build_train_op()
self.merged_summay = tf.summary.merge_all()
def _build_model(self):
"""
构建模型,前两个卷积的卷积核size 分别是7,5 是很重要的,换成其他的效果会差很多
"""
filters = [32, 64, 128, 128, FLAGS.max_stepsize]
strides = [1, 2]
with tf.variable_scope('cnn'):
with tf.variable_scope('unit-0'):
x = self._conv2d(self.inputs, 'cnn-0', 7, 1, filters[0], strides[0]) # 卷积
x = self._batch_norm('bn0', x) # 批标准化
x = self._leaky_relu(x, 0.01) # 非线性激活
x = self._max_pool(x, 2, strides[0]) # 池化
with tf.variable_scope('unit-1'):
x = self._conv2d(x, 'cnn-1', 5, filters[0], filters[1], strides[0])
x = self._batch_norm('bn1', x)
x = self._leaky_relu(x, 0.01)
x = self._max_pool(x, 2, strides[1])
with tf.variable_scope('unit-2'):
x = self._conv2d(x, 'cnn-2', 3, filters[1], filters[2], strides[0])
x = self._batch_norm('bn2', x)
x = self._leaky_relu(x, 0.01)
x = self._max_pool(x, 2, strides[1])
with tf.variable_scope('unit-3'):
x = self._conv2d(x, 'cnn-3', 3, filters[2], filters[3], strides[0])
x = self._batch_norm('bn3', x)
x = self._leaky_relu(x, 0.01)
x = self._max_pool(x, 2, strides[1])
with tf.variable_scope('unit-4'):
x = self._conv2d(x, 'cnn-4', 3, filters[3], filters[4], strides[0])
x = self._batch_norm('bn4', x)
x = self._leaky_relu(x, 0.01)
x = self._max_pool(x, 2, strides[1])
with tf.variable_scope('lstm'):
shp = x.get_shape().as_list()
x = tf.reshape(x, [-1, filters[4], shp[1]*shp[2]])
# tf.nn.rnn_cell.RNNCell, tf.nn.rnn_cell.GRUCell
cell = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, state_is_tuple=True)
if self.mode == 'train':
cell = tf.contrib.rnn.DropoutWrapper(cell=cell, output_keep_prob=0.8)
cell1 = tf.contrib.rnn.LSTMCell(FLAGS.num_hidden, state_is_tuple=True)
if self.mode == 'train':
cell1 = tf.contrib.rnn.DropoutWrapper(cell=cell1, output_keep_prob=0.8)
# 将rnn堆成2层深度
stack = tf.contrib.rnn.MultiRNNCell([cell, cell1], state_is_tuple=True)
# outputs是所有step的结果, state 是最后一个step的结果这里不需要
outputs, _ = tf.nn.dynamic_rnn(stack, x, self.seq_len, dtype=tf.float32)
# reshape 使其满足模型的step长度
outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden])
W = tf.get_variable(name='W',
shape=[FLAGS.num_hidden, num_classes],
dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable(name='b',
shape=[num_classes],
dtype=tf.float32,
initializer=tf.constant_initializer())
self.logits = tf.matmul(outputs, W) + b
# reshape 使得最后一个维度是 num_classes
shape = tf.shape(x)
self.logits = tf.reshape(self.logits, [shape[0], -1, num_classes])
# Time major
self.logits = tf.transpose(self.logits, (1, 0, 2))
def _build_train_op(self):
self.global_step = tf.Variable(0, trainable=False)
# ctc 损失函数,使用前后向算法和最大似然
self.loss = tf.nn.ctc_loss(labels=self.labels,
inputs=self.logits,
sequence_length=self.seq_len)
self.cost = tf.reduce_mean(self.loss)
tf.summary.scalar('cost', self.cost)
self.lrn_rate = tf.train.exponential_decay(FLAGS.initial_learning_rate,
self.global_step,
FLAGS.decay_steps,
FLAGS.decay_rate,
staircase=True)
# self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.lrn_rate,
# momentum=FLAGS.momentum).minimize(self.cost,
# global_step=self.global_step)
# self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.lrn_rate,
# momentum=FLAGS.momentum,
# use_nesterov=True).minimize(self.cost,
# global_step=self.global_step)
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.initial_learning_rate,
beta1=FLAGS.beta1,
beta2=FLAGS.beta2).minimize(self.loss,
global_step=self.global_step)
train_ops = [self.optimizer] + self._extra_train_ops
self.train_op = tf.group(*train_ops)
# Option 2: tf.contrib.ctc.ctc_beam_search_decoder
# (it's slower but you'll get better results)
# decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len,merge_repeated=False)
self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder(self.logits,
self.seq_len,
merge_repeated=False) # 寻找最优路径
self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1) # 解码
# 卷积
def _conv2d(self, x, name, filter_size, in_channels, out_channels, strides):
with tf.variable_scope(name):
kernel = tf.get_variable(name='DW',
shape=[filter_size, filter_size, in_channels, out_channels],
dtype=tf.float32,
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable(name='bais',
shape=[out_channels],
dtype=tf.float32,
initializer=tf.constant_initializer())
con2d_op = tf.nn.conv2d(x, kernel, [1, strides, strides, 1], padding='SAME')
return tf.nn.bias_add(con2d_op, b) #加上偏置,然后返回
# 批标准化
def _batch_norm(self, name, x):
"""批标准化."""
with tf.variable_scope(name):
params_shape = [x.get_shape()[-1]] #获取tensor的最后一个维度,后面的均值,方差都是这个维度
# 标准化数据为均值为0方差为1之后,还有一个x=x*gamma+beta的调整
# 这个会随着训练不断调整
beta = tf.get_variable(
'beta', params_shape, tf.float32,
initializer=tf.constant_initializer(0.0, tf.float32))
gamma = tf.get_variable(
'gamma', params_shape, tf.float32,
initializer=tf.constant_initializer(1.0, tf.float32))
# 训练的时候不断调整平滑均值,平滑方差
# 预测的时候,回复权重使用的是训练过程中调整出来的平滑方差均值去做标准化
if self.mode == 'train':
mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') #获取批均值和方差,size[最后一个维度]
# moving_mean, moving_variance 这两个name一定要让训练和预测的时候都相等,不然就没法恢复训练好的值了
moving_mean = tf.get_variable(
'moving_mean', params_shape, tf.float32,
initializer=tf.constant_initializer(0.0, tf.float32),
trainable=False)
moving_variance = tf.get_variable(
'moving_variance', params_shape, tf.float32,
initializer=tf.constant_initializer(1.0, tf.float32),
trainable=False)
self._extra_train_ops.append(moving_averages.assign_moving_average(
moving_mean, mean, 0.9))
self._extra_train_ops.append(moving_averages.assign_moving_average(
moving_variance, variance, 0.9))
else:
# mean的name一定要跟train的时候的一样 moving_mean
mean = tf.get_variable(
'moving_mean', params_shape, tf.float32,
initializer=tf.constant_initializer(0.0, tf.float32),
trainable=False)
# variance的name一定要跟train的时候的一样 moving_variance
variance = tf.get_variable(
'moving_variance', params_shape, tf.float32,
initializer=tf.constant_initializer(1.0, tf.float32),
trainable=False)
# 可视化
tf.summary.histogram(mean.op.name, mean)
tf.summary.histogram(variance.op.name, variance)
# 计算,标准化,最后一个值为误差,一般设置很小即可
x_bn = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001)
x_bn.set_shape(x.get_shape())
return x_bn
# 变种Relu
# Relu 简单而强大,方便求导
# 非负区间的梯度为常数,一定程度上能够防止梯度消失问题
def _leaky_relu(self, x, leakiness=0.0):
return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu')
def _max_pool(self, x, ksize, strides):
return tf.nn.max_pool(x,
ksize=[1, ksize, ksize, 1],
strides=[1, strides, strides, 1],
padding='SAME',
name='max_pool')
\ No newline at end of file
import os
import numpy as np
import tensorflow as tf
#import cv2
from PIL import Image
# +-* + () + 10 digit + blank + space
num_classes = 38#3 + 2 + 10 + 1 + 1
maxPrintLen = 100
tf.app.flags.DEFINE_boolean('restore', True, 'whether to restore from the latest checkpoint')
tf.app.flags.DEFINE_string('checkpoint_dir', './checkpoint/', 'the checkpoint dir')
tf.app.flags.DEFINE_float('initial_learning_rate', 1e-3, 'inital lr')
tf.app.flags.DEFINE_integer('image_height', 60, 'image height')
tf.app.flags.DEFINE_integer('image_width', 150, 'image width')
tf.app.flags.DEFINE_integer('image_channel', 1, 'image channels as input')
tf.app.flags.DEFINE_integer('max_stepsize', 64, 'max stepsize in lstm, as well as '
'the output channels of last layer in CNN')
tf.app.flags.DEFINE_integer('num_hidden', 128, 'number of hidden units in lstm')
tf.app.flags.DEFINE_integer('num_epochs', 1000, 'maximum epochs')
tf.app.flags.DEFINE_integer('batch_size', 128, 'the batch_size')
tf.app.flags.DEFINE_integer('save_steps', 500, 'the step to save checkpoint')
tf.app.flags.DEFINE_integer('validation_steps', 500, 'the step to validation')
tf.app.flags.DEFINE_float('decay_rate', 0.98, 'the lr decay rate')
tf.app.flags.DEFINE_float('beta1', 0.9, 'parameter of adam optimizer beta1')
tf.app.flags.DEFINE_float('beta2', 0.999, 'adam parameter beta2')
tf.app.flags.DEFINE_integer('decay_steps', 1000, 'the lr decay_step for optimizer')
tf.app.flags.DEFINE_float('momentum', 0.9, 'the momentum')
tf.app.flags.DEFINE_string('train_dir', './imgs/train/', 'the train data dir')
tf.app.flags.DEFINE_string('val_dir', './imgs/val/', 'the val data dir')
tf.app.flags.DEFINE_string('infer_dir', './imgs/infer/', 'the infer data dir')
tf.app.flags.DEFINE_string('logs_dir', './log', 'the logging dir')
tf.app.flags.DEFINE_string('mode', 'train', 'train, val or infer')
tf.app.flags.DEFINE_integer('num_gpus', 1, 'num of gpus')
FLAGS = tf.app.flags.FLAGS
# num_batches_per_epoch = int(num_train_samples/FLAGS.batch_size)
import string
charset = string.digits + string.ascii_lowercase#'0123456789+-*()'
encode_maps = {}
decode_maps = {}
for i, char in enumerate(charset, 1):
encode_maps[char] = i
decode_maps[i] = char
SPACE_INDEX = 0
SPACE_TOKEN = ''
encode_maps[SPACE_TOKEN] = SPACE_INDEX
decode_maps[SPACE_INDEX] = SPACE_TOKEN
class DataIterator:
def __init__(self, data_dir):
self.image = [] # 所有的图片都加载进内存待等待提取,加载进内存当然是为了速度了
self.labels = []
for root, sub_folder, file_list in os.walk(data_dir):
for file_path in file_list:
image_name = os.path.join(root, file_path)
# im = np.array(Image.open(image_name)).astype(np.float32)/255.
im = np.array(Image.open(image_name).convert("L")).astype(np.float32)/255.
# im = np.array(Image.open(image_name).convert("L").point(lambda x: 0 if x < 150 else 1)).astype(np.float32)
# im = cv2.imread(image_name, 0).astype(np.float32)/255.
# resize to same height, different width will consume time on padding
# im = cv2.resize(im, (image_width, image_height))
im = np.reshape(im, [FLAGS.image_height, FLAGS.image_width, FLAGS.image_channel])
self.image.append(im)
# image is named as /.../<folder>/00000_abcd.png
code = image_name.split(os.sep)[-1].split('_')[1].split('.')[0] # code 是验证码
code = [SPACE_INDEX if code == SPACE_TOKEN else encode_maps[c] for c in list(code)] # code转成[1,2,3,4] 字码列表
self.labels.append(code)
# 使size方法变成属性,调用的时候self.size即可,不用调用self.size() #这里体现不出@property的优点
@property
def size(self):
return len(self.labels)
# 给定index, 抽取labels
def the_label(self, indexs):
labels = []
for i in indexs:
labels.append(self.labels[i])
return labels
# 给定index, 得到一个批次的训练数据
def input_index_generate_batch(self, index=None):
if index:
image_batch = [self.image[i] for i in index]
label_batch = [self.labels[i] for i in index]
else:
image_batch = self.image
label_batch = self.labels
def get_input_lens(sequences):
# 分片的序列长度,因为验证码图片序列长度都是一样的,不像句子有长有短
# 所以这里的长度都是一样的
lengths = np.asarray([FLAGS.max_stepsize for _ in sequences], dtype=np.int64)
return sequences, lengths
batch_inputs, batch_seq_len = get_input_lens(np.array(image_batch))
batch_labels = sparse_tuple_from_label(label_batch) # 转成稀疏矩阵
return batch_inputs, batch_seq_len, batch_labels
# 对比解码得到的label和真实label,计算正确率
def accuracy_calculation(original_seq, decoded_seq, ignore_value=-1, isPrint=False):
if len(original_seq) != len(decoded_seq):
print('original lengths is different from the decoded_seq, please check again')
return 0
count = 0
for i, origin_label in enumerate(original_seq):
decoded_label = [j for j in decoded_seq[i] if j != ignore_value]
if isPrint and i < maxPrintLen:
# print('seq{0:4d}: origin: {1} decoded:{2}'.format(i, origin_label, decoded_label))
with open('./test.csv', 'w') as f:
f.write(str(origin_label) + '\t' + str(decoded_label))
f.write('\n')
if origin_label == decoded_label:
count += 1
return count * 1.0 / len(original_seq)
def sparse_tuple_from_label(sequences, dtype=np.int32):
"""
根据[[1,2,3,4],[5,2,6,5],...] 这种形式,生成稀疏矩阵
稀疏矩阵由三个元素的tuple组成,即(indices, values, shape)
indices和values的都是一个列表,列表元素刚好一一对应,
一个代表坐标位置,一个代表这个位置的值,其中indices是一个
[(0,1),(0,2),...(10,3),(10,4),...]这样的形式的列表,指示了
对应的values的值在密集矩阵的坐标,values 是[1,2,3,...,100,...]
这样的形式,最后一个shape描述密集矩阵的shape
示例:
indices = [(0,1),(0,2),(0,3),(1,1),(1,3),(2,2)]
values = [1,2,3,4,5,6]
shape = [4,3]
则对应的密集矩阵就是
0 1 2 3
0 4 0 5
0 0 6 0
参数:
sequences: 一个列表,列表里面是每个验证码的码字列表
返回:
(indices, values, shape)
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n] * len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)
return indices, values, shape
\ No newline at end of file
import requests
import time
import json
import os
import sys
from bs4 import BeautifulSoup as BS
import urllib.parse
import webbrowser
from io import BytesIO
from zhihu_captcha import utils
from zhihu_captcha import orcmodel
import tensorflow as tf
from PIL import Image
import numpy as np
try:
type (eval('model'))
except:
model = orcmodel.LSTMOCR('infer')
model.build_graph()
config = tf.ConfigProto(allow_soft_placement=True)
checkpoint_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), "checkpoint")
class ZhihuCaptcha():
def __init__(self, username=None, password=None):
if sys.path[0]: os.chdir(sys.path[0]) # 设置脚本所在目录为当前工作目录
# 恢复权重
self.__sess = self.__restoreSess(checkpoint_dir)
# 恢复权重
def __restoreSess(self, checkpoint=checkpoint_dir):
sess=tf.Session(config=config)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
ckpt = tf.train.latest_checkpoint(checkpoint)
if ckpt:
#回复权限,这里连 global_step 也会被加载进来
saver.restore(sess, ckpt)
# print('restore from the checkpoint{0}'.format(ckpt))
print('已加载checkpoint{0}'.format(ckpt))
else:
print('警告:未加载任何chechpoint')
print('如果这不是你预期中的,请确保以下目录存在可用的checkpoint:\n{0}'.format(checkpoint_dir))
return sess
def recgImg(self, img):
"""
可以在线测试验证码识别功能
参数:
img 一个 (60, 150) 的图片
"""
im = np.array(img.convert("L")).astype(np.float32)/255.
im = np.reshape(im, [60, 150, 1])
inp=np.array([im])
seq_len_input=np.array([np.array([64 for _ in inp], dtype=np.int64)])
#seq_len_input = np.asarray(seq_len_input)
seq_len_input = np.reshape(seq_len_input, [-1])
imgs_input = np.asarray([im])
feed = {model.inputs: imgs_input,model.seq_len: seq_len_input}
dense_decoded_code = self.__sess.run(model.dense_decoded, feed)
expression = ''
for i in dense_decoded_code[0]:
if i == -1:
expression += ''
else:
expression += utils.decode_maps[i]
return expression
\ No newline at end of file
import pymysql
import hashlib
import requests
import execjs
import os
import re
import sys
import time
from datetime import datetime
import kdl
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
# JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY = 'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def login():
url = 'https://www.zhihu.com'
loginUrl = 'https://www.zhihu.com/login/email'
headers = {
# "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36',
"Referer": "http://www.zhihu.com/",
'Host': 'www.zhihu.com',
'rememberme': "true"
}
data = {
'email': 'yousangdandan@yeah.net',
'password': '5358569'
}
global session
session = requests.session()
login_req = session.post(loginUrl, data=data,headers=headers)
print('loginReq:{}'.format(loginReq.status_code))
return login_req
def get_proxy():
auth = kdl.Auth("990866563045611", APIKEY)
client = kdl.Client(auth)
ips = client.get_dps(1, sign_type='hmacsha1', format='json', area='北京,上海,广东')
print("dps proxy: ", ips, client.get_proxy_authorization())
# return { "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0]), }, client.get_proxy_authorization()
return { "http": "http://{}".format('171.35.213.172:9999'), "https": "https://{}".format('171.35.213.172:9999'), }, client.get_proxy_authorization()
def retry_get_url(url, retrys=5, headers={}, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
time.sleep(3)
try:
# proxies, proxy_authorization = get_proxy()
# headers.update(proxy_authorization)
# get_resp = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, **kwargs)
get_resp = requests.get(url, headers=headers, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
print(e)
# proxies, proxy_authorization = get_proxy()
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
class Spider(object):
def __init__(self, spider_url):
'''
初始化数据库,调整js规则
'''
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
self.page_count = 1000
self.use_proxy = True
self.spider_url = spider_url
detail_url = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={}&limit=20&sort_by=created'
self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + detail_url
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
# self.exec_js = execjs.compile(js)
self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
'''
{
'Server': 'CLOUD ELB 1.0.0',
'Date': 'Fri, 01 Jan 2021 08:36:59 GMT',
'Content-Type': 'text/html; charset=utf-8',
'Vary': 'Accept-Encoding',
'content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-content-security-policy': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net", 'x-webkit-csp': "default-src * blob:; img-src * data: blob: resource: t.captcha.qq.com cstaticdun.126.net necaptcha.nosdn.127.net; connect-src * wss: blob: resource:; frame-src 'self' *.zhihu.com mailto: tel: weixin: *.vzuu.com mo.m.taobao.com getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 zhihujs: captcha.guard.qcloud.com pos.baidu.com dup.baidustatic.com openapi.baidu.com wappass.baidu.com passport.baidu.com *.cme.qcloud.com vs-cdn.tencent-cloud.com t.captcha.qq.com c.dun.163.com; script-src 'self' blob: *.zhihu.com g.alicdn.com qzonestyle.gtimg.cn res.wx.qq.com open.mobile.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com resource: captcha.gtimg.com captcha.guard.qcloud.com pagead2.googlesyndication.com cpro.baidustatic.com pos.baidu.com dup.baidustatic.com i.hao61.net 'nonce-8555a150-24a4-490b-9a1e-d48bdb590dfe' hm.baidu.com zz.bdstatic.com b.bdstatic.com imgcache.qq.com vs-cdn.tencent-cloud.com gw.alipayobjects.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net; style-src 'self' 'unsafe-inline' *.zhihu.com unicom.zhimg.com resource: captcha.gtimg.com ssl.captcha.qq.com t.captcha.qq.com cstaticdun.126.net c.dun.163.com ac.dun.163.com/ acstatic-dun.126.net",
'x-frame-options': 'SAMEORIGIN',
'strict-transport-security': 'max-age=15552000; includeSubDomains',
'surrogate-control': 'no-store',
'cache-control': 'no-cache, no-store, must-revalidate, private, max-age=0',
'pragma': 'no-cache',
'expires': '0',
'x-content-type-options': 'nosniff',
'x-xss-protection': '1; mode=block',
'X-Backend-Response': '0.022',
'Referrer-Policy': 'no-referrer-when-downgrade',
'X-SecNG-Response': '0.023000001907349',
'x-lb-timing': '0.023',
'x-idc-id': '2',
'Set-Cookie': 'KLBRSID=fb3eda1aa35a9ed9f88f346a7a3ebe83|1609490219|1609490131; Path=/',
'X-Cache-Lookup': 'Cache Miss, Cache Miss',
'Content-Encoding': 'gzip',
'Transfer-Encoding': 'chunked',
'X-NWS-LOG-UUID': '16179100128830453442',
'Connection': 'keep-alive',
'x-edge-timing': '0.061',
'x-cdn-provider': 'tencent'}
'''
'''
html
cache-control: no-cache, no-store, must-revalidate, private, max-age=0
content-encoding: gzip
content-type: application/json
date: Fri, 01 Jan 2021 07:01:52 GMT
etag: W/"6a7ddf80b3ab19ba789d570163ac1eacb4bde53e"
expires: Fri, 02 Jan 2000 00:00:00 GMT
pragma: no-cache
referrer-policy: no-referrer-when-downgrade
server: CLOUD ELB 1.0.0
set-cookie: KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484511|1609484497; Path=/
vary: Accept-Encoding
x-backend-response: 0.573
x-cache-lookup: Cache Miss
x-cdn-provider: tencent
x-edge-timing: 0.634
x-idc-id: 2
x-lb-timing: 0.599
x-nws-log-uuid: 12448536375904178345
x-secng-response: 0.59800004959106
x-udid: AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=
'''
res_cookies_dict = self.get_serach_page_cookies()
referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": referer + "/answers?page=1",
}
# cookies_dict = {
# "d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
# "KLBRSID": '5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497',
# }
# cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
def get_serach_page_cookies(self):
'''
cookies更新
'''
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609484506; KLBRSID=5430ad6ccb1a51f38ac194049bce5dfe|1609484506|1609484497',
"referer": self.spider_url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
}
requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict()
def update_page_count(self, answer_count):
count = int(answer_count / 20)
temp = int(answer_count % 20)
if temp > 0:
count += 1
self.page_count = count
def check_data_exist(self, data_dict, mark):
'''
数据插入前检测
'''
sql = "select id from {table} where answer_id = {id_}"
exist = None
if mark == 0:
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
return True
return False
def parse_sigle_page(self, data_dict, mark):
'''
插入主要内容数据和图片的url
'''
if not self.check_data_exist(data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
print(data_dict["question"]["title"])
self.cur.execute(into, values)
self.conn.commit()
return
def search_page(self, mark, page_max, start_page=0, need_commend=False):
'''
函数主入口
params:
mark 0 answer, 1 article, 2 thought
'''
offset = start_page
for i in range(page_max):
if i > self.page_count - 1:
break
if mark == 0:
self.search_answer_article_page(offset, 0, 0)
elif mark == 1:
self.search_answer_article_page(offset, 1, 0)
elif mark == 2:
self.search_thought_page(offset)
offset = offset + 20
time.sleep(10)
self.conn.close()
return
def get_page_data(self, url, headers_search, cookies_dict):
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict)
if get_page.status_code != 200:
# retry once
get_page = retry_get_url(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
try:
page_dict = get_page.json()
print('get page json data success! {}', offset)
except:
print('retry get page data : {}', offset)
self.get_page_data(url)
def search_answer_article_page(self, offset, mark, proxies_num=0):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = self.ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
page_dict = self.get_page_data(url, headers_search, cookies_dict)
if page_dict.get("data"):
print(self.page_count)
if self.page_count == 1000:
self.update_page_count(page_dict["paging"].get("totals", 0))
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
print("finshed_crawler " + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
print('page data error')
continue
else:
print("article_data_error, offset: ", offset, " url: ", url)
self.use_proxy = True
self.search_answer_article_page(offset=offset, mark=mark)
return
if __name__ == '__main__':
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print(datetime.now())
spider = Spider(spider_url=spider_url)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
spider.search_page(mark, max_page, start_page)
elif mark == 2:
spider.search_page(mark, max_page, start_page)
print(datetime.now())
...@@ -7,43 +7,18 @@ import re ...@@ -7,43 +7,18 @@ import re
import sys import sys
import time import time
from datetime import datetime from datetime import datetime
import kdl import base64
from requests_html import HTMLSession
from PIL import Image
from captcha.zhihu_captcha import ZhihuCaptcha
HOST = '172.18.51.14' HOST = '172.18.51.14'
PORT = 3306 PORT = 3306
USER = 'spider' USER = 'spider'
PASSWD = 'Gengmei123' PASSWD = 'Gengmei123'
DB = 'spider' DB = 'spider'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js' JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js' # JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
APIKEY = 'quxguz4hwm9cxnx6wpjhkokx04klpr8v'
def get_proxy():
auth = kdl.Auth("990866563045611", APIKEY)
client = kdl.Client(auth)
ips = client.get_dps(1, sign_type='hmacsha1', format='json', area='北京,上海,广东')
print("dps proxy: ", ips)
return { "http": "http://{}".format(ips[0]), "https": "https://{}".format(ips[0]), }
def retry_get_url(url, retrys=5, timeout=10, proxies=None, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
if proxies:
get_resp = requests.get(url, timeout=timeout, proxies=proxies, **kwargs)
else:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(3)
print(e)
proxies = get_proxy()
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
class Spider(object): class Spider(object):
...@@ -52,6 +27,7 @@ class Spider(object): ...@@ -52,6 +27,7 @@ class Spider(object):
''' '''
初始化数据库,调整js规则 初始化数据库,调整js规则
''' '''
self.login_req = self._login()
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER, self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD, passwd=PASSWD,
db=DB, charset='utf8') db=DB, charset='utf8')
...@@ -71,9 +47,77 @@ class Spider(object): ...@@ -71,9 +47,77 @@ class Spider(object):
except: except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f: with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read() js = f.read()
self.exec_js = execjs.compile(js) # self.exec_js = execjs.compile(js)
# self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules') self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
self.session = requests.session()
self.HTMLSession = HTMLSession()
def _login(self):
url = 'https://www.zhihu.com'
loginUrl = 'https://www.zhihu.com/login/email'
headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36',
"Referer": "http://www.zhihu.com/",
'Host': 'www.zhihu.com',
'rememberme': "true"
}
self.headers = headers
data = {
'email': 'yousangdandan@yeah.net',
'password': '5358569'
}
login_req = self.session.post(loginUrl, data=data, headers=headers)
print('loginReq:{}'.format(login_req.status_code))
self.set_cookie(self.session.cookies)
return login_req
def set_cookies(self, cookies):
self.HTMLSession.cookies = cookies
self.session.cookies = cookies
def appeal(self, url):
self.captcha_model = ZhihuCaptcha()
r = self.HTMLSession.get('https://www.zhihu.com/api/v4/anticrawl/captcha_appeal')
captchaUrl = r.json()['img_base64']
captchaUrl = re.sub('\n', '', captchaUrl)
with open('cache/captcha2.png', 'wb') as f:
img_base64 = base64.b64decode(captchaUrl.strip('data:image/png;base64,').strip())
print(img_base64)
f.write(img_base64)
im = Image.open('cache/captcha2.png')
captcha = self.captcha_model.recgImg(im)
print(captcha)
r = self.HTMLSession.post('https://www.zhihu.com/api/v4/anticrawl/captcha_appeal',
data=json.dumps({"captcha": captcha}),
headers={"User-Agent": user_agent,
"referer": 'https://www.zhihu.com/account/unhuman?type=unhuman&message=%E7%B3%BB%E7%BB%9F%E7%9B%91%E6%B5%8B%E5%88%B0%E6%82%A8%E7%9A%84%E7%BD%91%E7%BB%9C%E7%8E%AF%E5%A2%83%E5%AD%98%E5%9C%A8%E5%BC%82%E5%B8%B8%EF%BC%8C%E4%B8%BA%E4%BF%9D%E8%AF%81%E6%82%A8%E7%9A%84%E6%AD%A3%E5%B8%B8%E8%AE%BF%E9%97%AE%EF%BC%8C%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81%E8%BF%9B%E8%A1%8C%E9%AA%8C%E8%AF%81%E3%80%82&need_login=false',
'Content-Type': 'application/json',
'x-xsrftoken': self.HTMLSession.cookies._cookies['.zhihu.com']['/']['_xsrf'].value})
return self.HTMLSession.get(url, allow_redirects=False)
def retry_get_url(self, url, retrys=5, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
time.sleep(5)
try:
import pdb; pdb.set_trace()
get_resp = self.session.get(url, headers=self.headers, timeout=timeout, **kwargs)
if get_resp.status_code == 403:
get_resp = self.appeal(url)
return get_resp
except Exception as e:
retry_c += 1
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def update_page_count(self, answer_count): def update_page_count(self, answer_count):
count = int(answer_count / 20) count = int(answer_count / 20)
temp = int(answer_count % 20) temp = int(answer_count % 20)
...@@ -81,28 +125,6 @@ class Spider(object): ...@@ -81,28 +125,6 @@ class Spider(object):
count += 1 count += 1
self.page_count = count self.page_count = count
def get_serach_page_cookies(self):
'''
cookies更新
'''
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": 'SESSIONID=UIZ9mtCMPNttU11zx8a9e5eJcTm92PhBGKiz9oqWgDr; JOID=UlsQAkk86vULooRFHj177i1UY9UKHM_SK4OkYDkdytAsgqVlOzFzH1WjgUcZ-R9yCKLnnTcSKj5UlS_DhJu9iUI=; osd=VFoXB0466_IOpYJEGTh86CxTZtIMHcjXLIWlZzwazNErh6JjOjZ2GFOihkIe_x51DaXhnDAXLThVkirEgpq6jEU=; SESSIONID=rsVkcWbq9ESuP7O4fOw4qdMJdkNGnCFu59zCNAAkoIL; JOID=VV4TCkoAD-uttc-DPQ6Y9IZDJxUtIizHhpDtoBElLciBnuqhHkmjAfyyzow6twj5biJFaHi7j_WoTqKkbWlN0QI=; osd=UFkVCk4FCO2tscqEOw6c8YFFJxEoJSrHgpXqphEhKM-Hnu6kGU-jBfm1yIw-sg__biZAb367i_CvSKKgaG5L0QY=; _xsrf=vTWamiEoaOQszAl6fjdlxOtqyhDvOen9; d_c0="AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"; q_c1=3e9e37a46b1d4bfd87f7d1fcb084daad|1545899267000|1545899267000; _ga=GA1.2.929033900.1582626815; capsion_ticket="2|1:0|10:1608602928|14:capsion_ticket|44:MmRhNDdmYWJhZjU3NGQ4ODg3NDAzNGIwNDNiMTdlNDE=|7924fa0d0e36d3ed2a4af65dafa4684c9b36a70d586ec3adb1963c8df5f55e81"; _zap=6fd2d768-daa1-4be1-9a96-43d86c1bbc75; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1609325059,1609337218,1609401296,1609405637; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1609410807; KLBRSID=0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296',
"referer": self.spider_url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
}
requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict()
def check_data_exist(self, data_dict, mark): def check_data_exist(self, data_dict, mark):
''' '''
数据插入前检测 数据插入前检测
...@@ -124,7 +146,7 @@ class Spider(object): ...@@ -124,7 +146,7 @@ class Spider(object):
def parse_sigle_page(self, data_dict, mark): def parse_sigle_page(self, data_dict, mark):
''' '''
插入主要内容数据和图片的url,寻找评论 插入主要内容数据和图片的url
''' '''
if not self.check_data_exist(data_dict, mark): if not self.check_data_exist(data_dict, mark):
...@@ -141,8 +163,6 @@ class Spider(object): ...@@ -141,8 +163,6 @@ class Spider(object):
print(data_dict["question"]["title"]) print(data_dict["question"]["title"])
self.cur.execute(into, values) self.cur.execute(into, values)
self.conn.commit() self.conn.commit()
return
def search_page(self, mark, page_max, start_page=0, need_commend=False): def search_page(self, mark, page_max, start_page=0, need_commend=False):
''' '''
...@@ -167,8 +187,21 @@ class Spider(object): ...@@ -167,8 +187,21 @@ class Spider(object):
time.sleep(10) time.sleep(10)
self.conn.close() self.conn.close()
return
def get_page_data(self, url):
get_page = self.retry_get_url(url)
if get_page.status_code != 200:
# retry once
get_page = self.retry_get_url(url)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
try:
page_dict = get_page.json()
print('get page json data success! {}', offset)
except:
print('retry get page data : {}', offset)
self.get_page_data(url)
def search_answer_article_page(self, offset, mark, proxies_num=0): def search_answer_article_page(self, offset, mark, proxies_num=0):
''' '''
实现文章和回答的数据包请求 实现文章和回答的数据包请求
...@@ -178,19 +211,8 @@ class Spider(object): ...@@ -178,19 +211,8 @@ class Spider(object):
url = self.ANSWER_URL.format(offset) url = self.ANSWER_URL.format(offset)
elif mark == 1: elif mark == 1:
url = ARTICLE_URL.format(offset) url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
proxies = None page_dict = self.get_page_data(url)
if self.use_proxy:
proxies = get_proxy()
get_page = retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=proxies)
if get_page.status_code != 200:
# retry once
time.sleep(3)
get_page = retry_get_url(url, proxies=proxies)
if get_page.status_code != 200:
print("article_error, url : ", url, " status_code: ", get_page.status_code)
page_dict = get_page.json()
if page_dict.get("data"): if page_dict.get("data"):
print(self.page_count) print(self.page_count)
if self.page_count == 1000: if self.page_count == 1000:
...@@ -201,54 +223,15 @@ class Spider(object): ...@@ -201,54 +223,15 @@ class Spider(object):
self.parse_sigle_page(one_line, mark) self.parse_sigle_page(one_line, mark)
print("finshed_crawler " + offset) print("finshed_crawler " + offset)
except KeyError: except KeyError:
# It's totally ok to drop the last return data value. print('page data error')
# The search api just return something seems related to search
continue continue
else: else:
print("article_data_error, offset: ", offset, " url: ", url) print("article_data_error, offset: ", offset, " url: ", url)
self.use_proxy = True self.use_proxy = True
time.sleep(3)
self.search_answer_article_page(offset=offset, mark=mark) self.search_answer_article_page(offset=offset, mark=mark)
return return
def headers_handle(self, url):
'''
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
headers_search = {
"accept": "*/*",
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh;q=0.9",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-ab-param": "li_yxzl_new_style_a=1;se_major=0;se_adsrank=4;se_hi_trunc=0;se_v053=1;li_panswer_topic=0;zr_test_aa1=1;pf_noti_entry_num=2;zr_search_sim2=2;zr_slotpaidexp=2;se_zp_boost=0;tp_club_entrance=1;pf_profile2_tab=0;ug_newtag=1;li_answer_card=0;ls_recommend_test=5;qap_labeltype=1;zr_rec_answer_cp=open;se_sug_term=0;tp_topic_tab=0;ge_ge01=5;se_wil_act=0;se_videobox=0;tsp_ad_cardredesign=0;qap_question_visitor= 0;zr_slot_training=2;tp_clubhyb=0;li_ebook_gen_search=2;se_v_v005=0;zw_sameq_sorce=999;ge_ge02=6;se_mobilecard=0;se_auth_src=0;tp_header_style=1;tp_flow_ctr=0;pf_creator_card=1;li_viptab_name=0;zr_intervene=0;se_bert128=1;se_ffzx_jushen1=0;top_v_album=1;se_preset=0;tp_discover=1;ls_fmp4=0;tp_club_top=0;top_universalebook=1;li_svip_cardshow=1;li_paid_answer_exp=0;tp_topic_style=0;zr_art_rec=base;se_colorfultab=1;se_auth_src2=0;tp_club_qa_entrance=1;tp_club__entrance2=1;tsp_hotlist_ui=3;li_svip_tab_search=1;se_entity22=1;tp_meta_card=0;tp_topic_tab_new=0-0-0;tp_zrec=0;top_ebook=0;pf_adjust=1;qap_question_author=0;zr_topic_rpc=0;se_topicfeed=0;tp_club_feed=0;tsp_ioscard2=0;zr_rel_search=base;se_recommend=1;se_usercard=0;tp_club_fdv4=0;tp_m_intro_re_topic=1;pf_foltopic_usernum=0;li_vip_verti_search=0;zr_training_boost=false;se_v054=0;tp_contents=1;soc_feed_intelligent=3;tsp_ios_cardredesign=0;pf_fuceng=1;pf_newguide_vertical=0;ug_follow_topic_1=2;ls_video_commercial=0;li_car_meta=1;se_sug_dnn=0;tp_fenqu_wei=0;li_catalog_card=1;top_quality=0;se_click_v_v=1;se_aa_base=1;se_club_ui=0;se_return_1=0;soc_notification=1;zr_ans_rec=gbrank;zr_search_paid=1;zr_expslotpaid=3;zr_rerank=0;se_college=default;se_whitelist=1;top_root=0;li_yxxq_aut=A1;tsp_adcard2=0;ls_videoad=2;se_col_boost=1;li_edu_page=old;zr_training_first=false;se_t2sug=1;se_vbert3=0;se_merge=0;li_video_section=1;zr_km_answer=open_cvr;zr_sim3=0;se_v_v006=0;tp_dingyue_video=0;li_topics_search=0;se_searchwiki=0;se_guess=0;se_major_v2=0;tp_club_bt=0;tp_sft=a;top_test_4_liguangyi=1",
"x-api-version": "3.0.91",
"x-app-za": "OS=Web",
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": referer + "/answers?page=1",
}
cookies_dict = {
"d_c0": '"AADj9V7Fuw6PTkTnxjUXHGA8UqXz_bkbAN4=|1545899265"',
"KLBRSID": '0a401b23e8a71b70de2f4b37f5b4e379|1609410806|1609401296'
}
cookies_dict.update(res_cookies_dict)
f = "+".join(
["3_2.0", url.replace("https://www.zhihu.com", ""), headers_search["referer"], cookies_dict["d_c0"]])
fmd5 = hashlib.new('md5', f.encode()).hexdigest()
headers_search["x-zse-86"] = "1.0_" + self.exec_js.call("b", fmd5)
return headers_search, cookies_dict
if __name__ == '__main__': if __name__ == '__main__':
''' '''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment