Commit 9bde750f authored by haowang's avatar haowang

modify zhihu spider

parent 7c2c462d
......@@ -24,6 +24,7 @@ class RefreshContent(object):
初始化数据库,调整js规则
'''
self.update_error_content_id = []
self.update_error_url_content_id = {}
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
......@@ -53,49 +54,75 @@ class RefreshContent(object):
def create_new_content(self, content_id, content, pic_dict):
content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser")
update_error = False
for item in rich_obj.find_all("img"):
url = item.get("src")
url = item.get("src")[23:]
new_url = pic_dict.get(url)
if not new_url:
self.update_error_content_id.append({content_id: url})
if content_id not in self.update_error_content_id:
self.update_error_content_id.append(content_id)
self.update_error_url_content_id[url] = content_id
print({content_id: url})
update_error = True
continue
item['src'] = new_url + '-w'
return rich_obj.decode()
new_content = r'%s' % (rich_obj.decode())
return escape_string(new_content), update_error
def get_all_content_ids(self, table, key_id):
sql = """select distinct {} from {}""".format(key_id, table)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
if offset == 0:
sql = """select distinct {} from {}""".format(key_id, pic_table)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
else:
sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id):
def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
'''
替换url,更新回答内容
'''
content_ids = self.get_all_content_ids(pic_table, key_id)
content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
for content_id in content_ids:
print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(pic_table, key_id, content_id)
sql = """select content from {} where {} = {} and is_new = 0""".format(table, key_id, content_id)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
pic_dict = {item[0]: item[1] for item in res}
if not res:
continue
content = res[0][0]
sql = """select content from {} where {} = {}""".format(table, key_id, content_id)
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
pic_table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
content = res[0][0]
pic_dict = {
item[0][23:]: item[1] for item in res}
new_content = self.create_new_content(content_id, content, pic_dict)
new_content, update_error = self.create_new_content(content_id, content, pic_dict)
update_code = 1 if not update_error else 0
sql = """update {} set new_content = '{}' WHERE {} = '{}' """.format(table, new_content, key_id, content_id)
sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
table, new_content, update_code, key_id, content_id)
self.cur.execute(sql)
self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
......@@ -104,16 +131,19 @@ class RefreshContent(object):
if __name__ == '__main__':
""" python script_file mark """
print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1])
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
mark = int(sys.argv[1])
offset = int(sys.argv[2])
count = int(sys.argv[3])
print(datetime.now())
refresh = RefreshContent()
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id')
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print(refresh.update_error_content_id)
print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
print('update_error_content_ids : ', refresh.update_error_content_id)
print(datetime.now())
# import rsa
# import os, sys
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("/Users/xuwei")
# sys.path.append("/Users/xuwei/crawler")
# sys.path.append("/Users/xuwei/crawler/crawler_sys")
import os, sys
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("/Users/haowei")
sys.path.append("/Users/haowei/workspace/gm/crawler")
sys.path.append("/Users/haowei/workspace/gm/crawler/crawler_sys")
import pymysql
import hashlib
......@@ -12,7 +12,7 @@ import execjs
import os
import re
import sys
from crawler_sys.utils.output_results import retry_get_url, retry_get_url_no_proxies
import time
from datetime import datetime
HOST = '172.18.51.14'
......@@ -20,21 +20,53 @@ PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
# JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
JS_FILE_PATH = '/Users/haowei/workspace/gm/crawler/crawler_sys/site_crawler/zhihu.js'
# SPIDER_URL = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1'
# ANSWER_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
# ARTICLE_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={0}&limit=20&sort_by=created'
# THOUGHT_URL = 'https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/pins?offset={0}&limit=20&includes=data%5B*%5D.upvoted_followees%2Cadmin_closed_comment'
# ANSWER_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/answers/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
# ARTICLE_ROOT_COMMENT_URL = 'https://www.zhihu.com/api/v4/articles/{0}/root_comments?order=normal&limit=20&offset={1}&status=open'
# CHILD_COMMENT_START_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments'
# CHILD_COMMENT_OFFSET_URL = 'https://www.zhihu.com/api/v4/comments/{0}/child_comments?limit=20&offset={1}'
# THOUGHT_COMMENT_URL = 'https://www.zhihu.com/api/v4/pins/{0}/comments?order=normal&limit=20&offset={1}&status=open'
def retry_get_url_no_proxies(url, retrys=3, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
def retry_get_url(url, retrys=3, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
return None
class Spider(object):
def __init__(self):
def __init__(self, spider_url):
'''
初始化数据库,调整js规则
'''
......@@ -43,6 +75,9 @@ class Spider(object):
db=DB, charset='utf8')
self.cur = self.conn.cursor()
self.spider_url = spider_url
self.ANSWER_URL = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members") + '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cexcerpt%2Cis_labeled%2Clabel_info%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics%3Bdata%5B*%5D.question.has_publishing_draft%2Crelationship&offset={0}&limit=20&sort_by=created'
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
......@@ -57,14 +92,13 @@ class Spider(object):
cookies更新
'''
url = "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": '_SESSIONID=rvJz2mbRjCqJFwvD79ADGb8gpdpRkWXAsdVDqOVALgh; JOID=UVkSBEtHLEUgV_KGUUMd3fULiLJHZglmBHfTo3JnDWADc9KndNqP8XtW9oBWoLo5jsAnAcPG0-JwbPp8rD1JsFI=; osd=W1gdB05NLUojUviHXkAY1_QEi7dNZwZlAX3SrHFiB2EMcNetddWM9HFX-YNTqrs2jcUtAMzF1uhxY_l5pjxGs1c=; _zap=cc1330a6-d6e9-4b25-8232-0b0481e37ea2; _xsrf=4798d610-1db7-489f-8c14-d608c0bbcf08; d_c0="AOCcdS0CNBKPToRkxgVd-8qBKKgTDM9yF4Y=|1605507877"; capsion_ticket="2|1:0|10:1605684439|14:capsion_ticket|44:MzdiN2JlOTZlNWE2NGU5Zjg0MDNiODFlN2ViMjQzNGU=|b84d77e03b5e9447a3740a614d208ca5f3bc1e4fe15fe9b46f1a2a5702da5f99"; SESSIONID=nplCMucg2EXp8xNQAz74jmaLovUS9CG4rVtcFY4jWLT; JOID=V1sUBk8zRBeITtMgLDNzj10QrRI6WzZ_ygKgQU8AB038d6pof-COptdJ1yYpGa7oR9-1bgHql0Hfs4FvUF_YW-A=; osd=Ul4XBkI2QRSIQ9YlLzN-ilgTrR8_XjV_xwelQk8NAkj_d6dteuOOq9JM1CYkHKvrR9KwawLqmkTasIFiVVrbW-0=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1605608394,1605684435,1605684439,1605684455; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1605684522; KLBRSID=fe0fceb358d671fa6cc33898c8c48b48|1605685245|1605683234',
"referer": "https://www.zhihu.com/people/geng-mei-suo-chang/answers?page=1",
"referer": self.spider_url,
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
......@@ -72,31 +106,55 @@ class Spider(object):
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36",
}
requests_res = retry_get_url(url, headers=headers)
requests_res = retry_get_url(self.spider_url, headers=headers)
return requests_res.cookies.get_dict()
def parse_sigle_page(self, data_dict, mark):
def check_data_exist(self, data_dict, mark):
'''
插入主要内容数据和图片的url,寻找评论
数据插入前检测
'''
sql = "select id from {table} where id = {id_}"
exist = None
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
self.cur.execute(into, values)
self.conn.commit()
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
select_sql = sql.format(table='zhihu_answer', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if mark == 1:
select_sql = sql.format(table='zhihu_article', id_=data_dict["id"])
self.cur.execute(select_sql)
exist = self.cur.fetchone()
if exist:
return True
return False
def parse_sigle_page(self, data_dict, mark, need_comment=False):
'''
插入主要内容数据和图片的url,寻找评论
'''
if not self.check_data_exist(data_dict, mark):
if mark == 0:
into = "insert into zhihu_answer(title, content, answer_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (data_dict["question"]["title"], data_dict["content"], data_dict["id"], data_dict["created_time"],
data_dict["comment_count"], data_dict["content"])
elif mark == 1:
into = "insert into zhihu_article(title, content, article_id, created_time, comment_count, new_content) value(%s, %s, %s, %s, %s, %s)"
values = (
data_dict["title"], data_dict["content"], data_dict["id"], data_dict["created"], data_dict["comment_count"],
data_dict["content"])
print(data_dict["question"]["title"])
self.cur.execute(into, values)
self.conn.commit()
if need_comment:
offset = 0
if data_dict["comment_count"] != 0:
next = 1
while next == 1:
next = self.search_root_comment(data_dict["id"], offset, mark)
offset = offset + 20
# patt = r'%s(.+?)%s' % ("<noscript><img src=\"", "\" data-caption")
# pattern = re.compile(patt)
......@@ -112,7 +170,7 @@ class Spider(object):
return
def search_page(self, mark, page_max, start_page=0):
def search_page(self, mark, page_max, start_page=0, need_commend=False):
'''
函数主入口
......@@ -123,9 +181,9 @@ class Spider(object):
for i in range(page_max):
if mark == 0:
self.search_answer_article_page(offset, 0)
self.search_answer_article_page(offset, 0, 0, need_commend)
elif mark == 1:
self.search_answer_article_page(offset, 1)
self.search_answer_article_page(offset, 1, 0, need_commend)
elif mark == 2:
self.search_thought_page(offset)
......@@ -134,13 +192,13 @@ class Spider(object):
self.conn.close()
return
def search_answer_article_page(self, offset, mark, proxies_num=0):
def search_answer_article_page(self, offset, mark, proxies_num=0, need_comment=False):
'''
实现文章和回答的数据包请求
'''
offset = str(offset)
if mark == 0:
url = ANSWER_URL.format(offset)
url = self.ANSWER_URL.format(offset)
elif mark == 1:
url = ARTICLE_URL.format(offset)
[headers_search, cookies_dict] = self.headers_handle(url)
......@@ -156,8 +214,8 @@ class Spider(object):
for one_line in page_dict['data']:
try:
if one_line["content"] != None:
self.parse_sigle_page(one_line, mark)
print("finshed_article" + offset)
self.parse_sigle_page(one_line, mark, need_comment=need_comment)
print("finshed_crawler " + offset)
except KeyError:
# It's totally ok to drop the last return data value.
# The search api just return something seems related to search
......@@ -279,6 +337,7 @@ class Spider(object):
url请求中的头部伪装
'''
res_cookies_dict = self.get_serach_page_cookies()
referer = self.spider_url.replace("https://www.zhihu.com/people", "https://www.zhihu.com/api/v4/members")
headers_search = {
"accept": "*/*",
......@@ -294,7 +353,7 @@ class Spider(object):
"x-requested-with": "fetch",
"x-zse-83": "3_2.0",
"x-zse-86": None,
"referer": "https://www.zhihu.com/api/v4/members/geng-mei-suo-chang/answers?page=1",
"referer": referer + "/answers?page=1",
}
cookies_dict = {
......@@ -410,11 +469,18 @@ class Spider(object):
if __name__ == '__main__':
'''
python tasks/zhihu/spider.py 0 1 0 'https://www.zhihu.com/people/taoxi-1130'
python script_file_path mark(指定是问题还是其他, 0 是问题, 1是文章, 2是想法) max_page(最大页码) start_page(起始页码 0开始) spider_url(要爬取用户知乎首页地址)
'''
mark = int(sys.argv[1])
max_page = int(sys.argv[2])
start_page = int(sys.argv[3])
spider_url = sys.argv[4]
# spider_url = 'https://www.zhihu.com/people/geng-mei-suo-chang/answers'
print(datetime.now())
spider = Spider()
spider = Spider(spider_url=spider_url)
if mark == 0:
spider.search_page(mark, max_page, start_page)
elif mark == 1:
......
......@@ -10,6 +10,7 @@ import execjs
from datetime import datetime
from image_qiniu import upload_file, IMG_TYPE
from bs4 import BeautifulSoup
# pip3 install "requests[security]" -i https://pypi.tuna.tsinghua.edu.cn/simple
# DATA_OS_PATH = '/data'
......@@ -109,15 +110,16 @@ class UploadImage(object):
return headers_search, cookies_dict
@staticmethod
def retry_get_url(url, retrys=3, proxies=None, timeout=10, **kwargs):
def retry_get_url(url, retrys=5, proxies=None, timeout=10, **kwargs):
retry_c = 0
while retry_c < retrys:
try:
get_resp = requests.get(url, timeout=timeout, **kwargs)
requests.packages.urllib3.disable_warnings()
get_resp = requests.get(url, verify=False, timeout=timeout, **kwargs)
return get_resp
except Exception as e:
retry_c += 1
time.sleep(1)
time.sleep(2)
print(e)
print('Failed to get page %s after %d retries, %s'
% (url, retrys, datetime.now()))
......@@ -142,9 +144,9 @@ class UploadImage(object):
文章图片剪切和下载
'''
def _deal_image_by_path(file_path, old_url):
def _deal_image_by_path(res, file_path, old_url, i):
img = cv2.imread(file_path)
if img:
if img is not None:
high, width = img.shape[:2]
cropped = img[0:int(high / 10 * 9), 0:width]
pathes = new_path + "num" + str(i) + ".jpg"
......@@ -152,7 +154,7 @@ class UploadImage(object):
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(tuple[i][1]))
table, str(new_url), str(res[i][1]))
self.cur.execute(sql)
self.conn.commit()
else:
......@@ -167,36 +169,44 @@ class UploadImage(object):
self.cur.execute(sql)
self.conn.commit()
def _download_picture():
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(res)):
mark = re.search(gif_patt, res[i][1])
url = res[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if not r:
continue
# try:
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(res, pathes, url, i)
# except Exception as e:
# print(e)
urls = self.find_all_url(content)
self.insert_picture_urls(table, urls, content_id, key_id)
sql = """select {}, url from {} where {} = {} and new_url is null""".format(key_id, table, key_id, content_id)
self.cur.execute(sql)
tuple = self.cur.fetchall()
self.conn.commit()
gif_patt = r'gif'
for i in range(len(tuple)):
mark = re.search(gif_patt, tuple[i][1])
url = tuple[i][1]
[headers_search, cookies_dict] = self.headers_handle(url)
r = self.retry_get_url(url, headers=headers_search, cookies=cookies_dict, proxies=0)
if mark:
pathes = path + str('num') + str(i) + '.gif'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
new_url = self.upload_image_with_path(pathes)
sql = """UPDATE {} SET new_url = "{}" WHERE url = "{}" """.format(
table, str(new_url), str(url))
self.cur.execute(sql)
self.conn.commit()
else:
pathes = path + str('num') + str(i) + '.jpg'
with open(pathes, 'wb') as f: # 打开写入到path路径里-二进制文件,返回的句柄名为f
f.write(r.content) # 往f里写入r对象的二进制文件
f.close()
_deal_image_by_path(pathes, url)
_download_picture()
_download_picture()
def picture_process(self, path, new_path, table, pic_table, key_id, offset=0, count=10):
content_dict = self.gets_content_dict(table, key_id, offset, count)
......
CREATE TABLE `zhihu_answer` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`answer_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答';
alter table `zhihu_answer` add column `is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新';
CREATE TABLE `zhihu_answer_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '回答ID',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数量',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答父评论';
CREATE TABLE `zhihu_article` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`title` varchar(40) DEFAULT NULL COMMENT '标题',
`content` mediumtext COMMENT '内容',
`article_id` int(11) DEFAULT NULL COMMENT 'id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章';
CREATE TABLE `zhihu_child_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`reply_name` varchar(40) DEFAULT NULL COMMENT '回复者名字',
`child_comment_id` int(11) DEFAULT NULL COMMENT '子评论id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎子评论';
CREATE TABLE `zhihu_article_root_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`root_comment_id` int(11) DEFAULT NULL COMMENT '父评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` mediumtext COMMENT '内容',
`answerid` int(11) DEFAULT NULL COMMENT '文章id',
`child_comment_count` int(11) DEFAULT NULL COMMENT '子评论数',
`featured` varchar(5) DEFAULT NULL COMMENT '是否精彩评论',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章评论';
CREATE TABLE `zhihu_answer_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`answer_id` int(11) DEFAULT NULL COMMENT '问答id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎问答图片';
CREATE TABLE `zhihu_article_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`article_id` int(11) DEFAULT NULL COMMENT '文章id',
`url` mediumtext COMMENT 'url',
`new_url` mediumtext COMMENT '新url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎文章图片';
CREATE TABLE `zhihu_thought` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法id',
`content` text COMMENT '内容',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`comment_count` int(11) DEFAULT NULL COMMENT '评论数量',
`new_content` mediumtext COMMENT '新内容',
`is_new` tinyint(1) NOT NULL DEFAULT 0 COMMENT '是否已更新',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法';
CREATE TABLE `zhihu_thought_comment` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_comment_id` int(11) DEFAULT NULL COMMENT '想法评论id',
`author_name` varchar(40) DEFAULT NULL COMMENT '作者名',
`content` text COMMENT '内容',
`answerid` varchar(50) DEFAULT NULL COMMENT '想法id',
`created_time` int(11) DEFAULT NULL COMMENT '创建时间',
`author_id` varchar(50) DEFAULT NULL COMMENT '作者名',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法评论';
CREATE TABLE `zhihu_thought_picture_url` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`thought_id` varchar(50) DEFAULT NULL COMMENT '想法ID',
`url` text COMMENT 'url',
`new_url` text COMMENT 'new url',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='知乎想法图片'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment