Commit 3bd3a181 authored by zhangmeng's avatar zhangmeng

整理知乎爬虫代码

parent dc9ebe26
......@@ -21,3 +21,9 @@ python /srv/apps/crawler/crawler_sys/framework/search_page_single_process.py
1.将github上面litao分支的代码拉到服务器spider-prod-001上
2.配置环境 激活环境->python->import sys->sys.path->跳转到里面那个site-packages目录下面->vim mypath.pth->改成自己的项目路径 运行的时候就不会报出crawler包找不到
3.小红书导出为txt的文件 在maintenance的temfile目录下的那个脚本 使用的时候把邮箱跟密码替换成自己的
##知乎爬取
1.将项目部署在spider-001-prod上
2.运行脚本命令爬取特定网址 python tasks/zhihu/spider.py 0 119 58 'https://www.zhihu.com/people/zhaotianqiang'
3. 运行脚本命令清洗图片入库 python tasks/zhihu/upload_picture.py 0 0 84297 0
4.去mimas项目运行 python django_manage.py qa_insert_by_spider level= 0 offset = 0 count=322784将数据导入自己数据库
\ No newline at end of file
# coding=utf-8
import pymysql
import execjs
import os
import re
from datetime import datetime
from pymysql import escape_string
from bs4 import BeautifulSoup
import sys
HOST = '172.18.51.14'
PORT = 3306
USER = 'spider'
PASSWD = 'Gengmei123'
DB = 'spider'
JS_FILE_PATH = '/srv/apps/crawler/crawler_sys/site_crawler/zhihu.js'
class RefreshContent(object):
def __init__(self, is_online=0):
'''
初始化数据库,调整js规则
'''
self.update_error_content_id = []
self.update_error_url_content_id = {}
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER,
passwd=PASSWD,
db=DB, charset='utf8')
self.cur = self.conn.cursor()
os.environ["EXECJS_RUNTIME"] = 'Node'
try:
with open('./zhihu.js', 'r', encoding='utf-8') as f:
js = f.read()
except:
with open(JS_FILE_PATH, 'r', encoding='utf-8') as f:
js = f.read()
self.exec_js = execjs.compile(js, cwd='/home/gmuser/node_modules')
self.is_online = is_online
@staticmethod
def replace_html_image_to_url(content):
rich_obj = BeautifulSoup(content, features="html.parser")
for item in rich_obj.find_all("figure"):
image_obj = item.find("img")
new_rich_obj = rich_obj.new_tag(name="img")
new_rich_obj["src"] = image_obj.get("src", "")
item.replace_with(new_rich_obj)
return rich_obj.decode()
def create_new_content(self, content_id, content, pic_dict):
content = self.replace_html_image_to_url(content)
rich_obj = BeautifulSoup(content, features="html.parser")
update_error = False
for item in rich_obj.find_all("img"):
url = item.get("src")[23:]
new_url = pic_dict.get(url)
if not new_url:
if content_id not in self.update_error_content_id:
self.update_error_content_id.append(content_id)
self.update_error_url_content_id[url] = content_id
print({content_id: url})
update_error = True
continue
item['src'] = new_url + '-w'
new_content = r'%s' % (rich_obj.decode())
return escape_string(new_content), update_error
def get_all_content_ids(self, table, pic_table, key_id, offset=0, count=10):
if offset == 0:
sql = """select distinct {} from {}""".format(key_id, pic_table)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
else:
sql = """select answer_id {} from {} limit {}, {}""".format(key_id, table, offset, count)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if res:
return [item[0] for item in res]
return None
def refresh_content(self, table, pic_table, key_id, offset=0, count=10):
'''
替换url,更新回答内容
'''
content_ids = self.get_all_content_ids(table, pic_table, key_id, offset, count)
for content_id in content_ids:
print('start deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
sql = """select content from {} where {} = {} and is_new = 0 and is_online = {}""".format(table, key_id, content_id, self.is_online)
print(sql)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
if not res:
continue
content = res[0][0]
sql = """select url, new_url from {} where {} = {} and new_url is not null""".format(
pic_table, key_id, content_id)
self.cur.execute(sql)
res = self.cur.fetchall()
self.conn.commit()
pic_dict = {
item[0][23:]: item[1] for item in res}
new_content, update_error = self.create_new_content(content_id, content, pic_dict)
update_code = 1 if not update_error else 0
sql = """update {} set new_content = '{}', is_new = {} WHERE {} = '{}' """.format(
table, new_content, update_code, key_id, content_id)
self.cur.execute(sql)
self.conn.commit()
print('end deal table: {}, content_id: {},'.format(table, content_id), datetime.now())
if __name__ == '__main__':
""" python script_file mark """
print('参数个数为:', len(sys.argv), '个参数。')
print('参数列表:', type(sys.argv[0]), sys.argv[0], type(sys.argv[1]), sys.argv[1], type(sys.argv[2]), sys.argv[2], type(sys.argv[3]), sys.argv[3])
mark = int(sys.argv[1])
offset = int(sys.argv[2])
count = int(sys.argv[3])
is_online = int(sys.argv[4]) or 0
print(datetime.now())
refresh = RefreshContent(is_online=is_online)
if mark == 0:
refresh.refresh_content('zhihu_answer', 'zhihu_answer_picture_url', 'answer_id', offset, count)
elif mark == 1:
refresh.refresh_content('zhihu_article', 'zhihu_article_picture_url', 'article_id')
elif mark == 2:
refresh.refresh_content('zhihu_thought', 'zhihu_thought_picture_url', 'thought_id')
print('update_error_url_content_ids : ', refresh.update_error_url_content_id)
print('update_error_content_ids : ', refresh.update_error_content_id)
print(datetime.now())
This diff is collapsed.
This diff is collapsed.
import os
import time
from datetime import datetime
if __name__ == '__main__':
plans = [
#"python tasks/zhihu/spider_self.py 0 169 0 'https://www.zhihu.com/people/bai-fu-mei-yan-jiu-zhong-xin' 1",
"python tasks/zhihu/spider_self.py 0 169 66 'https://www.zhihu.com/people/geng-mei-suo-chang' 1",
]
for plan in plans:
print('start plan ', plan, ' at ', datetime.now())
os.system(plan)
print('end plan ', plan, ' at ', datetime.now())
time.sleep(10)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment