# -*- coding:UTF-8 -*- # @Time : 2020/8/11 10:03 # @File : trans_qiniu_img.py # @email : litao@igengmei.com # @author : litao import datetime from maintenance.func_send_email_with_file import send_file_email from typing import Dict, List from crawler.gm_upload.gm_upload import upload, upload_file import os import copy import re # import HTMLParser import pymysql from crawler.crawler_sys.utils.output_results import retry_get_url from lxml import html from lxml.html.clean import Cleaner import random class TRACTATE_PLATFORM(): """ 新帖子发布来源 """ GM = ("1", u"更美") HERA = ("2", u"HERA后台") DOCTOR = ("3", u"医生端") XIAOHONGSHU = ("4", u"小红书") WEIBO = ("5", u"微博") SOYOUNG = ("6", u"新氧") MARK = ("7", u"站内打卡活动") VARIETY_SHOW_YOUNG = ("8", "选秀节目(少年之名)打榜活动") GROUP_DETAIL = ("9", "普通小组") GROUP_TOPIC_DETAIL = ("10", "普通小组话题") STRATEGY_WEIBO_HOTSPOT = ("11", "策略微博热点") STRATEGY_DOUBAN_HOTSPOT = ("12", "策略豆瓣鹅组热点") STRATEGY_TOUTIAO = ("13", "策略头条文章") STRATEGY_ZHIHU = ("14", "策略知乎文章") STRATEGY_XIAOHONGSHU = ("15", "策略小红书文章") STRATEGY_SOYOUNG = ("16", "策略新氧文章") STRATEGY_WEIBO = ("17", "策略微博文章") img_type = { "OTHER": 1, # '其他图片' "GIF": 2, # "GIF动图") "JPG": 3, # "JPG图片") "JPEG": 4, # "JPEG图片") "PNG": 5, # "PNG图片") "BMP": 6, # "BMP位图") "WEBP": 7, # "WEBP图片类型") "TIFF": 8, # "TIFF图片类型") } WHITE_TAGS = { "basic": ["div", "p", "span", "img", "br", "video", 'a'], # 暂定小程序及爬取数据使用 "all": [ "div", "p", "span", "img", "br", "video", "audio", "a", "b", "strong", "i", "ul", "ol", "li", "em", "h1", "h2", "h3", "h4", "h5", "h6", "iframe", ] # 可以展示的所有白标签 } def gm_convert_html_tags(rich_text, all_tags=False, remove_tags=None): """ 富文本内容重新清洗,剔除不需要的样式 :param rich_text: 富文本 :param all_tags: 是否需要匹配所有白名单中的标签 :param remove_tags: 需要剔除的,白名单标签 [] :return: """ if not rich_text: return "" # rich_text = _get_rich_text(rich_text) # 标签清洗 + 补齐 参数 tags = WHITE_TAGS["all"] if all_tags else WHITE_TAGS["basic"] if remove_tags: tags = [tag for tag in tags if tag not in remove_tags] kw = { "remove_unknown_tags": False, "allow_tags": tags, "safe_attrs": ["src", ], } if "a" in tags: kw["safe_attrs"].append("href") elif all_tags: kw["safe_attrs"].extend(["class", "style"]) if "iframe" in kw["allow_tags"]: kw["embedded"] = False clear = Cleaner(**kw) rich_text = clear.clean_html(rich_text) # 增加样式 element_obj = html.fromstring(rich_text) for element in element_obj.xpath(u"//img|//video"): if not all_tags: # 小程序,普通用户,爬取数据 element.attrib["width"] = "100%" # 图片、视频增加宽度 100% if element.tag == "video" and all_tags: element.attrib["class"] = "js_richtext_video" # 移除a标签中跳转链不是gengmei开头的链接 for item in element_obj.xpath("//a[not(starts-with(@href, 'gengmei://'))]"): item.getparent().remove(item) # a 标签追加样式 for item in element_obj.xpath("//a"): item.attrib["style"] = 'color:#3FB5AF' # a标签颜色 rich_text = html.tostring(element_obj, encoding="unicode") return rich_text def push_data_to_user(res_data: Dict) -> Dict: """ 处理数据为可以入库的格式 :param res_data: :return: """ qiniu_img_dict = {} if res_data["img_list"]: for img_url in res_data["img_list"]: try: if img_url[0:4] != "http": res_data["content"] = res_data["content"].replace(img_url,"") continue img_wb = retry_get_url(img_url).content res = upload(img_wb) print(res) img_info = retry_get_url(res + "-imageinfo") img_info_json = img_info.json() qiniu_img_dict[img_url] = (res + "-w", img_info_json) except Exception as e: print("down load img error %s" % e) return {} # 替换图片 content = res_data.get("content") if content: for img_url in qiniu_img_dict: # print(qiniu_img_list[count][0]) content = content.replace(img_url, qiniu_img_dict[img_url][0]) res_data["qiniu_img_list"] = list(qiniu_img_dict.values()) res_data["content"] = content # 处理格式 res_data["content"] = gm_convert_html_tags(res_data["content"], all_tags=True) if not res_data.get("level"): res_data["level"] = 1 # print(res_data) return res_data def write_data_into_mysql(res_data: Dict, user_id_list: List): conn = pymysql.connect(host='172.16.30.138', port=3306, user='mimas', passwd='GJL3UJe1Ck9ggL6aKnZCq4cRvM', db='mimas_prod', charset='utf8mb4') cur = conn.cursor() now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 清洗数据为可以入库的格式 data = push_data_to_user(res_data) if not data.get("content"): return None if not data.get("qiniu_img_list"): return None tractate_id = 0 # 判断平台id if data["platform"] == "weibo": platform_value = 17 elif data["platform"] == "douban": platform_value = 12 elif data["platform"] == "zhihu": platform_value = 14 elif data["platform"] == "toutiao": platform_value = 13 elif data["platform"] == "xiaohongshu": platform_value = 15 elif data["platform"] == "xinyang": platform_value = 16 try: sql_query = """insert into api_tractate (user_id,content,is_online,status,platform,content_level,is_excellent,create_time,last_modified,user_del,low_quality,low_quality_deal,platform_id,pgc_type,title) values ({user_id},'{content}',{is_online},{status},{platform},{content_level},{is_excellent},'{create_time}','{last_modified}',{user_del},{low_quality},{low_quality_deal},'{platform_id}',{pgc_type},'{title}');""".format( user_id=random.choice(user_id_list), content=data["content"], is_online=0, status=2, platform=platform_value, content_level=data["level"], is_excellent=0, create_time=now_str, last_modified=now_str, user_del=0, low_quality=0, low_quality_deal=0, platform_id=data["doc_id"], pgc_type=0, title=data["title"]) res = cur.execute(sql_query) tractate_id = int(conn.insert_id()) if res: conn.commit() except Exception as e: print("commit error %s" % e) print(data) conn.rollback() if data.get("qiniu_img_list"): for img_info in data.get("qiniu_img_list"): if img_info[0] in data.get("content"): image_url_source = 2 else: image_url_source = 3 try: image_type = img_type.get(img_info[1]["format"].upper()) except: image_type = 1 try: width = img_info[1]["width"] height = img_info[1]["height"] except: width = 0 height = 0 try: if img_type == 7: sql_query = """ insert into api_tractate_images (tractate_id,image_url,width,image_webp,height,image_url_source,image_type,image_webp,create_time,update_time) values ({tractate_id},'{image_url}',{width},{height},{image_webp},{image_url_source},{image_type},{image_webp},'{create_time}','{update_time}') """.format(tractate_id=tractate_id, image_url=img_info[0], width=width, height=height, image_url_source=image_url_source, image_type=image_type, image_webp=img_info[0], create_time=now_str, update_time=now_str) else: sql_query = """ insert into api_tractate_images (tractate_id,image_url,width,height,image_url_source,image_type,create_time,update_time) values ({tractate_id},'{image_url}',{width},{height},{image_url_source},{image_type},'{create_time}','{update_time}') """.format(tractate_id=tractate_id, image_url=img_info[0], width=width, height=height, image_url_source=image_url_source, image_type=image_type, create_time=now_str, update_time=now_str) res = cur.execute(sql_query) if res: conn.commit() except Exception as e: print("commit error %s" % e) conn.rollback() cur.close() conn.close() if tractate_id: return tractate_id else: return None