# coding:utf8 import datetime import time from celery import shared_task from django.db.models import Q from gm_upload import upload_file, IMG_TYPE from qa.cache import diary_record_cache from qa.models import es_query from qa.utils.image import get_full_path from talos.models.diary import Diary, DiaryTag from talos.models.diary.diary import DiaryRecord from talos.models.topic import Problem, TopicImage import xml.dom.minidom as Dom from talos.rpc import get_current_rpc_invoker from utils.rpc import rpc_client, logging_exception def diary_all(step, amount): #按照有效日记贴筛选出有效的日记本范围 diary_topics = Problem.objects.using('slave').filter(is_online=1, flag='n', topic_type__in=['0', '1', '2'], diary_id__isnull=False) diary_ids = list(diary_topics.values_list('diary_id', flat=True)) if amount: diarys = Diary.objects.using('slave').filter(id__in=diary_ids, is_online=1).order_by('id')[:amount] else: diarys = Diary.objects.using("slave").filter(id__in=diary_ids, is_online=1).order_by('id') length = diarys.count() sitemap_doc = Dom.Document() sitemapindex_node = sitemap_doc.createElement("sitemapindex") sitemap_doc.appendChild(sitemapindex_node) urls = create_xml(step, length, "full", diarys) for url in urls: sitemap_node = sitemap_doc.createElement("sitemap") keys = ["loc", "lastmod"] items = {"loc": url, "lastmod": datetime.datetime.now().strftime("%Y-%m-%d")} create_node(keys, items, sitemap_doc, sitemap_node) sitemapindex_node.appendChild(sitemap_node) with open('./sitemap.xml', 'wb') as f: f.write(sitemap_doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) sitemap_url = upload_file('./sitemap.xml', IMG_TYPE.POST) rpc_client['internal_email/send']( to=["songzhenqi@igengmei.com", "sunyinghe@igengmei.com"], subject="日记同步数据链接", text="sitemap:{}".format(sitemap_url) ).unwrap() def create_node(keys, items, doc, parent_note): for key in keys: child_note = doc.createElement(key) if items.get(key) or items.get(key) in {'0', 0}: val = doc.createTextNode(str(items.get(key))) child_note.appendChild(val) parent_note.appendChild(child_note) def get_diaries_score(diary_list): data = list() if diary_list: q = dict() q["_source"] = {'include': ['id', 'recommend_score']} q["query"] = {"bool": {"must": [{'terms': {'id': diary_list}}] }} q["sort"] = [{'recommend_score': {'order':'desc'}},{'last_topic_add_time': {'order':'desc'}}] res = es_query('diary', q, 0, len(diary_list)) if res and "hits" in res and res["hits"]["total"] > 0: for i in res["hits"]["hits"]: if "recommend_score" in i['_source']: data.append({"diary_id": i['_source']['id'],"score": i['_source']['recommend_score']}) else: data.append({"diary_id": i['_source']['id'], "score": 0}) return data @shared_task def diary_modify_record(ids): """ ids:list :param id: :return: """ for id in ids: DiaryRecord.objects.update_or_create(diary_id=id) @shared_task def diary_increase(step): """ 为避免数据遗漏,通过创建时间、修改时间以及创建和修改记录共同筛选 :return: """ start_time = diary_record_cache.get('diary_record_time') if not start_time: start_time = '2020-05-13' end_time = datetime.datetime.now().strftime('%Y-%m-%d') diary_reocrds = DiaryRecord.objects.using("slave").all() diary_ids = set(diary_reocrds.values_list("diary_id", flat=True)) diaries = Diary.objects.using("slave").filter(Q(created_time__range=[start_time, end_time]) | Q(last_modified__range=[start_time, end_time]) | Q(id__in=diary_ids)) #剔除不含有有效日记贴的日记本 diary_ids = list(diaries.values_list("id", flat=True)) diary_topics = Problem.objects.using("slave").filter(is_online=1, flag='n', topic_type__in=['0', '1', '2'], diary_id__in=diary_ids) diary_ids = list(diary_topics.values_list("diary_id", flat=True)) diaries = Diary.objects.using("slave").filter(id__in=diary_ids) #在线日记处理 online_diaries = diaries.filter(is_online=True).order_by('id') length = online_diaries.count() urls = create_xml(step, length, "inc", online_diaries) if len(urls) > 1: sitemap_doc = Dom.Document() sitemapindex_node = sitemap_doc.createElement("sitemapindex") sitemap_doc.appendChild(sitemapindex_node) for url in urls: sitemap_node = sitemap_doc.createElement("sitemap") keys = ["loc", "lastmod"] items = {"loc": url, "lastmod": datetime.datetime.now().strftime("%Y-%m-%d")} create_node(keys, items, sitemap_doc, sitemap_node) sitemapindex_node.appendChild(sitemap_node) with open('./increase_sitemap.xml', 'wb') as f: f.write(sitemap_doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) sitemap_url = upload_file('./increase_sitemap.xml', IMG_TYPE.POST) rpc_client['internal_email/send']( to=["songzhenqi@igengmei.com", "sunyinghe@igengmei.com"], subject="日记增量同步数据链接", text="sitemap:{}".format(sitemap_url) ).unwrap() else: rpc_client['internal_email/send']( to=["songzhenqi@igengmei.com", "sunyinghe@igengmei.com"], subject="日记增量同步数据链接", text='url:{}'.format(urls) ).unwrap() #下线日记处理 not_online_diaries = diaries.filter(is_online=False) doc = Dom.Document() root_node = doc.createElement("urlset") root_node.setAttribute("content_method", "dec") doc.appendChild(root_node) loc_url = 'https://backend.igengmei.com/api/diary/topic/{}/_data' if not_online_diaries.count() < 25000: for obj in not_online_diaries: url_node = doc.createElement("url") root_node.appendChild(url_node) keys = ["loc", "lastmod", "changefreq", "data"] items = {"loc": loc_url.format(obj.id), "lastmod": obj.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), "changefreq": "weekly", "data": ""} create_node(keys, items, doc, url_node) data_node = url_node.getElementsByTagName("data")[0] display_node = doc.createElement("display") data_node.appendChild(display_node) with open('./delete_diary.xml', 'wb') as f: f.write(doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) url = upload_file('./delete_diary.xml', IMG_TYPE.POST) rpc_client['internal_email/send']( to=["songzhenqi@igengmei.com", "sunyinghe@igengmei.com"], subject="日记增量(删除)同步数据链接", text='url:{}'.format(url) ).unwrap() else: count = 0 step = 25000 length = not_online_diaries.count() urls = [] while count < length: for obj in not_online_diaries[count: count + step]: url_node = doc.createElement("url") root_node.appendChild(url_node) keys = ["loc", "lastmod", "changefreq", "data"] items = {"loc": loc_url.format(obj.id), "lastmod": obj.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), "changefreq": "weekly", "data": ""} create_node(keys, items, doc, url_node) data_node = url_node.getElementsByTagName("data")[0] display_node = doc.createElement("display") data_node.appendChild(display_node) with open('./delete_diary.xml', 'wb') as f: f.write(doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) url = upload_file('./delete_diary.xml', IMG_TYPE.POST) urls.append(url) count += step sitemap_doc = Dom.Document() sitemapindex_node = sitemap_doc.createElement("sitemapindex") sitemap_doc.appendChild(sitemapindex_node) for url in urls: sitemap_node = sitemap_doc.createElement("sitemap") keys = ["loc", "lastmod"] items = {"loc": url, "lastmod": datetime.datetime.now().strftime("%Y-%m-%d")} create_node(keys, items, sitemap_doc, sitemap_node) sitemapindex_node.appendChild(sitemap_node) with open('./delete_sitemap_diary.xml', 'wb') as f: f.write(sitemap_doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) sitemap_url = upload_file('./delete_sitemap_diary.xml', IMG_TYPE.POST) rpc_client['internal_email/send']( to=["songzhenqi@igengmei.com", "sunyinghe@igengmei.com"], subject="日记增量(删除)同步数据链接", text="sitemap:{}".format(sitemap_url) ).unwrap() diary_reocrds.delete() diary_record_cache.setex('diary_record_time', 1209600, end_time) return def create_xml(step, length, type, diarys): loc_url = 'https://www.igengmei.com/diary_book/{}/' data_murl = 'https://m.igengmei.com/diary_book/{}/' data_url = 'https://www.igengmei.com/diary_book/{}/' count = 0 urls = [] while count < length: doc = Dom.Document() root_node = doc.createElement("urlset") root_node.setAttribute("content_method", type) doc.appendChild(root_node) # 批量获取日记关联的医生,作者,医院,标签,统一通过rpc在gaia上查询相应信息 request_data = {} es_data = [] for obj in diarys[count:count + step]: tag_ids = list(DiaryTag.objects.using("slave").filter(diary_id=obj.id).values_list('tag_id', flat=True)) item = {'doctor_id': obj.doctor_id, 'author': obj.user_id, 'tag_ids': tag_ids, 'hospital_id': obj.hospital_id} request_data['{}'.format(obj.id)] = item es_data.append(obj.id) rpc_client = get_current_rpc_invoker() for i in range(10): try: response = rpc_client['api/diary/extra_info'](data=request_data).unwrap() break except: logging_exception() time.sleep(12) diaries_recommend_score = {} es_res = get_diaries_score(es_data) for score in es_res: diaries_recommend_score[score['diary_id']] = score['score'] for obj in diarys[count:count + step]: try: result = response[str(obj.id)] except: logging_exception() continue print(obj.id) obj_topics = Problem.objects.using("slave").filter(is_online=1, flag='n', topic_type__in=['0', '1', '2'], diary_id=obj.id).order_by('-created_time') url_node = doc.createElement("url") root_node.appendChild(url_node) keys = ['loc', "lastmod", "changefreq", "priority"] items = {"loc": loc_url.format(obj.id), "lastmod": obj.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), "changefreq": "weekly"} create_node(keys, items, doc, url_node) # data_root处理 data_root = doc.createElement("data") url_node.appendChild(data_root) display_node = doc.createElement("display") data_root.appendChild(display_node) url_node.appendChild(data_root) keys = ["id", "title", "description", "url", "mUrl", "firstTime", "updateTime", "imageBefore", "imageAfter"] description = obj_topics.first().answer url = data_url.format(obj.id) mUrl = data_murl.format(obj.id) items = {'id': str(obj.id), 'title': obj.title, 'url': url, 'mUrl': mUrl, 'firstTime': str(obj.created_time.timestamp()), 'updateTime': str(obj.created_time.timestamp()), 'description': description} image_ls = obj.cover imageAfter = imageBefore = 'https://heras.igengmei.com/service_home/2020/03/17/a99b291047' for image in image_ls: if image.get('desc') == 'After': imageAfter = image.get('image') or imageAfter else: imageBefore = image.get('image') or imageBefore items.update(imageAfter=imageAfter, imageBefore=imageBefore) create_node(keys, items, doc, display_node) # 医院,医生,整容地区,分级处理 tags_info = result['tags_info'] doctor_info = result['doctor_info'] keys = ['projectName', "satisfaction", 'hospital'] items = {'projectName': tags_info.get('projectName'), 'hospital': doctor_info.get('hospital') or obj.raw_hospital} create_node(keys, items, doc, display_node) doctor_node = doc.createElement("doctor") display_node.appendChild(doctor_node) # 医生信息处理 keys = ['name', "url", 'mUrl'] create_node(keys, doctor_info, doc, doctor_node) # 地域信息处理 keys = ['country', 'province', 'city', "area"] area_info = result['area_info'] addressSpecification_node = doc.createElement('addressSpecification') display_node.appendChild(addressSpecification_node) create_node(keys, area_info, doc, addressSpecification_node) # 分级以及点赞、评论、浏览数量统计 items = {'viewCount': int(obj.view_num), 'likeCount': obj.like_num, 'commentCount': obj.reply_num, "recommendScore": str(diaries_recommend_score.get(obj.id, 0))} items.update(**tags_info) keys = ['categoryFirstName', 'categorySecondName', 'categoryThirdName', 'viewCount', 'likeCount', 'commentCount', 'recommendScore'] create_node(keys, items, doc, display_node) #多个tags标签处理 for tag_name in tags_info.get('tag_names', []): key_node = doc.createElement("tags") val = doc.createTextNode(tag_name) key_node.appendChild(val) display_node.appendChild(key_node) # 咨询信息处理 consultation_node = doc.createElement("consultation") keys = ["tel", "url", "mUrl"] items = {"url": "https://gyfk12.kuaishang.cn/bs/mim/119017/116478/870671/sText_内投-APP首页.htm", "mUrl": "https://gyfk12.kuaishang.cn/bs/mim/119017/116478/870671/sText_内投-APP首页.htm"} create_node(keys, items, doc, consultation_node) display_node.appendChild(consultation_node) # 作者信息处理 author_node = doc.createElement("author") author_info = result['author_info'] keys = ['id', 'name', 'image', 'url', 'mUrl'] create_node(keys, author_info, doc, author_node) display_node.appendChild(author_node) # 日记贴相关字段处理 keys = ['id', 'number', 'updateTime', 'day', 'viewCount', 'likeCount', 'commentCount', 'description', 'imageAfter', 'url', 'mUrl'] mUrl = 'https://m.igengmei.com/topic/{}' url = 'https://www.igengmei.com/topic/{}/' for i in range(len(obj_topics[:10])): obj = obj_topics[i] updateTime = str(obj.created_time.timestamp()) if obj.last_modified else '' start_day = obj.diary.operation_time if obj.diary.operation_time else obj.diary.created_time end_day = obj.operation_date or obj.created_time day = (end_day - start_day).days topic_image = TopicImage.objects.filter(topic_id=obj.id).order_by('-id').first() if topic_image: image_url = topic_image.get_image_data().get("image", "") else: image_url = get_full_path(obj.post_operation_image) if obj.post_operation_image else '' items = {'id': obj.id, 'number': i, 'updateTime': updateTime, 'day': day, 'viewCount': obj.view_amount, 'likeCount': obj.vote_amount, 'commentCount': obj.reply_num, 'description': obj.answer, 'imageAfter': image_url, 'url': url.format(obj.id), 'mUrl': mUrl.format(obj.id)} diaryList_node = doc.createElement("diaryList") create_node(keys, items, doc, diaryList_node) display_node.appendChild(diaryList_node) # 公司信息 provider_node = doc.createElement("provider") keys = ["brand", "logo", "showurl"] items = {"brand": "更美", "logo": "https://heras.igengmei.com/2020/02/26/4dc1224831", "showurl": "www.igengmei.com"} create_node(keys, items, doc, provider_node) display_node.appendChild(provider_node) with open('./diary.xml', 'wb') as f: f.write(doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8")) count += step url = upload_file('./diary.xml', IMG_TYPE.POST) urls.append(url) return urls