import logging import jieba.posseg as pseg from gm_types.mimas import TRACTATE_STATUS, PGC_TYPE from talos.models.tractate.tractate import Tractate from talos.services.user import UserService from data_sync.utils import to_epoch, tzlc from qa.models.toutiao import by_content_type_id_get_keywords, get_content_star_keywords, get_content_title_keywords, \ get_content_star_first_keyword, has_service from talos.services.convert_service.user_convert_service import UserConvertService from gm_types.gaia import DOCTOR_TYPE, TAG_V3_TYPE, TAG_TYPE from tags.services.tag import (get_tagv3_analysis_info, get_tag_v3_operators_tags, get_tag_v3_anecdote_tags, get_tag_v3_anecdote_tag_ids, get_tag_v3_channel_tags_tags, get_tag_v3_names_by_tag_v3_ids, get_gossip_tags, get_tag_v3_gossip_tag_ids) from talos.models.tractate.vote import TractateVote from talos.models.tractate.reply import TractateReply from talos.services.tractate.tractate import TractateService import time import datetime from django.conf import settings import redis import re import traceback from algorithm.text_classifical.base import model as alo_model from gm_types.doris import SELECTED_CONTENT_TYPE def get_tractate(pks): try: tractates = Tractate.objects.filter(id__in=pks) data = [] user_ids = list(set(tractates.values_list("user_id", flat=True))) user_infos = UserService.get_users_by_user_ids(user_ids) tractate_all_tag_names = [] for tractate in tractates: item = {} user = user_infos.get(tractate.user_id, None) user_name = user and user.nickname or "" cut_bool = False cut_word = [" 医生", " 机构"] for i in cut_word: if user_name.find(i) >= 0: cut_bool = True if cut_bool == True: user_name = user_name.split()[0] item['id'] = tractate.id item["user_id"] = tractate.user_id sss = tractate.content.encode('utf-16', 'surrogatepass').decode('utf-16') dr = re.compile(r"<[^>]+>", re.S) str_re = dr.sub("", sss) item["content"] = str_re item['keynote_sentence'] = get_keynote_sentence(tractate.content) if int( float(tractate.content_level)) >= 3 else "" item["content_pre"] = "" item["is_online"] = tractate.is_online item["status"] = tractate.status item["platform"] = tractate.platform item["content_level"] = tractate.content_level item["is_excellent"] = tractate.is_excellent item["pgc_type"] = tractate.pgc_type item["create_time"] = tzlc(tractate.create_time) item['update_time_stratific'] = tractate.get_update_time_stratific(tractate.create_time) item["create_time_epoch"] = to_epoch(tzlc(tractate.create_time)) item["last_modified"] = tzlc(tractate.last_modified) redis_client = redis.StrictRedis.from_url(settings.DORIS_URL) redis_name = "doris:tractate:community:update_time" if tractate.pgc_type == PGC_TYPE.COMMUNITY \ and item["is_online"] == True \ and int(item["content_level"]) >= 3: redis_client.zadd(redis_name, float(time.mktime(item["last_modified"].timetuple())), item["id"]) else: redis_client.zrem(redis_name, item["id"]) item["audit_time"] = tzlc(tractate.audit_time) item["tractate_score"] = tractate.get_tractate_score item['good_click'] = tractate.get_goodclick item['goodclick_rate_30'] = tractate.con_goodclick item["good_click_tractate_score"] = tractate.get_good_click_tractate_score tractate_tag_list = tractate.get_tag_list item["tractate_tag_list"] = tractate_tag_list item["post_time"] = tzlc(tractate.audit_time) if tractate.status == TRACTATE_STATUS.AUDIT_SUCCESS else None item["author"] = user_name tag_list = tractate.get_tag_list item["tractate_tag"] = tractate.get_tag(tag_list) tractate_tag_name = tractate.get_tag_names(tag_list) item["tractate_tag_name"] = tractate_tag_name item['portrait_tag_name'] = [t.get("tag_name", None) for t in item["tractate_tag"] if t.get("tag_type", 0) in ( TAG_TYPE.BODY_PART, TAG_TYPE.BODY_PART_SUB_ITEM, TAG_TYPE.ITEM_WIKI)] item["is_video"] = tractate.get_is_video item["tractate_tag_name_content"] = tractate.get_tag_names_content(tag_list) item["hot_score"] = tractate.get_hot_score() item["vote_num"] = TractateVote.objects.filter(tractate_id=tractate.id, is_online=True).count() item["reply_num"] = TractateReply.objects.filter(tractate_id=tractate.id, is_online=True).count() item['reply_vote_radd'] = item["vote_num"] * 13 + item["reply_num"] * 17 item["content_simi_bol_show"] = tractate.get_show() # 新标签 fresh_tag_list = tractate.get_fresh_tag_list item["fresh_tractate_tag_list"] = fresh_tag_list item["fresh_tractate_tag_name"] = tractate.get_fresh_tag_names(fresh_tag_list) item["fresh_tractate_tag_name_content"] = tractate.get_tag_names_content(fresh_tag_list) item["last_any_reply_time"] = tzlc(tractate.get_tractate_last_any_reply_time()) item["is_office"] = tractate.get_user_info_office() content_keyword = by_content_type_id_get_keywords(id=tractate.id, content_type="usertopic") item["content_keyword"] = content_keyword item["content_star_keyword"] = get_content_star_keywords(id=tractate.id, content_type="usertopic") item["content_star_first_keyword"] = get_content_star_first_keyword(id=tractate.id, content_type="usertopic") item["has_service"] = has_service(tractate_tag_list, content_keyword) item["user_type"] = get_user_type(tractate.user_id) item['has_picture'] = tractate.get_tractate_image (need_refresh_data, second_demands_list, second_solutions_list, second_positions_list, second_demands_ids_list, second_solutions_ids_list, second_positions_ids_list, first_demands_ids_list, first_solutions_ids_list, first_positions_ids_list, first_demands_list, first_solutions_list, first_positions_list, project_tags_list, project_tags_ids_list, first_classify_ids_list, first_classify_names_list, second_classify_ids_list, second_classify_names_list) = get_tagv3_analysis_info(content_id=item["id"], content_type="tractate") if need_refresh_data: item["tags_v3"] = list(project_tags_list) item["first_demands"] = list(first_demands_list) item["second_demands"] = list(second_demands_list) item["first_solutions"] = list(first_solutions_list) item["second_solutions"] = list(second_solutions_list) item["positions"] = list(first_positions_list) item["second_positions"] = list(second_positions_list) item["tagv3_ids"] = list(project_tags_ids_list) item["first_demands_ids"] = list(first_demands_ids_list) item["second_demands_ids"] = list(second_demands_ids_list) item["first_solutions_ids"] = list(first_solutions_ids_list) item["second_solutions_ids"] = list(second_solutions_ids_list) item["first_positions_ids"] = list(first_positions_ids_list) item["second_positions_ids"] = list(second_positions_ids_list) item["first_classify_ids"] = list(first_classify_ids_list) item["first_classify_names"] = list(first_classify_names_list) item["second_classify_ids"] = list(second_classify_ids_list) item["second_classify_names"] = list(second_classify_names_list) else: item["tags_v3"] = [] item["first_demands"] = [] item["second_demands"] = [] item["first_solutions"] = [] item["second_solutions"] = [] item["positions"] = [] item["second_positions"] = [] item["tagv3_ids"] = [] item["first_demands_ids"] = [] item["second_demands_ids"] = [] item["first_solutions_ids"] = [] item["second_solutions_ids"] = [] item["first_positions_ids"] = [] item["second_positions_ids"] = [] item["first_classify_ids"] = [] item["first_classify_names"] = [] item["second_classify_ids"] = [] item["second_classify_names"] = [] ###新增字段同步新标签的运营标签 item['operators_add_tags'] = get_tag_v3_operators_tags(content_id=tractate.id, content_type="tractate") item['channel_tags'] = get_tag_v3_channel_tags_tags(content_id=tractate.id, content_type="tractate") item['channel_tags_names'] = get_tag_v3_channel_tags_tags(content_id=tractate.id, content_type="tractate", get_names=True) item['anecdote_tags'] = get_tag_v3_anecdote_tags(content_id=tractate.id, content_type="tractate") item['anecdote_tag_ids'] = get_tag_v3_anecdote_tag_ids(content_id=tractate.id, content_type="tractate") item['new_smr'] = tractate.get_tractate_new_smart_rank_score(tractate_id=tractate.id) score = tractate.get_search_tractate_new_smart_rank_score(tractate_id=tractate.id) item['search_new_smr'] = score.get("smart_rank_score", 0) item['new_goodclicks'] = score.get('new_goodclick', 0) item["latest_interaction_time"] = tractate.get_tractate_latest_interaction_time(item["is_online"], item["content_level"], item["last_modified"]) item['latest_create_or_reply_time'] = int(time.mktime(item['create_time'].timetuple())) if item['last_any_reply_time'] and item['last_any_reply_time'] > item["create_time"]: item['latest_create_or_reply_time'] = int(time.mktime(item['last_any_reply_time'].timetuple())) # 内容保量字段 days_past_value = (tzlc(datetime.datetime.today()) - item['create_time']).days # if days_past_value < 7: redis_client = redis.StrictRedis.from_url(settings.DORIS_URL) redis_name_for_diary_exposure = "doris:content_exposure:tractate" redis_exposure_val = redis_client.hget(redis_name_for_diary_exposure, item["id"]) if redis_exposure_val: redis_exposure_val = int(redis_exposure_val) if (str(item["content_level"]) == "6" and redis_exposure_val < 1000) \ or (str(item["content_level"]) == "5" and redis_exposure_val < 800) \ or (str(item["content_level"]) == "4" and redis_exposure_val < 500) \ or (str(item["content_level"]) == "3.5" and redis_exposure_val < 300) \ or (str(item["content_level"]) == "3" and redis_exposure_val < 200): item["is_need_guarantee"] = True else: item["is_need_guarantee"] = False elif days_past_value <= 1: # 认为是当天新增内容,还没产生过曝光 item["is_need_guarantee"] = True # else: # 7天以上强制不保量 # item["is_need_guarantee"] = False # 过滤标签 filter_tags_names = ["斩男心机妆", "今日look打卡", "少女心未泯", "颜值高光时刻", "美妆", "Get漫画迷人眼", "最显白口红推荐", "口红试色", "穿搭技巧", "眼妆教程", "腮红", "氧气笑容练成记", "口红试色", "假体隆胸", "自体脂肪隆胸", "胶原蛋白填充隆胸", "玻尿酸填充隆胸", "胸部假体取出", "胸部失败修复", "胸部下垂矫正", "胸部修复", "胸形美化", "丰胸(隆胸)", "胸部塑身", "玻尿酸隆胸", "假体隆胸", "胶原蛋白隆胸", "埋线隆胸", "自体脂肪隆胸", "胸部注射物取出", "胸部假体取出", "胸部假体取出", "胸部修复", "脂肪胶隆胸", "胸部整形", "隆胸", "胸部美化", "缩胸", "胸部提升", "生胸毛", "胸部护理", "综合隆胸", "植胸毛", "胸部修复", "隆胸修复", "胸部护理", "硅胶隆胸", "胸部护理", "胸部修复", "胸部手术", "胸部提升", "生胸毛", "美胸", "胸毛", "胸部", "胸形", "胸部假体", "胸部kyc", "自体脂肪丰胸", "泰国隆胸", "泰国假体隆胸","脂肪丰胸","复合式隆胸","隆胸假体","丰胸(隆胸)","美胸养成"] operators_add_tags_names = list() operators_add_tags_names = get_tag_v3_names_by_tag_v3_ids(item['operators_add_tags']) tractate_all_tag_names.extend( item["tractate_tag_name"] + item["tractate_tag_name_content"] + item["fresh_tractate_tag_name"] + item[ "fresh_tractate_tag_name_content"] + item["tags_v3"] + item["first_demands"] + item["second_demands"] + item["first_solutions"] + item[ "second_solutions"] + item["positions"] + item["second_positions"] + item['channel_tags_names'] + item['anecdote_tags'] + item["first_classify_names"] + item[ "second_classify_names"] + operators_add_tags_names) tags_inter = [item for item in list(set(tractate_all_tag_names)) if item in list(set(filter_tags_names))] # tags_inter = list(set(tractate_all_tag_names).intersection(set(filter_tags_names))) if tags_inter: item['show_by_index'] = 2 # 2 标签中含有过滤列表中的标签 else: item['show_by_index'] = tractate.get_show_by_index(tractate.id) item["has_video"] = TractateService.has_video(tractate.id) item["is_gif"] = tractate.cover_is_dynamic # 是否是动图 # 判断是否有八卦标签 gossip_tag_names = get_gossip_tags() gossip_inter = list(set(operators_add_tags_names).intersection(set(gossip_tag_names))) if gossip_inter: item["is_gossip"] = True else: item["is_gossip"] = False item["gossip_tag_ids"] = get_tag_v3_gossip_tag_ids(content_id=tractate.id, content_type="tractate") item["gossip_tags"] = get_tag_v3_gossip_tag_ids(content_id=tractate.id, content_type="tractate", get_names=True) # 首页精选增加内容分类 明星列表和网红列表 tagv4 = tractate.get_tractate_tagv4_names(tractate_id=tractate.id) if tagv4: tags_info = tractate.get_tag(list(tagv4)) item['selected_stars'] = [] item['selected_internet_celebrity'] = [] all_tags = [] for tag in tags_info: all_tags.append(tag.get("id", None)) if tag.get("tag_type", 0) == TAG_TYPE.STAR: item['selected_stars'].append(tag.get("name", None)) elif tag.get("tag_type", 0) == TAG_TYPE.INFLUENCER: item['selected_internet_celebrity'].append(tag.get("name", None)) else: pass if tag.get("id") == 14288: item['operators_add_tags'].append(14288) if 15928 in all_tags: item['selected_content_type'] = SELECTED_CONTENT_TYPE.BEAUTY_STAR elif 10682 in all_tags: item['selected_content_type'] = SELECTED_CONTENT_TYPE.STAR_GOSSIP elif 15930 in all_tags: item['selected_content_type'] = SELECTED_CONTENT_TYPE.BEAUTY_CELEBRITY elif 10683 in all_tags: item['selected_content_type'] = SELECTED_CONTENT_TYPE.CELEBRITY_GOSSIP else: item['selected_content_type'] = -1 else: try: result_data = alo_model.run(tractate.content) item['selected_content_type'] = int(result_data.get("content_type", 0)) item['selected_internet_celebrity'] = [list(item.keys())[0] for item in result_data.get("celebrity", [])] item['selected_stars'] = [list(item.keys())[0] for item in result_data.get("star", [])] projects = [list(item.keys())[0] for item in result_data.get("projects", [])] item['portrait_tag_name'].extend(projects) except: pass if "operators_add_tags" in item and 3315 in item['operators_add_tags']: item['new_smr'] = tractate.get_tractate_newuser_smr(tractate_id=tractate.id) data.append(item) logging.info("get data:%s" % data) return data except: logging.error("catch exception,logins:%s" % traceback.format_exc()) return [] def get_user_type(user_id): """ 0:doctor 1:office 2:daren 3:putong :param user_id: :return: """ try: user_info = UserConvertService.get_user_info_by_user_id(user_id=user_id) if user_info.get("doctor_id", None) and user_info["doctor_type"] == DOCTOR_TYPE.DOCTOR: user_type = 0 elif user_info.get("hospital_id", None) and user_info["doctor_type"] == DOCTOR_TYPE.OFFICER: user_type = 1 elif "membership_level" in user_info and user_info["membership_level"] != "0": user_type = 2 else: user_type = 3 return user_type except: return 4 def get_keynote_sentence(content): try: content_list = [] ss = content.encode('utf-16', 'surrogatepass').decode('utf-16') dr = re.compile(r"<[^>]+>", re.S) str_re = dr.sub("", ss) para = re.sub('([;。!?\?])([^”’])', r"\1\n\2", str_re) # 单字符断句符 para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号 para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号 para = re.sub('([;。!?\?][”’])([^,。!?\?])', r'\1\n\2', para) para = para.rstrip() # 段尾如果有多余的\n就去掉它 for con in para.split("\n"): ##切分文章成一句一句的内容 r = '[’!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~。?,]+' cos = con.lstrip(r) content_list.append(cos) return content_list except: logging.error("catch exception,logins:%s" % traceback.format_exc()) return []