# -*- coding:UTF-8 -*- # @Time : 2020/9/11 13:51 # @File : func_from_es_get_article.py # @email : litao@igengmei.com # @author : litao import pymysql from elasticsearch import Elasticsearch exists_es_dic = {} es = Elasticsearch([ { 'host': '172.16.31.17', 'port': 9200, }, { 'host': '172.16.31.11', 'port': 9200, }]) def con_sql_jerry_prod(sql): # 从数据库的表里获取数据 db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='aqpuBLYzEV7tML5RPsN1pntUzFy', db='jerry_prod') cursor = db.cursor() cursor.execute(sql) result = cursor.fetchall() db.close() return result def con_sql_doris_prod(sql): # 从数据库的表里获取数据 db = pymysql.connect(host='172.16.30.136', port=3306, user='doris', passwd='o5gbA27hXHHm', db='doris_prod') cursor = db.cursor() cursor.execute(sql) result = cursor.fetchall() db.close() return result def get_device_num_from_es(word): results = es.search( index='gm-dbmw-device', doc_type='doc', timeout='10s', size=0, body={ "query": { "bool": { "should": [ { "nested": { "path": "first_demands", "query": { "bool": { "must": [ { "terms": { "first_demands.name": [ word ] } } ] } } } }, { "nested": { "path": "second_demands", "query": { "bool": { "must": [ { "terms": { "second_demands.name": [ word ] } } ] } } } }, { "nested": { "path": "first_solutions", "query": { "bool": { "must": [ { "terms": { "first_solutions.name": [ word ] } } ] } } } }, { "nested": { "path": "second_solutions", "query": { "bool": { "must": [ { "terms": { "second_solutions.name": [ word ] } } ] } } } }, { "nested": { "path": "first_positions", "query": { "bool": { "must": [ { "terms": { "first_positions.name": [ word ] } } ] } } } }, { "nested": { "path": "second_positions", "query": { "bool": { "must": [ { "terms": { "second_positions.name": [ word ] } } ] } } } }, { "nested": { "path": "projects", "query": { "bool": { "must": [ { "terms": { "projects.name": [ word ] } } ] } } } } ], "minimum_should_match": 1 } } } ) tractate_content_num = results["hits"]["total"] return tractate_content_num def get_tractate_tags_from_es(doc_id,id_type='tractate'): # print(doc_id) body = { "query": { "bool": { "filter": [ {"term": {"_id": doc_id}} ] } } } if id_type == 'tractate': # tractate results = es.search( index='gm-dbmw-tractate-read', doc_type='tractate', timeout='10s', body=body ) # print(results) elif id_type == 'diary': results = es.search( index='gm-dbmw-diary-read', doc_type='diary', timeout='10s', body=body ) return results["hits"]['hits'][0] def get_es_article_num(tag_dict, allow_tag=["first_demands", "second_demands", "first_solutions", "second_solutions", "positions", "second_positions", "tags_v3"]): article_dict = { "first_demands": [], "second_demands": [], "first_solutions": [], "second_solutions": [], "positions": [], "second_positions": [], "tags_v3": [], } for key in list(article_dict.keys()): if key not in allow_tag: article_dict.pop(key) for tag_type in tag_dict: for tag_name in tag_dict[tag_type]: if tag_name in exists_es_dic: article_dict[tag_type].append(exists_es_dic[tag_name]) continue body = { "query": { "bool": { "minimum_should_match": 1, "should": [], "must": [ { "term": { "is_online": True } }, { "terms": { "content_level": [6, 5, 4, 3.5, 3] } }, { "range": { "content_length": { "gte": 30 } } }], } }, } body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}}) results = es.search( index='gm-dbmw-answer-read', doc_type='answer', timeout='10s', size=0, body=body ) answer_content_num = results["hits"]["total"] body = { "query": { "bool": { "minimum_should_match": 1, "should": [], "must": [{"term": {"is_online": True}}, { "terms": {"content_level": [6, 5, 4, 3.5, 3]} }] } } } body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}}) # tractate results = es.search( index='gm-dbmw-tractate-read', doc_type='tractate', timeout='10s', size=0, body=body ) tractate_content_num = results["hits"]["total"] body = { "query": { "bool": { "minimum_should_match": 1, "should": [], "must": [{"term": {"is_online": True}}, { "term": { "has_cover": True } }, {"term": { "is_sink": False } }, { "term": { "has_after_cover": True } }, { "term": { "has_before_cover": True } }, {"range": {"content_level": {"gte": "3"}}}, { "term": { "content_simi_bol_show": 0 } } ] } }, } body["query"]["bool"]["must"].append({"term": {tag_type: tag_name}}) ### diary 日记 results = es.search( index='gm-dbmw-diary-read', doc_type='diary', timeout='10s', size=0, body=body ) diary_content_num = results["hits"]["total"] total_num = answer_content_num + tractate_content_num + diary_content_num data_dic = {tag_name: (answer_content_num, tractate_content_num, diary_content_num, total_num)} # print(data_dic) exists_es_dic[tag_name] = data_dic article_dict[tag_type].append(data_dic) return article_dict def get_user_post_from_mysql(sql): second_demands_card_id_list = [] tags_v3_card_id_list = [] second_demands_count_dict = {} tags_v3_count_dict = {} second_demands_tractate_dict = {} tags_v3_tractate_dict = {} # print(sql) sql_res = con_sql_doris_prod(sql) for card_id in sql_res: # print(type(card_id[0]),card_id) es_res = get_tractate_tags_from_es(card_id[0],id_type='tractate') second_demands_tractate_dict[card_id[0]] = es_res["_source"]["second_demands"] tags_v3_tractate_dict[card_id[0]] = es_res["_source"]["tags_v3"] for position in es_res["_source"]["second_demands"]: if position in second_demands_count_dict: second_demands_count_dict[position] += 1 else: second_demands_count_dict[position] = 1 second_demands_card_id_list.append(int(card_id[0])) for tag in es_res["_source"]["tags_v3"]: if tag in tags_v3_count_dict: tags_v3_count_dict[tag] += 1 else: tags_v3_count_dict[tag] = 1 tags_v3_card_id_list.append(int(card_id[0])) # print(card_id_list) return second_demands_count_dict, tags_v3_count_dict, second_demands_card_id_list,tags_v3_card_id_list,second_demands_tractate_dict,tags_v3_tractate_dict def get_diary_from_mysql(sql,id_type='tractate'): second_demands_card_id_list = [] tags_v3_card_id_list = [] second_demands_count_dict = {} tags_v3_count_dict = {} second_demands_tractate_dict = {} tags_v3_tractate_dict = {} sql_res = con_sql_doris_prod(sql) for card_id in sql_res: # print(type(card_id[0]),card_id) es_res = get_tractate_tags_from_es(card_id[0],id_type=id_type) second_demands_tractate_dict[card_id[0]] = es_res["_source"]["second_demands"] tags_v3_tractate_dict[card_id[0]] = es_res["_source"]["tags_v3"] for position in es_res["_source"]["second_demands"]: if position in second_demands_count_dict: second_demands_count_dict[position] += 1 else: second_demands_count_dict[position] = 1 second_demands_card_id_list.append(int(card_id[0])) for tag in es_res["_source"]["tags_v3"]: if tag in tags_v3_count_dict: tags_v3_count_dict[tag] += 1 else: tags_v3_count_dict[tag] = 1 tags_v3_card_id_list.append(int(card_id[0])) # print(card_id_list) return second_demands_count_dict, tags_v3_count_dict, second_demands_card_id_list,tags_v3_card_id_list,second_demands_tractate_dict,tags_v3_tractate_dict