# -*- coding: utf-8 -*- import redis import re import json import pymysql import pandas as pd # 抽取医院名字简称 def get_city_names(): db = pymysql.connect(host='172.16.30.143', port=3306, user='work', passwd='BJQaT9VzDcuPBqkd', db='zhengxing') cursor = db.cursor() sql_active = "select name from api_city;" cursor.execute(sql_active) result = cursor.fetchall() name_list = pd.DataFrame(list(result))[0].values.tolist() print(name_list[:10]) db.close() return name_list def name_short(): db = pymysql.connect(host='172.16.30.143', port=3306, user='work', passwd='BJQaT9VzDcuPBqkd', db='zhengxing') cursor = db.cursor() sql_active = "select name from api_hospital;" cursor.execute(sql_active) result = cursor.fetchall() name_list = pd.DataFrame(list(result))[0].values.tolist()[:20] db.close() print(name_list[:10]) names = ["门诊部","医疗", "门诊", "研究所", "有限", "公司", "医学", "诊所", "中心", "医美", "集团", "卫生", "机构", "专业", "光学", "国际", "连锁", "综合", "专科",] location = ["街道", "社区",] city_list = get_city_names() stop_words = names + location + city_list new_names = [] for name in name_list: for word in stop_words: name = re.sub(word, '', name) # 去除\t name = re.sub(r'\t', "", name) new_names.append(name) df = pd.DataFrame() df['old_name'] = name_list df['new_name'] = new_names print(df.head(6)) df.to_csv("/home/gmuser/"+"12.csv",index=None) # number = list() # for i in cover: # sql = "select count(cid) from data_feed_exposure " \ # "where cid_type = 'diary'" \ # "and stat_date = '{}' and device_id = '{}';".format(date,i) # cursor.execute(sql) # result = cursor.fetchone()[0] # cid.append(i) # number.append(result) # df = pd.DataFrame() # df['id'] = cid # df['number'] = number # df = df.sort_values(by = "number",ascending=False) # df.to_csv(DIRECTORY_PATH+"exp.csv",index=None) def name_process(name): project_tags = ["口腔","植发","牙","皮肤","眼","外科","美容","整形","烧伤","胸","丰胸","美胸","祛痘","祛斑","脱毛", "创伤","除疤","半永久","纹绣","纹眉"] names = ["医疗","门诊","研究所","有限","公司","医学","诊所","中心","医美","集团","卫生","机构","专业", "光学","国际","连锁","综合","专科",""] location = ["街道","社区",] stop_words = project_tags + names + location for word in stop_words: name = re.sub(word, '', name) # 去除 中文括号( ) name = re.sub(r'\(.*?\)', '', name) # 去除 英文括号( ) name = re.sub(r'\(.*?\)', '', name) # 去除 左英文括号,右中文括号 name = re.sub(r'\(.*?\)', '', name) # 去除数字 name = re.sub(r'\d', '', name) # 去除\t name = re.sub(r'\t', "", name) return name if __name__ == '__main__': name_short()