import pandas as pd import pymysql import json from sqlalchemy import create_engine def con_sql(db,sql): cursor = db.cursor() try: cursor.execute(sql) result = cursor.fetchall() df = pd.DataFrame(list(result)) except Exception: print("发生异常", Exception) df = pd.DataFrame() finally: db.close() return df def is_json(myjson): try: json.loads(myjson) except ValueError: return False return True def json_format(x): a = json.loads(x) return set([x["appName"] for x in a]) def sort_app(): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod') sql = "select device_id,app_list,stat_date from device_id_applist" df = con_sql(db, sql).dropna() df = df.rename(columns={0: "device_id", 1: "app_list",2:"stat_date"}) print(df.shape) df = df.sort_values(by="stat_date",ascending=False) df = df.drop("stat_date",axis=1) df = df.drop_duplicates("device_id") print(df.shape) df = df.loc[df["app_list"].apply(is_json)] category = {"competitor":{"新氧美容","悦美","美呗整形","悦美微整形","如丽美容","医美咖","整形去哪儿","美黛拉","整形思密达","美芽"}, "dianshang":{"京东","淘宝","唯品会","天猫","苏宁易购","国美","当当","亚马逊","网易严选","小米有品"}, "kuajing_dianshang": {"小红书", "网易考拉", "洋码头", "达令全球好货", "海狐海淘", "HIG0", "豌豆公主", "尚品网", "丰趣海淘", "比呀比海外购"}, "zhibo": {"YY直播", "映客直播", "花椒直播", "NOW直播", "小米直播", "一直播", "KK直播", "酷狗直播", "来疯直播", "喵播"}, "youxizhibo": {"虎牙直播", "斗鱼直播", "熊猫直播", "触手直播", "企鹅电竞", "龙珠直播", "战旗直播", "全民直播", "CC直播", "火猫直播"}, "short_video": {"抖音短视频", "快手", "西瓜视频", "火山小视频", "秒拍", "快视频", "小影", "蛙趣视频", "最右", "小咖秀"}, "meitu": {"美图秀秀", "美颜相机", "天天P图", "Faceu激萌", "B612咔叽", "in", "相机360", "潮自拍", "玩图", "nice","轻颜相机","无他相机"}, "tiyu": {"直播吧", "腾讯体育", "新浪体育", "虎扑体育", "懂球帝", "CCTV5", "疯狂体育", "球探体育比分", "PP体育", "A8体育直播"}, "read":{"掌阅", "QQ阅读", "咪咕阅读", "书旗小说", "多看阅读", "追书神器", "搜狗阅读", "微信读书", "起点小说", "宜搜小说"}, "finance": {"21财经", "华尔街见闻", "新浪财经", "时代财经", "和讯财经", "第一财经", "FT中文网", "财经杂志", "财新", "央视财经"}, "fashion_clothes": {"蘑菇街", "聚美优品", "美丽说", "楚楚街", "穿衣助手", "有货", "优品惠", "优购时尚商城", "走秀奢侈品"}, "muying": {"贝贝网", "蜜芽", "孩子王", "妈妈100", "大V店", "宝贝格子", "乐友", "母婴之家", "国际妈咪海淘母婴商城", "美囤妈妈", "妈妈网孕育", "宝宝树孕育", "辣妈帮", "亲宝宝", "宝宝知道", "妈妈社区", "妈妈帮", "柚宝宝", "育儿宝"}, "fresh": {"每日优鲜", "京东到家", "天天果园", "中粮我买网", "本来生活", "手机惠农", "盒马", "顺丰优选", "百果园", "易果生鲜"}, "bijia": {"美团", "拼多多", "折800", "返利网", "卷皮折扣", "淘粉吧", "聚划算", "一淘", "网购联盟", "返利淘联盟", "什么值得买", "花生日记"}, "travel": {"携程旅行", "去哪儿旅行", "同程旅游", "途牛旅游", "飞猪", "马蜂窝旅游", "艺龙旅行", "驴妈妈旅游", "TripAdvisor猫途鹰", "美团旅行"}, "airplane": {"航班管家", "飞常准", "航旅纵横", "春秋航空", "南方航空", "中国国航", "东方航空", "海南航空", "深圳航空", "四川航空"}, "love": {"百合婚恋", "世纪佳缘", "珍爱网", "牵手婚恋", "探探", "热恋", "有缘网", "约会吧", "约爱", "快约爱"}, "stock": {"同花顺炒股票", "大智慧", "涨乐财富通", "腾讯自选股", "广发证券易淘金", "金太阳", "国泰君安君弘", "海通e海通财", "平安证券", "同花顺"}, "car": {"平安好车主", "途虎养车", "车主无忧", "汽车超人", "车e族", "汽修宝", "车点点", "汽车大师", "乐车邦", "车享家"}, "child": {"小伴龙", "儿歌多多", "宝宝巴士奇妙屋", "智慧树", "贝瓦儿歌", "儿歌点点", "宝贝听听", "宝宝小厨房", "宝宝游乐园", "叽里呱啦"}, "homework": {"作业帮", "小猿搜题", "一起作业学生端", "学霸君", "互动作业", "猿题库", "纳米盒", "阿凡题", "洋葱数学"}, "work": {"钉钉", "企业微信", "移动彩云", "云之家", "今目标", "口袋助理", "推事本", "奇鱼微办公", "工作圈", "明道"}, "job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"} } df["app_list"] = df["app_list"].apply(json_format) n = df.shape[0] df["sum"] = 0 for i in category.keys(): df[i] = df["app_list"].apply(lambda x: 1 if len(x & category[i]) > 0 else 0) df["sum"] = df["sum"]+df[i] # print(i) # print(df.loc[df[i]==1].shape[0]/n) df = df.drop("app_list",axis=1) # for i in df["sum"].unique(): # print(i) # a = df.loc[df["sum"] == i].shape[0]/n # print(a) yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') print(df.shape) n = 200000 for i in range(0,df.shape[0],n): print(i) if i == 0: temp = df.iloc[0:n] elif i+n > df.shape[0]: temp = df.iloc[i:] else: temp = df.loc[i:i+n] pd.io.sql.to_sql(temp, "app_list_sort", yconnect, schema='jerry_test', if_exists='append', index=False) print("insert done") if __name__ == "__main__": sort_app()