import pandas as pd
import pymysql
import json
from sqlalchemy import create_engine

def con_sql(db,sql):
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        df = pd.DataFrame(list(result))
    except Exception:
        print("发生异常", Exception)
        df = pd.DataFrame()
    finally:
        db.close()
    return df

def is_json(myjson):
    try:
        json.loads(myjson)
    except ValueError:
        return False
    return True

def json_format(x):
    a = json.loads(x)
    return set([x["appName"] for x in a])

def sort_app():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
    sql = "select device_id,app_list,stat_date from device_id_applist"
    df = con_sql(db, sql).dropna()
    df = df.rename(columns={0: "device_id", 1: "app_list",2:"stat_date"})
    print(df.shape)
    df = df.sort_values(by="stat_date",ascending=False)
    df = df.drop("stat_date",axis=1)
    df = df.drop_duplicates("device_id")
    print(df.shape)
    df = df.loc[df["app_list"].apply(is_json)]


    category = {"competitor":{"新氧美容","悦美","美呗整形","悦美微整形","如丽美容","医美咖","整形去哪儿","美黛拉","整形思密达","美芽"},
                "dianshang":{"京东","淘宝","唯品会","天猫","苏宁易购","国美","当当","亚马逊","网易严选","小米有品"},
                "kuajing_dianshang": {"小红书", "网易考拉", "洋码头", "达令全球好货", "海狐海淘",
                                      "HIG0", "豌豆公主", "尚品网", "丰趣海淘", "比呀比海外购"},
                "zhibo": {"YY直播", "映客直播", "花椒直播", "NOW直播", "小米直播", "一直播", "KK直播", "酷狗直播",
                          "来疯直播", "喵播"},
                "youxizhibo": {"虎牙直播", "斗鱼直播", "熊猫直播", "触手直播", "企鹅电竞", "龙珠直播", "战旗直播",
                               "全民直播", "CC直播", "火猫直播"},
                "short_video": {"抖音短视频", "快手", "西瓜视频", "火山小视频", "秒拍", "快视频", "小影", "蛙趣视频",
                                "最右", "小咖秀"},
                "meitu": {"美图秀秀", "美颜相机", "天天P图", "Faceu激萌", "B612咔叽", "in", "相机360", "潮自拍",
                          "玩图", "nice","轻颜相机","无他相机"},
                "tiyu": {"直播吧", "腾讯体育", "新浪体育", "虎扑体育", "懂球帝", "CCTV5", "疯狂体育",
                         "球探体育比分", "PP体育", "A8体育直播"},
                "read":{"掌阅", "QQ阅读", "咪咕阅读", "书旗小说", "多看阅读", "追书神器", "搜狗阅读", "微信读书",
                        "起点小说", "宜搜小说"},
                "finance": {"21财经", "华尔街见闻", "新浪财经", "时代财经", "和讯财经", "第一财经", "FT中文网", "财经杂志", "财新", "央视财经"},
                "fashion_clothes": {"蘑菇街", "聚美优品", "美丽说", "楚楚街", "穿衣助手", "有货", "优品惠", "优购时尚商城", "走秀奢侈品"},
                "muying": {"贝贝网", "蜜芽", "孩子王", "妈妈100", "大V店", "宝贝格子", "乐友", "母婴之家", "国际妈咪海淘母婴商城", "美囤妈妈",
                           "妈妈网孕育", "宝宝树孕育", "辣妈帮", "亲宝宝", "宝宝知道", "妈妈社区", "妈妈帮", "柚宝宝", "育儿宝"},
                "fresh": {"每日优鲜", "京东到家", "天天果园", "中粮我买网", "本来生活", "手机惠农", "盒马", "顺丰优选", "百果园", "易果生鲜"},
                "bijia": {"美团", "拼多多", "折800", "返利网", "卷皮折扣", "淘粉吧", "聚划算", "一淘", "网购联盟", "返利淘联盟", "什么值得买", "花生日记"},
                "travel": {"携程旅行", "去哪儿旅行", "同程旅游", "途牛旅游", "飞猪", "马蜂窝旅游", "艺龙旅行", "驴妈妈旅游",
                           "TripAdvisor猫途鹰", "美团旅行"},
                "airplane": {"航班管家", "飞常准", "航旅纵横", "春秋航空", "南方航空", "中国国航", "东方航空", "海南航空", "深圳航空", "四川航空"},
                "love": {"百合婚恋", "世纪佳缘", "珍爱网", "牵手婚恋", "探探", "热恋", "有缘网", "约会吧", "约爱", "快约爱"},
                "stock": {"同花顺炒股票", "大智慧", "涨乐财富通", "腾讯自选股", "广发证券易淘金", "金太阳", "国泰君安君弘", "海通e海通财", "平安证券", "同花顺"},
                "car": {"平安好车主", "途虎养车", "车主无忧", "汽车超人", "车e族", "汽修宝", "车点点", "汽车大师", "乐车邦", "车享家"},
                "child": {"小伴龙", "儿歌多多", "宝宝巴士奇妙屋", "智慧树", "贝瓦儿歌", "儿歌点点", "宝贝听听", "宝宝小厨房", "宝宝游乐园", "叽里呱啦"},
                "homework": {"作业帮", "小猿搜题", "一起作业学生端", "学霸君", "互动作业", "猿题库", "纳米盒", "阿凡题", "洋葱数学"},
                "work": {"钉钉", "企业微信", "移动彩云", "云之家", "今目标", "口袋助理", "推事本", "奇鱼微办公", "工作圈", "明道"},
                "job": {"智联招聘", "前程无忧", "斗米", "拉勾", "Boss直聘", "猎聘同道", "智联招聘"}
                 }
    df["app_list"] = df["app_list"].apply(json_format)
    n = df.shape[0]
    df["sum"] = 0
    for i in category.keys():
        df[i] = df["app_list"].apply(lambda x: 1 if len(x & category[i]) > 0 else 0)
        df["sum"] = df["sum"]+df[i]
        # print(i)
        # print(df.loc[df[i]==1].shape[0]/n)
    df = df.drop("app_list",axis=1)

    # for i in df["sum"].unique():
    #     print(i)
    #     a = df.loc[df["sum"] == i].shape[0]/n
    #     print(a)

    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
    print(df.shape)
    n = 200000
    for i in range(0,df.shape[0],n):
        print(i)
        if i == 0:
            temp = df.iloc[0:n]
        elif i+n > df.shape[0]:
            temp = df.iloc[i:]
        else:
            temp = df.loc[i:i+n]
        pd.io.sql.to_sql(temp, "app_list_sort", yconnect, schema='jerry_test', if_exists='append', index=False)
        print("insert done")



if __name__ == "__main__":
    sort_app()



