func_get_releaser_id.py 10.7 KB
# -*- coding:utf-8 -*-
# @Time : 2019/5/30 11:01 
# @Author : litao
import re, requests


def toutiao(releaserUrl,**kwargs):
    if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl:
        pattern = 'user/[0-9]+'
        re_find = re.findall(pattern, releaserUrl)
        if re_find != []:
            releaser_id = re_find[0].split('/')[1]
        else:
            pattern = 'to_user_id=[0-9]+'
            re_find = re.findall(pattern, releaserUrl)
            if re_find != []:
                releaser_id = re_find[0].split('=')[1]
            else:
                re_find = re.findall("/m(\d+)", releaserUrl)
                if re_find:
                    return re_find[0]
                else:
                    releaser_id = None
        return releaser_id

    elif 'm.toutiao.com' in releaserUrl:
        pattern = 'profile/[0-9]+'
        re_find = re.findall(pattern, releaserUrl)
        if re_find != []:
            releaser_id = re_find[0].split('/')[1]
            return releaser_id

    elif 'm.365yg.com' in releaserUrl:
        pattern = 'to_user_id=[0-9]+'
        re_find = re.findall(pattern, releaserUrl)
        if re_find != []:
            releaser_id = re_find[0].split('=')[1]
        else:
            releaser_id = None
        return releaser_id

    elif "user_id" in releaserUrl:
        re_find = re.findall("user_id=(\d+)",releaserUrl)
        if re_find:
            return re_find[0]
        else:
            return None
    elif "ixigua" in releaserUrl:
        re_find = re.findall("/(\d+)", releaserUrl)
        if re_find:
            return re_find[0]
        else:
            return None
    else:
        return None

def haokan(releaserUrl,**kwargs):
    if "app_id=" in releaserUrl:
        releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl))
        releaser_id = ' '.join(re.findall('\d+', releaser_id_str))
        return releaser_id
    elif "app_id" in releaserUrl:
        try:
            releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0]
            if releaser_id_str:
                return releaser_id_str
        except:
            releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0]
            if releaser_id_str:
                return releaser_id_str
    else:
        releaser_id_str = re.findall('(\d+)', releaserUrl)[0]
        if releaser_id_str:
            return releaser_id_str

def tengxunshipin(releaserUrl,is_qq=False,**kwargs):
    if not is_qq:
        try:
            if "id=" in releaserUrl:
                releaser_id = re.findall("id=(.*)", releaserUrl)[0]
                if "&" in releaser_id:
                    releaser_id = releaser_id.split("&")[0]
                return releaser_id
            else:
                releaser_id = re.findall("vplus/(.*)", releaserUrl)[0]
                if len(releaser_id) == 32 or len(releaser_id) == 16:
                    return releaser_id
                else:
                    if "#" in releaser_id:
                        releaser_id = releaser_id.split("#")[0]
                        if len(releaser_id) == 32 or len(releaser_id) == 16:
                            return releaser_id
                    if "/videos" in releaser_id:
                        releaser_id = releaser_id.split("/videos")[0]
                        if len(releaser_id) == 32 or len(releaser_id) == 16:
                            return releaser_id
                    get_page = requests.get(releaserUrl)
                    get_page.encoding = 'utf-8'
                    page = get_page.text
                    try:
                        USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
                        # releaser = re.findall("name: '(.*)',", USER_INFO)[0]
                        releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
                        # number_id = re.findall("number: '(.*)',", USER_INFO)[0]
                    except:
                        return None
                    return releaser_id
        except:
            return None
    else:
        get_page = requests.get(releaserUrl, timeout=1)
        get_page.encoding = 'utf-8'
        page = get_page.text
        try:
            USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0]
            releaser = re.findall("name: '(.*)',", USER_INFO)[0]
            releaser_id = re.findall("id: '(.*)',", USER_INFO)[0]
            number_id = re.findall("number: '(.*)',", USER_INFO)[0]
        except:
            return None
        D0 = {'releaser': releaser,
              'releaser_id': releaser_id,
              "number_id": number_id}
        return D0



def new_tudou(releaserUrl,**kwargs):
    if "?" in releaserUrl:
        releaserUrl = releaserUrl.split("?")[0]
    if "=" in releaserUrl:
        releaserUrl = releaserUrl.replace("=","")
    try:
        if 'videos' in releaserUrl:
            releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl))
            releaser_id = releaser_id_str.split('/')[1]
            return releaser_id
        elif releaserUrl[-1] == "/":
            releaserUrl = releaserUrl[0:-1]
            releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl))
            releaser_id = releaser_id_str
            return releaser_id
        else:
            releaser_id = releaserUrl.split("/")[-1]
            return releaser_id
    except:
        return None

def douyin(releaserUrl,**kwargs):
    try:
        releaser_id = re.findall("user/(\d+)",releaserUrl)[0]
    except:
        print(releaserUrl)
        return None

    return releaser_id


def tencent_news(releaserUrl,**kwargs):
    releaserUrl = str(releaserUrl)
    try:
        if "media/" in releaserUrl:
            res = re.findall(r"media/(\d+)", releaserUrl)
            if res:
                return res[0]
            else:
                pattern = 'media/[0-9]+'
                re_find = re.findall(pattern, releaserUrl)
                if re_find != []:
                    releaser_id = re_find[0].split('/')[1]
                else:
                    releaser_id = False
                return releaser_id
        elif "people/" in releaserUrl:
            res = re.findall(r"people/(\d+)", releaserUrl)
            if res:
                return res[0]
            else:
                pattern = 'people/[0-9]+'
                re_find = re.findall(pattern, releaserUrl)
                if re_find != []:
                    releaser_id = re_find[0].split('/')[1]
                else:
                    releaser_id = False
                return releaser_id

        else:
            res = re.findall(r"chlid=(\d+)", releaserUrl)
            if res:
                return res[0]
    except:
        return False


def miaopai(releaserUrl,**kwargs):
    if 'n.miaopai.com' in releaserUrl:
        releaser_id_str = releaserUrl.split('/')[-1]
        releaser_id = releaser_id_str.split('.htm')[0]
        return releaser_id
    else:
        print("input illegal releaserUrl %s" % releaserUrl)
        return None

def kwai(releaserUrl,**kwargs):
    if "3x" in releaserUrl:
        if "profile" in releaserUrl:
            res = re.findall(r"/profile/(.+)", releaserUrl)
            if res:
                return res[0]
            else:
                return ""
        elif "/u/" in releaserUrl:
            res = re.findall(r"/u/(.+)/", releaserUrl)
            if res:
                return res[0]
            else:
                return ""
    else:
        return None


def wangyi_news(releaserUrl,**kwargs):
    if "/sub/" in releaserUrl:
        res = re.findall(r"/sub/(.+)\.html", releaserUrl)
        if res:
            return res[0]
        else:
            return None
    elif "video" in releaserUrl:
        res = re.findall(r"/list/(.+)/video", releaserUrl)
        if res:
            return res[0]
        else:
            return None
    elif "all" in releaserUrl:
        res = re.findall(r"/list/(.+)/all", releaserUrl)
        if res:
            return res[0]
        else:
            return None


def weixin(releaserUrl,**kwargs):
    if "biz" in releaserUrl:
        releaserid = re.findall("biz=(.*?)&",releaserUrl)[0]
        return releaserid
    return releaserUrl


def pearvideo(releaserUrl,**kwargs):
    if "author" in releaserUrl:
        releaserid = re.findall("author_(\d+)",releaserUrl)[0]
        return releaserid
    else:
        return None


def weibo(releaserUrl,**kwargs):
    try:
        if "/u/" in releaserUrl:
            releaser_id = re.findall("/u/(\d+)",releaserUrl)[0]
        elif "/p/" in releaserUrl:
            releaser_id = re.findall("/p/(\d+)",releaserUrl)[0]
            if len(releaser_id) >= 15:
                releaser_id = releaser_id[6:]
        elif "/" in releaserUrl:
            releaser_id = re.findall("(\d+)",releaserUrl)[0]
        else:
            try:
                releaser_id = int(releaserUrl)
            except:
                return None
        # print(releaser_id)
        return releaser_id
    except:
        return None


def douban(releaserUrl,**kwargs):
    if "people/" in releaserUrl:
        releaser_id = re.findall(r"people/(.*)", releaserUrl)[0]
    return releaser_id


plantform_func = {
    "toutiao": toutiao,
    "西瓜":toutiao,
    "haokan": haokan,
    "腾讯视频": tengxunshipin,
    "new_tudou": new_tudou,
    "腾讯新闻": tencent_news,
    "miaopai": miaopai,
    "kwai": kwai,
    "快手": kwai,
    "网易新闻": wangyi_news,
    "抖音":douyin,
    "weixin":weixin,
    "weibo":weibo,
    "pearvideo":pearvideo,
    "douban":douban
}


def get_releaser_id(platform=None, releaserUrl=None,is_qq=False):
    # import pdb;pdb.set_trace()
    if platform and releaserUrl:
        if platform in plantform_func:
            try:
                func = plantform_func[platform]
                res = func(releaserUrl,is_qq=is_qq)
                if res:
                    return res
                else:
                    print(platform, releaserUrl, "can't get releaser_id")
                    return None
            except:
                print(platform, releaserUrl, "can't find releaser_id")
                return None
        else:
            # print(plantform," not in target list")
            return None


if __name__ == "__main__":
    pass
    # file = r'D:\wxfile\WeChat Files\litaolemo\FileStorage\File\2019-11\微博微信账号查找情况.csv'
    # with open(file, 'r')as f:
    #     head = f.readline()
    #     head_list = head.strip().split(',')
    #     for i in f:
    #         line_list = i.strip().split(',')
    #         line_dict = dict(zip(head_list, line_list))
    #         platform = line_dict['platform']
    #         releaserUrl = line_dict['releaserUrl']
    #         print(get_releaser_id(platform=platform,releaserUrl=releaserUrl))
    # print(releaser_id)

    # print(weibo("https://weibo.com/1656058115"))