# -*- coding:utf-8 -*- # @Time : 2019/5/30 11:01 # @Author : litao import re, requests def toutiao(releaserUrl,**kwargs): if 'www.toutiao.com' in releaserUrl or 'www.365yg.com' in releaserUrl: pattern = 'user/[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('/')[1] else: pattern = 'to_user_id=[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('=')[1] else: re_find = re.findall("/m(\d+)", releaserUrl) if re_find: return re_find[0] else: releaser_id = None return releaser_id elif 'm.toutiao.com' in releaserUrl: pattern = 'profile/[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('/')[1] return releaser_id elif 'm.365yg.com' in releaserUrl: pattern = 'to_user_id=[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('=')[1] else: releaser_id = None return releaser_id elif "user_id" in releaserUrl: re_find = re.findall("user_id=(\d+)",releaserUrl) if re_find: return re_find[0] else: return None elif "ixigua" in releaserUrl: re_find = re.findall("/(\d+)", releaserUrl) if re_find: return re_find[0] else: return None else: return None def haokan(releaserUrl,**kwargs): if "app_id=" in releaserUrl: releaser_id_str = ' '.join(re.findall('app_id=.*', releaserUrl)) releaser_id = ' '.join(re.findall('\d+', releaser_id_str)) return releaser_id elif "app_id" in releaserUrl: try: releaser_id_str = re.findall("%22(\d+)%22", releaserUrl)[0] if releaser_id_str: return releaser_id_str except: releaser_id_str = re.findall('"(\d+)"', releaserUrl)[0] if releaser_id_str: return releaser_id_str else: releaser_id_str = re.findall('(\d+)', releaserUrl)[0] if releaser_id_str: return releaser_id_str def tengxunshipin(releaserUrl,is_qq=False,**kwargs): if not is_qq: try: if "id=" in releaserUrl: releaser_id = re.findall("id=(.*)", releaserUrl)[0] if "&" in releaser_id: releaser_id = releaser_id.split("&")[0] return releaser_id else: releaser_id = re.findall("vplus/(.*)", releaserUrl)[0] if len(releaser_id) == 32 or len(releaser_id) == 16: return releaser_id else: if "#" in releaser_id: releaser_id = releaser_id.split("#")[0] if len(releaser_id) == 32 or len(releaser_id) == 16: return releaser_id if "/videos" in releaser_id: releaser_id = releaser_id.split("/videos")[0] if len(releaser_id) == 32 or len(releaser_id) == 16: return releaser_id get_page = requests.get(releaserUrl) get_page.encoding = 'utf-8' page = get_page.text try: USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0] # releaser = re.findall("name: '(.*)',", USER_INFO)[0] releaser_id = re.findall("id: '(.*)',", USER_INFO)[0] # number_id = re.findall("number: '(.*)',", USER_INFO)[0] except: return None return releaser_id except: return None else: get_page = requests.get(releaserUrl, timeout=1) get_page.encoding = 'utf-8' page = get_page.text try: USER_INFO = re.findall("var USER_INFO = ({.*?})", page, flags=re.DOTALL)[0] releaser = re.findall("name: '(.*)',", USER_INFO)[0] releaser_id = re.findall("id: '(.*)',", USER_INFO)[0] number_id = re.findall("number: '(.*)',", USER_INFO)[0] except: return None D0 = {'releaser': releaser, 'releaser_id': releaser_id, "number_id": number_id} return D0 def new_tudou(releaserUrl,**kwargs): if "?" in releaserUrl: releaserUrl = releaserUrl.split("?")[0] if "=" in releaserUrl: releaserUrl = releaserUrl.replace("=","") try: if 'videos' in releaserUrl: releaser_id_str = ' '.join(re.findall('i/.*/videos', releaserUrl)) releaser_id = releaser_id_str.split('/')[1] return releaser_id elif releaserUrl[-1] == "/": releaserUrl = releaserUrl[0:-1] releaser_id_str = ''.join(re.findall('i/(.*)', releaserUrl)) releaser_id = releaser_id_str return releaser_id else: releaser_id = releaserUrl.split("/")[-1] return releaser_id except: return None def douyin(releaserUrl,**kwargs): try: releaser_id = re.findall("user/(\d+)",releaserUrl)[0] except: print(releaserUrl) return None return releaser_id def tencent_news(releaserUrl,**kwargs): releaserUrl = str(releaserUrl) try: if "media/" in releaserUrl: res = re.findall(r"media/(\d+)", releaserUrl) if res: return res[0] else: pattern = 'media/[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('/')[1] else: releaser_id = False return releaser_id elif "people/" in releaserUrl: res = re.findall(r"people/(\d+)", releaserUrl) if res: return res[0] else: pattern = 'people/[0-9]+' re_find = re.findall(pattern, releaserUrl) if re_find != []: releaser_id = re_find[0].split('/')[1] else: releaser_id = False return releaser_id else: res = re.findall(r"chlid=(\d+)", releaserUrl) if res: return res[0] except: return False def miaopai(releaserUrl,**kwargs): if 'n.miaopai.com' in releaserUrl: releaser_id_str = releaserUrl.split('/')[-1] releaser_id = releaser_id_str.split('.htm')[0] return releaser_id else: print("input illegal releaserUrl %s" % releaserUrl) return None def kwai(releaserUrl,**kwargs): if "3x" in releaserUrl: if "profile" in releaserUrl: res = re.findall(r"/profile/(.+)", releaserUrl) if res: return res[0] else: return "" elif "/u/" in releaserUrl: res = re.findall(r"/u/(.+)/", releaserUrl) if res: return res[0] else: return "" else: return None def wangyi_news(releaserUrl,**kwargs): if "/sub/" in releaserUrl: res = re.findall(r"/sub/(.+)\.html", releaserUrl) if res: return res[0] else: return None elif "video" in releaserUrl: res = re.findall(r"/list/(.+)/video", releaserUrl) if res: return res[0] else: return None elif "all" in releaserUrl: res = re.findall(r"/list/(.+)/all", releaserUrl) if res: return res[0] else: return None def weixin(releaserUrl,**kwargs): if "biz" in releaserUrl: releaserid = re.findall("biz=(.*?)&",releaserUrl)[0] return releaserid return releaserUrl def pearvideo(releaserUrl,**kwargs): if "author" in releaserUrl: releaserid = re.findall("author_(\d+)",releaserUrl)[0] return releaserid else: return None def weibo(releaserUrl,**kwargs): try: if "/u/" in releaserUrl: releaser_id = re.findall("/u/(\d+)",releaserUrl)[0] elif "/p/" in releaserUrl: releaser_id = re.findall("/p/(\d+)",releaserUrl)[0] if len(releaser_id) >= 15: releaser_id = releaser_id[6:] elif "/" in releaserUrl: releaser_id = re.findall("(\d+)",releaserUrl)[0] else: try: releaser_id = int(releaserUrl) except: return None # print(releaser_id) return releaser_id except: return None def douban(releaserUrl,**kwargs): if "people/" in releaserUrl: releaser_id = re.findall(r"people/(.*)", releaserUrl)[0] return releaser_id plantform_func = { "toutiao": toutiao, "西瓜":toutiao, "haokan": haokan, "腾讯视频": tengxunshipin, "new_tudou": new_tudou, "腾讯新闻": tencent_news, "miaopai": miaopai, "kwai": kwai, "快手": kwai, "网易新闻": wangyi_news, "抖音":douyin, "weixin":weixin, "weibo":weibo, "pearvideo":pearvideo, "douban":douban } def get_releaser_id(platform=None, releaserUrl=None,is_qq=False): # import pdb;pdb.set_trace() if platform and releaserUrl: if platform in plantform_func: try: func = plantform_func[platform] res = func(releaserUrl,is_qq=is_qq) if res: return res else: print(platform, releaserUrl, "can't get releaser_id") return None except: print(platform, releaserUrl, "can't find releaser_id") return None else: # print(plantform," not in target list") return None if __name__ == "__main__": pass # file = r'D:\wxfile\WeChat Files\litaolemo\FileStorage\File\2019-11\微博微信账号查找情况.csv' # with open(file, 'r')as f: # head = f.readline() # head_list = head.strip().split(',') # for i in f: # line_list = i.strip().split(',') # line_dict = dict(zip(head_list, line_list)) # platform = line_dict['platform'] # releaserUrl = line_dict['releaserUrl'] # print(get_releaser_id(platform=platform,releaserUrl=releaserUrl)) # print(releaser_id) # print(weibo("https://weibo.com/1656058115"))