diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3da5f8020..3c535c32c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1069,7 +1069,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE -from .rumble import RumbleEmbedIE +from .rumble import RumbleIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1439,6 +1439,7 @@ from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE from .usatoday import USATodayIE +from .usawatchdog import UsaWatchdogIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( UstudioIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b01900afa..de84d2b80 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -132,6 +132,7 @@ from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE +from .rumble import RumbleIE class GenericIE(InfoExtractor): @@ -3518,6 +3519,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + rumble_urls = RumbleIE.rumble_embedded_id(webpage) + if rumble_urls is not None: + return self.playlist_result(rumble_urls) if len(rumble_urls) > 1 else rumble_urls[0] + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py index 4a0225109..8a18d12b7 100644 --- a/youtube_dl/extractor/rumble.py +++ b/youtube_dl/extractor/rumble.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..compat import compat_str @@ -8,31 +9,100 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + ExtractorError, ) -class RumbleEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' - _TESTS = [{ - 'url': 'https://rumble.com/embed/v5pv5f', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - } - }, { - 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', - 'only_matching': True, - }] +class RumbleIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) + RE_DICT = { + 'iframe_url': { + 're': r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)', + 'compiled': None}, + 'jscript_url': { + 're': r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html', + 'compiled': None}, + 'list_url': { + 're': r'https?://rumble.com/(?:c|user)/(?P[^/]+)', + 'compiled': None}, + 'jscript_id': { + 're': r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P[^"\']+)', + 'compiled': None} + } + + _TESTS = [ + { + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, + { + 'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, + { + 'url': 'https://rumble.com/c/PeakProsperity', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'PeakProsperity', + } + }, + { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + } + ] + + @classmethod + def get_re(cls, tag): + if cls.RE_DICT[tag]['compiled'] is None: + cls.RE_DICT[tag]['compiled'] = re.compile(cls.RE_DICT[tag]['re']) + return cls.RE_DICT[tag]['compiled'] + + @classmethod + def suitable(cls, url): + return (cls.get_re('jscript_url').match(url) is not None + or cls.get_re('list_url').match(url) is not None + or cls.get_re('iframe_url').match(url) is not None) + + @staticmethod + def rumble_embedded_id(page_data): + '''For use by extractors of sites which use emedded Rumble videos. Given + a webpage as a string returns a list of url result dicts for each embedded + rumble video found. None is returned if no embeds were found. Duplicates + are not removed''' + + embeds = [] + # The JS embeds + for mobj in RumbleIE.get_re('jscript_id').finditer(page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id'))) + + # The iframes embeds + for mobj in RumbleIE.get_re('iframe_url').finditer(page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id'))) + + return embeds if embeds else None + + def rumble_video_info(self, video_id): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) + if not video: + raise ExtractorError('Unable to locate video information.', expected=True) + title = video['title'] formats = [] @@ -65,3 +135,23 @@ class RumbleEmbedIE(InfoExtractor): 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), } + + def _real_extract(self, url): + if self.get_re('jscript_url').match(url) is not None: + page = self._download_webpage(url, 'Rumble Page') + video_id = self._search_regex(self.get_re('jscript_id'), page, "id") + return self.rumble_video_info(video_id) + + mobj = self.get_re('list_url').match(url) + if mobj is not None: + urls = [] + id = mobj.group('id') + page = self._download_webpage(url, id) + for mobj in re.finditer(r'[a-zA-Z0-9\-.]+)>', page): + urls.append('https://rumble.com/' + mobj.group('href')) + + return self.playlist_from_matches(urls, id) + + mobj = self.get_re('iframe_url').match(url) + if mobj is not None: + return self.rumble_video_info(mobj.group('id')) diff --git a/youtube_dl/extractor/usawatchdog.py b/youtube_dl/extractor/usawatchdog.py new file mode 100644 index 000000000..c46849b83 --- /dev/null +++ b/youtube_dl/extractor/usawatchdog.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + + +class UsaWatchdogIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/?$' + _TEST = { + 'url': 'https://usawatchdog.com/', + 'playlist_mincount': 15, + 'info_dict': { + 'id': 'USA Watchdog', + }} + + def _real_extract(self, url): + matches = [] + for mobj in re.finditer(r'front-view-title[^<]+https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P[^<]+)', + self._download_webpage(url, 'Site Root')): + matches.append(self.url_result(mobj.group('href'), + video_title=mobj.group('title').encode('utf8'))) + + return self.playlist_result(matches, 'USA Watchdog')