combine all the rumble extractors, add rumble to generic.py, remove UsaWatchdogStory

This commit is contained in:
Glenn Pavlovic 2023-01-29 18:20:46 -08:00
parent 7bb8d94184
commit a777aeeda0
4 changed files with 109 additions and 110 deletions

View File

@ -1049,11 +1049,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe
from .rtvnh import RTVNHIE from .rtvnh import RTVNHIE
from .rtvs import RTVSIE from .rtvs import RTVSIE
from .ruhd import RUHDIE from .ruhd import RUHDIE
from .rumble import ( from .rumble import RumbleIE
RumbleEmbedIE,
RumblePageIE,
RumblePlaylistIE,
)
from .rutube import ( from .rutube import (
RutubeIE, RutubeIE,
RutubeChannelIE, RutubeChannelIE,
@ -1418,10 +1414,7 @@ from .urort import UrortIE
from .urplay import URPlayIE from .urplay import URPlayIE
from .usanetwork import USANetworkIE from .usanetwork import USANetworkIE
from .usatoday import USATodayIE from .usatoday import USATodayIE
from .usawatchdog import ( from .usawatchdog import UsaWatchdogIE
UsaWatchdogStoryIE,
UsaWatchdogIE,
)
from .ustream import UstreamIE, UstreamChannelIE from .ustream import UstreamIE, UstreamChannelIE
from .ustudio import ( from .ustudio import (
UstudioIE, UstudioIE,

View File

@ -132,6 +132,7 @@ from .kinja import KinjaEmbedIE
from .arcpublishing import ArcPublishingIE from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE from .medialaan import MedialaanIE
from .simplecast import SimplecastIE from .simplecast import SimplecastIE
from .rumble import RumbleIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -3499,6 +3500,10 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
rumble_urls = RumbleIE.rumble_embedded_id(webpage)
if rumble_urls is not None:
return self.playlist_result(rumble_urls) if len(rumble_urls) > 1 else rumble_urls[0]
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:

View File

@ -9,16 +9,99 @@ from ..utils import (
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
try_get, try_get,
ExtractorError,
) )
class rumbleBase(InfoExtractor): class RumbleIE(InfoExtractor):
RE_DICT = {
'iframe_url': {
're': r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)',
'compiled': None},
'jscript_url': {
're': r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html',
'compiled': None},
'list_url': {
're': r'https?://rumble.com/(?:c|user)/(?P<id>[^/]+)',
'compiled': None},
'jscript_id': {
're': r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P<id>[^"\']+)',
'compiled': None}
}
_TESTS = [
{
'url': 'https://rumble.com/embed/v5pv5f',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}
},
{
'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}
},
{
'url': 'https://rumble.com/c/PeakProsperity',
'playlist_mincount': 25,
'info_dict': {
'id': 'PeakProsperity',
}
},
{
'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
'only_matching': True,
}
]
@classmethod
def get_re(cls, tag):
if cls.RE_DICT[tag]['compiled'] is None:
cls.RE_DICT[tag]['compiled'] = re.compile(cls.RE_DICT[tag]['re'])
return cls.RE_DICT[tag]['compiled']
@classmethod
def suitable(cls, url):
return (cls.get_re('jscript_url').match(url) is not None or
cls.get_re('list_url').match(url) is not None or
cls.get_re('iframe_url').match(url) is not None)
@staticmethod
def rumble_embedded_id(page_data):
'''For use by extractors of sites which use emedded Rumble videos. Given
a webpage as a string returns a list of url result dicts for each embedded
rumble video found. None is returned if no embeds were found. Duplicates
are not removed'''
embeds = []
# The JS embeds
for mobj in RumbleIE.get_re('jscript_id').finditer(page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id')))
# The iframes embeds
for mobj in RumbleIE.get_re('iframe_url').finditer(page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id')))
return embeds if embeds else None
def rumble_video_info(self, video_id): def rumble_video_info(self, video_id):
video = self._download_json( video = self._download_json(
'https://rumble.com/embedJS/', video_id, 'https://rumble.com/embedJS/', video_id,
query={'request': 'video', 'v': video_id}) query={'request': 'video', 'v': video_id})
if not video: if not video:
return None raise ExtractorError('Unable to locate video information.', expected=True)
title = video['title'] title = video['title']
@ -53,82 +136,22 @@ class rumbleBase(InfoExtractor):
'duration': int_or_none(video.get('duration')), 'duration': int_or_none(video.get('duration')),
} }
class RumbleEmbedIE(rumbleBase):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
_TESTS = [{
'url': 'https://rumble.com/embed/v5pv5f',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}
}, {
'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) if self.get_re('jscript_url').match(url) is not None:
return self.rumble_video_info(video_id) page = self._download_webpage(url, 'Rumble Page')
video_id = self._search_regex(self.get_re('jscript_id'), page, "id")
return self.rumble_video_info(video_id)
mobj = self.get_re('list_url').match(url)
if mobj is not None:
urls = []
id = mobj.group('id')
page = self._download_webpage(url, id)
for mobj in re.finditer(r'<a class=video-item--a href=\/(?P<href>[a-zA-Z0-9\-.]+)>', page):
urls.append('https://rumble.com/' + mobj.group('href'))
class RumblePageIE(rumbleBase): return self.playlist_from_matches(urls, id)
_VALID_URL = r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html'
_TEST = {
'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}}
_RUMBLE_JS_RE = r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P<id>[^"\']+)' mobj = self.get_re('iframe_url').match(url)
if mobj is not None:
def _real_extract(self, url): return self.rumble_video_info(mobj.group('id'))
page = self._download_webpage(url, 'Rumble Page')
video_id = self._search_regex(self._RUMBLE_JS_RE, page, "id")
return self.rumble_video_info(video_id)
class RumblePlaylistIE(rumbleBase):
_VALID_URL = r'https?://rumble.com/(?:c|user)/(?P<id>[^/]+)'
_TEST = {
'url': 'https://rumble.com/c/PeakProsperity',
'playlist_mincount': 25,
'info_dict': {
'id': 'PeakProsperity',
}}
def _real_extract(self, url):
urls = []
id = self._match_id(url)
page = self._download_webpage(url, id)
for mobj in re.finditer(r'<a class=video-item--a href=\/(?P<href>[a-zA-Z0-9\-.]+)>', page):
urls.append('https://rumble.com/' + mobj.group('href'))
return self.playlist_from_matches(urls, id)
def rumble_embedded_id(page_data):
'''For use by extractors of sites which use emedded Rumble videos. Given
a webpage as a string returns a list of url result dicts for each embedded
rumble video found. None is returned if no embeds were found. Duplicates
are not removed'''
embeds = []
# The JS embeds
for mobj in re.finditer(RumblePageIE._RUMBLE_JS_RE, page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id')))
# The iframes embeds
for mobj in re.finditer(RumbleEmbedIE._VALID_URL, page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id')))
return embeds if embeds else None

View File

@ -4,30 +4,9 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .rumble import rumble_embedded_id
class UsaWatchdogStoryIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/(?P<id>[^/]+)'
_TEST = {
'url': 'https://usawatchdog.com/cv-19-vaccine-warning-cv-19-cure-must-watch-videos/',
'md5': 'bf40e20aebca9016ca195534028cbb6f',
'info_dict': {
'id': 'vcl8gx',
'ext': 'mp4',
'timestamp': 1617141926,
'upload_date': '20210330',
'title': u'Vaccine Warning \u2013 CV-19 Cure Must Watch Videos',
}}
def _real_extract(self, url):
title = self._match_id(url)
embeds = rumble_embedded_id(self._download_webpage(url, title))
return embeds[0] if embeds is not None else None
class UsaWatchdogIE(InfoExtractor): class UsaWatchdogIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/$' _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/?$'
_TEST = { _TEST = {
'url': 'https://usawatchdog.com/', 'url': 'https://usawatchdog.com/',
'playlist_mincount': 15, 'playlist_mincount': 15,
@ -40,7 +19,6 @@ class UsaWatchdogIE(InfoExtractor):
for mobj in re.finditer(r'front-view-title[^<]+<a.+href=["\'](?P<href>https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P<title>[^<]+)', for mobj in re.finditer(r'front-view-title[^<]+<a.+href=["\'](?P<href>https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P<title>[^<]+)',
self._download_webpage(url, 'Site Root')): self._download_webpage(url, 'Site Root')):
matches.append(self.url_result(mobj.group('href'), matches.append(self.url_result(mobj.group('href'),
'UsaWatchdogStory', None, video_title=mobj.group('title').encode('utf8')))
mobj.group('title').encode('utf8')))
return self.playlist_result(matches, 'USA Watchdog') return self.playlist_result(matches, 'USA Watchdog')