diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index ed1a33cf2..a29e0d8df 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2448,12 +2448,12 @@ try: except ImportError: import BaseHTTPServer as compat_http_server +# urllib.parse try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus from urllib.parse import urlencode as compat_urllib_parse_urlencode - from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) @@ -2543,60 +2543,80 @@ except ImportError: # Python 2 return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - setattr(compat_urllib_parse, '_urlencode', getattr(compat_urllib_parse, 'urlencode')) for name, fix in ( ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes), ('parse_unquote', compat_urllib_parse_unquote), ('unquote_plus', compat_urllib_parse_unquote_plus), - ('urlencode', compat_urllib_parse_urlencode), - ('parse_qs', compat_parse_qs)): + ('urlencode', compat_urllib_parse_urlencode)): setattr(compat_urllib_parse, name, fix) +finally: + try: + # arguments changed in 3.8 and 3.10 + from urllib.parse import parse_qs as _parse_qs + _parse_qs('a=b', separator='&') + compat_parse_qs = _parse_qs + except (ImportError, TypeError): # Python 2, < 3.10 -compat_urllib_parse_parse_qs = compat_parse_qs + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + # Also use this implementation for Py < 3.10 + # * support only default separator '&', not r'[&;]', like 3.10+ + # * support max_num_fields, like 3.8+ + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace', + max_num_fields=None, separator='&'): + if not isinstance(separator, (compat_str, str)): + raise ValueError('Separator must be of type string or bytes') + # DoS protection, if anyone cares + if qs and max_num_fields is not None and qs.count(separator) >= max_num_fields: + raise ValueError('Too many fields') + _coerce_result = compat_str + r = [] + for name_value in qs.split(separator): + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace', + max_num_fields=None, separator='&'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding, errors, max_num_fields, separator) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + for name, fix in ( + ('parse_qs', compat_parse_qs), + ('parse_qsl', _parse_qsl)): + setattr(compat_urllib_parse, name, fix) + + compat_urllib_parse_parse_qs = compat_parse_qs try: from urllib.request import DataHandler as compat_urllib_request_DataHandler diff --git a/youtube_dl/extractor/bfi.py b/youtube_dl/extractor/bfi.py index 60c8944b5..cf4512caa 100644 --- a/youtube_dl/extractor/bfi.py +++ b/youtube_dl/extractor/bfi.py @@ -4,7 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import extract_attributes +from ..utils import ( + extract_attributes, + parse_qs, + remove_start, + smuggle_url, +) class BFIPlayerIE(InfoExtractor): @@ -12,26 +17,39 @@ class BFIPlayerIE(InfoExtractor): _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P[\w-]+)-online' _TEST = { 'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online', - 'md5': 'e8783ebd8e061ec4bc6e9501ed547de8', + 'md5': '15598bdd6a413ce9363970754f054d76', 'info_dict': { 'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63', 'ext': 'mp4', 'title': 'Computer Doctor', 'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b', + 'timestamp': 1564424975, + 'upload_date': '20190729', + 'uploader_id': '6057949427001', }, - 'skip': 'BFI Player films cannot be played outside of the UK', + # 'skip': 'BFI Player films cannot be played outside of the UK', } + _BRIGHTCOVE_ACCOUNT_ID = '6057949427001' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - entries = [] - for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage): - player_attr = extract_attributes(player_el) - ooyala_id = player_attr.get('data-video-id') - if not ooyala_id: - continue - entries.append(self.url_result( - 'ooyala:' + ooyala_id, 'Ooyala', - ooyala_id, player_attr.get('data-label'))) - return self.playlist_result(entries) + + film_only = 'play-film' in parse_qs(url, keep_blank_values=True) + + def entries(): + for player_el in re.finditer(r'(?s)]+>', webpage): + player_attr = extract_attributes(player_el.group(0)) + bcv_id, account_id, player_id, embed = ( + player_attr.get(x) for x in ('data-ref-id', 'data-acid', 'data-pid', 'data-embed')) + if not bcv_id: + continue + if film_only and player_attr.get('data-video-type') != 'film': + continue + bc_url = 'brightcove:new:%s:%s:%s:video:ref:%s' % ( + account_id or self._BRIGHTCOVE_ACCOUNT_ID, player_id or 'default', embed or 'default', bcv_id) + + yield self.url_result(smuggle_url( + bc_url, {'referrer': url, 'force_videoid': remove_start(bcv_id, 'ref:')}), ie='BrightcoveNew', video_id=video_id) + + return self.playlist_result(entries()) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 6022076ac..ad1f86b89 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -340,7 +340,7 @@ class BrightcoveLegacyIE(InfoExtractor): class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[^/]+)_(?P[^/]+)/index\.html\?.*(?Pvideo|playlist)Id=(?P\d+|ref:[^&]+)' + _VALID_URL = r'(?:brightcove:new|(?Phttps?)):(?(u)//players\.brightcove\.net/)(?P\d+)(?(u)/|:)(?P[^/]+)(?(u)_|:)(?P[^/]+)(?(u)/index\.html\?.*|:)(?Pvideo|playlist)(?(u)Id=|:)(?P\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -593,7 +593,7 @@ class BrightcoveNewIE(AdobePassIE): 'ip_blocks': smuggled_data.get('geo_ip_blocks'), }) - account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() + account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()[1:] policy_key_id = '%s_%s' % (account_id, player_id) policy_key = self._downloader.cache.load('brightcove', policy_key_id) @@ -678,4 +678,4 @@ class BrightcoveNewIE(AdobePassIE): json_data.get('description')) return self._parse_brightcove_metadata( - json_data, video_id, headers=headers) + json_data, smuggled_data.get('force_videoid') or video_id, headers=headers) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 113c913df..e3894b322 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2402,7 +2402,7 @@ class YoutubeDLError(Exception): class ExtractorError(YoutubeDLError): """Error during info extraction.""" - def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None): """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in youtube-dl. """ @@ -2421,6 +2421,7 @@ class ExtractorError(YoutubeDLError): self.exc_info = sys.exc_info() # preserve original exception self.cause = cause self.video_id = video_id + self.ie = ie def format_traceback(self): if self.traceback is None: @@ -6561,3 +6562,24 @@ def join_nonempty(*values, **kwargs): if from_dict is not None: values = (traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(compat_str, filter(None, values))) + + +class classproperty(object): + """property access for class methods with optional caching""" + def __new__(cls, *args, **kwargs): + func = args[0] if len(args) > 0 else kwargs.get('func') + if not func: + return functools.partial(cls, *args, **kwargs) + return super(classproperty, cls).__new__(cls) + + def __init__(self, func, cache=False): + functools.update_wrapper(self, func) + self.func = func + self._cache = {} if cache else None + + def __get__(self, _, cls): + if self._cache is None: + return self.func(cls) + elif cls not in self._cache: + self._cache[cls] = self.func(cls) + return self._cache[cls]