Merge d7b502a727 into 0153b387e5

[BFIPlayer] Support Brightcove video host, replacing Ooyala
[utils] Allow kwargs for parse_qs()
2024-06-12 23:17:11 +00:00 · 2023-04-16 21:25:04 +01:00 · 2023-04-16 21:24:09 +01:00 · 2023-04-16 21:20:07 +01:00 · 2023-04-16 19:39:10 +01:00
4 changed files with 124 additions and 64 deletions
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2448,12 +2448,12 @@ try:
 except ImportError:
    import BaseHTTPServer as compat_http_server

+# urllib.parse
 try:
    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
    from urllib.parse import unquote as compat_urllib_parse_unquote
    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
    from urllib.parse import urlencode as compat_urllib_parse_urlencode
-    from urllib.parse import parse_qs as compat_parse_qs
 except ImportError:  # Python 2
    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
                else re.compile(r'([\x00-\x7f]+)'))
@ -2543,60 +2543,80 @@ except ImportError:  # Python 2

        return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq)

-    # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
-    # Python 2's version is apparently totally broken
-    def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
-                   encoding='utf-8', errors='replace'):
-        qs, _coerce_result = qs, compat_str
-        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
-        r = []
-        for name_value in pairs:
-            if not name_value and not strict_parsing:
-                continue
-            nv = name_value.split('=', 1)
-            if len(nv) != 2:
-                if strict_parsing:
-                    raise ValueError('bad query field: %r' % (name_value,))
-                # Handle case of a control-name with no equal sign
-                if keep_blank_values:
-                    nv.append('')
-                else:
-                    continue
-            if len(nv[1]) or keep_blank_values:
-                name = nv[0].replace('+', ' ')
-                name = compat_urllib_parse_unquote(
-                    name, encoding=encoding, errors=errors)
-                name = _coerce_result(name)
-                value = nv[1].replace('+', ' ')
-                value = compat_urllib_parse_unquote(
-                    value, encoding=encoding, errors=errors)
-                value = _coerce_result(value)
-                r.append((name, value))
-        return r
-
-    def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
-                        encoding='utf-8', errors='replace'):
-        parsed_result = {}
-        pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
-                           encoding=encoding, errors=errors)
-        for name, value in pairs:
-            if name in parsed_result:
-                parsed_result[name].append(value)
-            else:
-                parsed_result[name] = [value]
-        return parsed_result
-
    setattr(compat_urllib_parse, '_urlencode',
            getattr(compat_urllib_parse, 'urlencode'))
    for name, fix in (
            ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes),
            ('parse_unquote', compat_urllib_parse_unquote),
            ('unquote_plus', compat_urllib_parse_unquote_plus),
-            ('urlencode', compat_urllib_parse_urlencode),
-            ('parse_qs', compat_parse_qs)):
+            ('urlencode', compat_urllib_parse_urlencode)):
        setattr(compat_urllib_parse, name, fix)
+finally:
+    try:
+        # arguments changed in 3.8 and 3.10
+        from urllib.parse import parse_qs as _parse_qs
+        _parse_qs('a=b', separator='&')
+        compat_parse_qs = _parse_qs
+    except (ImportError, TypeError):  # Python 2, < 3.10

-compat_urllib_parse_parse_qs = compat_parse_qs
+        # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
+        # Python 2's version is apparently totally broken
+        # Also use this implementation for Py < 3.10
+        # * support only default separator '&', not r'[&;]', like 3.10+
+        # * support max_num_fields, like 3.8+
+        def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
+                       encoding='utf-8', errors='replace',
+                       max_num_fields=None, separator='&'):
+            if not isinstance(separator, (compat_str, str)):
+                raise ValueError('Separator must be of type string or bytes')
+            # DoS protection, if anyone cares
+            if qs and max_num_fields is not None and qs.count(separator) >= max_num_fields:
+                raise ValueError('Too many fields')
+            _coerce_result = compat_str
+            r = []
+            for name_value in qs.split(separator):
+                if not name_value and not strict_parsing:
+                    continue
+                nv = name_value.split('=', 1)
+                if len(nv) != 2:
+                    if strict_parsing:
+                        raise ValueError('bad query field: %r' % (name_value,))
+                    # Handle case of a control-name with no equal sign
+                    if keep_blank_values:
+                        nv.append('')
+                    else:
+                        continue
+                if len(nv[1]) or keep_blank_values:
+                    name = nv[0].replace('+', ' ')
+                    name = compat_urllib_parse_unquote(
+                        name, encoding=encoding, errors=errors)
+                    name = _coerce_result(name)
+                    value = nv[1].replace('+', ' ')
+                    value = compat_urllib_parse_unquote(
+                        value, encoding=encoding, errors=errors)
+                    value = _coerce_result(value)
+                    r.append((name, value))
+            return r
+
+        def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
+                            encoding='utf-8', errors='replace',
+                            max_num_fields=None, separator='&'):
+            parsed_result = {}
+            pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
+                               encoding, errors, max_num_fields, separator)
+            for name, value in pairs:
+                if name in parsed_result:
+                    parsed_result[name].append(value)
+                else:
+                    parsed_result[name] = [value]
+            return parsed_result
+
+        for name, fix in (
+                ('parse_qs', compat_parse_qs),
+                ('parse_qsl', _parse_qsl)):
+            setattr(compat_urllib_parse, name, fix)
+
+    compat_urllib_parse_parse_qs = compat_parse_qs

 try:
    from urllib.request import DataHandler as compat_urllib_request_DataHandler
--- a/youtube_dl/extractor/bfi.py
+++ b/youtube_dl/extractor/bfi.py
@ -4,7 +4,12 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
-from ..utils import extract_attributes
+from ..utils import (
+    extract_attributes,
+    parse_qs,
+    remove_start,
+    smuggle_url,
+)


 class BFIPlayerIE(InfoExtractor):
@ -12,26 +17,39 @@ class BFIPlayerIE(InfoExtractor):
    _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online'
    _TEST = {
        'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online',
-        'md5': 'e8783ebd8e061ec4bc6e9501ed547de8',
+        'md5': '15598bdd6a413ce9363970754f054d76',
        'info_dict': {
            'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63',
            'ext': 'mp4',
            'title': 'Computer Doctor',
            'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b',
+            'timestamp': 1564424975,
+            'upload_date': '20190729',
+            'uploader_id': '6057949427001',
        },
-        'skip': 'BFI Player films cannot be played outside of the UK',
+        # 'skip': 'BFI Player films cannot be played outside of the UK',
    }
+    _BRIGHTCOVE_ACCOUNT_ID = '6057949427001'

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
-        entries = []
-        for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage):
-            player_attr = extract_attributes(player_el)
-            ooyala_id = player_attr.get('data-video-id')
-            if not ooyala_id:
-                continue
-            entries.append(self.url_result(
-                'ooyala:' + ooyala_id, 'Ooyala',
-                ooyala_id, player_attr.get('data-label')))
-        return self.playlist_result(entries)
+
+        film_only = 'play-film' in parse_qs(url, keep_blank_values=True)
+
+        def entries():
+            for player_el in re.finditer(r'(?s)<video-js\b[^>]+>', webpage):
+                player_attr = extract_attributes(player_el.group(0))
+                bcv_id, account_id, player_id, embed = (
+                    player_attr.get(x) for x in ('data-ref-id', 'data-acid', 'data-pid', 'data-embed'))
+                if not bcv_id:
+                    continue
+                if film_only and player_attr.get('data-video-type') != 'film':
+                    continue
+                bc_url = 'brightcove:new:%s:%s:%s:video:ref:%s' % (
+                    account_id or self._BRIGHTCOVE_ACCOUNT_ID, player_id or 'default', embed or 'default', bcv_id)
+
+                yield self.url_result(smuggle_url(
+                    bc_url, {'referrer': url, 'force_videoid': remove_start(bcv_id, 'ref:')}), ie='BrightcoveNew', video_id=video_id)
+
+        return self.playlist_result(entries())
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@ -340,7 +340,7 @@ class BrightcoveLegacyIE(InfoExtractor):

 class BrightcoveNewIE(AdobePassIE):
    IE_NAME = 'brightcove:new'
-    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
+    _VALID_URL = r'(?:brightcove:new|(?P<u>https?)):(?(u)//players\.brightcove\.net/)(?P<account_id>\d+)(?(u)/|:)(?P<player_id>[^/]+)(?(u)_|:)(?P<embed>[^/]+)(?(u)/index\.html\?.*|:)(?P<content_type>video|playlist)(?(u)Id=|:)(?P<video_id>\d+|ref:[^&]+)'
    _TESTS = [{
        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
        'md5': 'c8100925723840d4b0d243f7025703be',
@ -593,7 +593,7 @@ class BrightcoveNewIE(AdobePassIE):
            'ip_blocks': smuggled_data.get('geo_ip_blocks'),
        })

-        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
+        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()[1:]

        policy_key_id = '%s_%s' % (account_id, player_id)
        policy_key = self._downloader.cache.load('brightcove', policy_key_id)
@ -678,4 +678,4 @@ class BrightcoveNewIE(AdobePassIE):
                json_data.get('description'))

        return self._parse_brightcove_metadata(
-            json_data, video_id, headers=headers)
+            json_data, smuggled_data.get('force_videoid') or video_id, headers=headers)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2402,7 +2402,7 @@ class YoutubeDLError(Exception):
 class ExtractorError(YoutubeDLError):
    """Error during info extraction."""

-    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
+    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
        """ tb, if given, is the original traceback (so that it can be printed out).
        If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
        """
@ -2421,6 +2421,7 @@ class ExtractorError(YoutubeDLError):
        self.exc_info = sys.exc_info()  # preserve original exception
        self.cause = cause
        self.video_id = video_id
+        self.ie = ie

    def format_traceback(self):
        if self.traceback is None:
@ -6561,3 +6562,24 @@ def join_nonempty(*values, **kwargs):
    if from_dict is not None:
        values = (traverse_obj(from_dict, variadic(v)) for v in values)
    return delim.join(map(compat_str, filter(None, values)))
+
+
+class classproperty(object):
+    """property access for class methods with optional caching"""
+    def __new__(cls, *args, **kwargs):
+        func = args[0] if len(args) > 0 else kwargs.get('func')
+        if not func:
+            return functools.partial(cls, *args, **kwargs)
+        return super(classproperty, cls).__new__(cls)
+
+    def __init__(self, func, cache=False):
+        functools.update_wrapper(self, func)
+        self.func = func
+        self._cache = {} if cache else None
+
+    def __get__(self, _, cls):
+        if self._cache is None:
+            return self.func(cls)
+        elif cls not in self._cache:
+            self._cache[cls] = self.func(cls)
+        return self._cache[cls]
Author	SHA1	Message	Date
dirkf	bd9918040d	Merge `d7b502a727` into `0153b387e5`	2024-06-12 23:17:11 +00:00
dirkf	d7b502a727	[BFIPlayer] Support Brightcove video host, replacing Ooyala	2023-04-16 21:25:04 +01:00
dirkf	74e39ca0fd	[utils] Allow kwargs for `parse_qs()` * supported by `compat_parse_qs()`: keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replac> * now equivalent to yt-dlp	2023-04-16 21:24:09 +01:00
dirkf	dc990a61cc	[compat] Make parse_qs[l] match Py3.10 for Py>=2.6 * support only default separator '&', not r'[&;]', like 3.10+ * support max_num_fields, like 3.8+	2023-04-16 21:20:07 +01:00
dirkf	9bbe366275	[BrightcoveNew] Support `brightcove🆕` pseudo-URL scheme * scheme (`content_type` is `video` or `playlist`): brightcove🆕{account_id}:{player_id}:{embed}:{content_type}:{conte> * also support smuggled `force_videoid` to pass desired resulting video_id	2023-04-16 19:39:10 +01:00