Merge branch 'ytdl-org:master' into master

2022-05-05 07:23:46 +05:00 · 2022-05-05 07:23:46 +05:00 · 54eec8c0b9
commit 54eec8c0b9
parent 67c58034ef e988fa4523
8 changed files with 162 additions and 11 deletions
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/README.md
+++ b/README.md
@ -1069,9 +1069,11 @@ After you have ensured this site is distributing its content legally, you can fo
            }
    ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test (actually, test case) then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note:
+    * the test names use the extractor class name **without the trailing `IE`**
+    * tests with `only_matching` key in test's dict are not counted.
+8. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
+9. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):

        $ flake8 youtube_dl/extractor/yourextractor.py

--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -557,6 +557,7 @@ from .kinja import KinjaEmbedIE
 from .kinopoisk import KinoPoiskIE
 from .konserthusetplay import KonserthusetPlayIE
 from .krasview import KrasViewIE
+from .kth import KTHIE
 from .ku6 import Ku6IE
 from .kusi import KUSIIE
 from .kuwo import (
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor):
            'duration': info.get('duration'),
            'timestamp': info.get('createdAt'),
            'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
-            'view_count': info.get('plays'),
+            'view_count': int_or_none(info.get('plays')),
        }
--- a/youtube_dl/extractor/kth.py
+++ b/youtube_dl/extractor/kth.py
@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class KTHIE(InfoExtractor):
+    _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)'
+    _TEST = {
+        'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9',
+        'md5': 'd83ada6d00ca98b73243a88efe19e8a6',
+        'info_dict': {
+            'id': '0_uoop6oz9',
+            'ext': 'mp4',
+            'title': 'md5:bd1d6931facb6828762a33e6ce865f37',
+            'thumbnail': 're:https?://.+/thumbnail/.+',
+            'duration': 3516,
+            'timestamp': 1647345358,
+            'upload_date': '20220315',
+            'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        result = self.url_result(
+            smuggle_url('kaltura:308:%s' % video_id, {
+                'service_url': 'https://api.kaltura.nordu.net'}),
+            'Kaltura')
+        return result
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@ -5,15 +5,16 @@ import re

 from .common import InfoExtractor
 from ..compat import (
-    compat_urlparse,
    compat_str,
+    compat_urlparse,
 )
 from ..utils import (
-    ExtractorError,
    determine_ext,
+    ExtractorError,
    find_xpath_attr,
    fix_xml_ampersands,
    GeoRestrictedError,
+    HEADRequest,
    int_or_none,
    parse_duration,
    remove_start,
@ -96,12 +97,100 @@ class RaiBaseIE(InfoExtractor):
        if not formats and geoprotection is True:
            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)

+        formats.extend(self._create_http_urls(relinker_url, formats))
+
        return dict((k, v) for k, v in {
            'is_live': is_live,
            'duration': duration,
            'formats': formats,
        }.items() if v is not None)

+    def _create_http_urls(self, relinker_url, fmts):
+        _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
+        _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
+        _QUALITY = {
+            # tbr: w, h
+            '250': [352, 198],
+            '400': [512, 288],
+            '700': [512, 288],
+            '800': [700, 394],
+            '1200': [736, 414],
+            '1800': [1024, 576],
+            '2400': [1280, 720],
+            '3200': [1440, 810],
+            '3600': [1440, 810],
+            '5000': [1920, 1080],
+            '10000': [1920, 1080],
+        }
+
+        def test_url(url):
+            resp = self._request_webpage(
+                HEADRequest(url), None, headers={'User-Agent': 'Rai'},
+                fatal=False, errnote=False, note=False)
+
+            if resp is False:
+                return False
+
+            if resp.code == 200:
+                return False if resp.url == url else resp.url
+            return None
+
+        def get_format_info(tbr):
+            import math
+            br = int_or_none(tbr)
+            if len(fmts) == 1 and not br:
+                br = fmts[0].get('tbr')
+            if br > 300:
+                tbr = compat_str(math.floor(br / 100) * 100)
+            else:
+                tbr = '250'
+
+            # try extracting info from available m3u8 formats
+            format_copy = None
+            for f in fmts:
+                if f.get('tbr'):
+                    br_limit = math.floor(br / 100)
+                    if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1:
+                        format_copy = f.copy()
+            return {
+                'width': format_copy.get('width'),
+                'height': format_copy.get('height'),
+                'tbr': format_copy.get('tbr'),
+                'vcodec': format_copy.get('vcodec'),
+                'acodec': format_copy.get('acodec'),
+                'fps': format_copy.get('fps'),
+                'format_id': 'https-%s' % tbr,
+            } if format_copy else {
+                'width': _QUALITY[tbr][0],
+                'height': _QUALITY[tbr][1],
+                'format_id': 'https-%s' % tbr,
+                'tbr': int(tbr),
+            }
+
+        loc = test_url(_MP4_TMPL % (relinker_url, '*'))
+        if not isinstance(loc, compat_str):
+            return []
+
+        mobj = re.match(
+            _RELINKER_REG,
+            test_url(relinker_url) or '')
+        if not mobj:
+            return []
+
+        available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*']
+        available_qualities = [i for i in available_qualities if i]
+
+        formats = []
+        for q in available_qualities:
+            fmt = {
+                'url': _MP4_TMPL % (relinker_url, q),
+                'protocol': 'https',
+                'ext': 'mp4',
+            }
+            fmt.update(get_format_info(q))
+            formats.append(fmt)
+        return formats
+
    @staticmethod
    def _extract_subtitles(url, video_data):
        STL_EXT = 'stl'
@ -151,6 +240,22 @@ class RaiPlayIE(RaiBaseIE):
        'params': {
            'skip_download': True,
        },
+    }, {
+        # 1080p direct mp4 url
+        'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html',
+        'md5': '2e501e8651d72f05ffe8f5d286ad560b',
+        'info_dict': {
+            'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642',
+            'ext': 'mp4',
+            'title': 'Leonardo - S1E1',
+            'alt_title': 'St 1 Ep 1 - Episodio 1',
+            'description': 'md5:f5360cd267d2de146e4e3879a5a47d31',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Rai 1',
+            'duration': 3229,
+            'series': 'Leonardo',
+            'season': 'Season 1',
+        },
    }, {
        'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
        'only_matching': True,
@ -158,6 +263,10 @@ class RaiPlayIE(RaiBaseIE):
        # subtitles at 'subtitlesArray' key (see #27698)
        'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
        'only_matching': True,
+    }, {
+        # DRM protected
+        'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
@ -166,6 +275,13 @@ class RaiPlayIE(RaiBaseIE):
        media = self._download_json(
            base + '.json', video_id, 'Downloading video JSON')

+        if try_get(
+                media,
+                (lambda x: x['rights_management']['rights']['drm'],
+                 lambda x: x['program_info']['rights_management']['rights']['drm']),
+                dict):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
        title = media['name']

        video = media['video']
@ -307,7 +423,7 @@ class RaiIE(RaiBaseIE):
    }, {
        # with ContentItem in og:url
        'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
-        'md5': '6865dd00cf0bbf5772fdd89d59bd768a',
+        'md5': '06345bd97c932f19ffb129973d07a020',
        'info_dict': {
            'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
            'ext': 'mp4',
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@ -62,7 +62,7 @@ class StreamCZIE(InfoExtractor):
                if not stream.get('url'):
                    continue
                yield merge_dicts({
-                    'format_id': '{}-{}'.format(format_id, ext),
+                    'format_id': '-'.join((format_id, ext)),
                    'ext': ext,
                    'source_preference': pref,
                    'url': urljoin(spl_url, stream['url']),
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1464,15 +1464,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
    # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
    # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
    def _extract_n_function_name(self, jscode):
-        target = r'(?P<nfunc>[a-zA-Z0-9$]{3})(?:\[(?P<idx>\d+)\])?'
+        target = r'(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?'
        nfunc_and_idx = self._search_regex(
-            r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ),
+            r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ),
            jscode, 'Initial JS player n function name')
        nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx')
        if not idx:
            return nfunc
        return self._parse_json(self._search_regex(
-            r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode,
+            r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode,
            'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)]

    def _extract_n_function(self, video_id, player_url):