From 3b31478dfd18c03a1305089e79c30dc929f53eeb Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:30:22 +0200 Subject: [PATCH 01/36] Fix support for NPO downloads --- youtube_dl/extractor/npo.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e525ad928..eff9edb8b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,8 @@ from __future__ import unicode_literals import re +import urllib.parse +from http.cookies import SimpleCookie from .common import InfoExtractor from ..compat import ( @@ -184,22 +186,28 @@ class NPOIE(NPOBaseIE): return self._get_info(url, video_id) or self._get_old_info(video_id) def _get_info(self, url, video_id): - token = self._download_json( + _, xsrf_token_response = self._download_webpage_handle( 'https://www.npostart.nl/api/token', video_id, 'Downloading token', headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', - })['token'] + }) + cookies = SimpleCookie() + cookies.load(xsrf_token_response.headers['Set-Cookie']) + cookies = {k: v.value for k, v in cookies.items()} + xsrf_token = cookies['XSRF-TOKEN'] player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, - 'Downloading player JSON', data=urlencode_postdata({ + 'Downloading player JSON', + headers={"x-xsrf-token": urllib.parse.unquote(xsrf_token)}, + data=urlencode_postdata({ 'autoplay': 0, 'share': 1, 'pageUrl': url, + 'isFavourite': "false", 'hasAdConsent': 0, - '_token': token, - })) + },)) player_token = player['token'] @@ -215,7 +223,7 @@ class NPOIE(NPOBaseIE): 'quality': 'npo', 'tokenId': player_token, 'streamType': 'broadcast', - }) + }, data=b"") if not streams: continue stream = streams.get('stream') From b4776f2e36e6235c6a3142973355be7e03eee919 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:39:11 +0200 Subject: [PATCH 02/36] Import from compat --- youtube_dl/extractor/npo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eff9edb8b..dba422058 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals import re -import urllib.parse -from http.cookies import SimpleCookie from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urllib_parse_unquote_plus, + compat_cookies_SimpleCookie, ) from ..utils import ( determine_ext, @@ -192,7 +192,7 @@ class NPOIE(NPOBaseIE): 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', }) - cookies = SimpleCookie() + cookies = compat_cookies_SimpleCookie() cookies.load(xsrf_token_response.headers['Set-Cookie']) cookies = {k: v.value for k, v in cookies.items()} xsrf_token = cookies['XSRF-TOKEN'] @@ -200,7 +200,7 @@ class NPOIE(NPOBaseIE): player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', - headers={"x-xsrf-token": urllib.parse.unquote(xsrf_token)}, + headers={"x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token)}, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, From fb2b4e2894171825c6c85d813a8120b679eadf52 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:46:05 +0200 Subject: [PATCH 03/36] Add line comment --- youtube_dl/extractor/npo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index dba422058..646b0f433 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -207,7 +207,7 @@ class NPOIE(NPOBaseIE): 'pageUrl': url, 'isFavourite': "false", 'hasAdConsent': 0, - },)) + })) player_token = player['token'] @@ -223,7 +223,8 @@ class NPOIE(NPOBaseIE): 'quality': 'npo', 'tokenId': player_token, 'streamType': 'broadcast', - }, data=b"") + }, + data=b"") # empty byte string to force a POST request instead of GET, without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From 9e1acb2527a9141710657a35d358dba54b4c8ddd Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 31 Mar 2023 12:56:18 +0200 Subject: [PATCH 04/36] Fix flake8 --- youtube_dl/extractor/npo.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 646b0f433..e8e596be1 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -200,7 +200,9 @@ class NPOIE(NPOBaseIE): player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', - headers={"x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token)}, + headers={ + "x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token) + }, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, @@ -224,7 +226,9 @@ class NPOIE(NPOBaseIE): 'tokenId': player_token, 'streamType': 'broadcast', }, - data=b"") # empty byte string to force a POST request instead of GET, without it HTTP 405 will happen + data=b"") + # Empty byte string in the call above to force a POST request + # Without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From 632897860b94c20bab65c9fd0ad81d6ae3ab30c1 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 3 Apr 2023 09:50:21 +0200 Subject: [PATCH 05/36] Accept suggestions on PR; comply with conventions Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e8e596be1..84bde9683 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -4,10 +4,10 @@ import re from .common import InfoExtractor from ..compat import ( + compat_cookies_SimpleCookie, compat_HTTPError, compat_str, compat_urllib_parse_unquote_plus, - compat_cookies_SimpleCookie, ) from ..utils import ( determine_ext, @@ -194,20 +194,20 @@ class NPOIE(NPOBaseIE): }) cookies = compat_cookies_SimpleCookie() cookies.load(xsrf_token_response.headers['Set-Cookie']) - cookies = {k: v.value for k, v in cookies.items()} + cookies = dict((k, v.value) for k, v in cookies.items()) xsrf_token = cookies['XSRF-TOKEN'] player = self._download_json( 'https://www.npostart.nl/player/%s' % video_id, video_id, 'Downloading player JSON', headers={ - "x-xsrf-token": compat_urllib_parse_unquote_plus(xsrf_token) + 'x-xsrf-token': compat_urllib_parse_unquote_plus(xsrf_token) }, data=urlencode_postdata({ 'autoplay': 0, 'share': 1, 'pageUrl': url, - 'isFavourite': "false", + 'isFavourite': 'false', 'hasAdConsent': 0, })) @@ -226,7 +226,7 @@ class NPOIE(NPOBaseIE): 'tokenId': player_token, 'streamType': 'broadcast', }, - data=b"") + data=b'') # Empty byte string in the call above to force a POST request # Without it HTTP 405 will happen if not streams: From 0c7261db901e79aed3dfd20f0b3c99ccbd32d20a Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 6 Apr 2023 01:51:02 +0100 Subject: [PATCH 06/36] Update npo.py * simplify comment * force CI --- youtube_dl/extractor/npo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 84bde9683..d6379f1d3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -226,9 +226,8 @@ class NPOIE(NPOBaseIE): 'tokenId': player_token, 'streamType': 'broadcast', }, + # empty data to force a POST request, avoiding HTTP 405 data=b'') - # Empty byte string in the call above to force a POST request - # Without it HTTP 405 will happen if not streams: continue stream = streams.get('stream') From da3d1f4321ec0b374b4201e092c085550003aec3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 10:36:03 +0100 Subject: [PATCH 07/36] Add notes on new npo.nl site --- youtube_dl/extractor/npo.py | 96 ++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 472da54ac..aef007e6a 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -36,6 +36,7 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): IE_NAME = 'npo' + # TODO find out if all hosts still work: IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' _VALID_URL = r'''(?x) (?: @@ -62,6 +63,10 @@ class NPOIE(NPOBaseIE): 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', 'upload_date': '20140622', }, + 'skip': 'Video gone', + }, { + 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', + # TODO other test attributes }, { 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', @@ -73,8 +78,9 @@ class NPOIE(NPOBaseIE): 'upload_date': '20090227', 'duration': 2400, }, + 'skip': 'Video gone', }, { - 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', 'info_dict': { 'id': 'VPWON_1169289', @@ -95,7 +101,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { # non asf in streams 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', @@ -106,7 +113,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', 'info_dict': { @@ -119,7 +127,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', 'info_dict': { @@ -132,7 +141,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video gone', }, { # audio 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', @@ -148,15 +158,19 @@ class NPOIE(NPOBaseIE): }, { 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', 'only_matching': True, + 'skip': 'Video gone', }, { 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', 'only_matching': True, + 'skip': 'Video gone', }, { # live stream 'url': 'npo:LI_NL1_4188102', @@ -704,7 +718,6 @@ class VPROIE(NPOPlaylistBaseIE): 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, - 'skip': 'Video gone', }, { 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', @@ -723,6 +736,7 @@ class VPROIE(NPOPlaylistBaseIE): 'title': 'education education', }, 'playlist_count': 2, + 'skip': 'Video gone', }, { 'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html', @@ -778,3 +792,73 @@ class AndereTijdenIE(NPOPlaylistBaseIE): }, 'playlist_count': 3, }] + +############################################################### +# Description of the new process of getting to the stream # +############################################################### + +# Valid URLs for new tests +# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/ +# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/afspelen + +# Step 1: Normalize the URL +# If the URL ends with /afspelen, strip that +# We need the slug in the next stepto find the productId + +# Step 2: Find the productId +# In the contents of the URL is a JSON blob: +# ')[0] + next_data = json.loads(next_data) + product_id, description, thumbnail, title = None, None, None, None + for query in next_data['props']['pageProps']['dehydratedState']['queries']: + if isinstance(query['state']['data'], list): + for entry in query['state']['data']: + print(entry) + try: + if entry['slug'] == slug: + product_id = entry['productId'] + title = entry['title'] + synopsis = entry['synopsis'] + description = synopsis.get('long', synopsis.get('short', synopsis.get('brief', ''))) + thumbnail = entry['images'][0]['url'] + break + except KeyError: + continue + except IndexError: + continue + if not product_id: + raise ExtractorError('No productId found for slug: %s' % slug) - def _get_info(self, url, video_id): - _, xsrf_token_response = self._download_webpage_handle( - 'https://www.npostart.nl/api/token', video_id, - 'Downloading token', headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - cookies = compat_cookies_SimpleCookie() - cookies.load(xsrf_token_response.headers['Set-Cookie']) - cookies = dict((k, v.value) for k, v in cookies.items()) - xsrf_token = cookies['XSRF-TOKEN'] + token = self._get_token(product_id) - player = self._download_json( - 'https://www.npostart.nl/player/%s' % video_id, video_id, - 'Downloading player JSON', + stream_link = self._download_json( + 'https://prod.npoplayer.nl/stream-link', video_id=slug, + data=json.dumps({ + "profileName": "dash", + "drmType": "widevine", + "referrerUrl": url, + }).encode('utf8'), headers={ - 'x-xsrf-token': compat_urllib_parse_unquote_plus(xsrf_token) - }, - data=urlencode_postdata({ - 'autoplay': 0, - 'share': 1, - 'pageUrl': url, - 'isFavourite': 'false', - 'hasAdConsent': 0, - })) - - player_token = player['token'] - - drm = False - format_urls = set() - formats = [] - for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): - streams = self._download_json( - 'https://start-player.npo.nl/video/%s/streams' % video_id, - video_id, 'Downloading %s profile JSON' % profile, fatal=False, - query={ - 'profile': profile, - 'quality': 'npo', - 'tokenId': player_token, - 'streamType': 'broadcast', - }, - # empty data to force a POST request, avoiding HTTP 405 - data=b'') - if not streams: - continue - stream = streams.get('stream') - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('src')) - if not stream_url or stream_url in format_urls: - continue - format_urls.add(stream_url) - if stream.get('protection') is not None or stream.get('keySystemOptions') is not None: - drm = True - continue - stream_type = stream.get('type') - stream_ext = determine_ext(stream_url) - if stream_type == 'application/dash+xml' or stream_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - stream_url, video_id, mpd_id='dash', fatal=False)) - elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - elif re.search(r'\.isml?/Manifest', stream_url): - formats.extend(self._extract_ism_formats( - stream_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': stream_url, - }) - - if not formats: - if drm: - raise ExtractorError('This video is DRM protected.', expected=True) - return - - self._sort_formats(formats) - - info = { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - embed_url = url_or_none(player.get('embedUrl')) - if embed_url: - webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed page', fatal=False) - if webpage: - video = self._parse_json( - self._search_regex( - r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', - default='{}'), video_id) - if video: - title = video.get('episodeTitle') - subtitles = {} - subtitles_list = video.get('subtitles') - if isinstance(subtitles_list, list): - for cc in subtitles_list: - cc_url = url_or_none(cc.get('src')) - if not cc_url: - continue - lang = str_or_none(cc.get('language')) or 'nl' - subtitles.setdefault(lang, []).append({ - 'url': cc_url, - }) - return merge_dicts({ - 'title': title, - 'description': video.get('description'), - 'thumbnail': url_or_none( - video.get('still_image_url') or video.get('orig_image_url')), - 'duration': int_or_none(video.get('duration')), - 'timestamp': unified_timestamp(video.get('broadcastDate')), - 'creator': video.get('channel'), - 'series': video.get('title'), - 'episode': title, - 'episode_number': int_or_none(video.get('episodeNumber')), - 'subtitles': subtitles, - }, info) - - return info - - def _get_old_info(self, video_id): - metadata = self._download_json( - 'http://e.omroep.nl/metadata/%s' % video_id, - video_id, - # We have to remove the javascript callback - transform_source=strip_jsonp, + "Authorization": token, + "Content-Type": "application/json", + } ) - error = metadata.get('error') - if error: - raise ExtractorError(error, expected=True) + stream_url = stream_link['stream']['streamURL'] - # For some videos actual video id (prid) is different (e.g. for - # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 - # video id is POMS_WNL_853698 but prid is POW_00996502) - video_id = metadata.get('prid') or video_id - - # titel is too generic in some cases so utilize aflevering_titel as well - # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) - title = metadata['titel'] - sub_title = metadata.get('aflevering_titel') - if sub_title and sub_title != title: - title += ': %s' % sub_title - - token = self._get_token(video_id) - - formats = [] - urls = set() - - def is_legal_url(format_url): - return format_url and format_url not in urls and re.match( - r'^(?:https?:)?//', format_url) - - QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') - QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') - - quality_from_label = qualities(QUALITY_LABELS) - quality_from_format_id = qualities(QUALITY_FORMATS) - items = self._download_json( - 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, - 'Downloading formats JSON', query={ - 'adaptive': 'yes', - 'token': token, - })['items'][0] - for num, item in enumerate(items): - item_url = item.get('url') - if not is_legal_url(item_url): - continue - urls.add(item_url) - format_id = self._search_regex( - r'video/ida/([^/]+)', item_url, 'format id', - default=None) - - item_label = item.get('label') - - def add_format_url(format_url): - width = int_or_none(self._search_regex( - r'(\d+)[xX]\d+', format_url, 'width', default=None)) - height = int_or_none(self._search_regex( - r'\d+[xX](\d+)', format_url, 'height', default=None)) - if item_label in QUALITY_LABELS: - quality = quality_from_label(item_label) - f_id = item_label - elif item_label in QUALITY_FORMATS: - quality = quality_from_format_id(format_id) - f_id = format_id - else: - quality, f_id = [None] * 2 - formats.append({ - 'url': format_url, - 'format_id': f_id, - 'width': width, - 'height': height, - 'quality': quality, - }) - - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - if item.get('contentType') in ('url', 'audio'): - add_format_url(item_url) - continue - - try: - stream_info = self._download_json( - item_url + '&type=json', video_id, - 'Downloading %s stream JSON' - % item_label or item.get('format') or format_id or num) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error = (self._parse_json( - ee.cause.read().decode(), video_id, - fatal=False) or {}).get('errorstring') - if error: - raise ExtractorError(error, expected=True) - raise - # Stream URL instead of JSON, example: npo:LI_NL1_4188102 - if isinstance(stream_info, compat_str): - if not stream_info.startswith('http'): - continue - video_url = stream_info - # JSON - else: - video_url = stream_info.get('url') - if not video_url or 'vodnotavailable.' in video_url or video_url in urls: - continue - urls.add(video_url) - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - else: - add_format_url(video_url) - - is_live = metadata.get('medium') == 'live' - - if not is_live: - for num, stream in enumerate(metadata.get('streams', [])): - stream_url = stream.get('url') - if not is_legal_url(stream_url): - continue - urls.add(stream_url) - # smooth streaming is not supported - stream_type = stream.get('type', '').lower() - if stream_type in ['ss', 'ms']: - continue - if stream_type == 'hds': - f4m_formats = self._extract_f4m_formats( - stream_url, video_id, fatal=False) - # f4m downloader downloads only piece of live stream - for f4m_format in f4m_formats: - f4m_format['preference'] = -1 - formats.extend(f4m_formats) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', fatal=False)) - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - elif '.asf' in stream_url: - asx = self._download_xml( - stream_url, video_id, - 'Downloading stream %d ASX playlist' % num, - transform_source=fix_xml_ampersands, fatal=False) - if not asx: - continue - ref = asx.find('./ENTRY/Ref') - if ref is None: - continue - video_url = ref.get('href') - if not video_url or video_url in urls: - continue - urls.add(video_url) - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - 'preference': -10, - }) - else: - formats.append({ - 'url': stream_url, - 'quality': stream.get('kwaliteit'), - }) - - self._sort_formats(formats) - - subtitles = {} - if metadata.get('tt888') == 'ja': - subtitles['nl'] = [{ - 'ext': 'vtt', - 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, - }] + # TODO other formats than dash / mpd + mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) return { - 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': metadata.get('info'), - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'upload_date': unified_strdate(metadata.get('gidsdatum')), - 'duration': parse_duration(metadata.get('tijdsduur')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, + 'id': slug, + 'formats': mpd, + 'title': title or slug, + 'description': description, + 'thumbnail': thumbnail, + # TODO fill in other metadata that's available } - -############################################################### -# Description of the new process of getting to the stream # -############################################################### - -# Valid URLs for new tests -# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/ -# https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/afspelen - -# Step 1: Normalize the URL -# If the URL ends with /afspelen, strip that -# We need the slug in the next stepto find the productId - -# Step 2: Find the productId -# In the contents of the URL is a JSON blob: -# ')[0] next_data = json.loads(next_data) - product_id, description, thumbnail, title = None, None, None, None + product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: if isinstance(query['state']['data'], list): for entry in query['state']['data']: - print(entry) - try: - if entry['slug'] == slug: - product_id = entry['productId'] - title = entry['title'] - synopsis = entry['synopsis'] - description = synopsis.get('long', synopsis.get('short', synopsis.get('brief', ''))) - thumbnail = entry['images'][0]['url'] - break - except KeyError: - continue - except IndexError: - continue + if entry['slug'] == slug: + product_id = entry.get('productId') + title = entry.get('title') + synopsis = entry.get('synopsis', {}) + description = ( + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') + ) + thumbnails = entry.get('images') + for thumbnail_entry in thumbnails: + if 'url' in thumbnail_entry: + thumbnail = thumbnail_entry.get('url') if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) @@ -97,19 +96,18 @@ class NPOIE(InfoExtractor): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, data=json.dumps({ - "profileName": "dash", - "drmType": "widevine", - "referrerUrl": url, + 'profileName': 'dash', + 'drmType': 'widevine', + 'referrerUrl': url, }).encode('utf8'), headers={ - "Authorization": token, - "Content-Type": "application/json", + 'Authorization': token, + 'Content-Type': 'application/json', } ) - stream_url = stream_link['stream']['streamURL'] - # TODO other formats than dash / mpd + stream_url = stream_link.get('stream', {}).get('streamURL') mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) return { From fb7b7179ff7ff08ad7e32539c0b0d440e0899903 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 15:08:10 +0100 Subject: [PATCH 12/36] Speculate about other ways of getting productId --- youtube_dl/extractor/npo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 502d276ff..7f90aa827 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -70,6 +70,8 @@ class NPOIE(InfoExtractor): page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug) # TODO find out what proper HTML parsing utilities are available in youtube-dl next_data = page.split('')[0] + # TODO The data in this script tag feels like GraphQL, so there might be an easier way + # to get the product id, maybe using a GraphQL endpoint next_data = json.loads(next_data) product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: From f9e59b0c49c8f0fc3951f8ca01705abb46ed51e4 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 15:28:14 +0100 Subject: [PATCH 13/36] Add the possibility to add 'hls' later --- youtube_dl/extractor/npo.py | 43 ++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 7f90aa827..3e543e350 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -82,9 +82,9 @@ class NPOIE(InfoExtractor): title = entry.get('title') synopsis = entry.get('synopsis', {}) description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') ) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: @@ -95,26 +95,29 @@ class NPOIE(InfoExtractor): token = self._get_token(product_id) - stream_link = self._download_json( - 'https://prod.npoplayer.nl/stream-link', video_id=slug, - data=json.dumps({ - 'profileName': 'dash', - 'drmType': 'widevine', - 'referrerUrl': url, - }).encode('utf8'), - headers={ - 'Authorization': token, - 'Content-Type': 'application/json', - } - ) - - # TODO other formats than dash / mpd - stream_url = stream_link.get('stream', {}).get('streamURL') - mpd = self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False) + formats = [] + for profile in ( + 'dash', + # 'hls', # TODO test what needs to change for 'hls' support + ): + stream_link = self._download_json( + 'https://prod.npoplayer.nl/stream-link', video_id=slug, + data=json.dumps({ + 'profileName': profile, + 'drmType': 'widevine', + 'referrerUrl': url, + }).encode('utf8'), + headers={ + 'Authorization': token, + 'Content-Type': 'application/json', + } + ) + stream_url = stream_link.get('stream', {}).get('streamURL') + formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) return { 'id': slug, - 'formats': mpd, + 'formats': formats, 'title': title or slug, 'description': description, 'thumbnail': thumbnail, From 8b1a7d9a7c09d7c88fa03f885ebdc5347c007f69 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 1 Mar 2024 16:23:19 +0100 Subject: [PATCH 14/36] Use provided util --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 3e543e350..e7275e1b3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -72,7 +72,7 @@ class NPOIE(InfoExtractor): next_data = page.split('')[0] # TODO The data in this script tag feels like GraphQL, so there might be an easier way # to get the product id, maybe using a GraphQL endpoint - next_data = json.loads(next_data) + next_data = self._parse_json(next_data, slug) product_id, title, description, thumbnail = None, None, None, None for query in next_data['props']['pageProps']['dehydratedState']['queries']: if isinstance(query['state']['data'], list): From 34b5b2010774fab2cb8984c720fcd7c62110669a Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 3 Mar 2024 17:47:15 +0100 Subject: [PATCH 15/36] Refactor into reusable method --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 60 ++++++++++++++++++++++++------ 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dabcd60cb..696fd8e1e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ from .nowness import ( NownessSeriesIE, ) from .noz import NozIE -from .npo import NPOIE +from .npo import BNNVaraIE, NPOIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e7275e1b3..389696861 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -13,7 +13,6 @@ class NPOIE(InfoExtractor): IE_DESC = 'npo.nl' _VALID_URL = r'''(?x) (?: - npo:| https?:// (?:www\.)? (?: @@ -82,9 +81,9 @@ class NPOIE(InfoExtractor): title = entry.get('title') synopsis = entry.get('synopsis', {}) description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') + synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief') ) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: @@ -93,8 +92,19 @@ class NPOIE(InfoExtractor): if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) - token = self._get_token(product_id) + formats = self._download_by_product_id(product_id, slug, url) + return { + 'id': slug, + 'formats': formats, + 'title': title or slug, + 'description': description, + 'thumbnail': thumbnail, + # TODO fill in other metadata that's available + } + + def _download_by_product_id(self, product_id, slug, url=None): + token = self._get_token(product_id) formats = [] for profile in ( 'dash', @@ -105,7 +115,7 @@ class NPOIE(InfoExtractor): data=json.dumps({ 'profileName': profile, 'drmType': 'widevine', - 'referrerUrl': url, + 'referrerUrl': url or '', }).encode('utf8'), headers={ 'Authorization': token, @@ -114,12 +124,40 @@ class NPOIE(InfoExtractor): ) stream_url = stream_link.get('stream', {}).get('streamURL') formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) + return formats + + +class BNNVaraIE(NPOIE): + IE_NAME = 'bnnvara' + IE_DESC = 'bnnvara.nl' + _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' + + def _real_extract(self, url): + url = url.rstrip('/') + video_id = url.split('/')[-1] + + media = self._download_json('https://api.bnnvara.nl/bff/graphql', + video_id, + data=json.dumps( + { + 'operationName': 'getMedia', + 'variables': { + 'id': video_id, + 'hasAdConsent': False, + 'atInternetId': 70 + }, + 'query': 'query getMedia($id: ID!, $mediaUrl: String, $hasAdConsent: Boolean!, $atInternetId: Int) {\n player(\n id: $id\n mediaUrl: $mediaUrl\n hasAdConsent: $hasAdConsent\n atInternetId: $atInternetId\n ) {\n ... on PlayerSucces {\n brand {\n name\n slug\n broadcastsEnabled\n __typename\n }\n title\n programTitle\n pomsProductId\n broadcasters {\n name\n __typename\n }\n duration\n classifications {\n title\n imageUrl\n type\n __typename\n }\n image {\n title\n url\n __typename\n }\n cta {\n title\n url\n __typename\n }\n genres {\n name\n __typename\n }\n subtitles {\n url\n language\n __typename\n }\n sources {\n name\n url\n ratio\n __typename\n }\n type\n token\n __typename\n }\n ... on PlayerError {\n error\n __typename\n }\n __typename\n }\n}' + }).encode('utf8'), + headers={ + 'Content-Type': 'application/json', + }) + product_id = media.get('data', {}).get('player', {}).get('pomsProductId') + + formats = self._download_by_product_id(product_id, video_id) return { - 'id': slug, + 'id': product_id, + 'title': media.get('data', {}).get('player', {}).get('title'), 'formats': formats, - 'title': title or slug, - 'description': description, - 'thumbnail': thumbnail, - # TODO fill in other metadata that's available + 'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'), } From 4fc423845e8b5f8855fb6e5a0a5087064401b12b Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 12:49:22 +0100 Subject: [PATCH 16/36] Fix lint --- youtube_dl/extractor/npo.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 389696861..53fd816f7 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -80,11 +80,9 @@ class NPOIE(InfoExtractor): product_id = entry.get('productId') title = entry.get('title') synopsis = entry.get('synopsis', {}) - description = ( - synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief') - ) + description = (synopsis.get('long') + or synopsis.get('short') + or synopsis.get('brief')) thumbnails = entry.get('images') for thumbnail_entry in thumbnails: if 'url' in thumbnail_entry: From 28ba01f1ccfc5560be7d027b1669822e44d4143f Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 13:43:56 +0100 Subject: [PATCH 17/36] Add Ongehoord Nederland and test URL for BNNVARA --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 696fd8e1e..802e498f9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ from .nowness import ( NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE +from .npo import BNNVaraIE, NPOIE, ONIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 53fd816f7..d8573d343 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -129,6 +130,9 @@ class BNNVaraIE(NPOIE): IE_NAME = 'bnnvara' IE_DESC = 'bnnvara.nl' _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' + _TESTS = [{ + 'url': 'https://www.bnnvara.nl/videos/27455', + }] def _real_extract(self, url): url = url.rstrip('/') @@ -159,3 +163,29 @@ class BNNVaraIE(NPOIE): 'formats': formats, 'thumbnail': media.get('data', {}).get('player', {}).get('image').get('url'), } + + +class ONIE(NPOIE): + IE_NAME = 'on' + IE_DESC = 'ongehoordnederland.tv' + _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' + _TESTS = [{ + 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall("page: '(.+)'", page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From eb6e396bfb66965487ef1e7c50edbf6e28130462 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 13:55:59 +0100 Subject: [PATCH 18/36] First version of a VPRO regex --- youtube_dl/extractor/npo.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index d8573d343..d48a4cda0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -4,9 +4,7 @@ import json import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from ..utils import ExtractorError class NPOIE(InfoExtractor): @@ -189,3 +187,29 @@ class ONIE(NPOIE): 'title': video_id, 'formats': formats, } + + +class VPROIE(NPOIE): + IE_NAME = 'vpro' + IE_DESC = 'vpro.nl' + _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' + _TESTS = [{ + 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall('data-media-id="(.+_.+)"\s', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From d36d50fe5cf166899adfc85e7ca9b0f8f5272d19 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 14:04:03 +0100 Subject: [PATCH 19/36] Re-add Zapp --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 802e498f9..b3a9fdfba 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ from .nowness import ( NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE +from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index d48a4cda0..84b41443b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -204,6 +204,7 @@ class VPROIE(NPOIE): formats = [] for result in results: formats.extend(self._download_by_product_id(result, video_id)) + break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL.') @@ -213,3 +214,24 @@ class VPROIE(NPOIE): 'title': video_id, 'formats': formats, } + + +class ZAPPIE(NPOIE): + IE_NAME = 'zapp' + IE_DESC = 'zapp.nl' + _VALID_URL = r'https?://(?:www\.)?zapp.nl/.*' + + _TESTS = [{ + 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + + formats = self._download_by_product_id(url, video_id) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } From d426a92a60ba9b6eb01256d3dcad4dcbfecd742c Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Tue, 5 Mar 2024 14:11:49 +0100 Subject: [PATCH 20/36] Encoding suggestion from PR --- youtube_dl/extractor/npo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 84b41443b..01eb54fc0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import json From 3b3d73cbe6f64d6485e03cb658cc491d4fa62333 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 11:52:08 +0100 Subject: [PATCH 21/36] Use program-detail endpoint and remove a test --- youtube_dl/extractor/npo.py | 61 +++++++++++++++---------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 01eb54fc0..239583b5b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -25,18 +25,6 @@ class NPOIE(InfoExtractor): _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', # TODO fill in other test attributes - }, { - 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', - 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', - 'info_dict': { - 'id': 'VARA_101191800', - 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show: The best of.', - 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', - 'upload_date': '20090227', - 'duration': 2400, - }, - 'skip': 'Video gone', }, { 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', @@ -66,27 +54,21 @@ class NPOIE(InfoExtractor): url = url[:-10] url = url.rstrip('/') slug = url.split('/')[-1] - page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug) - # TODO find out what proper HTML parsing utilities are available in youtube-dl - next_data = page.split('')[0] - # TODO The data in this script tag feels like GraphQL, so there might be an easier way - # to get the product id, maybe using a GraphQL endpoint - next_data = self._parse_json(next_data, slug) - product_id, title, description, thumbnail = None, None, None, None - for query in next_data['props']['pageProps']['dehydratedState']['queries']: - if isinstance(query['state']['data'], list): - for entry in query['state']['data']: - if entry['slug'] == slug: - product_id = entry.get('productId') - title = entry.get('title') - synopsis = entry.get('synopsis', {}) - description = (synopsis.get('long') - or synopsis.get('short') - or synopsis.get('brief')) - thumbnails = entry.get('images') - for thumbnail_entry in thumbnails: - if 'url' in thumbnail_entry: - thumbnail = thumbnail_entry.get('url') + + program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', + slug, + query={'slug': slug}) + product_id = program_metadata.get('productId') + images = program_metadata.get('images') + thumbnail = None + for image in images: + thumbnail = image.get('url') + break + title = program_metadata.get('title') + descriptions = program_metadata.get('description', {}) + description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief') + duration = program_metadata.get('durationInSeconds') + if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) @@ -96,17 +78,18 @@ class NPOIE(InfoExtractor): 'id': slug, 'formats': formats, 'title': title or slug, - 'description': description, + 'description': description or title or slug, 'thumbnail': thumbnail, - # TODO fill in other metadata that's available + 'duration': duration, } def _download_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) formats = [] for profile in ( - 'dash', - # 'hls', # TODO test what needs to change for 'hls' support + 'dash', + # 'hls' is available too, but implementing it doesn't add much + # As far as I know 'dash' is always available ): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, @@ -131,6 +114,7 @@ class BNNVaraIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' _TESTS = [{ 'url': 'https://www.bnnvara.nl/videos/27455', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -170,6 +154,7 @@ class ONIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' _TESTS = [{ 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -196,6 +181,7 @@ class VPROIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' _TESTS = [{ 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + # TODO fill in other test attributes }] def _real_extract(self, url): @@ -224,6 +210,7 @@ class ZAPPIE(NPOIE): _TESTS = [{ 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', + # TODO fill in other test attributes }] def _real_extract(self, url): From 4b24e5f00da0b11f3e2989d5a568e862285d34ea Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:22:27 +0100 Subject: [PATCH 22/36] Re-add SchoolTV --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/npo.py | 42 +++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b3a9fdfba..5f2ac7ced 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,7 @@ from .nowness import ( NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE +from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE, SchoolTVIE from .npr import NprIE from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 239583b5b..a28915bd0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -87,9 +87,9 @@ class NPOIE(InfoExtractor): token = self._get_token(product_id) formats = [] for profile in ( - 'dash', - # 'hls' is available too, but implementing it doesn't add much - # As far as I know 'dash' is always available + 'dash', + # 'hls' is available too, but implementing it doesn't add much + # As far as I know 'dash' is always available ): stream_link = self._download_json( 'https://prod.npoplayer.nl/stream-link', video_id=slug, @@ -223,3 +223,39 @@ class ZAPPIE(NPOIE): 'title': video_id, 'formats': formats, } + + +class SchoolTVIE(NPOIE): + IE_NAME = 'schooltv' + IE_DESC = 'schooltv.nl' + _VALID_URL = r'https?://(?:www\.)?schooltv.nl/item/.*' + + _TESTS = [{ + 'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015', + # TODO fill in other test attributes + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + + build_id = 'b7eHUzAVO7wHXCopYxQhV' + + metadata_url = 'https://schooltv.nl/_next/data/' \ + + build_id \ + + '/item/' \ + + video_id + '.json' + + metadata = self._download_json(metadata_url, + video_id).get('pageProps', {}).get('data', {}) + + formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL.') + + return { + 'id': video_id, + 'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''), + 'description': metadata.get('description') or metadata.get('short_description'), + 'formats': formats, + } From 681b39032ae34709a74c5a4ab8f0d2275aab6880 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:32:34 +0100 Subject: [PATCH 23/36] Fix flake8 and better error reporting --- youtube_dl/extractor/npo.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a28915bd0..c4e4097e3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -166,7 +166,8 @@ class ONIE(NPOIE): formats.extend(self._download_by_product_id(result, video_id)) if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, @@ -187,14 +188,15 @@ class VPROIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] page, _ = self._download_webpage_handle(url, video_id) - results = re.findall('data-media-id="(.+_.+)"\s', page) + results = re.findall(r'data-media-id="(.+_.+)"\s', page) formats = [] for result in results: formats.extend(self._download_by_product_id(result, video_id)) break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, @@ -238,6 +240,8 @@ class SchoolTVIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] + # TODO Find out how we could obtain this automatically + # Otherwise this extractor might break each time SchoolTV deploys a new release build_id = 'b7eHUzAVO7wHXCopYxQhV' metadata_url = 'https://schooltv.nl/_next/data/' \ @@ -251,7 +255,8 @@ class SchoolTVIE(NPOIE): formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL.') + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') return { 'id': video_id, From 159f825edd6326fda7f43fb27d13db6cd2bbc4ca Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:53:37 +0100 Subject: [PATCH 24/36] Add scaffolding for last few extractors and change order so the PR diff looks nice --- youtube_dl/extractor/npo.py | 91 +++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c4e4097e3..196ab9d1b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,16 +11,7 @@ from ..utils import ExtractorError class NPOIE(InfoExtractor): IE_NAME = 'npo' IE_DESC = 'npo.nl' - _VALID_URL = r'''(?x) - (?: - https?:// - (?:www\.)? - (?: - npo\.nl/(?:[^/]+/)* - ) - ) - (?P[^/?#]+) - ''' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*' _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', @@ -176,35 +167,6 @@ class ONIE(NPOIE): } -class VPROIE(NPOIE): - IE_NAME = 'vpro' - IE_DESC = 'vpro.nl' - _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' - _TESTS = [{ - 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', - # TODO fill in other test attributes - }] - - def _real_extract(self, url): - video_id = url.rstrip('/').split('/')[-1] - page, _ = self._download_webpage_handle(url, video_id) - results = re.findall(r'data-media-id="(.+_.+)"\s', page) - formats = [] - for result in results: - formats.extend(self._download_by_product_id(result, video_id)) - break # TODO find a better solution, VPRO pages can have multiple videos embedded - - if not formats: - raise ExtractorError('Could not find a POMS product id in the provided URL, ' - 'perhaps because all stream URLs are DRM protected.') - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - class ZAPPIE(NPOIE): IE_NAME = 'zapp' IE_DESC = 'zapp.nl' @@ -264,3 +226,54 @@ class SchoolTVIE(NPOIE): 'description': metadata.get('description') or metadata.get('short_description'), 'formats': formats, } + + +class HetKlokhuisIE(NPOIE): + ... + + def _real_extract(self, url): + ... + + +class VPROIE(NPOIE): + IE_NAME = 'vpro' + IE_DESC = 'vpro.nl' + _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' + _TESTS = [{ + 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', + # TODO fill in other test attributes + }] + + def _real_extract(self, url): + video_id = url.rstrip('/').split('/')[-1] + page, _ = self._download_webpage_handle(url, video_id) + results = re.findall(r'data-media-id="(.+_.+)"\s', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + break # TODO find a better solution, VPRO pages can have multiple videos embedded + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class WNLIE(NPOIE): + ... + + def _real_extract(self, url): + ... + + +class AndereTijdenIE(NPOIE): + ... + + def _real_extract(self, url): + ... + From 0cbcd1aec656998d44dbffe59cbb0adac4b84b45 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Wed, 6 Mar 2024 12:55:51 +0100 Subject: [PATCH 25/36] Make diff better --- youtube_dl/extractor/extractors.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f2ac7ced..b1093a1ac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -847,7 +847,16 @@ from .nowness import ( NownessSeriesIE, ) from .noz import NozIE -from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE, SchoolTVIE +from .npo import ( + AndereTijdenIE, + BNNVaraIE, + NPOIE, + ONIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) from .npr import NprIE from .nrk import ( NRKIE, From 0ab79c37ae2c465678276bef0e9032efb30f464b Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 7 Mar 2024 16:23:09 +0100 Subject: [PATCH 26/36] Reusable code for two NTR sites --- youtube_dl/extractor/npo.py | 53 +++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 196ab9d1b..77411da52 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -228,11 +228,35 @@ class SchoolTVIE(NPOIE): } -class HetKlokhuisIE(NPOIE): - ... - +class NTRSubsiteIE(NPOIE): def _real_extract(self, url): - ... + video_id = url.rstrip('/').split('/')[-1] + + page, _ = self._download_webpage_handle(url) + results = re.findall(r'data-mid="(.+_.+)"', page) + formats = [] + for result in results: + formats.extend(self._download_by_product_id(result, video_id)) + break + + if not formats: + raise ExtractorError('Could not find a POMS product id in the provided URL, ' + 'perhaps because all stream URLs are DRM protected.') + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class HetKlokhuisIE(NTRSubsiteIE): + IE_NAME = 'het-klokhuis' + IE_DESC = 'hetklokhuis.nl' + _VALID_URL = r'https?://(?:www\.)?het-klokhuis\.nl/.*' + _TESTS = [{ + 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens' + }] class VPROIE(NPOIE): @@ -264,16 +288,11 @@ class VPROIE(NPOIE): } -class WNLIE(NPOIE): - ... - - def _real_extract(self, url): - ... - - -class AndereTijdenIE(NPOIE): - ... - - def _real_extract(self, url): - ... - +class AndereTijdenIE(NTRSubsiteIE): + IE_NAME = 'anderetijden' + IE_DESC = 'anderetijden.nl' + _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/.*' + _TESTS = [{ + 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem' + # TODO fill in other test attributes + }] From c08f29f45b6b7f41127c8d9260617de7d69430f9 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 16:27:40 +0100 Subject: [PATCH 27/36] Update unit tests --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/npo.py | 37 ++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1093a1ac..e5c9af8ba 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -855,7 +855,6 @@ from .npo import ( SchoolTVIE, HetKlokhuisIE, VPROIE, - WNLIE, ) from .npr import NprIE from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 77411da52..f5f748573 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -15,17 +15,24 @@ class NPOIE(InfoExtractor): _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', - # TODO fill in other test attributes + 'md5': 'f9ce9c43cc8bc3b8138df1562b99c379', + 'info_dict': { + 'description': 'Wie is de mol? (2)', + 'ext': 'm4v', + 'duration': 2439, + 'id': 'wie-is-de-mol-2', + 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', + 'title': 'Wie is de mol? (2)' + } }, { 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'md5': 'c84d054219c4888ed53b4ee3d01b2d93', 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - 'duration': 3000, + 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', + 'title': 'Zwart geld: de toekomst komt uit Afrika', + 'description': 'Zwart geld: de toekomst komt uit Afrika', + 'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg', + 'duration': 3000 }, }] @@ -105,7 +112,12 @@ class BNNVaraIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' _TESTS = [{ 'url': 'https://www.bnnvara.nl/videos/27455', - # TODO fill in other test attributes + 'md5': '392dd367877739e49b9e0a9a550b178a', + 'info_dict': { + 'id': 'VARA_101369808', + 'thumbnail': 'https://media.vara.nl/files/thumbnails/321291_custom_zembla__wie_is_de_mol_680x383.jpg', + 'title': 'Zembla - Wie is de mol?' + } }] def _real_extract(self, url): @@ -265,7 +277,12 @@ class VPROIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*' _TESTS = [{ 'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html', - # TODO fill in other test attributes + 'md5': 'cf302e066b5313cfaf8d5adf50d64f13', + 'info_dict': { + 'id': 'offline-als-luxe.html', + 'title': 'offline-als-luxe.html', + 'ext': 'm4v', + } }] def _real_extract(self, url): From 28624cfe0930655b815f40d4b4820f76728de65e Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 16:57:31 +0100 Subject: [PATCH 28/36] Work work --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/npo.py | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e5c9af8ba..1a1905d13 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -855,6 +855,7 @@ from .npo import ( SchoolTVIE, HetKlokhuisIE, VPROIE, + ZAPPIE, ) from .npr import NprIE from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f5f748573..699eedf12 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -208,7 +208,12 @@ class SchoolTVIE(NPOIE): _TESTS = [{ 'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015', - # TODO fill in other test attributes + 'md5': 'e9ef151c4886994e2bea23593348cb14', + 'info_dict': { + 'id': 'zapp-music-challenge-2015-zapp-music-challenge-2015', + 'title': 'Zapp Music Challenge 2015 - Alain Clark & Yaell', + 'description': "Een nummer schrijven met de super bekende soulzanger en producer Alain Clark? Dat is de uitdaging voor de dertienjarige Yaell uit Delft. En als het dan echt goed is, mag hij het ook nog eens live gaan spelen op de speelplaats bij Giel Beelen! Muziek is heel erg belangrijk in het leven van Yaell. 'Als er geen muziek zou zijn, dan zou ik heel veel niet kunnen.' Hij is dan ook altijd aan het schrijven, vaak over zijn eigen leven. Maar soms is het best lastig om die teksten te verzinnen. Vindt hij de inspiratie om een hit te maken met Alain?" + }, }] def _real_extract(self, url): @@ -244,7 +249,7 @@ class NTRSubsiteIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - page, _ = self._download_webpage_handle(url) + page, _ = self._download_webpage_handle(url, video_id) results = re.findall(r'data-mid="(.+_.+)"', page) formats = [] for result in results: @@ -263,11 +268,16 @@ class NTRSubsiteIE(NPOIE): class HetKlokhuisIE(NTRSubsiteIE): - IE_NAME = 'het-klokhuis' + IE_NAME = 'hetklokhuis' IE_DESC = 'hetklokhuis.nl' - _VALID_URL = r'https?://(?:www\.)?het-klokhuis\.nl/.*' + _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/.*' _TESTS = [{ - 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens' + 'url': 'https://hetklokhuis.nl/dossier/142/zoek-het-uit/tv-uitzending/2987/aliens', + 'md5': '4664b54ed4e05183b1e4f2f4290d551e', + 'info_dict': { + 'id': 'aliens', + 'title': 'aliens' + } }] @@ -310,6 +320,10 @@ class AndereTijdenIE(NTRSubsiteIE): IE_DESC = 'anderetijden.nl' _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/.*' _TESTS = [{ - 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem' - # TODO fill in other test attributes + 'url': 'https://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'md5': '3d607b16e00b459156b4ab6e163dccd7', + 'info_dict': { + 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem' + } }] From 1ca4e686a3f9001cb52c8b682b57c1fba65700db Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Sun, 10 Mar 2024 17:04:00 +0100 Subject: [PATCH 29/36] Add an MD5 --- youtube_dl/extractor/npo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 699eedf12..f4cd137ff 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -157,7 +157,10 @@ class ONIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' _TESTS = [{ 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', - # TODO fill in other test attributes + 'md5': 'a85ebd50fa86fe5cbce654655f7dbb12', + 'info_dict': { + + } }] def _real_extract(self, url): From 4398f6832f76948ee79025f0e055117182d1dfb3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 11 Mar 2024 13:40:23 +0100 Subject: [PATCH 30/36] Fix zapp extractor --- youtube_dl/extractor/npo.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f4cd137ff..a5413a1d7 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -99,7 +99,8 @@ class NPOIE(InfoExtractor): headers={ 'Authorization': token, 'Content-Type': 'application/json', - } + }, + fatal=False, ) stream_url = stream_link.get('stream', {}).get('streamURL') formats.extend(self._extract_mpd_formats(stream_url, slug, mpd_id='dash', fatal=False)) @@ -188,14 +189,18 @@ class ZAPPIE(NPOIE): _VALID_URL = r'https?://(?:www\.)?zapp.nl/.*' _TESTS = [{ - 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', - # TODO fill in other test attributes + 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/POMS_AT_811523', + 'md5': '9eb2d8b6f88b72b6b986ea2c26a81588', + 'info_dict': { + 'id': 'POMS_AT_811523', + 'title': 'POMS_AT_811523', + }, }] def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - formats = self._download_by_product_id(url, video_id) + formats = self._download_by_product_id(video_id, video_id, url=url) return { 'id': video_id, @@ -279,8 +284,8 @@ class HetKlokhuisIE(NTRSubsiteIE): 'md5': '4664b54ed4e05183b1e4f2f4290d551e', 'info_dict': { 'id': 'aliens', - 'title': 'aliens' - } + 'title': 'aliens', + }, }] @@ -295,7 +300,7 @@ class VPROIE(NPOIE): 'id': 'offline-als-luxe.html', 'title': 'offline-als-luxe.html', 'ext': 'm4v', - } + }, }] def _real_extract(self, url): @@ -327,6 +332,6 @@ class AndereTijdenIE(NTRSubsiteIE): 'md5': '3d607b16e00b459156b4ab6e163dccd7', 'info_dict': { 'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', - 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem' - } + 'title': 'Duitse-soldaten-over-de-Slag-bij-Arnhem', + }, }] From 58d7a00e3f07744b65ad53d12fcee1ec0050de74 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Mon, 11 Mar 2024 14:14:38 +0100 Subject: [PATCH 31/36] Resolve some of the pull request feedback --- youtube_dl/extractor/npo.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a5413a1d7..ea1e0fd2b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -18,8 +18,8 @@ class NPOIE(InfoExtractor): 'md5': 'f9ce9c43cc8bc3b8138df1562b99c379', 'info_dict': { 'description': 'Wie is de mol? (2)', - 'ext': 'm4v', 'duration': 2439, + 'ext': 'm4v', 'id': 'wie-is-de-mol-2', 'thumbnail': 'https://assets-start.npo.nl/resources/2023/07/01/e723c3cf-3e42-418a-9ba5-f6dbb64b516a.jpg', 'title': 'Wie is de mol? (2)' @@ -30,6 +30,7 @@ class NPOIE(InfoExtractor): 'info_dict': { 'id': 'zwart-geld-de-toekomst-komt-uit-afrika', 'title': 'Zwart geld: de toekomst komt uit Afrika', + 'ext': 'mp4', 'description': 'Zwart geld: de toekomst komt uit Afrika', 'thumbnail': 'https://assets-start.npo.nl/resources/2023/06/30/d9879593-1944-4249-990c-1561dac14d8e.jpg', 'duration': 3000 @@ -70,7 +71,7 @@ class NPOIE(InfoExtractor): if not product_id: raise ExtractorError('No productId found for slug: %s' % slug) - formats = self._download_by_product_id(product_id, slug, url) + formats = self._extract_formats_by_product_id(product_id, slug, url) return { 'id': slug, @@ -81,7 +82,7 @@ class NPOIE(InfoExtractor): 'duration': duration, } - def _download_by_product_id(self, product_id, slug, url=None): + def _extract_formats_by_product_id(self, product_id, slug, url=None): token = self._get_token(product_id) formats = [] for profile in ( @@ -93,7 +94,6 @@ class NPOIE(InfoExtractor): 'https://prod.npoplayer.nl/stream-link', video_id=slug, data=json.dumps({ 'profileName': profile, - 'drmType': 'widevine', 'referrerUrl': url or '', }).encode('utf8'), headers={ @@ -117,7 +117,8 @@ class BNNVaraIE(NPOIE): 'info_dict': { 'id': 'VARA_101369808', 'thumbnail': 'https://media.vara.nl/files/thumbnails/321291_custom_zembla__wie_is_de_mol_680x383.jpg', - 'title': 'Zembla - Wie is de mol?' + 'title': 'Zembla - Wie is de mol?', + 'ext': 'mp4', } }] @@ -142,7 +143,7 @@ class BNNVaraIE(NPOIE): }) product_id = media.get('data', {}).get('player', {}).get('pomsProductId') - formats = self._download_by_product_id(product_id, video_id) + formats = self._extract_formats_by_product_id(product_id, video_id) return { 'id': product_id, @@ -170,7 +171,7 @@ class ONIE(NPOIE): results = re.findall("page: '(.+)'", page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL, ' @@ -200,7 +201,7 @@ class ZAPPIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] - formats = self._download_by_product_id(video_id, video_id, url=url) + formats = self._extract_formats_by_product_id(video_id, video_id, url=url) return { 'id': video_id, @@ -239,7 +240,7 @@ class SchoolTVIE(NPOIE): metadata = self._download_json(metadata_url, video_id).get('pageProps', {}).get('data', {}) - formats = self._download_by_product_id(metadata.get('poms_mid'), video_id) + formats = self._extract_formats_by_product_id(metadata.get('poms_mid'), video_id) if not formats: raise ExtractorError('Could not find a POMS product id in the provided URL, ' @@ -261,7 +262,7 @@ class NTRSubsiteIE(NPOIE): results = re.findall(r'data-mid="(.+_.+)"', page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) break if not formats: @@ -309,7 +310,7 @@ class VPROIE(NPOIE): results = re.findall(r'data-media-id="(.+_.+)"\s', page) formats = [] for result in results: - formats.extend(self._download_by_product_id(result, video_id)) + formats.extend(self._extract_formats_by_product_id(result, video_id)) break # TODO find a better solution, VPRO pages can have multiple videos embedded if not formats: From ad64f3751e74c5ee2bbe45a6d5110813dbdd77f3 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:34:33 +0100 Subject: [PATCH 32/36] Improve regex Co-authored-by: Roy --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index ea1e0fd2b..27582ae9f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -307,7 +307,7 @@ class VPROIE(NPOIE): def _real_extract(self, url): video_id = url.rstrip('/').split('/')[-1] page, _ = self._download_webpage_handle(url, video_id) - results = re.findall(r'data-media-id="(.+_.+)"\s', page) + results = re.findall(r'data-media-id="([a-zA-Z0-9_]+)"\s', page) formats = [] for result in results: formats.extend(self._extract_formats_by_product_id(result, video_id)) From bc86c5f73b189a3ab5caa0f63d62ed8e3b70d741 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:37:41 +0100 Subject: [PATCH 33/36] Make regex more specific and remove redundant .* --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 27582ae9f..4651e6868 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,7 +11,7 @@ from ..utils import ExtractorError class NPOIE(InfoExtractor): IE_NAME = 'npo' IE_DESC = 'npo.nl' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/start/serie/' _TESTS = [{ 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', From 4c90b2f5875593af17dff13f96b8b05791f64a21 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:39:59 +0100 Subject: [PATCH 34/36] Adhere to code style Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4651e6868..4a70e251b 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -55,8 +55,7 @@ class NPOIE(InfoExtractor): slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', - slug, - query={'slug': slug}) + slug, query={'slug': slug}) product_id = program_metadata.get('productId') images = program_metadata.get('images') thumbnail = None From 007bbeacd78e0d158f684b5a8833d6425a0312f9 Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Thu, 14 Mar 2024 13:41:01 +0100 Subject: [PATCH 35/36] Remove afspelen and trailing slashes with one regex Co-authored-by: dirkf --- youtube_dl/extractor/npo.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 4a70e251b..545e58509 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -44,14 +44,8 @@ class NPOIE(InfoExtractor): note='Downloading token')['token'] def _real_extract(self, url): - # You might want to use removesuffix here, - # but removesuffix is introduced in Python 3.9 - # and youtube-dl supports Python 3.2+ - if url.endswith('/afspelen'): - url = url[:-9] - elif url.endswith('/afspelen/'): - url = url[:-10] - url = url.rstrip('/') + # Remove /afspelen and/or any trailing `/`s + url = re.sub(r'/(?:afspelen)?/*$', '', url) slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail', From a60972e253dfe88c81601eaa2e2899afbc4c29fd Mon Sep 17 00:00:00 2001 From: Bart Broere Date: Fri, 15 Mar 2024 13:02:56 +0100 Subject: [PATCH 36/36] Fix indent from suggestion --- youtube_dl/extractor/npo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 545e58509..4dbab16ab 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -44,8 +44,8 @@ class NPOIE(InfoExtractor): note='Downloading token')['token'] def _real_extract(self, url): - # Remove /afspelen and/or any trailing `/`s - url = re.sub(r'/(?:afspelen)?/*$', '', url) + # Remove /afspelen and/or any trailing `/`s + url = re.sub(r'/(?:afspelen)?/*$', '', url) slug = url.split('/')[-1] program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail',