From 953fce852fa2cceb8ecf257212099da68860698a Mon Sep 17 00:00:00 2001 From: Marcin Biczan Date: Wed, 5 Apr 2023 21:54:58 +0200 Subject: [PATCH 01/15] TVPapp extractor added :: init --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tvp.py | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3a87f9e33..8d7f32bdb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1371,6 +1371,7 @@ from .tvp import ( TVPEmbedIE, TVPIE, TVPWebsiteIE, + TVPappIE, ) from .tvplay import ( TVPlayIE, diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index accff75b5..27c004946 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import itertools import re +import json from .common import InfoExtractor from ..utils import ( @@ -250,3 +251,50 @@ class TVPWebsiteIE(InfoExtractor): display_id, playlist_id = mobj.group('display_id', 'id') return self.playlist_result( self._entries(display_id, playlist_id), playlist_id) + + +class TVPappIE(InfoExtractor): + IE_NAME = 'tvp:app' + _VALID_URL = r'https://vod.tvp.pl/[^/]+/[^,]+,(?P[0-9]+)/[^,]+,[^,]+,(?P[0-9]+)' + _TESTS = [{ + # series + 'url': 'https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532', + 'info_dict': { + 'id': '392532', + 'ext': 'mp4', + + 'series_id': 292227, + 'title': 'Korona królów. Jagiellonowie - Episode 37 - odc. 37 – Branka', + 'description': 'Wiosna 1407. Anna z Goraja znika z Wawelu. Jej matka - Beata z Bożego Daru rozpacza. Anna Cylejska zarządza poszukiwania, a Sofia tajemniczo się uśmiecha. Elżbieta i Katarzyna Gorajskie wyjawiają, że ich siostra pojechała do klasztoru klarysek w Sączu. Na Wawel dociera wieść o śmierci Wielkiego Mistrza Konrada von Jungingena. Rozpoczyna się walka o władzę w Malborku. Jan Falkenberg ma list do papieża, w którym wyjawia sekret Trąby. Bdzigost i Ciołek zdobywają to pismo. Król Jagiełło poznaje wreszcie sekret Mikołaja Trąby i znajduje sposób, by go ochronić.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 37, + }, + 'params': { + 'skip_download': True, + 'format': 'dash-f1-v1-x3', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'https://vod.tvp.pl/api/products/vods/%s?lang=pl&platform=BROWSER' + url_p = 'https://vod.tvp.pl/api/products/%s/videos/playlist?platform=BROWSER&videoType=MOVIE' + + _details = self._download_json(url % video_id, video_id, 'Downloading details') + _playlist = self._download_json(url_p % video_id, video_id, 'Downloading playlist') + + formats = [] + formats.extend(self._extract_mpd_formats(_playlist['sources']['DASH'][0]['src'], video_id, mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'duraton': _details['duration'], + + 'series_id': _details['season']['serial']['id'], + 'title': _details['statisticsParameters']['gemiusVideo']['fullTitle'], + 'description': _details['description'], + 'thumbnail': _details['images']['16x9'][0]['url'], + 'episode_number': _details['number'], + } + From b08dc56f465d0dd055dba73129ff29e3d51e2607 Mon Sep 17 00:00:00 2001 From: Marcin Biczan Date: Wed, 5 Apr 2023 22:03:19 +0200 Subject: [PATCH 02/15] json uneeded --- youtube_dl/extractor/tvp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 27c004946..22f71588d 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import itertools import re -import json from .common import InfoExtractor from ..utils import ( From 4b6bef45b5b0a396ac86e1081894fd87d35110f0 Mon Sep 17 00:00:00 2001 From: Marcin Biczan Date: Thu, 6 Apr 2023 21:25:00 +0200 Subject: [PATCH 03/15] added support for full offer --- youtube_dl/extractor/tvp.py | 104 +++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 22f71588d..40b284057 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -254,25 +254,87 @@ class TVPWebsiteIE(InfoExtractor): class TVPappIE(InfoExtractor): IE_NAME = 'tvp:app' - _VALID_URL = r'https://vod.tvp.pl/[^/]+/[^,]+,(?P[0-9]+)/[^,]+,[^,]+,(?P[0-9]+)' - _TESTS = [{ - # series - 'url': 'https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532', - 'info_dict': { - 'id': '392532', - 'ext': 'mp4', + # https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532 + # https://vod.tvp.pl/programy,88/korona-krolow-taka-historia-odcinki,283653/odcinek-1,S02E01,386064 + # https://vod.tvp.pl/filmy-dokumentalne,163/jurek,376503 + # https://vod.tvp.pl/dla-dzieci,24/zubr-pompik-odcinki,282014/odcinek-1,S01E01,319853 + # https://vod.tvp.pl/teatr-telewizji,202/barwy-uczuc,392420 + # https://vod.tvp.pl/filmy-fabularne,136/rozlam,390638 + _VALID_URL = r'https://vod.tvp.pl/(seriale|filmy-fabularne|programy|filmy-dokumentalne|dla-dzieci|teatr-telewizji|informacje-i-publicystyka),[0-9]+/([^,]+,[0-9]+/)?[^,]+,([^,]+,)?(?P[0-9]+)' + _TESTS = [ + { + # series + 'url': 'https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532', + 'info_dict': { + 'id': '392532', + 'ext': 'mp4', - 'series_id': 292227, - 'title': 'Korona królów. Jagiellonowie - Episode 37 - odc. 37 – Branka', - 'description': 'Wiosna 1407. Anna z Goraja znika z Wawelu. Jej matka - Beata z Bożego Daru rozpacza. Anna Cylejska zarządza poszukiwania, a Sofia tajemniczo się uśmiecha. Elżbieta i Katarzyna Gorajskie wyjawiają, że ich siostra pojechała do klasztoru klarysek w Sączu. Na Wawel dociera wieść o śmierci Wielkiego Mistrza Konrada von Jungingena. Rozpoczyna się walka o władzę w Malborku. Jan Falkenberg ma list do papieża, w którym wyjawia sekret Trąby. Bdzigost i Ciołek zdobywają to pismo. Król Jagiełło poznaje wreszcie sekret Mikołaja Trąby i znajduje sposób, by go ochronić.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': 37, - }, - 'params': { - 'skip_download': True, - 'format': 'dash-f1-v1-x3', - } - }] + 'series_id': 292227, + 'title': 'Korona królów. Jagiellonowie - Episode 37 - odc. 37 – Branka', + 'description': 'Wiosna 1407. Anna z Goraja znika z Wawelu. Jej matka - Beata z Bożego Daru rozpacza. Anna Cylejska zarządza poszukiwania, a Sofia tajemniczo się uśmiecha. Elżbieta i Katarzyna Gorajskie wyjawiają, że ich siostra pojechała do klasztoru klarysek w Sączu. Na Wawel dociera wieść o śmierci Wielkiego Mistrza Konrada von Jungingena. Rozpoczyna się walka o władzę w Malborku. Jan Falkenberg ma list do papieża, w którym wyjawia sekret Trąby. Bdzigost i Ciołek zdobywają to pismo. Król Jagiełło poznaje wreszcie sekret Mikołaja Trąby i znajduje sposób, by go ochronić.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 37, + }, + 'params': { + 'skip_download': True, + 'format': 'dash-f1-v1-x3', + } + }, + { + # programs + 'url': 'https://vod.tvp.pl/programy,88/korona-krolow-taka-historia-odcinki,283653/odcinek-1,S02E01,386064', + 'info_dict': { + 'id': '386064', + 'ext': 'mp4', + + 'series_id': 283653, + 'title': 'Korona królów. Taka historia... - Episode 1 - odc. 1 – Jagiełło: życie po Jadwidze', + 'description': 'Jogaiła był Wielkim Księciem Litewskim, synem Olgierda i wnukiem Giedymina. W 1386 roku został polskim królem, bowiem został mężem polskiej monarchini – Jadwigi Andegaweńskiej. Dwa trony i dwóch władców mających pełnię władzy. Kiedy w 1399 roku Jadwiga zmarła, najważniejsi polscy możni za potwierdzenie praw Jagiełły do korony zapragnęli więcej wpływów na władzę, w następstwie czego Władysław Jagiełło postanowił zrzec się tronu i wrócić na Litwę. Zaczęły się rozmowy i negocjacje. Obu stronom zależało, żeby król pozostał królem. Możni zaproponowali Jagielle kandydatkę na nową żonę – Annę Cylejską. Dziewczyna była wnuczką Kazimierza Wielkiego, więc uznano ją za prawowitą dziedziczkę polskiego tronu, czym miała wzmacniać rolę Jagiełły. Przeprowadzono powtórną elekcję i ustanowiono warunki współpracy z Litwą.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + 'format': 'dash-f1-v1-x3', + } + }, + { + # filmy-dokumentalne + 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/jurek,376503', + 'info_dict': { + 'id': '376503', + 'ext': 'mp4', + + 'series_id': None, + 'title': 'Jurek', + 'description': 'Obraz człowieka, który pnie się do góry w sensie dosłownym, ale też metaforycznym i symbolicznym. Od socjalistycznego pracownika, po gwiazdę międzynarodowych mediów, od człowieka, który wspina się bez pieniędzy i sprzętu, po pełnoprawnego konkurenta Reinholda Messnera w walce o zdobycie Korony Himalajów i Karakorum. Skromnego, wyrazistego, kochającego góry. Rozmowy z rodziną i przyjaciółmi, archiwalia, zdjęcia, nagrania, fragmenty programów telewizyjnych i wywiadów składają się na portret całego środowiska himalaistów lat 80. To również obraz czasów, w jakich żyli – ciężkich i barwnych jednocześnie, kiedy idealizm miał większą wartość niż sława.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': None, + }, + 'params': { + 'skip_download': True, + 'format': 'dash-f1-v1-x3', + } + }, + { + # dla-dzieci + 'url': 'https://vod.tvp.pl/dla-dzieci,24/zubr-pompik-odcinki,282014/odcinek-1,S01E01,319853', + 'info_dict': { + 'id': '319853', + 'ext': 'mp4', + + 'series_id': 282014, + 'title': 'Żubr Pompik - Episode 1 - Odc. 1 – Duże i małe', + 'description': 'W głębinach wielkiej, dzikiej, zielonej puszczy mieszka żubr Pompik. Jest niewielki, do tego nie tak silny, szybki i skoczny, jak jego rówieśnicy. Początkowo bardzo się tym martwi. A co, jeżeli nigdy nie urośnie? Odkrywa jednak, że ma wiele innych zalet! Jest najbardziej ciekawskim ze zwierząt w puszczy. Interesuje go wszystko, co dzieje się dookoła. Nie ma takiej leśnej tajemnicy, której Pompik by nie rozwiązał dzięki uważnej obserwacji, cierpliwości i zadawaniu mnóstwa pytań.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + 'format': 'dash-f1-v1-x3', + } + }, +] def _real_extract(self, url): video_id = self._match_id(url) @@ -282,6 +344,8 @@ class TVPappIE(InfoExtractor): _details = self._download_json(url % video_id, video_id, 'Downloading details') _playlist = self._download_json(url_p % video_id, video_id, 'Downloading playlist') + # import json + # print(json.dumps(_playlist, indent = 2)) formats = [] formats.extend(self._extract_mpd_formats(_playlist['sources']['DASH'][0]['src'], video_id, mpd_id='dash', fatal=False)) @@ -290,10 +354,10 @@ class TVPappIE(InfoExtractor): 'formats': formats, 'duraton': _details['duration'], - 'series_id': _details['season']['serial']['id'], + 'series_id': _details['season']['serial']['id'] if 'season' in _details else None, 'title': _details['statisticsParameters']['gemiusVideo']['fullTitle'], 'description': _details['description'], 'thumbnail': _details['images']['16x9'][0]['url'], - 'episode_number': _details['number'], + 'episode_number': _details['number'] if 'number' in _details else None, } From efedc80dafcdcc75cc8c9187fac8455b635e792e Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 9 May 2023 16:34:41 +0100 Subject: [PATCH 04/15] Update extractors.py [skip ci] --- youtube_dl/extractor/extractors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d7f32bdb..cced047b2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1370,8 +1370,9 @@ from .tvnow import ( from .tvp import ( TVPEmbedIE, TVPIE, - TVPWebsiteIE, - TVPappIE, + TVPStreamIE, + TVPVODSeriesIE, + TVPVODVideoIE, ) from .tvplay import ( TVPlayIE, From 6e827118cbb738ab207aa37151554a13290dc8e2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 9 May 2023 16:44:13 +0100 Subject: [PATCH 05/15] Update tvp.py from yt-dlp * pull changes from https://github.com/yt-dlp/yt-dlp/pull/6989, thanks selfisekai * use `traverse_obj()` for safer extraction * fix tests that are not blocked from UK Co-authored-by: selfisekai --- youtube_dl/extractor/tvp.py | 914 ++++++++++++++++++++++++++---------- 1 file changed, 676 insertions(+), 238 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 40b284057..1ad585e37 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -2,52 +2,259 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor +from ..compat import ( + compat_str, +) from ..utils import ( clean_html, determine_ext, ExtractorError, - get_element_by_attribute, - orderedSet, + int_or_none, + js_to_json, + traverse_obj, + txt_or_none, + url_or_none, ) +if not hasattr(InfoExtractor, '_match_valid_url'): + + import sys + from ..compat import ( + compat_os_name, + compat_re_Pattern as compiled_regex_type, + ) + from ..utils import ( + bug_reports_message, + error_to_compat_str, + NO_DEFAULT, + RegexNotFoundError, + ) + + BaseIE = InfoExtractor + + class InfoExtractor(BaseIE): + def _match_valid_url(self, url): + return re.match(self._VALID_URL, url) + + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: + break + + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): + _name = '\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif isinstance(group, (list, tuple)): + return tuple(mobj.group(g) for g in group) + else: + return mobj.group(group) + elif default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract %s' % _name) + else: + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) + return None + + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags, group) + if isinstance(res, tuple): + return tuple(map(clean_html, res)) + return clean_html(res or None) + + def _search_json(self, start_pattern, string, name, video_id, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + + # self, start_pattern, string, name, video_id, *, end_pattern='', + # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT + end_pattern = kwargs.pop('end_pattern', '') + contains_pattern = kwargs.pop('contains_pattern', r'{(?:[\s\S]+)}') + fatal = kwargs.get('fatal', True) + default = kwargs.get('default', NO_DEFAULT) + + # NB: end_pattern is only used to reduce the size of the initial match + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + r'(?:{0})\s*(?P{1})\s*(?:{2})'.format( + start_pattern, contains_pattern, end_pattern), + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + try: + # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + return self._parse_json(json_string, video_id, **kwargs) + except ExtractorError as e: + msg = 'Unable to extract {0} - Failed to parse JSON'.format(name) + if fatal: + raise ExtractorError(msg, cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id) + return default + + class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P\d+)' - + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com|tvpworld\.com|swipeto\.pl)/(?:(?:(?!\d+/)[^/]+/)*|(?:video|website)/[^/]+,)(?P\d+)' _TESTS = [{ + # TVPlayer 2 in js wrapper + 'url': 'https://swipeto.pl/64095316/uliczny-foxtrot-wypozyczalnia-kaset-kto-pamieta-dvdvideo', + 'info_dict': { + 'id': '64095316', + 'ext': 'mp4', + 'title': 'Uliczny Foxtrot — Wypożyczalnia kaset. Kto pamięta DVD-Video?', + 'age_limit': 0, + 'duration': 374, + 'thumbnail': r're:https://.+', + }, + 'expected_warnings': [ + 'Failed to download ISM manifest: HTTP Error 404: Not Found', + 'Failed to download m3u8 information: HTTP Error 404: Not Found', + ], + 'skip': 'Video gone: 404 Nie znaleziono obiektu', + }, { + # TVPlayer 2 in js wrapper (redirect to VodVideo) 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', - 'description': 'md5:437f48b93558370b031740546b696e24', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, }, + 'add_ie': ['Generic', 'TVPEmbed'], }, { - 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', + # film (old format) + 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', 'info_dict': { - 'id': '17916176', + 'id': '51374509', 'ext': 'mp4', - 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', - 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', + 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'age_limit': 12, }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['TVPEmbed'], + 'skip': 'This video is not available from your location due to geo restriction', }, { + # TVPlayer legacy + 'url': 'https://www.tvp.pl/polska-press-video-uploader/wideo/62042351', + 'info_dict': { + 'id': '62042351', + 'ext': 'mp4', + 'title': 'Wideo', + 'description': 'Wideo Kamera', + 'duration': 24, + 'age_limit': 0, + 'thumbnail': r're:https://.+', + }, + 'add_ie': ['TVPEmbed'], + }, { + # TVPlayer 2 in iframe # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', + 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow', + 'md5': 'd35fb45103802488fcb7470e411b9ed4', 'info_dict': { - 'id': '33908820', + 'id': '50725617', 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + 'title': 'Dzieci na sprzedaż dla homoseksualistów', + 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', + 'age_limit': 12, + 'duration': 259, + 'thumbnail': r're:https://.+', }, - 'skip': 'HTTP Error 404: Not Found', + 'add_ie': ['TVPEmbed'], + }, { + # TVPlayer 2 in client-side rendered website (regional; window.__newsData) + 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo', + 'info_dict': { + 'id': '25804446', + 'ext': 'mp4', + 'title': 'Studio Yayo', + 'upload_date': '20160616', + 'timestamp': 1466075700, + 'age_limit': 0, + 'duration': 20, + 'thumbnail': r're:https://.+', + }, + 'add_ie': ['TVPEmbed'], + 'skip': 'Video is geo restricted', + }, { + # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) + 'url': 'https://www.tvp.info/52880236/09042021-0800', + 'info_dict': { + 'id': '52880236', + 'ext': 'mp4', + 'title': '09.04.2021, 08:00', + 'age_limit': 0, + 'thumbnail': r're:https://.+', + }, + 'add_ie': ['TVPEmbed'], + 'skip': 'Video is geo restricted', + }, { + # client-side rendered (regional) program (playlist) page + 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', + 'info_dict': { + 'id': '9660819', + 'description': 'Od poniedziałku do piątku o 18:55', + 'title': 'Rozmowa dnia', + }, + 'playlist_mincount': 1800, + 'params': { + 'skip_download': True, + } + }, { + # ABC-specific video embeding + # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450 + 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124', + 'info_dict': { + 'id': '48320456', + 'ext': 'mp4', + 'title': 'Teleranek, Żubr', + }, + 'skip': 'Video gone: Nie znaleziono obiektu', + }, { + # yet another vue page + 'url': 'https://jp2.tvp.pl/46925618/filmy', + 'info_dict': { + 'id': '46925618', + 'title': 'Filmy', + }, + 'playlist_mincount': 19, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -66,31 +273,209 @@ class TVPIE(InfoExtractor): }, { 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'only_matching': True, + }, { + 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej', + 'only_matching': True, + }, { + 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', + 'only_matching': True, + }, { + 'url': 'https://tvpworld.com/48583640/tescos-polish-business-bought-by-danish-chain-netto', + 'only_matching': True, + }, { + 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'only_matching': True, }] + def _parse_vue_website_data(self, webpage, page_id): + website_data = self._search_regex([ + # website - regiony, tvp.info + # directory - jp2.tvp.pl + r'window\s*\.\s*__(?:website|directory)Data\s*=\s*({[\s\S]+?});', + ], webpage, 'website data') + if not website_data: + return None + return self._parse_json(website_data, page_id, transform_source=js_to_json) + + def _extract_vue_video(self, video_data, page_id=None): + if isinstance(video_data, compat_str): + video_data = self._parse_json(video_data, page_id, transform_source=js_to_json) + video_id = txt_or_none(video_data.get('_id')) or page_id + if not video_id: + return + is_website = video_data.get('type') == 'website' + if is_website: + url = video_data['url'] + fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) + if fucked_up_url_parts: + url = 'https://vod.tvp.pl/website/' + ','.join(fucked_up_url_parts.group(2, 1)) + else: + url = 'tvp:' + video_id + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': url, + 'ie_key': (TVPIE if is_website else TVPEmbedIE).ie_key(), + 'title': txt_or_none(video_data.get('title')), + 'description': txt_or_none(video_data.get('lead')), + 'timestamp': int_or_none(video_data.get('release_date_long')), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': traverse_obj(video_data, ('image', (None, Ellipsis), 'url'), expected_type=url_or_none) or None, + } + + def _handle_vuejs_page(self, url, webpage, page_id): + # vue client-side rendered sites (all regional pages + tvp.info) + video_data = self._search_regex([ + r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;', + ], webpage, 'video data', default=None) + if video_data: + video_data = self._extract_vue_video(video_data, page_id=page_id) + if video_data: + return self._extract_vue_video(video_data, page_id=page_id) + else: + # paged playlists + website_data = self._parse_vue_website_data(webpage, page_id) + if website_data: + entries = self._vuejs_entries(url, website_data, page_id) + + return { + '_type': 'playlist', + 'id': page_id, + 'title': txt_or_none(website_data.get('title')), + 'description': txt_or_none(website_data.get('lead')), + 'entries': entries, + } + raise ExtractorError('Could not extract video/website data') + + def _vuejs_entries(self, url, website_data, page_id): + + def extract_videos(wd): + for video in traverse_obj(wd, (None, ('latestVideo', (('videos', 'items'), Ellipsis)))): + video = self._extract_vue_video(video) + if video: + yield video + + for from_ in extract_videos(website_data): + yield from_ + + if website_data.get('items_total_count') > website_data.get('items_per_page'): + for page in itertools.count(2): + page_website_data = self._parse_vue_website_data( + self._download_webpage(url, page_id, note='Downloading page #%d' % page, + query={'page': page}), + page_id) + if not page_website_data.get('videos') and not page_website_data.get('items'): + break + for from_ in extract_videos(page_website_data): + yield from_ + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex([ + webpage, urlh = self._download_webpage_handle(url, page_id, expected_status=404) + + # The URL may redirect to a VOD + # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii + for ie_cls in (TVPVODSeriesIE, TVPVODVideoIE): + if ie_cls.suitable(urlh.url): + return self.url_result(urlh.url, ie=ie_cls.ie_key(), video_id=page_id) + + if re.search( + r'window\s*\.\s*__(?:video|news|website|directory)Data\s*=', + webpage): + return self._handle_vuejs_page(url, webpage, page_id) + + # classic server-side rendered sites + video_id = self._search_regex(( + r']+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)', r']+src="[^"]*?object_id=(\d+)', r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + r'data-video-id="(\d+)"', + # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video? + # the first one is referenced to as "copyid", and seems to be unused by the website + r'', + ), webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), + webpage, default=None) or (self._html_search_meta( + 'description', webpage, default=None) + if '//s.tvp.pl/files/portal/v' in webpage else None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } +class TVPStreamIE(InfoExtractor): + IE_NAME = 'tvp:stream' + _VALID_URL = r'(?:tvpstream:|https?://(?:tvpstream\.vod|stream)\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P\d*)' + _TESTS = [{ + 'url': 'https://stream.tvp.pl/?channel_id=56969941', + 'only_matching': True, + }, { + 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455', + 'info_dict': { + 'id': r're:\d+', + 'title': r're:\S.*', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'tvpstream:39821455', + 'only_matching': True, + }, { + # the default stream when you provide no channel_id, most probably TVP Info + 'url': 'tvpstream:', + 'only_matching': True, + }, { + 'url': 'https://tvpstream.vod.tvp.pl/', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_url = self._proto_relative_url('//stream.tvp.pl/?channel_id=%s' % channel_id or 'default') + webpage = self._download_webpage(channel_url, channel_id or 'default', 'Downloading channel webpage') + channels = self._search_json( + r'window\s*\.\s*__channels\s*=', webpage, 'channel list', channel_id, + contains_pattern=r'\[\s*\{[\s\S]+}\s*]') + channel = traverse_obj(channels, (lambda _, v: channel_id == compat_str(v['id'])), get_all=False) if channel_id else channels[0] + audition = traverse_obj(channel, ('items', lambda _, v: v['is_live'] is True), get_all=False) + return { + '_type': 'url_transparent', + 'id': channel_id or channel['id'], + 'url': 'tvp:%s' % (audition['video_id'], ), + 'title': audition.get('title'), + 'alt_title': channel.get('title'), + 'is_live': True, + 'ie_key': 'TVPEmbed', + } + + class TVPEmbedIE(InfoExtractor): IE_NAME = 'tvp:embed' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P\d+)' - + # XFF is not effective + _GEO_BYPASS = False + _VALID_URL_PAT = ( + r''' + (?: + tvp: + |https?:// + (?:[^/]+\.)? + (?:tvp(?:parlament)?\.pl|tvp\.info|tvpworld\.com|swipeto\.pl)/ + (?:sess/ + (?:tvplayer\.php\?.*?object_id + |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd]) + |shared/details\.php\?.*?object_id) + =) + (?P\d+) + ''') + _VALID_URL = '(?x)' + _VALID_URL_PAT + _EMBED_REGEX = [r'(?x)]+?src=(["\'])(?P{0})'.format(_VALID_URL_PAT)] _TESTS = [{ 'url': 'tvp:194536', 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', @@ -98,9 +483,16 @@ class TVPEmbedIE(InfoExtractor): 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, + 'duration': 2652, + 'series': 'Czas honoru', + 'episode': 'Episode 13', + 'episode_number': 13, + 'season': 'sezon 1', + 'thumbnail': r're:https://.+', }, }, { - # not available 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'info_dict': { @@ -108,7 +500,28 @@ class TVPEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Panorama, 07.12.2015, 15:40', }, - 'skip': 'Transmisja została zakończona lub materiał niedostępny', + 'skip': 'Nie znaleziono obiektu', + }, { + 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&autoplay=false', + 'info_dict': { + 'id': '51247504', + 'ext': 'mp4', + 'title': 'Razmova 091220', + 'duration': 876, + 'age_limit': 0, + 'thumbnail': r're:https://.+', + }, + }, { + # TVPlayer2 embed URL + 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757', + 'only_matching': True, + }, { + 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452', + 'only_matching': True, + }, { + # pulsembed on dziennik.pl + 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html', + 'only_matching': True, }, { 'url': 'tvp:22670268', 'only_matching': True, @@ -117,247 +530,272 @@ class TVPEmbedIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # could be anything that is a valid JS function name + callback = random.choice(( + 'jebac_pis', + 'jebacpis', + 'ziobro', + 'sasin70', + 'sasin_przejebal_70_milionow_PLN', + 'tvp_is_a_state_propaganda_service', + )) webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s' + + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id) - error = self._html_search_regex( - r'(?s)]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)

', - webpage, 'error', default=None) or clean_html( - get_element_by_attribute('class', 'msg error', webpage)) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) + # stripping JSONP padding + null, datastr = self._search_regex( + r'\s%s\s*\(\s*(?Pnull\s*,\s*)?(?P(?(null)\[\s*)?\{(?:[\s\S]+)}(?(null)]\s*))\)\s*;' % (re.escape(callback), ), + webpage, 'JSON API result', group=('null', 'json')) + data = self._parse_json(datastr, video_id, fatal=False) + if null: + error_desc = traverse_obj(data, (0, 'desc'), expected_type=compat_str) + if error_desc == 'Obiekt wymaga płatności': + error_desc = 'Video requires payment and log-in, but log-in is not implemented' + raise ExtractorError(error_desc or 'unexpected JSON error', expected=error_desc) - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) + content = data['content'] + info = traverse_obj(content, 'info', expected_type=dict) - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) + if traverse_obj(info, 'isGeoBlocked', expected_type=bool): + # actual country list is not provided, we just assume it's always available in PL + self.raise_geo_restricted(countries=['PL']) - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, - 'formats', group='url', default=None) - if not video_url or 'material_niedostepny.mp4' in video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] + is_live = traverse_obj(info, 'isLive', expected_type=bool) formats = [] - video_url_base = self._search_regex( - r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', - video_url, 'video base url', default=None) - if video_url_base: - # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. - # It's not mentioned in MPEG-DASH standard. Figure that out. - # formats.extend(self._extract_mpd_formats( - # video_url_base + '.ism/video.mpd', - # video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_base + '.ism/Manifest', - video_id, 'mss', fatal=False)) - formats.extend(self._extract_f4m_formats( - video_url_base + '.ism/video.f4m', - video_id, f4m_id='hds', fatal=False)) - m3u8_formats = self._extract_m3u8_formats( - video_url_base + '.ism/video.m3u8', video_id, - 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - formats.extend(m3u8_formats) - for i, m3u8_format in enumerate(m3u8_formats, 2): - http_url = '%s-%d.mp4' % (video_url_base, i) - if self._is_valid_url(http_url, video_id): - f = m3u8_format.copy() - f.update({ - 'url': http_url, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - }] + for file in traverse_obj(content, ('files', Ellipsis), expected_type=dict): + video_url = url_or_none(file.get('url')) + if not video_url: + continue + ext = determine_ext(video_url, None) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls', + fatal=False, live=is_live)) + elif ext == 'mpd': + if is_live: + # doesn't work with either ffmpeg or native downloader + continue + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif video_url.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) + elif ext == 'ism': + if '.ism/manifest' in video_url: + formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) + else: + # mp4, wmv or something + quality = traverse_obj(file, 'quality', expected_type=dict) or {} + formats.append({ + 'format_id': 'direct', + 'url': video_url, + 'ext': ext or file.get('type'), + 'fps': int_or_none(quality.get('fps')), + 'tbr': int_or_none(quality.get('bitrate'), scale=1000), + 'width': int_or_none(quality.get('width')), + 'height': int_or_none(quality.get('height')), + }) self._sort_formats(formats) - return { + title = traverse_obj(info, 'subtitle', 'title', 'seoTitle', expected_type=txt_or_none) + # `seoDescription` may be Falsen + description = traverse_obj(info, 'description', 'seoDescription', + expected_type=lambda x: txt_or_none(x or None)) + thumbnails = [] + for thumb in traverse_obj(content, ('posters', Ellipsis), expected_type=dict): + thumb_url = thumb.get('src') + if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url: + continue + thumbnails.append({ + 'url': thumb.get('src'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + }) + age_limit = traverse_obj(info, ('ageGroup', 'minAge'), expected_type=int) + if age_limit == 1: + age_limit = 0 + duration = traverse_obj(info, 'duration', expected_type=int) if not is_live else None + + subtitles = {} + for sub in traverse_obj(content, ('subtitles', Ellipsis), expected_type=dict): + if not (sub.get('url') and sub.get('lang')): + continue + subtitles.setdefault(sub['lang'], []).append({ + 'url': sub['url'], + 'ext': sub.get('type'), + }) + + info_dict = { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'is_live': is_live, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, + } + + # vod.tvp.pl + if traverse_obj(info, 'vortalName') == 'vod': + info_dict.update({ + 'title': '%s, %s' % (info.get('title'), info.get('subtitle')), + 'series': info.get('title'), + 'season': info.get('season'), + 'episode_number': info.get('episode') or None, + }) + + return info_dict + + +class TVPVODBaseIE(InfoExtractor): + _API_BASE_URL = 'https://vod.tvp.pl/api/products/' + + def _call_api(self, resource, video_id, **kwargs): + return self._download_json( + self._API_BASE_URL + resource, video_id, + query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs) + + def _parse_video(self, video): + video_id = traverse_obj(video, 'externalUid', expected_type=txt_or_none) + + if not video_id: + return None + + return { + '_type': 'url', + 'url': 'tvp:' + video_id, + 'ie_key': TVPEmbedIE.ie_key(), + 'title': video.get('title'), + 'description': traverse_obj(video, ('lead', 'description'), expected_type=txt_or_none), + 'age_limit': int_or_none(video.get('rating')), + 'duration': int_or_none(video.get('duration')), } -class TVPWebsiteIE(InfoExtractor): - IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' +class TVPVODVideoIE(TVPVODBaseIE): + IE_NAME = 'tvp:vod' + _VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$' _TESTS = [{ - # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', 'info_dict': { - 'id': '38678312', - }, - 'playlist_count': 115, - }, { - # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', - 'info_dict': { - 'id': '36637049', + 'id': '60468609', 'ext': 'mp4', - 'title': 'Gloria, Gloria', - }, - 'params': { - 'skip_download': True, + 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', + 'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c', + 'duration': 300, + 'episode_number': 24, + 'episode': 'Episode 24', + 'age_limit': 0, + 'series': 'Laboratorium alchemika', + 'thumbnail': 're:https://.+', }, 'add_ie': ['TVPEmbed'], }, { - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', + 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667', + 'info_dict': { + 'id': '51640077', + 'ext': 'mp4', + 'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu', + 'series': 'Ukraiński sługa narodu', + 'description': 'md5:b7940c0a8e439b0c81653a986f544ef3', + 'age_limit': 12, + 'duration': 3051, + 'thumbnail': 're:https://.+', + }, + 'add_ie': ['TVPEmbed'], + }, { + # new URL format + 'url': 'https://vod.tvp.pl/seriale,18/czas-honoru-odcinki,292065/odcinek-13,S01E13,313867', + 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', + 'info_dict': { + 'id': '194536', + 'ext': 'mp4', + 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, + }, + 'add_ie': ['TVPEmbed'], + }, { + 'url': 'https://vod.tvp.pl/filmy-fabularne,136/rozlam,390638', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._parse_video( + self._call_api('vods/' + video_id, video_id)) + if not video: + raise ExtractorError('No video data for ' + video_id) + return video + + +class TVPVODSeriesIE(TVPVODBaseIE): + IE_NAME = 'tvp:vod:series' + _VALID_URL = r'''(?x) + https?://vod\.tvp\.pl/ + seriale,(?P<cat>\d+)/ + (?P<display_id>[^,]+?)(?(cat)-odcinki),(?P<id>\d+) + (?(cat)|(?P<video>/video)?)(?:[#?]|$) + ''' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P<display_id>[a-z\d-]+,\d+)/[a-z\d-]+-odcinki,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$' + + _TESTS = [{ + # series + 'url': 'https://vod.tvp.pl/seriale,18/ranczo-odcinki,316445', + # series (old) - redirects to home page + # 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', + 'info_dict': { + 'id': '316445', + 'title': 'Ranczo', + # 'description': 'md5:a7ccbe1296e6f32425cef17639f1b24b', + 'age_limit': 12, + 'categories': ['seriale'], + }, + 'playlist_mincount': 129, + }, { + 'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514', + 'only_matching': True, + }, { + 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338', 'only_matching': True, }] def _entries(self, display_id, playlist_id): - url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) - for page_num in itertools.count(1): - page = self._download_webpage( - url, display_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + season_path = 'vods/serials/%s/seasons' % (playlist_id, ) + seasons = self._call_api( + season_path, playlist_id, + note='Downloading season list') or [] - video_ids = orderedSet(re.findall( - r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, - page)) - - if not video_ids: - break - - for video_id in video_ids: - yield self.url_result( - 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), - video_id=video_id) + for ii, season in enumerate(seasons, 1): + season_id = traverse_obj(season, 'id', expected_type=txt_or_none) + if not season_id: + continue + episodes = self._call_api( + '%s/%s/episodes' % (season_path, season_id), playlist_id, + note='Downloading episode list (season %d)' % ii) + for episode in episodes or []: + video_id = traverse_obj(episode, 'externalUid', expected_type=txt_or_none) + if video_id: + yield self._parse_video(episode) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id, playlist_id = mobj.group('display_id', 'id') - return self.playlist_result( - self._entries(display_id, playlist_id), playlist_id) - - -class TVPappIE(InfoExtractor): - IE_NAME = 'tvp:app' - # https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532 - # https://vod.tvp.pl/programy,88/korona-krolow-taka-historia-odcinki,283653/odcinek-1,S02E01,386064 - # https://vod.tvp.pl/filmy-dokumentalne,163/jurek,376503 - # https://vod.tvp.pl/dla-dzieci,24/zubr-pompik-odcinki,282014/odcinek-1,S01E01,319853 - # https://vod.tvp.pl/teatr-telewizji,202/barwy-uczuc,392420 - # https://vod.tvp.pl/filmy-fabularne,136/rozlam,390638 - _VALID_URL = r'https://vod.tvp.pl/(seriale|filmy-fabularne|programy|filmy-dokumentalne|dla-dzieci|teatr-telewizji|informacje-i-publicystyka),[0-9]+/([^,]+,[0-9]+/)?[^,]+,([^,]+,)?(?P<id>[0-9]+)' - _TESTS = [ - { - # series - 'url': 'https://vod.tvp.pl/seriale,18/korona-krolow-jagiellonowie-odcinki,292227/odcinek-37,S01E37,392532', - 'info_dict': { - 'id': '392532', - 'ext': 'mp4', - - 'series_id': 292227, - 'title': 'Korona królów. Jagiellonowie - Episode 37 - odc. 37 – Branka', - 'description': 'Wiosna 1407. Anna z Goraja znika z Wawelu. Jej matka - Beata z Bożego Daru rozpacza. Anna Cylejska zarządza poszukiwania, a Sofia tajemniczo się uśmiecha. Elżbieta i Katarzyna Gorajskie wyjawiają, że ich siostra pojechała do klasztoru klarysek w Sączu. Na Wawel dociera wieść o śmierci Wielkiego Mistrza Konrada von Jungingena. Rozpoczyna się walka o władzę w Malborku. Jan Falkenberg ma list do papieża, w którym wyjawia sekret Trąby. Bdzigost i Ciołek zdobywają to pismo. Król Jagiełło poznaje wreszcie sekret Mikołaja Trąby i znajduje sposób, by go ochronić.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': 37, - }, - 'params': { - 'skip_download': True, - 'format': 'dash-f1-v1-x3', - } - }, - { - # programs - 'url': 'https://vod.tvp.pl/programy,88/korona-krolow-taka-historia-odcinki,283653/odcinek-1,S02E01,386064', - 'info_dict': { - 'id': '386064', - 'ext': 'mp4', - - 'series_id': 283653, - 'title': 'Korona królów. Taka historia... - Episode 1 - odc. 1 – Jagiełło: życie po Jadwidze', - 'description': 'Jogaiła był Wielkim Księciem Litewskim, synem Olgierda i wnukiem Giedymina. W 1386 roku został polskim królem, bowiem został mężem polskiej monarchini – Jadwigi Andegaweńskiej. Dwa trony i dwóch władców mających pełnię władzy. Kiedy w 1399 roku Jadwiga zmarła, najważniejsi polscy możni za potwierdzenie praw Jagiełły do korony zapragnęli więcej wpływów na władzę, w następstwie czego Władysław Jagiełło postanowił zrzec się tronu i wrócić na Litwę. Zaczęły się rozmowy i negocjacje. Obu stronom zależało, żeby król pozostał królem. Możni zaproponowali Jagielle kandydatkę na nową żonę – Annę Cylejską. Dziewczyna była wnuczką Kazimierza Wielkiego, więc uznano ją za prawowitą dziedziczkę polskiego tronu, czym miała wzmacniać rolę Jagiełły. Przeprowadzono powtórną elekcję i ustanowiono warunki współpracy z Litwą.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - 'format': 'dash-f1-v1-x3', - } - }, - { - # filmy-dokumentalne - 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/jurek,376503', - 'info_dict': { - 'id': '376503', - 'ext': 'mp4', - - 'series_id': None, - 'title': 'Jurek', - 'description': 'Obraz człowieka, który pnie się do góry w sensie dosłownym, ale też metaforycznym i symbolicznym. Od socjalistycznego pracownika, po gwiazdę międzynarodowych mediów, od człowieka, który wspina się bez pieniędzy i sprzętu, po pełnoprawnego konkurenta Reinholda Messnera w walce o zdobycie Korony Himalajów i Karakorum. Skromnego, wyrazistego, kochającego góry. Rozmowy z rodziną i przyjaciółmi, archiwalia, zdjęcia, nagrania, fragmenty programów telewizyjnych i wywiadów składają się na portret całego środowiska himalaistów lat 80. To również obraz czasów, w jakich żyli – ciężkich i barwnych jednocześnie, kiedy idealizm miał większą wartość niż sława.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': None, - }, - 'params': { - 'skip_download': True, - 'format': 'dash-f1-v1-x3', - } - }, - { - # dla-dzieci - 'url': 'https://vod.tvp.pl/dla-dzieci,24/zubr-pompik-odcinki,282014/odcinek-1,S01E01,319853', - 'info_dict': { - 'id': '319853', - 'ext': 'mp4', - - 'series_id': 282014, - 'title': 'Żubr Pompik - Episode 1 - Odc. 1 – Duże i małe', - 'description': 'W głębinach wielkiej, dzikiej, zielonej puszczy mieszka żubr Pompik. Jest niewielki, do tego nie tak silny, szybki i skoczny, jak jego rówieśnicy. Początkowo bardzo się tym martwi. A co, jeżeli nigdy nie urośnie? Odkrywa jednak, że ma wiele innych zalet! Jest najbardziej ciekawskim ze zwierząt w puszczy. Interesuje go wszystko, co dzieje się dookoła. Nie ma takiej leśnej tajemnicy, której Pompik by nie rozwiązał dzięki uważnej obserwacji, cierpliwości i zadawaniu mnóstwa pytań.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - 'format': 'dash-f1-v1-x3', - } - }, -] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'https://vod.tvp.pl/api/products/vods/%s?lang=pl&platform=BROWSER' - url_p = 'https://vod.tvp.pl/api/products/%s/videos/playlist?platform=BROWSER&videoType=MOVIE' - - _details = self._download_json(url % video_id, video_id, 'Downloading details') - _playlist = self._download_json(url_p % video_id, video_id, 'Downloading playlist') - - # import json - # print(json.dumps(_playlist, indent = 2)) - formats = [] - formats.extend(self._extract_mpd_formats(_playlist['sources']['DASH'][0]['src'], video_id, mpd_id='dash', fatal=False)) - - return { - 'id': video_id, - 'formats': formats, - 'duraton': _details['duration'], - - 'series_id': _details['season']['serial']['id'] if 'season' in _details else None, - 'title': _details['statisticsParameters']['gemiusVideo']['fullTitle'], - 'description': _details['description'], - 'thumbnail': _details['images']['16x9'][0]['url'], - 'episode_number': _details['number'] if 'number' in _details else None, - } + display_id, playlist_id = self._match_valid_url(url).group('display_id', 'id') + metadata = self._call_api( + 'vods/serials/' + playlist_id, playlist_id, + note='Downloading serial metadata') or {} + pl = self.playlist_result( + self._entries(display_id, playlist_id), playlist_id, txt_or_none(metadata.get('title'))) + pl.update({ + 'description': traverse_obj(metadata, ('description', 'lead'), expected_type=clean_html), + 'categories': traverse_obj(metadata, ('mainCategory', (None, Ellipsis), 'name'), expected_type=txt_or_none), + 'age_limit': traverse_obj(metadata, 'rating', expected_type=int), + }) + return pl From 4891480197f26d9bce96d5c6583d67f356bfdf48 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 9 May 2023 16:50:35 +0100 Subject: [PATCH 06/15] Update youtube_dl/extractor/tvp.py Add `txt_or_none()` shim --- youtube_dl/extractor/tvp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 1ad585e37..62b8b4a77 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -35,6 +35,9 @@ if not hasattr(InfoExtractor, '_match_valid_url'): RegexNotFoundError, ) + def txt_or_none(v, default=None): + return default if v is None else (compat_str(v).strip() or default) + BaseIE = InfoExtractor class InfoExtractor(BaseIE): From 237c59f7f5fec27e5af1b3dccb7a4d124ea660d6 Mon Sep 17 00:00:00 2001 From: dirkf <fieldhouse@gmx.net> Date: Tue, 9 May 2023 16:54:40 +0100 Subject: [PATCH 07/15] Update youtube_dl/extractor/tvp.py --- youtube_dl/extractor/tvp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 62b8b4a77..6d8b31be2 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -16,7 +16,6 @@ from ..utils import ( int_or_none, js_to_json, traverse_obj, - txt_or_none, url_or_none, ) From 56c07235ee27123e7d0632364585cec788080bbe Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:21:08 +0200 Subject: [PATCH 08/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 6d8b31be2..e90f09da3 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -76,7 +76,7 @@ if not hasattr(InfoExtractor, '_match_valid_url'): elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) + self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): From 283b6b31f55d34de5122e5e0a09fa47c5049b908 Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:21:27 +0200 Subject: [PATCH 09/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index e90f09da3..de28230d7 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -381,6 +381,9 @@ class TVPIE(InfoExtractor): if ie_cls.suitable(urlh.url): return self.url_result(urlh.url, ie=ie_cls.ie_key(), video_id=page_id) + if urlh.getcode() == 404: + raise compat_HTTPError(url, 404, 'HTTP Error 404: Not Found', urlh.headers, urlh) + if re.search( r'window\s*\.\s*__(?:video|news|website|directory)Data\s*=', webpage): From 578c53381b1a11d1a2d72feec5b0a55977010c2f Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:21:43 +0200 Subject: [PATCH 10/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index de28230d7..1491e1ec9 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -257,6 +257,15 @@ class TVPIE(InfoExtractor): 'title': 'Filmy', }, 'playlist_mincount': 19, + }, { + # redirect + 'url': 'https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii', + 'info_dict': { + 'id': '295157', + 'title': 'Wadowickie spotkania z Janem Pawłem II', + }, + 'playlist_mincount': 12, + 'add_ie': ['TVPEmbed', 'TVPVODSeries'], }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, From da19699ff832318a53387fc70fa4f0e36712e950 Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:22:01 +0200 Subject: [PATCH 11/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 1491e1ec9..e861ddbef 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -116,7 +116,12 @@ if not hasattr(InfoExtractor, '_match_valid_url'): # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) return self._parse_json(json_string, video_id, **kwargs) except ExtractorError as e: - msg = 'Unable to extract {0} - Failed to parse JSON'.format(name) + except ExtractorError as e: + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): + _name = '\033[0;34m%s\033[0m' % name + else: + _name = name + msg = 'Unable to extract {0} - Failed to parse JSON'.format(_name) if fatal: raise ExtractorError(msg, cause=e.cause, video_id=video_id) elif not has_default: From 79b0cde4dcf67cbaed088e692bc41ff8083f3fcd Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:22:18 +0200 Subject: [PATCH 12/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index e861ddbef..fee22aa65 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_str, ) from ..utils import ( From e7c42394a6c9a2b61fe2575bb78fa5bca8b63c2c Mon Sep 17 00:00:00 2001 From: bibiak <24386585+bibiak1@users.noreply.github.com> Date: Mon, 22 May 2023 21:22:27 +0200 Subject: [PATCH 13/15] Update youtube_dl/extractor/tvp.py Co-authored-by: dirkf <fieldhouse@gmx.net> --- youtube_dl/extractor/tvp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index fee22aa65..8742d7a8d 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -112,7 +112,6 @@ if not hasattr(InfoExtractor, '_match_valid_url'): if not json_string: return default - # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) try: # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) return self._parse_json(json_string, video_id, **kwargs) From 8a2249ecf14e1394ef552d1067f21acbbbc8fcc5 Mon Sep 17 00:00:00 2001 From: Marcin Biczan <marcin@biczan.pl> Date: Mon, 22 May 2023 21:25:05 +0200 Subject: [PATCH 14/15] too many except --- youtube_dl/extractor/tvp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 8742d7a8d..5efae5f22 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -115,7 +115,6 @@ if not hasattr(InfoExtractor, '_match_valid_url'): try: # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) return self._parse_json(json_string, video_id, **kwargs) - except ExtractorError as e: except ExtractorError as e: if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name From f5dd875a02f4d854ae66e40b180606cf264351c7 Mon Sep 17 00:00:00 2001 From: bibiak <marcin@biczan.pl> Date: Mon, 2 Oct 2023 14:13:50 +0000 Subject: [PATCH 15/15] moved txt_or_none outside if statement --- youtube_dl/extractor/tvp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 5efae5f22..8f4a41063 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -21,6 +21,9 @@ from ..utils import ( ) +def txt_or_none(v, default=None): + return default if v is None else (compat_str(v).strip() or default) + if not hasattr(InfoExtractor, '_match_valid_url'): import sys @@ -35,9 +38,6 @@ if not hasattr(InfoExtractor, '_match_valid_url'): RegexNotFoundError, ) - def txt_or_none(v, default=None): - return default if v is None else (compat_str(v).strip() or default) - BaseIE = InfoExtractor class InfoExtractor(BaseIE):