Compare commits

...

31 Commits

Author SHA1 Message Date
bibiak
1f1ec92857
Merge f5dd875a02 into 4d05f84325 2024-06-27 06:36:55 +08:00
dirkf
4d05f84325 [PalcoMP3] Conform to new linter rule
* no space after @ in decorator
2024-06-20 20:03:49 +01:00
dirkf
e0094e63c3 [jsinterp] Various tweaks
* treat Infinity like NaN
* cache operator list
2024-06-20 20:03:49 +01:00
dirkf
fd8242e3ef [jsinterp] Fix and improve expression parsing
* improve BODMAS (fixes https://github.com/ytdl-org/youtube-dl/issues/32815)
* support more weird expressions with multiple unary ops
2024-06-20 20:03:49 +01:00
dirkf
ad01fa6cca [jsinterp] Add Debugger from yt-dlp
* https://github.com/yt-dlp/yt-dlp/commit/8f53dc4
* thx pukkandan
2024-06-20 20:03:49 +01:00
dirkf
2eac0fa379 [utils] Save orig_msg in ExtractorError 2024-06-20 20:03:49 +01:00
bibiak
f5dd875a02 moved txt_or_none outside if statement 2023-10-02 14:13:50 +00:00
bibiak
efee229d66
Merge branch 'ytdl-org:master' into master 2023-10-02 15:51:03 +02:00
bibiak
8aac6a6702
Merge branch 'ytdl-org:master' into master 2023-08-31 20:40:06 +02:00
bibiak
0c53d4245d
Merge branch 'ytdl-org:master' into master 2023-08-31 15:14:56 +02:00
bibiak
a9223364b3
Merge branch 'ytdl-org:master' into master 2023-06-29 17:43:26 +02:00
bibiak
9f3bbddc5c
Merge branch 'ytdl-org:master' into master 2023-06-19 21:33:30 +02:00
bibiak
7caa31f0f0
Merge branch 'ytdl-org:master' into master 2023-06-13 07:10:17 +02:00
Marcin Biczan
8a2249ecf1 too many except 2023-05-22 21:25:05 +02:00
bibiak
e7c42394a6
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:22:27 +02:00
bibiak
79b0cde4dc
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:22:18 +02:00
bibiak
da19699ff8
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:22:01 +02:00
bibiak
578c53381b
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:21:43 +02:00
bibiak
283b6b31f5
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:21:27 +02:00
bibiak
56c07235ee
Update youtube_dl/extractor/tvp.py
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-05-22 21:21:08 +02:00
bibiak
1d148eb75f
Merge branch 'ytdl-org:master' into master 2023-05-22 20:53:23 +02:00
dirkf
237c59f7f5
Update youtube_dl/extractor/tvp.py 2023-05-09 16:54:40 +01:00
dirkf
4891480197
Update youtube_dl/extractor/tvp.py
Add `txt_or_none()` shim
2023-05-09 16:50:35 +01:00
dirkf
6e827118cb
Update tvp.py from yt-dlp
* pull changes from https://github.com/yt-dlp/yt-dlp/pull/6989, thanks selfisekai
* use `traverse_obj()` for safer extraction
* fix tests that are not blocked from UK

Co-authored-by: selfisekai
2023-05-09 16:44:13 +01:00
dirkf
efedc80daf
Update extractors.py
[skip ci]
2023-05-09 16:34:41 +01:00
bibiak
93e0d820ce
Merge branch 'ytdl-org:master' into master 2023-04-22 14:50:01 +02:00
bibiak
37ff4c9399
Merge branch 'ytdl-org:master' into master 2023-04-11 14:56:29 +02:00
bibiak
16cb050ae6
Merge branch 'ytdl-org:master' into master 2023-04-09 13:54:54 +02:00
Marcin Biczan
4b6bef45b5 added support for full offer 2023-04-06 21:25:00 +02:00
Marcin Biczan
b08dc56f46 json uneeded 2023-04-05 22:03:19 +02:00
Marcin Biczan
953fce852f TVPapp extractor added :: init 2023-04-05 21:54:58 +02:00
8 changed files with 781 additions and 155 deletions

View File

@ -577,9 +577,11 @@ class TestJSInterpreter(unittest.TestCase):
def test_unary_operators(self): def test_unary_operators(self):
jsi = JSInterpreter('function f(){return 2 - - - 2;}') jsi = JSInterpreter('function f(){return 2 - - - 2;}')
self.assertEqual(jsi.call_function('f'), 0) self.assertEqual(jsi.call_function('f'), 0)
# fails jsi = JSInterpreter('function f(){return 2 + - + - - 2;}')
# jsi = JSInterpreter('function f(){return 2 + - + - - 2;}') self.assertEqual(jsi.call_function('f'), 0)
# self.assertEqual(jsi.call_function('f'), 0) # https://github.com/ytdl-org/youtube-dl/issues/32815
jsi = JSInterpreter('function f(){return 0 - 7 * - 6;}')
self.assertEqual(jsi.call_function('f'), 42)
""" # fails so far """ # fails so far
def test_packed(self): def test_packed(self):

View File

@ -158,6 +158,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js',
'_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ',
), ),
(
'https://www.youtube.com/s/player/590f65a6/player_ias.vflset/en_US/base.js',
'1tm7-g_A9zsI8_Lay_', 'xI4Vem4Put_rOg',
),
] ]

View File

@ -3033,7 +3033,6 @@ class InfoExtractor(object):
transform_source=transform_source, default=None) transform_source=transform_source, default=None)
def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
# allow passing `transform_source` through to _find_jwplayer_data() # allow passing `transform_source` through to _find_jwplayer_data()
transform_source = kwargs.pop('transform_source', None) transform_source = kwargs.pop('transform_source', None)
kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {} kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}

View File

@ -1384,7 +1384,9 @@ from .tvnow import (
from .tvp import ( from .tvp import (
TVPEmbedIE, TVPEmbedIE,
TVPIE, TVPIE,
TVPWebsiteIE, TVPStreamIE,
TVPVODSeriesIE,
TVPVODVideoIE,
) )
from .tvplay import ( from .tvplay import (
TVPlayIE, TVPlayIE,

View File

@ -8,7 +8,7 @@ from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
str_or_none, str_or_none,
try_get, traverse_obj,
) )
@ -109,7 +109,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
} }
name''' name'''
@ classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
@ -118,7 +118,8 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
def entries(): def entries():
for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): for music in traverse_obj(artist, (
'musics', 'nodes', lambda _, m: m['musicID'])):
yield self._parse_music(music) yield self._parse_music(music)
return self.playlist_result( return self.playlist_result(
@ -137,7 +138,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE):
'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
'description': 'md5:7043342c09a224598e93546e98e49282', 'description': 'md5:7043342c09a224598e93546e98e49282',
'upload_date': '20161107', 'upload_date': '20161107',
'uploader_id': 'maiaramaraisaoficial', 'uploader_id': '@maiaramaraisaoficial',
'uploader': 'Maiara e Maraisa', 'uploader': 'Maiara e Maraisa',
} }
}] }]

View File

@ -2,52 +2,274 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools import itertools
import random
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
clean_html, clean_html,
determine_ext, determine_ext,
ExtractorError, ExtractorError,
get_element_by_attribute, int_or_none,
orderedSet, js_to_json,
traverse_obj,
url_or_none,
) )
def txt_or_none(v, default=None):
return default if v is None else (compat_str(v).strip() or default)
if not hasattr(InfoExtractor, '_match_valid_url'):
import sys
from ..compat import (
compat_os_name,
compat_re_Pattern as compiled_regex_type,
)
from ..utils import (
bug_reports_message,
error_to_compat_str,
NO_DEFAULT,
RegexNotFoundError,
)
BaseIE = InfoExtractor
class InfoExtractor(BaseIE):
def _match_valid_url(self, url):
return re.match(self._VALID_URL, url)
def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
In case of failure return a default value or raise a WARNING or a
RegexNotFoundError, depending on fatal, specifying the field name.
"""
if isinstance(pattern, (str, compat_str, compiled_regex_type)):
mobj = re.search(pattern, string, flags)
else:
for p in pattern:
mobj = re.search(p, string, flags)
if mobj:
break
if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
if mobj:
if group is None:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
elif isinstance(group, (list, tuple)):
return tuple(mobj.group(g) for g in group)
else:
return mobj.group(group)
elif default is not NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if isinstance(res, tuple):
return tuple(map(clean_html, res))
return clean_html(res or None)
def _search_json(self, start_pattern, string, name, video_id, **kwargs):
"""Searches string for the JSON object specified by start_pattern"""
# self, start_pattern, string, name, video_id, *, end_pattern='',
# contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
end_pattern = kwargs.pop('end_pattern', '')
contains_pattern = kwargs.pop('contains_pattern', r'{(?:[\s\S]+)}')
fatal = kwargs.get('fatal', True)
default = kwargs.get('default', NO_DEFAULT)
# NB: end_pattern is only used to reduce the size of the initial match
if default is NO_DEFAULT:
default, has_default = {}, False
else:
fatal, has_default = False, True
json_string = self._search_regex(
r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
start_pattern, contains_pattern, end_pattern),
string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
if not json_string:
return default
try:
# return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
return self._parse_json(json_string, video_id, **kwargs)
except ExtractorError as e:
if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
msg = 'Unable to extract {0} - Failed to parse JSON'.format(_name)
if fatal:
raise ExtractorError(msg, cause=e.cause, video_id=video_id)
elif not has_default:
self.report_warning(
'{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
return default
class TVPIE(InfoExtractor): class TVPIE(InfoExtractor):
IE_NAME = 'tvp' IE_NAME = 'tvp'
IE_DESC = 'Telewizja Polska' IE_DESC = 'Telewizja Polska'
_VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com|tvpworld\.com|swipeto\.pl)/(?:(?:(?!\d+/)[^/]+/)*|(?:video|website)/[^/]+,)(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
# TVPlayer 2 in js wrapper
'url': 'https://swipeto.pl/64095316/uliczny-foxtrot-wypozyczalnia-kaset-kto-pamieta-dvdvideo',
'info_dict': {
'id': '64095316',
'ext': 'mp4',
'title': 'Uliczny Foxtrot — Wypożyczalnia kaset. Kto pamięta DVD-Video?',
'age_limit': 0,
'duration': 374,
'thumbnail': r're:https://.+',
},
'expected_warnings': [
'Failed to download ISM manifest: HTTP Error 404: Not Found',
'Failed to download m3u8 information: HTTP Error 404: Not Found',
],
'skip': 'Video gone: 404 Nie znaleziono obiektu',
}, {
# TVPlayer 2 in js wrapper (redirect to VodVideo)
'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': { 'info_dict': {
'id': '194536', 'id': '194536',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek', 'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:437f48b93558370b031740546b696e24', 'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
}, },
'add_ie': ['Generic', 'TVPEmbed'],
}, { }, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', # film (old format)
'md5': 'b0005b542e5b4de643a9690326ab1257', 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466',
'info_dict': { 'info_dict': {
'id': '17916176', 'id': '51374509',
'ext': 'mp4', 'ext': 'mp4',
'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 'title': 'Krzysztof Krawczyk całe moje życie, Krzysztof Krawczyk całe moje życie',
'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 'description': 'md5:2e80823f00f5fc263555482f76f8fa42',
'age_limit': 12,
}, },
'params': {
'skip_download': True,
},
'add_ie': ['TVPEmbed'],
'skip': 'This video is not available from your location due to geo restriction',
}, { }, {
# TVPlayer legacy
'url': 'https://www.tvp.pl/polska-press-video-uploader/wideo/62042351',
'info_dict': {
'id': '62042351',
'ext': 'mp4',
'title': 'Wideo',
'description': 'Wideo Kamera',
'duration': 24,
'age_limit': 0,
'thumbnail': r're:https://.+',
},
'add_ie': ['TVPEmbed'],
}, {
# TVPlayer 2 in iframe
# page id is not the same as video id(#7799) # page id is not the same as video id(#7799)
'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow',
'md5': '84cd3c8aec4840046e5ab712416b73d0', 'md5': 'd35fb45103802488fcb7470e411b9ed4',
'info_dict': { 'info_dict': {
'id': '33908820', 'id': '50725617',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Wiadomości, 28.09.2017, 19:30', 'title': 'Dzieci na sprzedaż dla homoseksualistów',
'description': 'Wydanie główne codziennego serwisu informacyjnego.' 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590',
'age_limit': 12,
'duration': 259,
'thumbnail': r're:https://.+',
}, },
'skip': 'HTTP Error 404: Not Found', 'add_ie': ['TVPEmbed'],
}, {
# TVPlayer 2 in client-side rendered website (regional; window.__newsData)
'url': 'https://warszawa.tvp.pl/25804446/studio-yayo',
'info_dict': {
'id': '25804446',
'ext': 'mp4',
'title': 'Studio Yayo',
'upload_date': '20160616',
'timestamp': 1466075700,
'age_limit': 0,
'duration': 20,
'thumbnail': r're:https://.+',
},
'add_ie': ['TVPEmbed'],
'skip': 'Video is geo restricted',
}, {
# TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData)
'url': 'https://www.tvp.info/52880236/09042021-0800',
'info_dict': {
'id': '52880236',
'ext': 'mp4',
'title': '09.04.2021, 08:00',
'age_limit': 0,
'thumbnail': r're:https://.+',
},
'add_ie': ['TVPEmbed'],
'skip': 'Video is geo restricted',
}, {
# client-side rendered (regional) program (playlist) page
'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia',
'info_dict': {
'id': '9660819',
'description': 'Od poniedziałku do piątku o 18:55',
'title': 'Rozmowa dnia',
},
'playlist_mincount': 1800,
'params': {
'skip_download': True,
}
}, {
# ABC-specific video embeding
# moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450
'url': 'https://abc.tvp.pl/48636269/zubry-odc-124',
'info_dict': {
'id': '48320456',
'ext': 'mp4',
'title': 'Teleranek, Żubr',
},
'skip': 'Video gone: Nie znaleziono obiektu',
}, {
# yet another vue page
'url': 'https://jp2.tvp.pl/46925618/filmy',
'info_dict': {
'id': '46925618',
'title': 'Filmy',
},
'playlist_mincount': 19,
}, {
# redirect
'url': 'https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii',
'info_dict': {
'id': '295157',
'title': 'Wadowickie spotkania z Janem Pawłem II',
},
'playlist_mincount': 12,
'add_ie': ['TVPEmbed', 'TVPVODSeries'],
}, { }, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'only_matching': True, 'only_matching': True,
@ -66,31 +288,212 @@ class TVPIE(InfoExtractor):
}, { }, {
'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej',
'only_matching': True,
}, {
'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277',
'only_matching': True,
}, {
'url': 'https://tvpworld.com/48583640/tescos-polish-business-bought-by-danish-chain-netto',
'only_matching': True,
}, {
'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm',
'only_matching': True,
}] }]
def _parse_vue_website_data(self, webpage, page_id):
website_data = self._search_regex([
# website - regiony, tvp.info
# directory - jp2.tvp.pl
r'window\s*\.\s*__(?:website|directory)Data\s*=\s*({[\s\S]+?});',
], webpage, 'website data')
if not website_data:
return None
return self._parse_json(website_data, page_id, transform_source=js_to_json)
def _extract_vue_video(self, video_data, page_id=None):
if isinstance(video_data, compat_str):
video_data = self._parse_json(video_data, page_id, transform_source=js_to_json)
video_id = txt_or_none(video_data.get('_id')) or page_id
if not video_id:
return
is_website = video_data.get('type') == 'website'
if is_website:
url = video_data['url']
fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url)
if fucked_up_url_parts:
url = 'https://vod.tvp.pl/website/' + ','.join(fucked_up_url_parts.group(2, 1))
else:
url = 'tvp:' + video_id
return {
'_type': 'url_transparent',
'id': video_id,
'url': url,
'ie_key': (TVPIE if is_website else TVPEmbedIE).ie_key(),
'title': txt_or_none(video_data.get('title')),
'description': txt_or_none(video_data.get('lead')),
'timestamp': int_or_none(video_data.get('release_date_long')),
'duration': int_or_none(video_data.get('duration')),
'thumbnails': traverse_obj(video_data, ('image', (None, Ellipsis), 'url'), expected_type=url_or_none) or None,
}
def _handle_vuejs_page(self, url, webpage, page_id):
# vue client-side rendered sites (all regional pages + tvp.info)
video_data = self._search_regex([
r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;',
], webpage, 'video data', default=None)
if video_data:
video_data = self._extract_vue_video(video_data, page_id=page_id)
if video_data:
return self._extract_vue_video(video_data, page_id=page_id)
else:
# paged playlists
website_data = self._parse_vue_website_data(webpage, page_id)
if website_data:
entries = self._vuejs_entries(url, website_data, page_id)
return {
'_type': 'playlist',
'id': page_id,
'title': txt_or_none(website_data.get('title')),
'description': txt_or_none(website_data.get('lead')),
'entries': entries,
}
raise ExtractorError('Could not extract video/website data')
def _vuejs_entries(self, url, website_data, page_id):
def extract_videos(wd):
for video in traverse_obj(wd, (None, ('latestVideo', (('videos', 'items'), Ellipsis)))):
video = self._extract_vue_video(video)
if video:
yield video
for from_ in extract_videos(website_data):
yield from_
if website_data.get('items_total_count') > website_data.get('items_per_page'):
for page in itertools.count(2):
page_website_data = self._parse_vue_website_data(
self._download_webpage(url, page_id, note='Downloading page #%d' % page,
query={'page': page}),
page_id)
if not page_website_data.get('videos') and not page_website_data.get('items'):
break
for from_ in extract_videos(page_website_data):
yield from_
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) webpage, urlh = self._download_webpage_handle(url, page_id, expected_status=404)
video_id = self._search_regex([
# The URL may redirect to a VOD
# example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii
for ie_cls in (TVPVODSeriesIE, TVPVODVideoIE):
if ie_cls.suitable(urlh.url):
return self.url_result(urlh.url, ie=ie_cls.ie_key(), video_id=page_id)
if urlh.getcode() == 404:
raise compat_HTTPError(url, 404, 'HTTP Error 404: Not Found', urlh.headers, urlh)
if re.search(
r'window\s*\.\s*__(?:video|news|website|directory)Data\s*=',
webpage):
return self._handle_vuejs_page(url, webpage, page_id)
# classic server-side rendered sites
video_id = self._search_regex((
r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)',
r'<iframe[^>]+src="[^"]*?object_id=(\d+)', r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
r"object_id\s*:\s*'(\d+)'", r"object_id\s*:\s*'(\d+)'",
r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) r'data-video-id="(\d+)"',
# abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video?
# the first one is referenced to as "copyid", and seems to be unused by the website
r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>',
), webpage, 'video id', default=page_id)
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'tvp:' + video_id, 'url': 'tvp:' + video_id,
'description': self._og_search_description( 'description': self._og_search_description(
webpage, default=None) or self._html_search_meta( webpage, default=None) or (self._html_search_meta(
'description', webpage, default=None), 'description', webpage, default=None)
if '//s.tvp.pl/files/portal/v' in webpage else None),
'thumbnail': self._og_search_thumbnail(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed', 'ie_key': 'TVPEmbed',
} }
class TVPStreamIE(InfoExtractor):
IE_NAME = 'tvp:stream'
_VALID_URL = r'(?:tvpstream:|https?://(?:tvpstream\.vod|stream)\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)'
_TESTS = [{
'url': 'https://stream.tvp.pl/?channel_id=56969941',
'only_matching': True,
}, {
'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455',
'info_dict': {
'id': r're:\d+',
'title': r're:\S.*',
'ext': 'mp4',
},
'params': {
'skip_download': 'm3u8',
},
'add_ie': ['TVPEmbed'],
}, {
'url': 'tvpstream:39821455',
'only_matching': True,
}, {
# the default stream when you provide no channel_id, most probably TVP Info
'url': 'tvpstream:',
'only_matching': True,
}, {
'url': 'https://tvpstream.vod.tvp.pl/',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_url = self._proto_relative_url('//stream.tvp.pl/?channel_id=%s' % channel_id or 'default')
webpage = self._download_webpage(channel_url, channel_id or 'default', 'Downloading channel webpage')
channels = self._search_json(
r'window\s*\.\s*__channels\s*=', webpage, 'channel list', channel_id,
contains_pattern=r'\[\s*\{[\s\S]+}\s*]')
channel = traverse_obj(channels, (lambda _, v: channel_id == compat_str(v['id'])), get_all=False) if channel_id else channels[0]
audition = traverse_obj(channel, ('items', lambda _, v: v['is_live'] is True), get_all=False)
return {
'_type': 'url_transparent',
'id': channel_id or channel['id'],
'url': 'tvp:%s' % (audition['video_id'], ),
'title': audition.get('title'),
'alt_title': channel.get('title'),
'is_live': True,
'ie_key': 'TVPEmbed',
}
class TVPEmbedIE(InfoExtractor): class TVPEmbedIE(InfoExtractor):
IE_NAME = 'tvp:embed' IE_NAME = 'tvp:embed'
IE_DESC = 'Telewizja Polska' IE_DESC = 'Telewizja Polska'
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' # XFF is not effective
_GEO_BYPASS = False
_VALID_URL_PAT = (
r'''
(?:
tvp:
|https?://
(?:[^/]+\.)?
(?:tvp(?:parlament)?\.pl|tvp\.info|tvpworld\.com|swipeto\.pl)/
(?:sess/
(?:tvplayer\.php\?.*?object_id
|TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd])
|shared/details\.php\?.*?object_id)
=)
(?P<id>\d+)
''')
_VALID_URL = '(?x)' + _VALID_URL_PAT
_EMBED_REGEX = [r'(?x)<iframe[^>]+?src=(["\'])(?P<url>{0})'.format(_VALID_URL_PAT)]
_TESTS = [{ _TESTS = [{
'url': 'tvp:194536', 'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
@ -98,9 +501,16 @@ class TVPEmbedIE(InfoExtractor):
'id': '194536', 'id': '194536',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek', 'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
'duration': 2652,
'series': 'Czas honoru',
'episode': 'Episode 13',
'episode_number': 13,
'season': 'sezon 1',
'thumbnail': r're:https://.+',
}, },
}, { }, {
# not available
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7', 'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': { 'info_dict': {
@ -108,7 +518,28 @@ class TVPEmbedIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40', 'title': 'Panorama, 07.12.2015, 15:40',
}, },
'skip': 'Transmisja została zakończona lub materiał niedostępny', 'skip': 'Nie znaleziono obiektu',
}, {
'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&amp;autoplay=false',
'info_dict': {
'id': '51247504',
'ext': 'mp4',
'title': 'Razmova 091220',
'duration': 876,
'age_limit': 0,
'thumbnail': r're:https://.+',
},
}, {
# TVPlayer2 embed URL
'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757',
'only_matching': True,
}, {
'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452',
'only_matching': True,
}, {
# pulsembed on dziennik.pl
'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html',
'only_matching': True,
}, { }, {
'url': 'tvp:22670268', 'url': 'tvp:22670268',
'only_matching': True, 'only_matching': True,
@ -117,136 +548,272 @@ class TVPEmbedIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# could be anything that is a valid JS function name
callback = random.choice((
'jebac_pis',
'jebacpis',
'ziobro',
'sasin70',
'sasin_przejebal_70_milionow_PLN',
'tvp_is_a_state_propaganda_service',
))
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s'
+ '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id)
error = self._html_search_regex( # stripping JSONP padding
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', null, datastr = self._search_regex(
webpage, 'error', default=None) or clean_html( r'\s%s\s*\(\s*(?P<null>null\s*,\s*)?(?P<json>(?(null)\[\s*)?\{(?:[\s\S]+)}(?(null)]\s*))\)\s*;' % (re.escape(callback), ),
get_element_by_attribute('class', 'msg error', webpage)) webpage, 'JSON API result', group=('null', 'json'))
if error: data = self._parse_json(datastr, video_id, fatal=False)
raise ExtractorError('%s said: %s' % ( if null:
self.IE_NAME, clean_html(error)), expected=True) error_desc = traverse_obj(data, (0, 'desc'), expected_type=compat_str)
if error_desc == 'Obiekt wymaga płatności':
error_desc = 'Video requires payment and log-in, but log-in is not implemented'
raise ExtractorError(error_desc or 'unexpected JSON error', expected=error_desc)
title = self._search_regex( content = data['content']
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', info = traverse_obj(content, 'info', expected_type=dict)
webpage, 'title', group='title')
series_title = self._search_regex(
r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
webpage, 'series', group='series', default=None)
if series_title:
title = '%s, %s' % (series_title, title)
thumbnail = self._search_regex( if traverse_obj(info, 'isGeoBlocked', expected_type=bool):
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) # actual country list is not provided, we just assume it's always available in PL
self.raise_geo_restricted(countries=['PL'])
video_url = self._search_regex( is_live = traverse_obj(info, 'isLive', expected_type=bool)
r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
'formats', group='url', default=None)
if not video_url or 'material_niedostepny.mp4' in video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
formats = [] formats = []
video_url_base = self._search_regex( for file in traverse_obj(content, ('files', Ellipsis), expected_type=dict):
r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url = url_or_none(file.get('url'))
video_url, 'video base url', default=None) if not video_url:
if video_url_base: continue
# TODO: <Group> found instead of <AdaptationSet> in MPD manifest. ext = determine_ext(video_url, None)
# It's not mentioned in MPEG-DASH standard. Figure that out. if ext == 'm3u8':
# formats.extend(self._extract_mpd_formats( formats.extend(self._extract_m3u8_formats(
# video_url_base + '.ism/video.mpd', video_url, video_id, ext='mp4', m3u8_id='hls',
# video_id, mpd_id='dash', fatal=False)) fatal=False, live=is_live))
formats.extend(self._extract_ism_formats( elif ext == 'mpd':
video_url_base + '.ism/Manifest', if is_live:
video_id, 'mss', fatal=False)) # doesn't work with either ffmpeg or native downloader
formats.extend(self._extract_f4m_formats( continue
video_url_base + '.ism/video.f4m', formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False))
video_id, f4m_id='hds', fatal=False)) elif ext == 'f4m':
m3u8_formats = self._extract_m3u8_formats( formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False))
video_url_base + '.ism/video.m3u8', video_id, elif video_url.endswith('.ism/manifest'):
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
self._sort_formats(m3u8_formats) elif ext == 'ism':
m3u8_formats = list(filter( if '.ism/manifest' in video_url:
lambda f: f.get('vcodec') != 'none', m3u8_formats)) formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False))
formats.extend(m3u8_formats) else:
for i, m3u8_format in enumerate(m3u8_formats, 2): # mp4, wmv or something
http_url = '%s-%d.mp4' % (video_url_base, i) quality = traverse_obj(file, 'quality', expected_type=dict) or {}
if self._is_valid_url(http_url, video_id): formats.append({
f = m3u8_format.copy() 'format_id': 'direct',
f.update({ 'url': video_url,
'url': http_url, 'ext': ext or file.get('type'),
'format_id': f['format_id'].replace('hls', 'http'), 'fps': int_or_none(quality.get('fps')),
'protocol': 'http', 'tbr': int_or_none(quality.get('bitrate'), scale=1000),
}) 'width': int_or_none(quality.get('width')),
formats.append(f) 'height': int_or_none(quality.get('height')),
else: })
formats = [{
'format_id': 'direct',
'url': video_url,
'ext': determine_ext(video_url, 'mp4'),
}]
self._sort_formats(formats) self._sort_formats(formats)
return { title = traverse_obj(info, 'subtitle', 'title', 'seoTitle', expected_type=txt_or_none)
# `seoDescription` may be Falsen
description = traverse_obj(info, 'description', 'seoDescription',
expected_type=lambda x: txt_or_none(x or None))
thumbnails = []
for thumb in traverse_obj(content, ('posters', Ellipsis), expected_type=dict):
thumb_url = thumb.get('src')
if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url:
continue
thumbnails.append({
'url': thumb.get('src'),
'width': thumb.get('width'),
'height': thumb.get('height'),
})
age_limit = traverse_obj(info, ('ageGroup', 'minAge'), expected_type=int)
if age_limit == 1:
age_limit = 0
duration = traverse_obj(info, 'duration', expected_type=int) if not is_live else None
subtitles = {}
for sub in traverse_obj(content, ('subtitles', Ellipsis), expected_type=dict):
if not (sub.get('url') and sub.get('lang')):
continue
subtitles.setdefault(sub['lang'], []).append({
'url': sub['url'],
'ext': sub.get('type'),
})
info_dict = {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'thumbnail': thumbnail, 'description': description,
'thumbnails': thumbnails,
'age_limit': age_limit,
'is_live': is_live,
'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
}
# vod.tvp.pl
if traverse_obj(info, 'vortalName') == 'vod':
info_dict.update({
'title': '%s, %s' % (info.get('title'), info.get('subtitle')),
'series': info.get('title'),
'season': info.get('season'),
'episode_number': info.get('episode') or None,
})
return info_dict
class TVPVODBaseIE(InfoExtractor):
_API_BASE_URL = 'https://vod.tvp.pl/api/products/'
def _call_api(self, resource, video_id, **kwargs):
return self._download_json(
self._API_BASE_URL + resource, video_id,
query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs)
def _parse_video(self, video):
video_id = traverse_obj(video, 'externalUid', expected_type=txt_or_none)
if not video_id:
return None
return {
'_type': 'url',
'url': 'tvp:' + video_id,
'ie_key': TVPEmbedIE.ie_key(),
'title': video.get('title'),
'description': traverse_obj(video, ('lead', 'description'), expected_type=txt_or_none),
'age_limit': int_or_none(video.get('rating')),
'duration': int_or_none(video.get('duration')),
} }
class TVPWebsiteIE(InfoExtractor): class TVPVODVideoIE(TVPVODBaseIE):
IE_NAME = 'tvp:series' IE_NAME = 'tvp:vod'
_VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' _VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$'
_TESTS = [{ _TESTS = [{
# series 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
'info_dict': { 'info_dict': {
'id': '38678312', 'id': '60468609',
},
'playlist_count': 115,
}, {
# film
'url': 'https://vod.tvp.pl/website/gloria,35139666',
'info_dict': {
'id': '36637049',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Gloria, Gloria', 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24',
}, 'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c',
'params': { 'duration': 300,
'skip_download': True, 'episode_number': 24,
'episode': 'Episode 24',
'age_limit': 0,
'series': 'Laboratorium alchemika',
'thumbnail': 're:https://.+',
}, },
'add_ie': ['TVPEmbed'], 'add_ie': ['TVPEmbed'],
}, { }, {
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667',
'info_dict': {
'id': '51640077',
'ext': 'mp4',
'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu',
'series': 'Ukraiński sługa narodu',
'description': 'md5:b7940c0a8e439b0c81653a986f544ef3',
'age_limit': 12,
'duration': 3051,
'thumbnail': 're:https://.+',
},
'add_ie': ['TVPEmbed'],
}, {
# new URL format
'url': 'https://vod.tvp.pl/seriale,18/czas-honoru-odcinki,292065/odcinek-13,S01E13,313867',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:76649d2014f65c99477be17f23a4dead',
'age_limit': 12,
},
'add_ie': ['TVPEmbed'],
}, {
'url': 'https://vod.tvp.pl/filmy-fabularne,136/rozlam,390638',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._parse_video(
self._call_api('vods/' + video_id, video_id))
if not video:
raise ExtractorError('No video data for ' + video_id)
return video
class TVPVODSeriesIE(TVPVODBaseIE):
IE_NAME = 'tvp:vod:series'
_VALID_URL = r'''(?x)
https?://vod\.tvp\.pl/
seriale,(?P<cat>\d+)/
(?P<display_id>[^,]+?)(?(cat)-odcinki),(?P<id>\d+)
(?(cat)|(?P<video>/video)?)(?:[#?]|$)
'''
_VALID_URL = r'https?://vod\.tvp\.pl/(?P<display_id>[a-z\d-]+,\d+)/[a-z\d-]+-odcinki,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$'
_TESTS = [{
# series
'url': 'https://vod.tvp.pl/seriale,18/ranczo-odcinki,316445',
# series (old) - redirects to home page
# 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video',
'info_dict': {
'id': '316445',
'title': 'Ranczo',
# 'description': 'md5:a7ccbe1296e6f32425cef17639f1b24b',
'age_limit': 12,
'categories': ['seriale'],
},
'playlist_mincount': 129,
}, {
'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514',
'only_matching': True,
}, {
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338',
'only_matching': True, 'only_matching': True,
}] }]
def _entries(self, display_id, playlist_id): def _entries(self, display_id, playlist_id):
url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) season_path = 'vods/serials/%s/seasons' % (playlist_id, )
for page_num in itertools.count(1): seasons = self._call_api(
page = self._download_webpage( season_path, playlist_id,
url, display_id, 'Downloading page %d' % page_num, note='Downloading season list') or []
query={'page': page_num})
video_ids = orderedSet(re.findall( for ii, season in enumerate(seasons, 1):
r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, season_id = traverse_obj(season, 'id', expected_type=txt_or_none)
page)) if not season_id:
continue
if not video_ids: episodes = self._call_api(
break '%s/%s/episodes' % (season_path, season_id), playlist_id,
note='Downloading episode list (season %d)' % ii)
for video_id in video_ids: for episode in episodes or []:
yield self.url_result( video_id = traverse_obj(episode, 'externalUid', expected_type=txt_or_none)
'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), if video_id:
video_id=video_id) yield self._parse_video(episode)
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) display_id, playlist_id = self._match_valid_url(url).group('display_id', 'id')
display_id, playlist_id = mobj.group('display_id', 'id') metadata = self._call_api(
return self.playlist_result( 'vods/serials/' + playlist_id, playlist_id,
self._entries(display_id, playlist_id), playlist_id) note='Downloading serial metadata') or {}
pl = self.playlist_result(
self._entries(display_id, playlist_id), playlist_id, txt_or_none(metadata.get('title')))
pl.update({
'description': traverse_obj(metadata, ('description', 'lead'), expected_type=clean_html),
'categories': traverse_obj(metadata, ('mainCategory', (None, Ellipsis), 'name'), expected_type=txt_or_none),
'age_limit': traverse_obj(metadata, 'rating', expected_type=int),
})
return pl

View File

@ -14,6 +14,7 @@ from .utils import (
remove_quotes, remove_quotes,
unified_timestamp, unified_timestamp,
variadic, variadic,
write_string,
) )
from .compat import ( from .compat import (
compat_basestring, compat_basestring,
@ -53,15 +54,16 @@ def wraps_op(op):
# NB In principle NaN cannot be checked by membership. # NB In principle NaN cannot be checked by membership.
# Here all NaN values are actually this one, so _NaN is _NaN, # Here all NaN values are actually this one, so _NaN is _NaN,
# although _NaN != _NaN. # although _NaN != _NaN. Ditto Infinity.
_NaN = float('nan') _NaN = float('nan')
_Infinity = float('inf')
def _js_bit_op(op): def _js_bit_op(op):
def zeroise(x): def zeroise(x):
return 0 if x in (None, JS_Undefined, _NaN) else x return 0 if x in (None, JS_Undefined, _NaN, _Infinity) else x
@wraps_op(op) @wraps_op(op)
def wrapped(a, b): def wrapped(a, b):
@ -84,7 +86,7 @@ def _js_arith_op(op):
def _js_div(a, b): def _js_div(a, b):
if JS_Undefined in (a, b) or not (a or b): if JS_Undefined in (a, b) or not (a or b):
return _NaN return _NaN
return operator.truediv(a or 0, b) if b else float('inf') return operator.truediv(a or 0, b) if b else _Infinity
def _js_mod(a, b): def _js_mod(a, b):
@ -220,6 +222,42 @@ class LocalNameSpace(ChainMap):
return 'LocalNameSpace%s' % (self.maps, ) return 'LocalNameSpace%s' % (self.maps, )
class Debugger(object):
ENABLED = False
@staticmethod
def write(*args, **kwargs):
level = kwargs.get('level', 100)
def truncate_string(s, left, right=0):
if s is None or len(s) <= left + right:
return s
return '...'.join((s[:left - 3], s[-right:] if right else ''))
write_string('[debug] JS: {0}{1}\n'.format(
' ' * (100 - level),
' '.join(truncate_string(compat_str(x), 50, 50) for x in args)))
@classmethod
def wrap_interpreter(cls, f):
def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs):
if cls.ENABLED and stmt.strip():
cls.write(stmt, level=allow_recursion)
try:
ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs)
except Exception as e:
if cls.ENABLED:
if isinstance(e, ExtractorError):
e = e.orig_msg
cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion)
raise
if cls.ENABLED and stmt.strip():
if should_ret or not repr(ret) == stmt:
cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
return ret, should_ret
return interpret_statement
class JSInterpreter(object): class JSInterpreter(object):
__named_object_counter = 0 __named_object_counter = 0
@ -307,8 +345,7 @@ class JSInterpreter(object):
def __op_chars(cls): def __op_chars(cls):
op_chars = set(';,[') op_chars = set(';,[')
for op in cls._all_operators(): for op in cls._all_operators():
for c in op[0]: op_chars.update(op[0])
op_chars.add(c)
return op_chars return op_chars
def _named_object(self, namespace, obj): def _named_object(self, namespace, obj):
@ -326,9 +363,8 @@ class JSInterpreter(object):
# collections.Counter() is ~10% slower in both 2.7 and 3.9 # collections.Counter() is ~10% slower in both 2.7 and 3.9
counters = dict((k, 0) for k in _MATCHING_PARENS.values()) counters = dict((k, 0) for k in _MATCHING_PARENS.values())
start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
in_quote, escaping, skipping = None, False, 0 in_quote, escaping, after_op, in_regex_char_group = None, False, True, False
after_op, in_regex_char_group = True, False skipping = 0
for idx, char in enumerate(expr): for idx, char in enumerate(expr):
paren_delta = 0 paren_delta = 0
if not in_quote: if not in_quote:
@ -382,10 +418,12 @@ class JSInterpreter(object):
return separated[0][1:].strip(), separated[1].strip() return separated[0][1:].strip(), separated[1].strip()
@staticmethod @staticmethod
def _all_operators(): def _all_operators(_cached=[]):
return itertools.chain( if not _cached:
# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence _cached.extend(itertools.chain(
_SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS) # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence
_SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS))
return _cached
def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion):
if op in ('||', '&&'): if op in ('||', '&&'):
@ -416,7 +454,7 @@ class JSInterpreter(object):
except Exception as e: except Exception as e:
if allow_undefined: if allow_undefined:
return JS_Undefined return JS_Undefined
raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e) raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e)
def _dump(self, obj, namespace): def _dump(self, obj, namespace):
try: try:
@ -438,6 +476,7 @@ class JSInterpreter(object):
_FINALLY_RE = re.compile(r'finally\s*\{') _FINALLY_RE = re.compile(r'finally\s*\{')
_SWITCH_RE = re.compile(r'switch\s*\(') _SWITCH_RE = re.compile(r'switch\s*\(')
@Debugger.wrap_interpreter
def interpret_statement(self, stmt, local_vars, allow_recursion=100): def interpret_statement(self, stmt, local_vars, allow_recursion=100):
if allow_recursion < 0: if allow_recursion < 0:
raise self.Exception('Recursion limit reached') raise self.Exception('Recursion limit reached')
@ -511,7 +550,6 @@ class JSInterpreter(object):
expr = self._dump(inner, local_vars) + outer expr = self._dump(inner, local_vars) + outer
if expr.startswith('('): if expr.startswith('('):
m = re.match(r'\((?P<d>[a-z])%(?P<e>[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr) m = re.match(r'\((?P<d>[a-z])%(?P<e>[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr)
if m: if m:
# short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig` # short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig`
@ -693,7 +731,7 @@ class JSInterpreter(object):
(?P<op>{_OPERATOR_RE})? (?P<op>{_OPERATOR_RE})?
=(?!=)(?P<expr>.*)$ =(?!=)(?P<expr>.*)$
)|(?P<return> )|(?P<return>
(?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ (?!if|return|true|false|null|undefined|NaN|Infinity)(?P<name>{_NAME_RE})$
)|(?P<indexing> )|(?P<indexing>
(?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$
)|(?P<attribute> )|(?P<attribute>
@ -727,11 +765,12 @@ class JSInterpreter(object):
raise JS_Break() raise JS_Break()
elif expr == 'continue': elif expr == 'continue':
raise JS_Continue() raise JS_Continue()
elif expr == 'undefined': elif expr == 'undefined':
return JS_Undefined, should_return return JS_Undefined, should_return
elif expr == 'NaN': elif expr == 'NaN':
return _NaN, should_return return _NaN, should_return
elif expr == 'Infinity':
return _Infinity, should_return
elif md.get('return'): elif md.get('return'):
return local_vars[m.group('name')], should_return return local_vars[m.group('name')], should_return
@ -760,18 +799,28 @@ class JSInterpreter(object):
right_expr = separated.pop() right_expr = separated.pop()
# handle operators that are both unary and binary, minimal BODMAS # handle operators that are both unary and binary, minimal BODMAS
if op in ('+', '-'): if op in ('+', '-'):
# simplify/adjust consecutive instances of these operators
undone = 0 undone = 0
while len(separated) > 1 and not separated[-1].strip(): while len(separated) > 1 and not separated[-1].strip():
undone += 1 undone += 1
separated.pop() separated.pop()
if op == '-' and undone % 2 != 0: if op == '-' and undone % 2 != 0:
right_expr = op + right_expr right_expr = op + right_expr
elif op == '+':
while len(separated) > 1 and separated[-1].strip() in self.OP_CHARS:
right_expr = separated.pop() + right_expr
# hanging op at end of left => unary + (strip) or - (push right)
left_val = separated[-1] left_val = separated[-1]
for dm_op in ('*', '%', '/', '**'): for dm_op in ('*', '%', '/', '**'):
bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim))
if len(bodmas) > 1 and not bodmas[-1].strip(): if len(bodmas) > 1 and not bodmas[-1].strip():
expr = op.join(separated) + op + right_expr expr = op.join(separated) + op + right_expr
right_expr = None if len(separated) > 1:
separated.pop()
right_expr = op.join((left_val, right_expr))
else:
separated = [op.join((left_val, right_expr))]
right_expr = None
break break
if right_expr is None: if right_expr is None:
continue continue
@ -797,6 +846,8 @@ class JSInterpreter(object):
def eval_method(): def eval_method():
if (variable, member) == ('console', 'debug'): if (variable, member) == ('console', 'debug'):
if Debugger.ENABLED:
Debugger.write(self.interpret_expression('[{}]'.format(arg_str), local_vars, allow_recursion))
return return
types = { types = {
'String': compat_str, 'String': compat_str,

View File

@ -2406,7 +2406,7 @@ class ExtractorError(YoutubeDLError):
""" tb, if given, is the original traceback (so that it can be printed out). """ tb, if given, is the original traceback (so that it can be printed out).
If expected is set, this is a normal error message and most likely not a bug in youtube-dl. If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
""" """
self.orig_msg = msg
if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
expected = True expected = True
if video_id is not None: if video_id is not None: