Compare commits

...

11 Commits

Author SHA1 Message Date
Olivier Trichet
a05e5a4ae1
Merge 72db217289 into 4d05f84325 2024-06-27 06:36:32 +08:00
dirkf
4d05f84325 [PalcoMP3] Conform to new linter rule
* no space after @ in decorator
2024-06-20 20:03:49 +01:00
dirkf
e0094e63c3 [jsinterp] Various tweaks
* treat Infinity like NaN
* cache operator list
2024-06-20 20:03:49 +01:00
dirkf
fd8242e3ef [jsinterp] Fix and improve expression parsing
* improve BODMAS (fixes https://github.com/ytdl-org/youtube-dl/issues/32815)
* support more weird expressions with multiple unary ops
2024-06-20 20:03:49 +01:00
dirkf
ad01fa6cca [jsinterp] Add Debugger from yt-dlp
* https://github.com/yt-dlp/yt-dlp/commit/8f53dc4
* thx pukkandan
2024-06-20 20:03:49 +01:00
dirkf
2eac0fa379 [utils] Save orig_msg in ExtractorError 2024-06-20 20:03:49 +01:00
Olivier Trichet
72db217289 [RadioFrance] Extractor fo thematic webradios 2022-12-22 14:22:19 -05:00
Olivier Trichet
fc933e686b [RadioFrance] Refactoring 2022-12-22 13:01:10 -05:00
Olivier Trichet
ea02c40539 [RadioFrance] Extractor for podcast playlists 2022-12-22 13:00:54 -05:00
Olivier Trichet
7270ecf3d6 [RadioFrance] Extractor for podcast of Radio France stations 2022-12-22 13:00:17 -05:00
Olivier Trichet
dade9111f1 [RadioFrance] Remove old Radio France stations extractors
These are not working anymore after their respectives websites were
merged into www.radiofrance.fr.
2022-12-22 13:00:08 -05:00
10 changed files with 360 additions and 205 deletions

View File

@ -577,9 +577,11 @@ class TestJSInterpreter(unittest.TestCase):
def test_unary_operators(self):
jsi = JSInterpreter('function f(){return 2 - - - 2;}')
self.assertEqual(jsi.call_function('f'), 0)
# fails
# jsi = JSInterpreter('function f(){return 2 + - + - - 2;}')
# self.assertEqual(jsi.call_function('f'), 0)
jsi = JSInterpreter('function f(){return 2 + - + - - 2;}')
self.assertEqual(jsi.call_function('f'), 0)
# https://github.com/ytdl-org/youtube-dl/issues/32815
jsi = JSInterpreter('function f(){return 0 - 7 * - 6;}')
self.assertEqual(jsi.call_function('f'), 42)
""" # fails so far
def test_packed(self):

View File

@ -158,6 +158,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js',
'_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ',
),
(
'https://www.youtube.com/s/player/590f65a6/player_ias.vflset/en_US/base.js',
'1tm7-g_A9zsI8_Lay_', 'xI4Vem4Put_rOg',
),
]

View File

@ -3033,7 +3033,6 @@ class InfoExtractor(object):
transform_source=transform_source, default=None)
def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
# allow passing `transform_source` through to _find_jwplayer_data()
transform_source = kwargs.pop('transform_source', None)
kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}

View File

@ -413,8 +413,6 @@ from .foxnews import (
FoxNewsArticleIE,
)
from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
FranceTVIE,
FranceTVSiteIE,
@ -1011,7 +1009,11 @@ from .radiocanada import (
from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .radiofrance import (
RadioFrancePodcastEpisodeIE,
RadioFrancePodcastPlaylistIE,
RadioFranceWebradioIE,
)
from .rai import (
RaiPlayIE,
RaiPlayLiveIE,

View File

@ -1,73 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
)
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': {
'id': 'rendez-vous-au-pays-des-geeks',
'display_id': 'rendez-vous-au-pays-des-geeks',
'ext': 'mp3',
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
'timestamp': 1393700400,
'vcodec': 'none',
}
}, {
# no thumbnail
'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = extract_attributes(self._search_regex(
r'''(?sx)
(?:
</h1>|
<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
).*?
(<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
''',
webpage, 'video data'))
video_url = video_data.get('data-url') or video_data['data-asset-source']
title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
description = self._html_search_regex(
r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
webpage, 'description', default=None)
thumbnail = self._search_regex(
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
webpage, 'thumbnail', default=None)
uploader = self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
ext = determine_ext(video_url.lower())
return {
'id': display_id,
'display_id': display_id,
'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
}

View File

@ -1,59 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import month_by_name
class FranceInterIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
_TEST = {
'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
'md5': '9e54d7bdb6fdc02a841007f8a975c094',
'info_dict': {
'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
'ext': 'mp3',
'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
'description': 'md5:401969c5d318c061f86bda1fa359292b',
'thumbnail': r're:^https?://.*\.jpg',
'upload_date': '20160907',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'video url', group='url')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
upload_date_str = self._search_regex(
r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
webpage, 'upload date', fatal=False)
if upload_date_str:
upload_date_list = upload_date_str.split()
upload_date_list.reverse()
upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
upload_date_list[2] = '%02d' % int(upload_date_list[2])
upload_date = ''.join(upload_date_list)
else:
upload_date = None
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'formats': [{
'url': video_url,
'vcodec': 'none',
}],
}

View File

@ -8,7 +8,7 @@ from ..compat import compat_str
from ..utils import (
int_or_none,
str_or_none,
try_get,
traverse_obj,
)
@ -109,7 +109,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
}
name'''
@ classmethod
@classmethod
def suitable(cls, url):
return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
@ -118,7 +118,8 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
def entries():
for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []):
for music in traverse_obj(artist, (
'musics', 'nodes', lambda _, m: m['musicID'])):
yield self._parse_music(music)
return self.playlist_result(
@ -137,7 +138,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE):
'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
'description': 'md5:7043342c09a224598e93546e98e49282',
'upload_date': '20161107',
'uploader_id': 'maiaramaraisaoficial',
'uploader_id': '@maiaramaraisaoficial',
'uploader': 'Maiara e Maraisa',
}
}]

View File

@ -4,56 +4,284 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_attribute,
int_or_none,
parse_iso8601,
strip_or_none,
url_or_none
)
class RadioFranceIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
IE_NAME = 'radiofrance'
class RadioFranceBaseIE(InfoExtractor):
_BASE_URL = r'https://www.radiofrance.fr/'
_TEST = {
'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
'info_dict': {
'id': 'one-one',
'ext': 'ogg',
'title': 'One to one',
'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
'uploader': 'Thomas Hercouët',
},
def extract_api_data(self, api_path, id, html):
pattern = r'<script [^>]*sveltekit:data-url="https://www\.radiofrance\.fr/api/v[\d.]+/%s[^>]*>(?P<json>.*)</script>' % api_path
json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
if not json:
raise ExtractorError('%s: JSON data not found' % id)
try:
json = self._parse_json(json, id)
json = self._parse_json(json['body'], id)
if api_path == 'path':
return json['content']
elif api_path == 'stations':
return json
else:
raise ExtractorError('Coding error')
except KeyError:
raise ExtractorError('%s: Invalid JSON' % id)
def get_title(self, api_data, webpage=None):
title = strip_or_none(api_data.get('title'))
if not title and webpage:
title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
return title
def get_description(self, api_data, webpage=None):
description = strip_or_none(api_data.get('standFirst'))
if not description and webpage:
description = strip_or_none(self._og_search_description(webpage))
return description
def get_thumbnail(self, api_data, webpage=None):
thumbnail = None
visual = api_data.get('visual')
if visual:
thumbnail = url_or_none(visual.get('src'))
if not thumbnail and webpage:
thumbnail = self._og_search_thumbnail(webpage)
return thumbnail
def get_timestamp(self, api_data, webpage=None):
timestamp = api_data.get('publishedDate')
if not timestamp and webpage:
timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
return timestamp
def get_brand(self, api_data, webpage=None):
brand = strip_or_none(api_data.get('brand'))
if not brand and webpage:
brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
return brand
def extract_episode(self, episode_id, api_data):
manifestations = api_data.get('manifestations')
if manifestations is None or len(manifestations) == 0:
return None, None
url = url_or_none(manifestations[0]['url'])
duration = int_or_none(manifestations[0].get('duration'))
return url, duration
def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
playlist_data = api_data['expressions']
entries = []
items = playlist_data.get('items')
for item in items:
episode_path = item.get('path')
if episode_path is None:
self.report_warning('No path found for episode "%s"', item.get('title'))
continue
episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
if episode_id is None:
self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
continue
episode_url, duration = self.extract_episode(episode_id, item)
if episode_url is None:
self.to_screen('Episode "%s" is not available' % episode_path)
continue
entry = {
'id': episode_id,
'url': episode_url,
'title': self.get_title(item),
'description': self.get_description(item),
'timestamp': self.get_timestamp(item),
'thumbnail': self.get_thumbnail(item),
'duration': duration,
}
entries.append(entry)
page_number = int_or_none(playlist_data.get('pageNumber'))
if page_number:
if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
if direction in ['both', 'next'] and playlist_data.get('next') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
return entries
def get_data(self, url, api_path, id, page=None):
query = {}
note = None
if page:
query['p'] = page
note = "Downloading page %i" % page
webpage = self._download_webpage(url, id, query=query, note=note)
api_data = self.extract_api_data(api_path, id, webpage)
return webpage, api_data
class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P<id>\d+)$'
_TESTS = [{
'note': 'Podcast episode with audio from France Info',
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713',
'info_dict': {
'id': '8310713',
'ext': 'mp3',
'url': r're:^https?://.*\.mp3$',
'title': 'Pour la première fois en vingt ans, leuro passe sous les 0,99\u00a0dollar',
'description': str,
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': int,
'duration': int,
'upload_date': str
}
}, {
'note': 'Podcast episode from France Musique',
'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228',
'only_matching': True
}, {
'note': 'Podcast episode from FranceInter',
'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281',
'only_matching': True
}, {
'note': 'Podcast episode from France Culture',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610',
'only_matching': True
}, {
'note': 'Podcast episode from Le Mouv',
'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950',
'only_matching': True
}, {
'note': 'Podcast episode from FIP',
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742',
'only_matching': True
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
'preference': i,
}
for i, fm in
enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
self._sort_formats(formats)
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'path', id)
url, duration = self.extract_episode(id, api_data)
if url is None:
msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
raise ExtractorError(msg, expected=True, video_id=id)
return {
'id': video_id,
'title': title,
'formats': formats,
'description': description,
'uploader': uploader,
'id': id,
'url': url,
'title': self.get_title(api_data, webpage),
'description': self.get_description(api_data, webpage),
'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'duration': duration,
}
class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P<id>[^/]+?)(?:[?#].*)?$'
_TESTS = [{
'note': 'Podcast show with multiple pages of episodes and some of them are missing',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2',
'info_dict': {
'id': 'une-semaine-dans-le-monde-10-11',
'title': 'Une semaine dans le monde | 10-11',
'description': str,
'timestamp': int
},
'playlist_count': 23,
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'path', id)
entries = self.get_playlist_entries(url, id, api_data, direction='both')
entries.reverse()
return {
'id': id,
'_type': 'playlist',
'entries': entries,
'title': self.get_title(api_data, webpage),
'description': self.get_description(api_data, webpage),
'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
}
class RadioFranceWebradioIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?P<id>radio-[^/]+)$'
_TESTS = [{
'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
'url': 'https://www.radiofrance.fr/fip/radio-metal',
'info_dict': {
'id': 'radio-metal',
'ext': 'aac',
'title': str,
},
'params': {
'format': 'aac',
'skip_download': True,
}
}]
def get_livestream_formats(self, id, api_data):
sources = api_data['media']['sources']
formats = []
for source in sources:
url = source.get('url')
if not url:
continue
format_id = source.get('format')
format = {
'url': url,
'format_id': format_id,
'asr': 48000,
'vcodec': 'none'
}
if format_id == 'mp3':
format['preference'] = 1
format['acodec'] = 'mp3'
format['abr'] = source.get('bitrate')
elif format_id == 'aac':
format['preference'] = 2
format['acodec'] = 'aac'
format['abr'] = source.get('bitrate')
elif format_id == 'hls':
format['preference'] = 0
format['manifest_url'] = url
formats.append(format)
if len(formats) == 0:
raise ExtractorError('No live streaming URL found')
return formats
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'stations', id)
return {
'id': id,
'title': self.get_title(api_data, webpage),
'formats': self.get_livestream_formats(id, api_data),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'is_live': True
}

View File

@ -14,6 +14,7 @@ from .utils import (
remove_quotes,
unified_timestamp,
variadic,
write_string,
)
from .compat import (
compat_basestring,
@ -53,15 +54,16 @@ def wraps_op(op):
# NB In principle NaN cannot be checked by membership.
# Here all NaN values are actually this one, so _NaN is _NaN,
# although _NaN != _NaN.
# although _NaN != _NaN. Ditto Infinity.
_NaN = float('nan')
_Infinity = float('inf')
def _js_bit_op(op):
def zeroise(x):
return 0 if x in (None, JS_Undefined, _NaN) else x
return 0 if x in (None, JS_Undefined, _NaN, _Infinity) else x
@wraps_op(op)
def wrapped(a, b):
@ -84,7 +86,7 @@ def _js_arith_op(op):
def _js_div(a, b):
if JS_Undefined in (a, b) or not (a or b):
return _NaN
return operator.truediv(a or 0, b) if b else float('inf')
return operator.truediv(a or 0, b) if b else _Infinity
def _js_mod(a, b):
@ -220,6 +222,42 @@ class LocalNameSpace(ChainMap):
return 'LocalNameSpace%s' % (self.maps, )
class Debugger(object):
ENABLED = False
@staticmethod
def write(*args, **kwargs):
level = kwargs.get('level', 100)
def truncate_string(s, left, right=0):
if s is None or len(s) <= left + right:
return s
return '...'.join((s[:left - 3], s[-right:] if right else ''))
write_string('[debug] JS: {0}{1}\n'.format(
' ' * (100 - level),
' '.join(truncate_string(compat_str(x), 50, 50) for x in args)))
@classmethod
def wrap_interpreter(cls, f):
def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs):
if cls.ENABLED and stmt.strip():
cls.write(stmt, level=allow_recursion)
try:
ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs)
except Exception as e:
if cls.ENABLED:
if isinstance(e, ExtractorError):
e = e.orig_msg
cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion)
raise
if cls.ENABLED and stmt.strip():
if should_ret or not repr(ret) == stmt:
cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
return ret, should_ret
return interpret_statement
class JSInterpreter(object):
__named_object_counter = 0
@ -307,8 +345,7 @@ class JSInterpreter(object):
def __op_chars(cls):
op_chars = set(';,[')
for op in cls._all_operators():
for c in op[0]:
op_chars.add(c)
op_chars.update(op[0])
return op_chars
def _named_object(self, namespace, obj):
@ -326,9 +363,8 @@ class JSInterpreter(object):
# collections.Counter() is ~10% slower in both 2.7 and 3.9
counters = dict((k, 0) for k in _MATCHING_PARENS.values())
start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
in_quote, escaping, skipping = None, False, 0
after_op, in_regex_char_group = True, False
in_quote, escaping, after_op, in_regex_char_group = None, False, True, False
skipping = 0
for idx, char in enumerate(expr):
paren_delta = 0
if not in_quote:
@ -382,10 +418,12 @@ class JSInterpreter(object):
return separated[0][1:].strip(), separated[1].strip()
@staticmethod
def _all_operators():
return itertools.chain(
def _all_operators(_cached=[]):
if not _cached:
_cached.extend(itertools.chain(
# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence
_SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS)
_SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS))
return _cached
def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion):
if op in ('||', '&&'):
@ -416,7 +454,7 @@ class JSInterpreter(object):
except Exception as e:
if allow_undefined:
return JS_Undefined
raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e)
raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e)
def _dump(self, obj, namespace):
try:
@ -438,6 +476,7 @@ class JSInterpreter(object):
_FINALLY_RE = re.compile(r'finally\s*\{')
_SWITCH_RE = re.compile(r'switch\s*\(')
@Debugger.wrap_interpreter
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
if allow_recursion < 0:
raise self.Exception('Recursion limit reached')
@ -511,7 +550,6 @@ class JSInterpreter(object):
expr = self._dump(inner, local_vars) + outer
if expr.startswith('('):
m = re.match(r'\((?P<d>[a-z])%(?P<e>[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr)
if m:
# short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig`
@ -693,7 +731,7 @@ class JSInterpreter(object):
(?P<op>{_OPERATOR_RE})?
=(?!=)(?P<expr>.*)$
)|(?P<return>
(?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$
(?!if|return|true|false|null|undefined|NaN|Infinity)(?P<name>{_NAME_RE})$
)|(?P<indexing>
(?P<in>{_NAME_RE})\[(?P<idx>.+)\]$
)|(?P<attribute>
@ -727,11 +765,12 @@ class JSInterpreter(object):
raise JS_Break()
elif expr == 'continue':
raise JS_Continue()
elif expr == 'undefined':
return JS_Undefined, should_return
elif expr == 'NaN':
return _NaN, should_return
elif expr == 'Infinity':
return _Infinity, should_return
elif md.get('return'):
return local_vars[m.group('name')], should_return
@ -760,17 +799,27 @@ class JSInterpreter(object):
right_expr = separated.pop()
# handle operators that are both unary and binary, minimal BODMAS
if op in ('+', '-'):
# simplify/adjust consecutive instances of these operators
undone = 0
while len(separated) > 1 and not separated[-1].strip():
undone += 1
separated.pop()
if op == '-' and undone % 2 != 0:
right_expr = op + right_expr
elif op == '+':
while len(separated) > 1 and separated[-1].strip() in self.OP_CHARS:
right_expr = separated.pop() + right_expr
# hanging op at end of left => unary + (strip) or - (push right)
left_val = separated[-1]
for dm_op in ('*', '%', '/', '**'):
bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim))
if len(bodmas) > 1 and not bodmas[-1].strip():
expr = op.join(separated) + op + right_expr
if len(separated) > 1:
separated.pop()
right_expr = op.join((left_val, right_expr))
else:
separated = [op.join((left_val, right_expr))]
right_expr = None
break
if right_expr is None:
@ -797,6 +846,8 @@ class JSInterpreter(object):
def eval_method():
if (variable, member) == ('console', 'debug'):
if Debugger.ENABLED:
Debugger.write(self.interpret_expression('[{}]'.format(arg_str), local_vars, allow_recursion))
return
types = {
'String': compat_str,

View File

@ -2406,7 +2406,7 @@ class ExtractorError(YoutubeDLError):
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
"""
self.orig_msg = msg
if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
expected = True
if video_id is not None: