diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index d838b3981..874ef6de7 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -995,7 +995,8 @@ from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import (
RadioFrancePodcastEpisodeIE,
- RadioFrancePodcastPlaylistIE
+ RadioFrancePodcastPlaylistIE,
+ RadioFranceWebradioIE,
)
from .rai import (
RaiPlayIE,
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py
index 7a8eeb327..c495f464d 100644
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@@ -17,67 +17,69 @@ from ..utils import (
class RadioFranceBaseIE(InfoExtractor):
_BASE_URL = r'https://www.radiofrance.fr/'
- def extract_api_data(self, id, html):
- pattern = r''
+ def extract_api_data(self, api_path, id, html):
+ pattern = r'' % api_path
json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
+
if not json:
raise ExtractorError('%s: JSON data not found' % id)
try:
json = self._parse_json(json, id)
json = self._parse_json(json['body'], id)
- return json['content']
+
+ if api_path == 'path':
+ return json['content']
+ elif api_path == 'stations':
+ return json
+ else:
+ raise ExtractorError('Coding error')
except KeyError:
raise ExtractorError('%s: Invalid JSON' % id)
- def parse_api_data_info(self, api_data):
+ def get_title(self, api_data, webpage=None):
title = strip_or_none(api_data.get('title'))
+ if not title and webpage:
+ title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
+ return title
+
+ def get_description(self, api_data, webpage=None):
description = strip_or_none(api_data.get('standFirst'))
- channel_id = strip_or_none(api_data.get('brand'))
- visual = api_data.get('visual')
- publication_time = api_data.get('publishedDate')
+ if not description and webpage:
+ description = strip_or_none(self._og_search_description(webpage))
+ return description
+
+ def get_thumbnail(self, api_data, webpage=None):
thumbnail = None
+ visual = api_data.get('visual')
if visual:
thumbnail = url_or_none(visual.get('src'))
+ if not thumbnail and webpage:
+ thumbnail = self._og_search_thumbnail(webpage)
+ return thumbnail
- return {
- 'title': title,
- 'description': description,
- 'channel_id': channel_id,
- 'thumbnail': thumbnail,
- 'timestamp': publication_time,
- }
+ def get_timestamp(self, api_data, webpage=None):
+ timestamp = api_data.get('publishedDate')
+ if not timestamp and webpage:
+ timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
+ return timestamp
- def parse_html_info(self, webpage):
- title = strip_or_none(self._og_search_title(webpage)) or strip_or_none(get_element_by_attribute('h1', None, webpage, False))
- description = strip_or_none(self._og_search_description(webpage))
- thumbnail = self._og_search_thumbnail(webpage)
- channel_id = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
- publication_time = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
-
- return {
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'channel_id': channel_id,
- 'timestamp': publication_time
- }
+ def get_brand(self, api_data, webpage=None):
+ brand = strip_or_none(api_data.get('brand'))
+ if not brand and webpage:
+ brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
+ return brand
def extract_episode(self, episode_id, api_data):
manifestations = api_data.get('manifestations')
if manifestations is None or len(manifestations) == 0:
- return None
+ return None, None
url = url_or_none(manifestations[0]['url'])
duration = int_or_none(manifestations[0].get('duration'))
- episode_info = {
- 'id': episode_id,
- 'url': url,
- 'duration': duration
- }
- return self.parse_api_data_info(api_data) | episode_info
+ return url, duration
- def extract_playlist_entries(self, url, playlist_id, api_data, direction):
+ def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
playlist_data = api_data['expressions']
entries = []
@@ -87,47 +89,44 @@ class RadioFranceBaseIE(InfoExtractor):
if episode_path is None:
self.report_warning('No path found for episode "%s"', item.get('title'))
continue
- episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + item.get('path'))
+ episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
if episode_id is None:
- self.report_warning('Could not parse id of episode from path: "%s"' % item.get('path'))
+ self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
continue
- entry = self.extract_episode(episode_id, item)
- if entry is None:
- msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
+ episode_url, duration = self.extract_episode(episode_id, item)
+ if episode_url is None:
self.to_screen('Episode "%s" is not available' % episode_path)
continue
+ entry = {
+ 'id': episode_id,
+ 'url': episode_url,
+ 'title': self.get_title(item),
+ 'description': self.get_description(item),
+ 'timestamp': self.get_timestamp(item),
+ 'thumbnail': self.get_thumbnail(item),
+ 'duration': duration,
+ }
entries.append(entry)
page_number = int_or_none(playlist_data.get('pageNumber'))
if page_number:
if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
- webpage, other_api_data = self.get_data(url, playlist_id, page=page_number - 1)
- entries = self.extract_playlist_entries(url, playlist_id, other_api_data, direction='prev') + entries
+ webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
+ entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
if direction in ['both', 'next'] and playlist_data.get('next') is not None:
- webpage, other_api_data = self.get_data(url, playlist_id, page=page_number + 1)
- entries = entries + self.extract_playlist_entries(url, playlist_id, other_api_data, direction='next')
+ webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
+ entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
return entries
- def extract_playlist(self, playlist_id, url, api_data):
- entries = self.extract_playlist_entries(url, playlist_id, api_data, direction='both')
- entries = list(filter(lambda e: e is not None, entries))
- entries.reverse()
- playlist_info = {
- '_type': 'playlist',
- 'id': playlist_id,
- 'entries': entries
- }
- return self.parse_api_data_info(api_data) | playlist_info
-
- def get_data(self, url, id, page=None):
+ def get_data(self, url, api_path, id, page=None):
query = {}
note = None
if page:
query['p'] = page
note = "Downloading page %i" % page
webpage = self._download_webpage(url, id, query=query, note=note)
- api_data = self.extract_api_data(id, webpage)
+ api_data = self.extract_api_data(api_path, id, webpage)
return webpage, api_data
@@ -172,14 +171,22 @@ class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
def _real_extract(self, url):
id = self._match_id(url)
- webpage, api_data = self.get_data(url, id)
- api_data_info = self.extract_episode(id, api_data)
- if api_data_info is None:
+ webpage, api_data = self.get_data(url, 'path', id)
+ url, duration = self.extract_episode(id, api_data)
+ if url is None:
msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
raise ExtractorError(msg, expected=True, video_id=id)
- html_info = self.parse_html_info(webpage)
- return html_info | api_data_info
+ return {
+ 'id': id,
+ 'url': url,
+ 'title': self.get_title(api_data, webpage),
+ 'description': self.get_description(api_data, webpage),
+ 'timestamp': self.get_timestamp(api_data, webpage),
+ 'thumbnail': self.get_thumbnail(api_data, webpage),
+ 'channel_id': self.get_brand(api_data, webpage),
+ 'duration': duration,
+ }
class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
@@ -199,7 +206,82 @@ class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
def _real_extract(self, url):
id = self._match_id(url)
- webpage, api_data = self.get_data(url, id)
+ webpage, api_data = self.get_data(url, 'path', id)
- html_info = self.parse_html_info(webpage)
- return html_info | self.extract_playlist(id, url, api_data)
+ entries = self.get_playlist_entries(url, id, api_data, direction='both')
+ entries.reverse()
+
+ return {
+ 'id': id,
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': self.get_title(api_data, webpage),
+ 'description': self.get_description(api_data, webpage),
+ 'timestamp': self.get_timestamp(api_data, webpage),
+ 'thumbnail': self.get_thumbnail(api_data, webpage),
+ 'channel_id': self.get_brand(api_data, webpage),
+ }
+
+
+class RadioFranceWebradioIE(RadioFranceBaseIE):
+ _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?Pradio-[^/]+)$'
+
+ _TESTS = [{
+ 'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
+ 'url': 'https://www.radiofrance.fr/fip/radio-metal',
+ 'info_dict': {
+ 'id': 'radio-metal',
+ 'ext': 'aac',
+ 'title': str,
+ },
+ 'params': {
+ 'format': 'aac',
+ 'skip_download': True,
+ }
+ }]
+
+ def get_livestream_formats(self, id, api_data):
+ sources = api_data['media']['sources']
+
+ formats = []
+ for source in sources:
+ url = source.get('url')
+ if not url:
+ continue
+
+ format_id = source.get('format')
+ format = {
+ 'url': url,
+ 'format_id': format_id,
+ 'asr': 48000,
+ 'vcodec': 'none'
+ }
+ if format_id == 'mp3':
+ format['preference'] = 1
+ format['acodec'] = 'mp3'
+ format['abr'] = source.get('bitrate')
+ elif format_id == 'aac':
+ format['preference'] = 2
+ format['acodec'] = 'aac'
+ format['abr'] = source.get('bitrate')
+ elif format_id == 'hls':
+ format['preference'] = 0
+ format['manifest_url'] = url
+ formats.append(format)
+
+ if len(formats) == 0:
+ raise ExtractorError('No live streaming URL found')
+ return formats
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage, api_data = self.get_data(url, 'stations', id)
+
+ return {
+ 'id': id,
+ 'title': self.get_title(api_data, webpage),
+ 'formats': self.get_livestream_formats(id, api_data),
+ 'thumbnail': self.get_thumbnail(api_data, webpage),
+ 'channel_id': self.get_brand(api_data, webpage),
+ 'is_live': True
+ }