[ardaudiothek] Add an extractor for search results

2019-09-08 21:37:43 +02:00 · 2019-09-08 21:37:43 +02:00 · d7c2b5dac8
commit d7c2b5dac8
parent cd2c7ab40e
2 changed files with 178 additions and 43 deletions
--- a/youtube_dl/extractor/ardaudiothek.py
+++ b/youtube_dl/extractor/ardaudiothek.py
@ -3,6 +3,14 @@ from __future__ import unicode_literals
 import re
 try:
    from urllib.parse import unquote as _unquote_compat
 except ImportError:
    from urllib import unquote
    def _unquote_compat(str):
        return unquote(str.encode('utf-8')).decode('utf-8')
 from .common import InfoExtractor
 from ..utils import (
    compat_str,
@ -76,7 +84,7 @@ class ARDAudiothekBaseIE(InfoExtractor):
            lambda x: x['guid'],
        ], compat_str)
        if not res['url']:
-            raise ExtractorError(msg='Could not find any downloads',
+            raise ExtractorError(msg='Could not find a URL to download',
                                 expected=True)
        res['format_note'] = try_get(
@ -228,55 +236,132 @@ class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE):
        'playlist_mincount': 5,
    }]
-    def _extract_episodes(self, podcast_id, n_entries):
+    def _get_page_str(self, page):
-        # items_per_page works from 1 up to 2147483647 (2^31 - 1).
+        # The API sometimes returns 404s for page=1. So only add that
-        # The website calls the API with items_per_page set to 24. Setting it
+        # parameter if we actually are past the first page
-        # to 500 or 1000 would download the data of all episodes in one or two
+        return '&page=' + compat_str(page) if page > 1 else ''
        # pages. Increasing this value might however trigger server errors in
        # the future. So to avoid any problems we will keep using the default
        # value and just download a few more pages.
        items_per_page = 24
    def _get_episode_from_array_entry(self, array_entry):
        # The array entry already is a an 'episode' dict.
        return array_entry
    def _extract_episodes(
            self, display_id, api_url_template, default_items_per_page):
        """
        Extract episodes by calling a web API end point.
        Sometimes the server does not respond properly when requesting a page.
        This also happens on the website. It sometimes hangs when trying to
        load more search results, for instance. Thus the number of entries
        reported by the API is often wrong and we do not solely rely on that
        number to stop reading episodes.
        This function handles paginated content in a robust way by skipping
        over faulty server responses. In this case it reduces the page size to
        get as many episodes as possible. It also removes duplicate entries
        from the result.
        Args:
            display_id: Only used for user feedback.
            api_url_template: This is the URL of the API to download JSON data
                from. It is a format string expected to have the following
                fields:
                    - {items_per_page}
                    - {page_str}
            default_items_per_page: The number of items to fetch per page.
                It is best to set this to the same value that is used by the
                website when accessing the API. This function automatically
                reduces the number of items per page when the server responds
                with errors or missing data.
        Returns:
            A list of extracted episode dicts to be used as playlist entries.
        Raises:
            ExtractorError: Might be raised when extracting episode data.
        """
        items_per_page = default_items_per_page
        page = 1
        api_url_template = 'https://www.ardaudiothek.de/api/podcasts/{}/episodes?items_per_page={}{}'
        entries = []
        while True:
            # The API sometimes returns 404s for page=1. So only add that
            # parameter if we actually have paginated content.
            page_str = '&page=' + compat_str(page) if page > 1 else ''
            api_url = api_url_template.format(podcast_id,
                                              items_per_page,
                                              page_str)
            result_data = self._download_json(api_url, podcast_id, fatal=False)
        # The number of entries as reported by the API
        n_entries = None
        # The API sometimes returns an empty page without any episodes. In this
        # case the next page often has episodes. This, however, throws off
        # the total number of entries and it no longer becomes a reliable
        # stopping condition when comparing it with the number of entries
        # reported by the API. So we deal with this by not stopping at the
        # first occurance of an empty page. We skip over a certain number of
        # empty pages before giving up.
        max_n_skipped_pages = default_items_per_page + 3
        n_skipped_pages = 0
        while True:
            # We need this to check if we actually added any entries
            n_entries_before_this_page = len(entries)
            # Fetch data
            api_url = api_url_template.format(
                page_str=self._get_page_str(page),
                items_per_page=items_per_page)
            result_data = self._download_json(api_url, display_id, fatal=False)
            episodes = try_get(result_data,
                               lambda x: x['result']['episodes'],
                               list)
-            if episodes is None:
+
            # Add entries
            for episode in episodes or []:
                entry = self._extract_episode(
                    self._get_episode_from_array_entry(episode))
                if entry not in entries:
                    entries.append(entry)
            # Fetch how many episodes the API says it has (it's enough to
            # read it once)
            n_entries = n_entries if n_entries is not None else try_get(
                result_data,
                lambda x: x['result']['meta']['episodes']['total'],
                int)
            # Check if we have read the reported number of episodes
            if n_entries is not None and len(entries) >= n_entries:
                break
-            for episode in episodes:
+            # Check if we actually added any entries
-                entries.append(self._extract_episode(episode))
+            if n_entries_before_this_page == len(entries):
                # This was an empty page so we have to skip it
                n_skipped_pages += 1
                if n_skipped_pages >= max_n_skipped_pages:
                    # Enough skipping, give up
                    break
-            # Check if we're done
+                # Throttle by reading only half as many entries as before
-            if len(entries) >= n_entries:
+                if items_per_page > 1:
-                break
+                    new_items_per_page = int(max(1, items_per_page / 2))
                    page = int((page - 1) * items_per_page /
                               new_items_per_page)
                    items_per_page = new_items_per_page
            else:
                # This page had episodes, so we're no longer skipping
                n_skipped_pages = 0
-            # Sanity check, just in case
+                # Try to go back to full speed by going back to the default
-            meta_total = try_get(result_data,
+                # items_per_page value if possible.
-                                 lambda x:
+                if items_per_page * page % default_items_per_page == 0:
-                                 x['result']['meta']['episodes']['total'],
+                    page = int(page * items_per_page /
-                                 (int, float))
+                               default_items_per_page)
-            meta_pages = try_get(result_data,
+                    items_per_page = default_items_per_page
                                 lambda x:
                                 x['result']['meta']['episodes']['pages'],
                                 (int, float))
            if not meta_total or not meta_pages:
                break
            page += 1
        # Tell the user if we received less entries than the API reported
        if n_entries is not None and len(entries) < n_entries:
            self.to_screen('Received {} of {} reported episodes'.format(
                len(entries), n_entries))
        return entries
    def _real_extract(self, url):
@ -290,16 +375,65 @@ class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE):
            raise ExtractorError(msg="Could not find any playlist data",
                                 expected=True)
        n_entries = try_get(pc_data,
                            lambda x: x['number_of_elements'],
                            (int, float))
        res = self._extract_id_title_desc(pc_data)
        res['_type'] = 'playlist'
        res['entries'] = self._extract_episodes(podcast_id, n_entries)
-        if n_entries > len(res['entries']):
+        # items_per_page works from 1 up to 2147483647 (2^31 - 1).
-            self.to_screen('Only received {} of {} reported episode IDs'
+        # The website calls the API with items_per_page set to 24. Setting it
-                           .format(len(res['entries']), n_entries))
+        # to 500 or 1000 would download the data of all episodes in one or two
        # pages. Increasing this value might however trigger server errors in
        # the future. So to avoid any problems we will keep using the default
        # value and just download a few more pages.
        res['entries'] = self._extract_episodes(
            podcast_id,
            'https://www.ardaudiothek.de/api/podcasts/%s/episodes?items_per_page={items_per_page}{page_str}' % podcast_id,
            24)
        return res
 class ARDAudiothekSearchIE(ARDAudiothekPlaylistIE):
    _VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/suche\?(?:(?!q=).*&)?q=(?P<id>[^&]+)(?:&.*)?'
    _TESTS = [{
        'url': 'https://www.ardaudiothek.de/suche?q=Sommer',
        'info_dict': {
            'id': 'Sommer',
            'title': 'Sommer',
            'description': compat_str,
        },
        'playlist_mincount': 5,
    }, {
        'url': 'https://www.ardaudiothek.de/suche?q=Angela%20Merkel',
        'info_dict': {
            'id': 'Angela%20Merkel',
            'title': 'Angela Merkel',
            'description': compat_str,
        },
        'playlist_mincount': 5,
    }]
    def _get_page_str(self, page):
        # The search API always works with a page number
        return '&page=' + compat_str(page)
    def _get_episode_from_array_entry(self, array_entry):
        # The array entry is a dict with an 'episode' and a 'search_meta' entry
        return try_get(array_entry, lambda x: x['episode'], dict)
    def _real_extract(self, url):
        search_str = self._match_id(url)
        display_str = _unquote_compat(search_str)
        return {
            '_type': 'playlist',
            'id': search_str,
            'display_id': display_str,
            'title': display_str,
            'description': 'ARD Audiothek-Suche nach "' + display_str + '"',
            # Searching on the website calls the API with items_per_page set
            # to 8. Other values sometimes cause server errors.
            'entries': self._extract_episodes(
                display_str,
                'https://www.ardaudiothek.de/api/search/%s?focus=episodes{page_str}&items_per_page={items_per_page}' % search_str,
                8),
        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -60,6 +60,7 @@ from .ard import (
 from .ardaudiothek import (
    ARDAudiothekIE,
    ARDAudiothekPlaylistIE,
    ARDAudiothekSearchIE,
 )
 from .arte import (
    ArteTVPlus7IE,