[Mixcloud] Harmonize ID generation from lists with full ID generation (#27787 )

Mixcloud IDs are generated as `username_slug` when the full ID dict has been downloaded. When downloading a list (e.g. uploads, favorites, ...), the temporary ID is just the `slug`. This made e.g. archive file usage require the download of stream metadata before the download can be rejected as already downloaded. This commit attempts to get the uploader username during the GraphQL query, so the temporary IDs are generated similarly.
[cspan] improve info extraction(closes #27791 )
2021-01-13 09:22:48 +00:00 · 2021-01-13 09:17:43 +01:00
2 changed files with 35 additions and 3 deletions
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@ -8,11 +8,14 @@ from ..utils import (
    ExtractorError,
    extract_attributes,
    find_xpath_attr,
+    get_element_by_attribute,
    get_element_by_class,
    int_or_none,
    js_to_json,
    merge_dicts,
+    parse_iso8601,
    smuggle_url,
+    str_to_int,
    unescapeHTML,
 )
 from .senateisvp import SenateISVPIE
@ -116,8 +119,30 @@ class CSpanIE(InfoExtractor):
                jwsetup, video_id, require_title=False, m3u8_id='hls',
                base_url=url)
            add_referer(info['formats'])
+            for subtitles in info['subtitles'].values():
+                for subtitle in subtitles:
+                    ext = determine_ext(subtitle['url'])
+                    if ext == 'php':
+                        ext = 'vtt'
+                    subtitle['ext'] = ext
            ld_info = self._search_json_ld(webpage, video_id, default={})
-            return merge_dicts(info, ld_info)
+            title = get_element_by_class('video-page-title', webpage) or \
+                self._og_search_title(webpage)
+            description = get_element_by_attribute('itemprop', 'description', webpage) or \
+                self._html_search_meta(['og:description', 'description'], webpage)
+            return merge_dicts(info, ld_info, {
+                'title': title,
+                'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),
+                'description': description,
+                'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),
+                'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),
+                'duration': int_or_none(self._search_regex(
+                    r'jwsetup\.seclength\s*=\s*(\d+);',
+                    webpage, 'duration', fatal=False)),
+                'view_count': str_to_int(self._search_regex(
+                    r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",
+                    webpage, 'views', fatal=False)),
+            })

        # Obsolete
        # We first look for clipid, because clipprog always appears before
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@ -251,8 +251,13 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE):
                cloudcast_url = cloudcast.get('url')
                if not cloudcast_url:
                    continue
+                video_id = cloudcast.get('slug')
+                if video_id:
+                    owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
+                    if owner_username:
+                        video_id = '%s_%s' % (owner_username, video_id)
                entries.append(self.url_result(
-                    cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug')))
+                    cloudcast_url, MixcloudIE.ie_key(), video_id))

            page_info = items['pageInfo']
            has_next_page = page_info['hasNextPage']
@ -321,7 +326,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
    _DESCRIPTION_KEY = 'biog'
    _ROOT_TYPE = 'user'
    _NODE_TEMPLATE = '''slug
-          url'''
+          url
+          owner { username }'''

    def _get_playlist_title(self, title, slug):
        return '%s (%s)' % (title, slug)
@ -345,6 +351,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
    _NODE_TEMPLATE = '''cloudcast {
            slug
            url
+            owner { username }
          }'''

    def _get_cloudcast(self, node):