Compare commits

...

2 Commits

Author SHA1 Message Date
Aarni Koskela
7c2d18a13f
[Mixcloud] Harmonize ID generation from lists with full ID generation (#27787)
Mixcloud IDs are generated as `username_slug` when the full ID dict has been
downloaded.  When downloading a list (e.g. uploads, favorites, ...), the temporary
ID is just the `slug`.  This made e.g. archive file usage require the download
of stream metadata before the download can be rejected as already downloaded.

This commit attempts to get the uploader username during the GraphQL query, so the
temporary IDs are generated similarly.
2021-01-13 09:22:48 +00:00
Remita Amine
2408e6d26a [cspan] improve info extraction(closes #27791) 2021-01-13 09:17:43 +01:00
2 changed files with 35 additions and 3 deletions

View File

@ -8,11 +8,14 @@ from ..utils import (
ExtractorError, ExtractorError,
extract_attributes, extract_attributes,
find_xpath_attr, find_xpath_attr,
get_element_by_attribute,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
js_to_json, js_to_json,
merge_dicts, merge_dicts,
parse_iso8601,
smuggle_url, smuggle_url,
str_to_int,
unescapeHTML, unescapeHTML,
) )
from .senateisvp import SenateISVPIE from .senateisvp import SenateISVPIE
@ -116,8 +119,30 @@ class CSpanIE(InfoExtractor):
jwsetup, video_id, require_title=False, m3u8_id='hls', jwsetup, video_id, require_title=False, m3u8_id='hls',
base_url=url) base_url=url)
add_referer(info['formats']) add_referer(info['formats'])
for subtitles in info['subtitles'].values():
for subtitle in subtitles:
ext = determine_ext(subtitle['url'])
if ext == 'php':
ext = 'vtt'
subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={}) ld_info = self._search_json_ld(webpage, video_id, default={})
return merge_dicts(info, ld_info) title = get_element_by_class('video-page-title', webpage) or \
self._og_search_title(webpage)
description = get_element_by_attribute('itemprop', 'description', webpage) or \
self._html_search_meta(['og:description', 'description'], webpage)
return merge_dicts(info, ld_info, {
'title': title,
'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),
'description': description,
'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),
'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),
'duration': int_or_none(self._search_regex(
r'jwsetup\.seclength\s*=\s*(\d+);',
webpage, 'duration', fatal=False)),
'view_count': str_to_int(self._search_regex(
r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",
webpage, 'views', fatal=False)),
})
# Obsolete # Obsolete
# We first look for clipid, because clipprog always appears before # We first look for clipid, because clipprog always appears before

View File

@ -251,8 +251,13 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE):
cloudcast_url = cloudcast.get('url') cloudcast_url = cloudcast.get('url')
if not cloudcast_url: if not cloudcast_url:
continue continue
video_id = cloudcast.get('slug')
if video_id:
owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
if owner_username:
video_id = '%s_%s' % (owner_username, video_id)
entries.append(self.url_result( entries.append(self.url_result(
cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) cloudcast_url, MixcloudIE.ie_key(), video_id))
page_info = items['pageInfo'] page_info = items['pageInfo']
has_next_page = page_info['hasNextPage'] has_next_page = page_info['hasNextPage']
@ -321,7 +326,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
_DESCRIPTION_KEY = 'biog' _DESCRIPTION_KEY = 'biog'
_ROOT_TYPE = 'user' _ROOT_TYPE = 'user'
_NODE_TEMPLATE = '''slug _NODE_TEMPLATE = '''slug
url''' url
owner { username }'''
def _get_playlist_title(self, title, slug): def _get_playlist_title(self, title, slug):
return '%s (%s)' % (title, slug) return '%s (%s)' % (title, slug)
@ -345,6 +351,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
_NODE_TEMPLATE = '''cloudcast { _NODE_TEMPLATE = '''cloudcast {
slug slug
url url
owner { username }
}''' }'''
def _get_cloudcast(self, node): def _get_cloudcast(self, node):