From cf4a829c138c100000086cb55c2b772fe2db47ac Mon Sep 17 00:00:00 2001 From: Daenges Date: Mon, 23 May 2022 19:21:56 +0200 Subject: [PATCH] Implement _search_json_ld() --- youtube_dl/extractor/megacartoons.py | 85 ++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py index 80c17c100..ea6161b7b 100644 --- a/youtube_dl/extractor/megacartoons.py +++ b/youtube_dl/extractor/megacartoons.py @@ -2,7 +2,16 @@ from __future__ import unicode_literals import json -from ..utils import url_or_none +import re + +from ..utils import ( + bug_reports_message, + JSON_LD_RE, + merge_dicts, + NO_DEFAULT, + RegexNotFoundError, + url_or_none, +) from .common import InfoExtractor @@ -14,39 +23,81 @@ class MegaCartoonsIE(InfoExtractor): 'md5': '4ba9be574f9a17abe0c074e2f955fded', 'info_dict': { 'id': 'help-wanted', - 'title': 'Help Wanted', 'ext': 'mp4', + 'title': 'Help Wanted - SpongeBob SquarePants', + 'upload_date': '20200223', + 'timestamp': 1582416000, 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:2c909daa6c6cb16b2d4d791dd1a31632' } } + # adapted from common.py pending yt-dlp back-port + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): + json_ld_list = list(re.finditer(JSON_LD_RE, html)) + default = kwargs.get('default', NO_DEFAULT) + fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False + json_ld = [] + for mobj in json_ld_list: + json_ld_item = self._parse_json( + mobj.group('json_ld'), video_id, fatal=fatal) + if not json_ld_item: + continue + if isinstance(json_ld_item, dict): + json_ld.append(json_ld_item) + elif isinstance(json_ld_item, (list, tuple)): + json_ld.extend(json_ld_item) + if json_ld: + # handle initial '@graph' with one level of children + if len(json_ld) > 0 and '@graph' in json_ld[0] and '@context' in json_ld[0]: + # should always be hit here + context = json_ld[0]['@context'] + json_ld_g = json_ld[0]['@graph'] or [] + for item in json_ld_g: + item.setdefault('@context', context) + json_ld = json_ld_g + json_ld[1:] + json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + if json_ld: + return json_ld + if default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract JSON-LD') + else: + self.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) + return {} + def _real_extract(self, url): # ID is equal to the episode name video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # Try to find a good title or fallback to the ID - title = self._og_search_title(webpage) or video_id + info = self._search_json_ld(webpage, video_id, fatal=False) or {} - # Video data is stored in a json -> extract it from the raw html - url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls')) + info.update({ + 'id': video_id, + # Try to find a good title or fallback to the ID + 'title': info.get('title') or self._og_search_title(webpage) or video_id.replace('-', ' ').capitalize(), + }) - video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url - video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail + if 'url' not in info or 'thumbnail' not in info: + # Video data is stored in a json -> extract it from the raw html + url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls')) + + video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url + video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail + info = merge_dicts(info, { + 'url': video_url, + 'thumbnail': video_thumbnail, + }) # Find the
class in the html article = self._search_regex( r'(?s)]*?\bclass\s*=\s*[^>]*?\bpost\b[^>]*>(.+?)\s*([^<]+)\s*

', article, 'videodescription', fatal=False) - or self._og_search_description(webpage)) + info['description'] = ( + self._html_search_regex(r'(?s)

\s*([^<]+)\s*

', article, 'videodescription', fatal=False) + or self._og_search_description(webpage)) - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': video_thumbnail, - 'description': video_description, - } + return info