From 34c3b064028aec3e3d70801a6fe069ab4205f8ae Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Wed, 10 Nov 2021 07:42:10 +0200 Subject: [PATCH] Add MegaTVCom IEs * Add new IEs * MegaTVComBaseIE: Base IE class * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all sorts of pages showing videos on megatv.com * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos * When video_id is not matched in the URL, namely for news articles, extract it (article_id) from a particular element on the web page * Derive metadata and sources directly from the web page, from data attributes of the player placeholder element and other commonly used elements * Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the metadata on the embeddable page are some times slightly different, for the same video --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/generic.py | 7 ++ youtube_dl/extractor/megatvcom.py | 186 +++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 youtube_dl/extractor/megatvcom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..c50c09160 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1038,6 +1038,10 @@ from .rutube import ( RutubePersonIE, RutubePlaylistIE, ) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a9c064105..aa35d6928 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -100,6 +100,7 @@ from .ustream import UstreamIE from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .megatvcom import MegaTVComEmbedIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE @@ -3199,6 +3200,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) + # Look for megatv.com embeds + megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url)) + if megatvcom_urls: + return self.playlist_from_matches( + megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) + # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) if wapo_urls: diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py new file mode 100644 index 000000000..119ae5d57 --- /dev/null +++ b/youtube_dl/extractor/megatvcom.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + get_element_by_class, + unified_timestamp, + extract_attributes, + clean_html, + unescapeHTML, +) + + +class MegaTVComBaseIE(InfoExtractor): + _PLAYER_DIV_ID = 'player_div_id' + + def _extract_player_attrs(self, webpage): + PLAYER_DIV_RE = r'''(?x) + [\"\'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)| + [^>]*? + )+> + ''' % {'pdi': self._PLAYER_DIV_ID} + for mobj in re.finditer(PLAYER_DIV_RE, webpage): + if mobj.group(self._PLAYER_DIV_ID): + player_el = mobj.group(0) + break + else: + raise ExtractorError('no
element found in webpage' % + self._PLAYER_DIV_ID) + return { + re.sub(r'^data-(?:kwik_)?', '', k): v + for k, v in extract_attributes(player_el).items() + if k not in ('id',) + } + + +class MegaTVComIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom' + IE_DESC = 'megatv.com videos' + _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)' + + _TESTS = [{ + 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/', + 'md5': '2ebe96661cb81854889053cebb661068', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + }, + }, { + 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/', + 'md5': '8ab0c9d664cea11678670202b87bb2b1', + 'info_dict': { + 'id': '527800', + 'ext': 'mp4', + 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157', + 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df', + 'timestamp': 1636048859, + 'upload_date': '20211104', + }, + }] + + def _match_article_id(self, webpage): + ART_RE = r'''(?x) + [\"\'])Article_(?P
\d+)(?P=_q2)| + [^>]*? + )+> + ''' + return compat_str(self._search_regex(ART_RE, webpage, 'article_id', + group='article')) + + def _real_extract(self, url): + video_id = self._match_id(url) + _is_article = video_id == 'None' + webpage = self._download_webpage(url, + 'N/A' if _is_article else + video_id) + if _is_article: + video_id = self._match_article_id(webpage) + player_attrs = self._extract_player_attrs(webpage) + title = player_attrs.get('label') or self._og_search_title(webpage) + description = clean_html(get_element_by_class( + 'article-wrapper' if _is_article else 'story_content', + webpage)) + if not description: + description = self._og_search_description(webpage) + thumbnail = player_attrs.get('image') or \ + self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) + try: + source = player_attrs['source'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats = self._extract_m3u8_formats(source, video_id, 'mp4') \ + if determine_ext(source) == 'm3u8' else [source] + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } + + +class MegaTVComEmbedIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom:embed' + IE_DESC = 'megatv.com embedded videos' + _VALID_URL = r'https?://(?:www\.)?megatv.com/embed/?\?p=\d+' + + _TESTS = [{ + 'url': 'https://www.megatv.com/embed/?p=2020520979', + 'md5': '2ebe96661cb81854889053cebb661068', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + }, + }, { + 'url': 'https://www.megatv.com/embed/?p=2020534081', + 'md5': 'f9a15e315acbf01b128e8efa3f75aab3', + 'info_dict': { + 'id': '534081', + 'ext': 'mp4', + 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0', + 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52', + 'timestamp': 1636376351, + 'upload_date': '20211108', + }, + }] + + @classmethod + def _extract_urls(cls, webpage, origin_url=None): + # make the scheme in _VALID_URL optional + _URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1] + EMBED_RE = r'''(?x) + ]+?src=(?P<_q1>%(quot_re)s) + (?P%(url_re)s)(?P=_q1) + ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE} + for mobj in re.finditer(EMBED_RE, webpage): + if url.startswith('//'): + scheme = compat_urllib_parse_urlparse(origin_url).scheme \ + if origin_url else 'https' + url = '%s:%s' % (scheme, url) + yield unescapeHTML(mobj.group('url')) + + def _real_extract(self, url): + webpage = self._download_webpage(url, 'N/A') + player_attrs = self._extract_player_attrs(webpage) + canonical_url = player_attrs['share_url'] + video_id = compat_parse_qs(compat_urllib_parse_urlparse( + canonical_url).query)['p'][0] + + # Resolve the canonical URL, following redirects, and defer to + # megatvcom, as the metadata extracted from the embeddable page some + # times are slightly different, for the same video + canonical_url = self._request_webpage( + HEADRequest(canonical_url), video_id, + note='Resolve canonical URL', + errnote='Could not resolve canonical URL').geturl() + return self.url_result( + canonical_url, + MegaTVComIE.ie_key(), + video_id + )