From 34c3b064028aec3e3d70801a6fe069ab4205f8ae Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Wed, 10 Nov 2021 07:42:10 +0200 Subject: [PATCH 1/4] Add MegaTVCom IEs * Add new IEs * MegaTVComBaseIE: Base IE class * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all sorts of pages showing videos on megatv.com * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos * When video_id is not matched in the URL, namely for news articles, extract it (article_id) from a particular element on the web page * Derive metadata and sources directly from the web page, from data attributes of the player placeholder element and other commonly used elements * Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the metadata on the embeddable page are some times slightly different, for the same video --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/generic.py | 7 ++ youtube_dl/extractor/megatvcom.py | 186 +++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 youtube_dl/extractor/megatvcom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..c50c09160 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1038,6 +1038,10 @@ from .rutube import ( RutubePersonIE, RutubePlaylistIE, ) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a9c064105..aa35d6928 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -100,6 +100,7 @@ from .ustream import UstreamIE from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .megatvcom import MegaTVComEmbedIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE @@ -3199,6 +3200,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) + # Look for megatv.com embeds + megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url)) + if megatvcom_urls: + return self.playlist_from_matches( + megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) + # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) if wapo_urls: diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py new file mode 100644 index 000000000..119ae5d57 --- /dev/null +++ b/youtube_dl/extractor/megatvcom.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + get_element_by_class, + unified_timestamp, + extract_attributes, + clean_html, + unescapeHTML, +) + + +class MegaTVComBaseIE(InfoExtractor): + _PLAYER_DIV_ID = 'player_div_id' + + def _extract_player_attrs(self, webpage): + PLAYER_DIV_RE = r'''(?x) + [\"\'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)| + [^>]*? + )+> + ''' % {'pdi': self._PLAYER_DIV_ID} + for mobj in re.finditer(PLAYER_DIV_RE, webpage): + if mobj.group(self._PLAYER_DIV_ID): + player_el = mobj.group(0) + break + else: + raise ExtractorError('no
element found in webpage' % + self._PLAYER_DIV_ID) + return { + re.sub(r'^data-(?:kwik_)?', '', k): v + for k, v in extract_attributes(player_el).items() + if k not in ('id',) + } + + +class MegaTVComIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom' + IE_DESC = 'megatv.com videos' + _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)' + + _TESTS = [{ + 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/', + 'md5': '2ebe96661cb81854889053cebb661068', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + }, + }, { + 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/', + 'md5': '8ab0c9d664cea11678670202b87bb2b1', + 'info_dict': { + 'id': '527800', + 'ext': 'mp4', + 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157', + 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df', + 'timestamp': 1636048859, + 'upload_date': '20211104', + }, + }] + + def _match_article_id(self, webpage): + ART_RE = r'''(?x) + [\"\'])Article_(?P
\d+)(?P=_q2)| + [^>]*? + )+> + ''' + return compat_str(self._search_regex(ART_RE, webpage, 'article_id', + group='article')) + + def _real_extract(self, url): + video_id = self._match_id(url) + _is_article = video_id == 'None' + webpage = self._download_webpage(url, + 'N/A' if _is_article else + video_id) + if _is_article: + video_id = self._match_article_id(webpage) + player_attrs = self._extract_player_attrs(webpage) + title = player_attrs.get('label') or self._og_search_title(webpage) + description = clean_html(get_element_by_class( + 'article-wrapper' if _is_article else 'story_content', + webpage)) + if not description: + description = self._og_search_description(webpage) + thumbnail = player_attrs.get('image') or \ + self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage)) + try: + source = player_attrs['source'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats = self._extract_m3u8_formats(source, video_id, 'mp4') \ + if determine_ext(source) == 'm3u8' else [source] + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } + + +class MegaTVComEmbedIE(MegaTVComBaseIE): + IE_NAME = 'megatvcom:embed' + IE_DESC = 'megatv.com embedded videos' + _VALID_URL = r'https?://(?:www\.)?megatv.com/embed/?\?p=\d+' + + _TESTS = [{ + 'url': 'https://www.megatv.com/embed/?p=2020520979', + 'md5': '2ebe96661cb81854889053cebb661068', + 'info_dict': { + 'id': '520979', + 'ext': 'mp4', + 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', + 'description': 'md5:0209fa8d318128569c0d256a5c404db1', + 'timestamp': 1634975747, + 'upload_date': '20211023', + }, + }, { + 'url': 'https://www.megatv.com/embed/?p=2020534081', + 'md5': 'f9a15e315acbf01b128e8efa3f75aab3', + 'info_dict': { + 'id': '534081', + 'ext': 'mp4', + 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0', + 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52', + 'timestamp': 1636376351, + 'upload_date': '20211108', + }, + }] + + @classmethod + def _extract_urls(cls, webpage, origin_url=None): + # make the scheme in _VALID_URL optional + _URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1] + EMBED_RE = r'''(?x) + ]+?src=(?P<_q1>%(quot_re)s) + (?P%(url_re)s)(?P=_q1) + ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE} + for mobj in re.finditer(EMBED_RE, webpage): + if url.startswith('//'): + scheme = compat_urllib_parse_urlparse(origin_url).scheme \ + if origin_url else 'https' + url = '%s:%s' % (scheme, url) + yield unescapeHTML(mobj.group('url')) + + def _real_extract(self, url): + webpage = self._download_webpage(url, 'N/A') + player_attrs = self._extract_player_attrs(webpage) + canonical_url = player_attrs['share_url'] + video_id = compat_parse_qs(compat_urllib_parse_urlparse( + canonical_url).query)['p'][0] + + # Resolve the canonical URL, following redirects, and defer to + # megatvcom, as the metadata extracted from the embeddable page some + # times are slightly different, for the same video + canonical_url = self._request_webpage( + HEADRequest(canonical_url), video_id, + note='Resolve canonical URL', + errnote='Could not resolve canonical URL').geturl() + return self.url_result( + canonical_url, + MegaTVComIE.ie_key(), + video_id + ) From a5ec30e106eda4a4a39e671a3f0c1e82656d260e Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Sat, 13 Nov 2021 08:41:23 +0200 Subject: [PATCH 2/4] Address PR comments about escapes --- youtube_dl/extractor/megatvcom.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py index 119ae5d57..f3faeb78b 100644 --- a/youtube_dl/extractor/megatvcom.py +++ b/youtube_dl/extractor/megatvcom.py @@ -28,7 +28,7 @@ class MegaTVComBaseIE(InfoExtractor): def _extract_player_attrs(self, webpage): PLAYER_DIV_RE = r'''(?x) [\"\'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)| + id=(?P<_q1>["'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)| [^>]*? )+> ''' % {'pdi': self._PLAYER_DIV_ID} @@ -78,7 +78,7 @@ class MegaTVComIE(MegaTVComBaseIE): def _match_article_id(self, webpage): ART_RE = r'''(?x) [\"\'])Article_(?P
\d+)(?P=_q2)| + id=(?P<_q2>["'])Article_(?P
\d+)(?P=_q2)| [^>]*? )+> ''' @@ -124,7 +124,7 @@ class MegaTVComIE(MegaTVComBaseIE): class MegaTVComEmbedIE(MegaTVComBaseIE): IE_NAME = 'megatvcom:embed' IE_DESC = 'megatv.com embedded videos' - _VALID_URL = r'https?://(?:www\.)?megatv.com/embed/?\?p=\d+' + _VALID_URL = r'https?://(?:www\.)?megatv\.com/embed/?\?p=\d+' _TESTS = [{ 'url': 'https://www.megatv.com/embed/?p=2020520979', @@ -157,7 +157,7 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): EMBED_RE = r'''(?x) ]+?src=(?P<_q1>%(quot_re)s) (?P%(url_re)s)(?P=_q1) - ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE} + ''' % {'quot_re': r'["\']', 'url_re': _URL_RE} for mobj in re.finditer(EMBED_RE, webpage): if url.startswith('//'): scheme = compat_urllib_parse_urlparse(origin_url).scheme \ From 28fddc175880c6a21c9965a166f13fc239049d1e Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Sat, 13 Nov 2021 11:32:06 +0200 Subject: [PATCH 3/4] Fix copy/paste typo --- youtube_dl/extractor/megatvcom.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py index f3faeb78b..26fbcff4b 100644 --- a/youtube_dl/extractor/megatvcom.py +++ b/youtube_dl/extractor/megatvcom.py @@ -159,11 +159,12 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): (?P%(url_re)s)(?P=_q1) ''' % {'quot_re': r'["\']', 'url_re': _URL_RE} for mobj in re.finditer(EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) if url.startswith('//'): scheme = compat_urllib_parse_urlparse(origin_url).scheme \ if origin_url else 'https' url = '%s:%s' % (scheme, url) - yield unescapeHTML(mobj.group('url')) + yield url def _real_extract(self, url): webpage = self._download_webpage(url, 'N/A') From 96a0ad4778da7f30ed5be627f2c10df6d0af3ca8 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Sat, 13 Nov 2021 11:50:05 +0200 Subject: [PATCH 4/4] MegaTVComEmbedIE: Make canonical URL extraction more robust --- youtube_dl/extractor/megatvcom.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/megatvcom.py b/youtube_dl/extractor/megatvcom.py index 26fbcff4b..46db816d8 100644 --- a/youtube_dl/extractor/megatvcom.py +++ b/youtube_dl/extractor/megatvcom.py @@ -166,10 +166,26 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): url = '%s:%s' % (scheme, url) yield url + def _match_canonical_url(self, webpage): + LINK_RE = r'''(?x) + %(quot_re)s)(?Pcanonical)(?P=_q1)| + href=(?P<_q2>%(quot_re)s)(?P(?:(?!(?P=_q2)).)+)(?P=_q2)| + [^>]*? + )+> + ''' % {'quot_re': r'["\']'} + for mobj in re.finditer(LINK_RE, webpage): + canonical, href = mobj.group('canonical', 'href') + if canonical and href: + return unescapeHTML(href) + def _real_extract(self, url): webpage = self._download_webpage(url, 'N/A') player_attrs = self._extract_player_attrs(webpage) - canonical_url = player_attrs['share_url'] + canonical_url = player_attrs.get('share_url') or \ + self._match_canonical_url(webpage) + if not canonical_url: + raise ExtractorError('canonical URL not found') video_id = compat_parse_qs(compat_urllib_parse_urlparse( canonical_url).query)['p'][0]