Add MegaTVCom IEs

* Add new IEs * MegaTVComBaseIE: Base IE class * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all sorts of pages showing videos on megatv.com * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos * When video_id is not matched in the URL, namely for news articles, extract it (article_id) from a particular element on the web page * Derive metadata and sources directly from the web page, from data attributes of the player placeholder element and other commonly used elements * Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the metadata on the embeddable page are some times slightly different, for the same video
2021-11-10 07:42:10 +02:00 · 2021-11-10 07:42:10 +02:00 · 34c3b06402
commit 34c3b06402
parent a803582717
3 changed files with 197 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1038,6 +1038,10 @@ from .rutube import (
    RutubePersonIE,
    RutubePlaylistIE,
 )
 from .megatvcom import (
    MegaTVComIE,
    MegaTVComEmbedIE,
 )
 from .rutv import RUTVIE
 from .ruutu import RuutuIE
 from .ruv import RuvIE
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -100,6 +100,7 @@ from .ustream import UstreamIE
 from .arte import ArteTVEmbedIE
 from .videopress import VideoPressIE
 from .rutube import RutubeIE
 from .megatvcom import MegaTVComEmbedIE
 from .limelight import LimelightBaseIE
 from .anvato import AnvatoIE
 from .washingtonpost import WashingtonPostIE
@ -3199,6 +3200,12 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
        # Look for megatv.com embeds
        megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url))
        if megatvcom_urls:
            return self.playlist_from_matches(
                megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
        # Look for WashingtonPost embeds
        wapo_urls = WashingtonPostIE._extract_urls(webpage)
        if wapo_urls:
--- a/youtube_dl/extractor/megatvcom.py
+++ b/youtube_dl/extractor/megatvcom.py
@ -0,0 +1,186 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import hashlib
 import re
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
    compat_parse_qs,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
    HEADRequest,
    ExtractorError,
    determine_ext,
    get_element_by_class,
    unified_timestamp,
    extract_attributes,
    clean_html,
    unescapeHTML,
 )
 class MegaTVComBaseIE(InfoExtractor):
    _PLAYER_DIV_ID = 'player_div_id'
    def _extract_player_attrs(self, webpage):
        PLAYER_DIV_RE = r'''(?x)
        <div(?:
            id=(?P<_q1>[\"\'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)|
            [^>]*?
        )+>
        ''' % {'pdi': self._PLAYER_DIV_ID}
        for mobj in re.finditer(PLAYER_DIV_RE, webpage):
            if mobj.group(self._PLAYER_DIV_ID):
                player_el = mobj.group(0)
                break
        else:
            raise ExtractorError('no <div id="%s"> element found in webpage' %
                                 self._PLAYER_DIV_ID)
        return {
            re.sub(r'^data-(?:kwik_)?', '', k): v
            for k, v in extract_attributes(player_el).items()
            if k not in ('id',)
        }
 class MegaTVComIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom'
    IE_DESC = 'megatv.com videos'
    _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P<id>\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)'
    _TESTS = [{
        'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
        'md5': '2ebe96661cb81854889053cebb661068',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
        },
    }, {
        'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
        'md5': '8ab0c9d664cea11678670202b87bb2b1',
        'info_dict': {
            'id': '527800',
            'ext': 'mp4',
            'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
            'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
            'timestamp': 1636048859,
            'upload_date': '20211104',
        },
    }]
    def _match_article_id(self, webpage):
        ART_RE = r'''(?x)
        <article(?:
            id=(?P<_q2>[\"\'])Article_(?P<article>\d+)(?P=_q2)|
            [^>]*?
        )+>
        '''
        return compat_str(self._search_regex(ART_RE, webpage, 'article_id',
                                             group='article'))
    def _real_extract(self, url):
        video_id = self._match_id(url)
        _is_article = video_id == 'None'
        webpage = self._download_webpage(url,
                                               'N/A' if _is_article else
                                               video_id)
        if _is_article:
            video_id = self._match_article_id(webpage)
        player_attrs = self._extract_player_attrs(webpage)
        title = player_attrs.get('label') or self._og_search_title(webpage)
        description = clean_html(get_element_by_class(
            'article-wrapper' if _is_article else 'story_content',
            webpage))
        if not description:
            description = self._og_search_description(webpage)
        thumbnail = player_attrs.get('image') or \
            self._og_search_thumbnail(webpage)
        timestamp = unified_timestamp(self._html_search_meta(
            'article:published_time', webpage))
        try:
            source = player_attrs['source']
        except KeyError:
            raise ExtractorError('no source found for %s' % video_id)
        formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
            if determine_ext(source) == 'm3u8' else [source]
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'formats': formats,
        }
 class MegaTVComEmbedIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom:embed'
    IE_DESC = 'megatv.com embedded videos'
    _VALID_URL = r'https?://(?:www\.)?megatv.com/embed/?\?p=\d+'
    _TESTS = [{
        'url': 'https://www.megatv.com/embed/?p=2020520979',
        'md5': '2ebe96661cb81854889053cebb661068',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
        },
    }, {
        'url': 'https://www.megatv.com/embed/?p=2020534081',
        'md5': 'f9a15e315acbf01b128e8efa3f75aab3',
        'info_dict': {
            'id': '534081',
            'ext': 'mp4',
            'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
            'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
            'timestamp': 1636376351,
            'upload_date': '20211108',
        },
    }]
    @classmethod
    def _extract_urls(cls, webpage, origin_url=None):
        # make the scheme in _VALID_URL optional
        _URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
        EMBED_RE = r'''(?x)
            <iframe[^>]+?src=(?P<_q1>%(quot_re)s)
                (?P<url>%(url_re)s)(?P=_q1)
        ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE}
        for mobj in re.finditer(EMBED_RE, webpage):
            if url.startswith('//'):
                scheme = compat_urllib_parse_urlparse(origin_url).scheme \
                    if origin_url else 'https'
                url = '%s:%s' % (scheme, url)
            yield unescapeHTML(mobj.group('url'))
    def _real_extract(self, url):
        webpage = self._download_webpage(url, 'N/A')
        player_attrs = self._extract_player_attrs(webpage)
        canonical_url = player_attrs['share_url']
        video_id = compat_parse_qs(compat_urllib_parse_urlparse(
            canonical_url).query)['p'][0]
        # Resolve the canonical URL, following redirects, and defer to
        # megatvcom, as the metadata extracted from the embeddable page some
        # times are slightly different, for the same video
        canonical_url = self._request_webpage(
            HEADRequest(canonical_url), video_id,
            note='Resolve canonical URL',
            errnote='Could not resolve canonical URL').geturl()
        return self.url_result(
            canonical_url,
            MegaTVComIE.ie_key(),
            video_id
        )