Add MegaTVCom IEs

* Add new IEs
  * MegaTVComBaseIE: Base IE class
  * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all
    sorts of pages showing videos on megatv.com
  * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos
* When video_id is not matched in the URL, namely for news articles,
  extract it (article_id) from a particular element on the web page
* Derive metadata and sources directly from the web page, from data
  attributes of the player placeholder element and other commonly used
  elements
* Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the
  metadata on the embeddable page are some times slightly different, for
  the same video
This commit is contained in:
Zenon Mousmoulas 2021-11-10 07:42:10 +02:00
parent a803582717
commit 34c3b06402
3 changed files with 197 additions and 0 deletions

View File

@ -1038,6 +1038,10 @@ from .rutube import (
RutubePersonIE, RutubePersonIE,
RutubePlaylistIE, RutubePlaylistIE,
) )
from .megatvcom import (
MegaTVComIE,
MegaTVComEmbedIE,
)
from .rutv import RUTVIE from .rutv import RUTVIE
from .ruutu import RuutuIE from .ruutu import RuutuIE
from .ruv import RuvIE from .ruv import RuvIE

View File

@ -100,6 +100,7 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE from .videopress import VideoPressIE
from .rutube import RutubeIE from .rutube import RutubeIE
from .megatvcom import MegaTVComEmbedIE
from .limelight import LimelightBaseIE from .limelight import LimelightBaseIE
from .anvato import AnvatoIE from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE from .washingtonpost import WashingtonPostIE
@ -3199,6 +3200,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for megatv.com embeds
megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url))
if megatvcom_urls:
return self.playlist_from_matches(
megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
# Look for WashingtonPost embeds # Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage) wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls: if wapo_urls:

View File

@ -0,0 +1,186 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
get_element_by_class,
unified_timestamp,
extract_attributes,
clean_html,
unescapeHTML,
)
class MegaTVComBaseIE(InfoExtractor):
_PLAYER_DIV_ID = 'player_div_id'
def _extract_player_attrs(self, webpage):
PLAYER_DIV_RE = r'''(?x)
<div(?:
id=(?P<_q1>[\"\'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)|
[^>]*?
)+>
''' % {'pdi': self._PLAYER_DIV_ID}
for mobj in re.finditer(PLAYER_DIV_RE, webpage):
if mobj.group(self._PLAYER_DIV_ID):
player_el = mobj.group(0)
break
else:
raise ExtractorError('no <div id="%s"> element found in webpage' %
self._PLAYER_DIV_ID)
return {
re.sub(r'^data-(?:kwik_)?', '', k): v
for k, v in extract_attributes(player_el).items()
if k not in ('id',)
}
class MegaTVComIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom'
IE_DESC = 'megatv.com videos'
_VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P<id>\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)'
_TESTS = [{
'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
'md5': '2ebe96661cb81854889053cebb661068',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
},
}, {
'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
'md5': '8ab0c9d664cea11678670202b87bb2b1',
'info_dict': {
'id': '527800',
'ext': 'mp4',
'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
'timestamp': 1636048859,
'upload_date': '20211104',
},
}]
def _match_article_id(self, webpage):
ART_RE = r'''(?x)
<article(?:
id=(?P<_q2>[\"\'])Article_(?P<article>\d+)(?P=_q2)|
[^>]*?
)+>
'''
return compat_str(self._search_regex(ART_RE, webpage, 'article_id',
group='article'))
def _real_extract(self, url):
video_id = self._match_id(url)
_is_article = video_id == 'None'
webpage = self._download_webpage(url,
'N/A' if _is_article else
video_id)
if _is_article:
video_id = self._match_article_id(webpage)
player_attrs = self._extract_player_attrs(webpage)
title = player_attrs.get('label') or self._og_search_title(webpage)
description = clean_html(get_element_by_class(
'article-wrapper' if _is_article else 'story_content',
webpage))
if not description:
description = self._og_search_description(webpage)
thumbnail = player_attrs.get('image') or \
self._og_search_thumbnail(webpage)
timestamp = unified_timestamp(self._html_search_meta(
'article:published_time', webpage))
try:
source = player_attrs['source']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
if determine_ext(source) == 'm3u8' else [source]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
}
class MegaTVComEmbedIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom:embed'
IE_DESC = 'megatv.com embedded videos'
_VALID_URL = r'https?://(?:www\.)?megatv.com/embed/?\?p=\d+'
_TESTS = [{
'url': 'https://www.megatv.com/embed/?p=2020520979',
'md5': '2ebe96661cb81854889053cebb661068',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
},
}, {
'url': 'https://www.megatv.com/embed/?p=2020534081',
'md5': 'f9a15e315acbf01b128e8efa3f75aab3',
'info_dict': {
'id': '534081',
'ext': 'mp4',
'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
'timestamp': 1636376351,
'upload_date': '20211108',
},
}]
@classmethod
def _extract_urls(cls, webpage, origin_url=None):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
EMBED_RE = r'''(?x)
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)
(?P<url>%(url_re)s)(?P=_q1)
''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage):
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
yield unescapeHTML(mobj.group('url'))
def _real_extract(self, url):
webpage = self._download_webpage(url, 'N/A')
player_attrs = self._extract_player_attrs(webpage)
canonical_url = player_attrs['share_url']
video_id = compat_parse_qs(compat_urllib_parse_urlparse(
canonical_url).query)['p'][0]
# Resolve the canonical URL, following redirects, and defer to
# megatvcom, as the metadata extracted from the embeddable page some
# times are slightly different, for the same video
canonical_url = self._request_webpage(
HEADRequest(canonical_url), video_id,
note='Resolve canonical URL',
errnote='Could not resolve canonical URL').geturl()
return self.url_result(
canonical_url,
MegaTVComIE.ie_key(),
video_id
)