GlomexEmbedIE: Reuse _VALID_URL in _extract_urls

* Let _extract_urls reuse _VALID_URL after making scheme optional and
  simplifying the query string part
* Upon an iframe match
  * Add the scheme to the matched URL, if necessary
  * Match the URL against the full _VALID_URL
This commit is contained in:
Zenon Mousmoulas 2021-11-11 11:16:29 +02:00
parent 4225c46d3b
commit d303e1e05f

View File

@ -162,7 +162,8 @@ class GlomexEmbedIE(GlomexBaseIE):
IE_NAME = 'glomex:embed' IE_NAME = 'glomex:embed'
IE_DESC = 'Glomex embedded videos' IE_DESC = 'Glomex embedded videos'
_BASE_PLAYER_URL = 'https://player.glomex.com/integration/1/iframe-player.html' _BASE_PLAYER_URL = 'https://player.glomex.com/integration/1/iframe-player.html'
_VALID_URL = r'(?:https?:)?//player\.glomex\.com/integration/[^/]+/iframe-player\.html\?(?:(?:integrationId=(?P<integration>[^&#]+)|playlistId=(?P<id>[^&#]+)|[^&=#]+=[^&#]+)&?)+' _VALID_URL = r'''(?x)https?://player\.glomex\.com/integration/[^/]+/iframe-player\.html
\?(?:(?:integrationId=(?P<integration>[^&#]+)|playlistId=(?P<id>[^&#]+)|[^&=#]+=[^&#]+)&?)+'''
_TESTS = [{ _TESTS = [{
'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
@ -219,12 +220,16 @@ class GlomexEmbedIE(GlomexBaseIE):
@classmethod @classmethod
def _extract_urls(cls, webpage, origin_url): def _extract_urls(cls, webpage, origin_url):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
# simplify the query string part of _VALID_URL; after extracting iframe
# src, the URL will be matched again
_URL_RE = _URL_RE.split(r'\?', 1)[0] + r'\?(?:(?!(?P=_q1)).)+'
# https://docs.glomex.com/publisher/video-player-integration/javascript-api/ # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
EMBED_RE = r'''(?x) EMBED_RE = r'''(?x)
(?: (?:
<iframe[^>]+?src=(?P<_q1>%(quot_re)s) <iframe[^>]+?src=(?P<_q1>%(quot_re)s)
(?P<url>(?:https?:)?//player\.glomex\.com/integration/[^/]+/iframe-player\.html\? (?P<url>%(url_re)s)(?P=_q1)|
(?:(?!(?P=_q1)).)+)(?P=_q1)|
<(?P<html_tag>glomex-player|div)(?: <(?P<html_tag>glomex-player|div)(?:
data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)| data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)| data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
@ -240,7 +245,7 @@ class GlomexEmbedIE(GlomexBaseIE):
(?:\s|.)*? (?:\s|.)*?
)+</script> )+</script>
) )
''' % {'quot_re': r'[\"\']'} ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage): for mobj in re.finditer(EMBED_RE, webpage):
url, html_tag, video_id_html, integration_html, glomex_player, \ url, html_tag, video_id_html, integration_html, glomex_player, \
script_tag, video_id_js, integration_js = \ script_tag, video_id_js, integration_js = \
@ -248,7 +253,14 @@ class GlomexEmbedIE(GlomexBaseIE):
'integration_html', 'glomex_player', 'script_tag', 'integration_html', 'glomex_player', 'script_tag',
'id_js', 'integration_js') 'id_js', 'integration_js')
if url: if url:
yield cls._smuggle_origin_url(unescapeHTML(url), origin_url) url = unescapeHTML(url)
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
if not cls.suitable(url):
continue
yield cls._smuggle_origin_url(url, origin_url)
elif html_tag: elif html_tag:
if html_tag == "div" and not glomex_player: if html_tag == "div" and not glomex_player:
continue continue