Improve course/category extraction

This commit is contained in:
dirkf 2022-10-29 15:57:14 +00:00 committed by GitHub
parent 0a99e9f59d
commit eff6cd4c24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,8 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_kwargs, compat_kwargs,
@ -27,6 +29,42 @@ class PlatziBaseIE(InfoExtractor):
_LOGIN_URL = 'https://platzi.com/login/' _LOGIN_URL = 'https://platzi.com/login/'
_NETRC_MACHINE = 'platzi' _NETRC_MACHINE = 'platzi'
def _raise_extractor_error(self, video_id, reason, expected=True):
raise ExtractorError('[%s] %s: %s' % (self.IE_NAME, video_id, reason), expected=expected)
def _download_webpage(self, url_or_request, video_id, *args, **kwargs):
# CF likes Connection: keep-alive and so disfavours Py2
# retry on 403 may get in
kwargs['expected_status'] = 403
# header parameters required fpor Py3 to breach site's CF fence w/o 403
headers = kwargs.get('headers') or {}
new_hdrs = {}
if 'User-Agent' not in headers:
headers['User-Agent'] = 'Mozilla/5.0' # (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.0.0 Safari/537.36'
kwargs['headers'] = new_hdrs = headers
if new_hdrs:
kwargs = compat_kwargs(kwargs)
for _ in range(2):
x = super(PlatziBaseIE, self)._download_webpage_handle(url_or_request, video_id, *args, **kwargs)
if x is False:
return x
if x[1].getcode() != 403:
break
kwargs.pop('expected_status', None)
note = kwargs.pop('note', '')
kwargs['note'] = (note or 'Downloading webpage') + ' - retrying'
kwargs = compat_kwargs(kwargs)
path = compat_urllib_parse_urlparse(x[1].geturl())
if path == '/':
self._raise_extractor_error(video_id, 'Redirected to home page: content expired?')
elif path =='/login':
self.raise_login_required()
else:
errs = clean_html(get_element_by_class('Errorpage-text', x[0]))
if errs:
self._raise_extractor_error(video_id, errs)
return x[0]
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -122,29 +160,10 @@ class PlatziIE(PlatziBaseIE):
'skip': 'Content expired', 'skip': 'Content expired',
}] }]
def _download_webpage_handle(self, url_or_request, video_id, *args, **kwargs):
# CF likes Connection: keep-alive and so disfavours Py2
# retry on 403 may get in
kwargs['expected_status'] = 403
x = super(PlatziIE, self)._download_webpage_handle(url_or_request, video_id, *args, **kwargs)
if x is not False and x[1].getcode() == 403:
kwargs.pop('expected_status', None)
note = kwargs.pop('note', '')
kwargs['note'] = (note or 'Downloading webpage') + ' - retrying'
x = super(PlatziIE, self)._download_webpage_handle(url_or_request, video_id, *args, **compat_kwargs(kwargs))
return x
def _real_extract(self, url): def _real_extract(self, url):
lecture_id = self._match_id(url) lecture_id = self._match_id(url)
# header parameters required fpor Py3 to breach site's CF fence w/o 403 webpage = self._download_webpage(url, lecture_id)
headers = {
'User-Agent': 'Mozilla/5.0',
}
webpage, urlh = self._download_webpage_handle(url, lecture_id, headers=headers)
if compat_urllib_parse_urlparse(urlh.geturl()).path == '/':
raise ExtractorError(
'Redirected to home page: content expired?', expected=True)
data_preloaded_state = self._parse_json( data_preloaded_state = self._parse_json(
self._search_regex( self._search_regex(
@ -158,11 +177,10 @@ class PlatziIE(PlatziBaseIE):
why = video_player['blockedInfo'].get('type') or 'unspecified' why = video_player['blockedInfo'].get('type') or 'unspecified'
if why == 'unlogged': if why == 'unlogged':
self.raise_login_required() self.raise_login_required()
raise ExtractorError( self._raise_extractor_error(video_id, 'All video formats blocked because ' + why)
'All video formats blocked because ' + why, expected=True)
formats = [] formats = []
headers['Referer'] = url headers = {'Referer': url}
extractions = { extractions = {
'hls': lambda x: formats.extend(self._extract_m3u8_formats( 'hls': lambda x: formats.extend(self._extract_m3u8_formats(
server_json[x], lecture_id, 'mp4', server_json[x], lecture_id, 'mp4',
@ -209,17 +227,35 @@ class PlatziCourseIE(PlatziBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?P<clas>
platzi\.com/clases| # es version platzi\.com/clases| # es version
courses\.platzi\.com/classes # en version courses\.platzi\.com/classes # en version
)|
platzi\.com(?:/(?P<curs>cursos))?
)/(?P<id>[^/?\#&]+) )/(?P<id>[^/?\#&]+)
''' '''
_TESTS = [{ _TESTS = [{
'url': 'https://platzi.com/web-angular/',
'info_dict': {
'id': 'web-angular',
'title': 'Frontend con Angular',
},
'playlist_count': 9,
}, {
'url': 'https://platzi.com/cursos/angular/',
'info_dict': {
'id': '2478',
'title': 'Curso de Fundamentos de Angular',
},
'playlist_count': 21,
}, {
'url': 'https://platzi.com/clases/next-js/', 'url': 'https://platzi.com/clases/next-js/',
'info_dict': { 'info_dict': {
'id': '1311', 'id': '1311',
'title': 'Curso de Next.js', 'title': 'Curso de Next.js',
}, },
'playlist_count': 22, 'playlist_count': 22,
'skip': 'Oops (updating page)',
}, { }, {
'url': 'https://courses.platzi.com/classes/communication-codestream/', 'url': 'https://courses.platzi.com/classes/communication-codestream/',
'info_dict': { 'info_dict': {
@ -227,16 +263,54 @@ class PlatziCourseIE(PlatziBaseIE):
'title': 'Codestream Course', 'title': 'Codestream Course',
}, },
'playlist_count': 14, 'playlist_count': 14,
'skip': 'Content expired',
}] }]
@classmethod
def _match_valid_url(cls, url):
return re.match(cls._VALID_URL, url)
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url)
def _real_extract(self, url): def __extract_things(self, webpage, thing_id, thing_pattern):
course_name = self._match_id(url) return self.playlist_from_matches(
re.finditer(thing_pattern, webpage),
playlist_id=thing_id,
playlist_title=self._og_search_title(webpage, default=None),
getter=lambda m: urljoin('https://platzi.com', m.group('path')))
webpage = self._download_webpage(url, course_name) def _extract_classes(self, webpage, course_id):
display_id = course_id
course_id = self._search_regex(
r'''(["'])courseId\1\s*:\s*(?P<id>\d+)''',
webpage, 'course id', group='id', fatal=False) or course_id
return self.__extract_things(
webpage, course_id,
r'''<a\b[^>]+\bhref\s*=\s*['"]?(?P<path>/clases/\d+-%s/[^/]+)'''
% (display_id, ))
def _extract_categories(self, webpage, cat_id):
return self.__extract_things(
webpage, cat_id,
r'''<a\b[^>]+\bhref\s*=\s*['"]?(?P<path>/cursos/[^/]+)''')
def _real_extract(self, url):
m = self._match_valid_url(url)
classes, courses, this_id = m.group('clas', 'curs', 'id')
webpage = self._download_webpage(url, this_id)
if courses:
return self._extract_classes(webpage, this_id)
if not classes:
return self._extract_categories(webpage, this_id)
# this branch now seems always to give "Oops" pages
course_name = this_id
initialData = self._search_regex( initialData = self._search_regex(
(r'window.initialData\s*=\s*({.+?})\s*;\s*\n', r'window.initialData\s*=\s*({.+?})\s*;'), (r'window.initialData\s*=\s*({.+?})\s*;\s*\n', r'window.initialData\s*=\s*({.+?})\s*;'),