[comedycentral] fix extraction(closes #27905 )

[wat] remove unused variable
[wat] fix format extraction(closes #27901 )
2021-01-21 23:53:09 +01:00 · 2021-01-21 17:22:30 +01:00 · 2021-01-21 17:20:32 +01:00
5 changed files with 50 additions and 196 deletions
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@ -1,142 +1,51 @@
 from __future__ import unicode_literals

 from .mtv import MTVServicesInfoExtractor
-from .common import InfoExtractor


 class ComedyCentralIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
-        (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes)))
-        /(?P<title>.*)'''
+    _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})'
    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'

    _TESTS = [{
-        'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
-        'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
+        'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike',
+        'md5': 'b8acb347177c680ff18a292aa2166f80',
        'info_dict': {
-            'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025',
            'ext': 'mp4',
-            'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
-            'description': 'After a certain point, breastfeeding becomes c**kblocking.',
-            'timestamp': 1376798400,
-            'upload_date': '20130818',
+            'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike',
+            'description': 'md5:5334307c433892b85f4f5e5ac9ef7498',
+            'timestamp': 1598670000,
+            'upload_date': '20200829',
        },
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+        'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314',
        'only_matching': True,
-    }]
-
-
-class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
-        (?:full-episodes|shows(?=/[^/]+/full-episodes))
-        /(?P<id>[^?]+)'''
-    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
-
-    _TESTS = [{
-        'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028',
-        'info_dict': {
-            'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."',
-            'title': 'November 28, 2016 - Ryan Speedo Green',
-        },
-        'playlist_count': 4,
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
-        mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
-        videos_info = self._get_videos_info(mgid)
-        return videos_info
-
-
-class ToshIE(MTVServicesInfoExtractor):
-    IE_DESC = 'Tosh.0'
-    _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
-    _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
-
-    _TESTS = [{
-        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
-        'info_dict': {
-            'description': 'Tosh asked fans to share their summer plans.',
-            'title': 'Twitter Users Share Summer Plans',
-        },
-        'playlist': [{
-            'md5': 'f269e88114c1805bb6d7653fecea9e06',
-            'info_dict': {
-                'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
-                'ext': 'mp4',
-                'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
-                'description': 'Tosh asked fans to share their summer plans.',
-                'thumbnail': r're:^https?://.*\.jpg',
-                # It's really reported to be published on year 2077
-                'upload_date': '20770610',
-                'timestamp': 3390510600,
-                'subtitles': {
-                    'en': 'mincount:3',
-                },
-            },
-        }]
-    }, {
-        'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
+        'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate',
        'only_matching': True,
    }]


 class ComedyCentralTVIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})'
    _TESTS = [{
-        'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+        'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1',
        'info_dict': {
-            'id': 'local_playlist-f99b626bdfe13568579a',
-            'ext': 'flv',
-            'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
+            'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285',
+            'ext': 'mp4',
+            'title': 'Josh Investigates',
+            'description': 'Steht uns das Ende der Welt bevor?',
        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
-    }, {
-        'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
-        'only_matching': True,
    }]
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+    _GEO_COUNTRIES = ['DE']

-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        mrss_url = self._search_regex(
-            r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
-            webpage, 'mrss url', group='url')
-
-        return self._get_videos_info_from_url(mrss_url, video_id)
-
-
-class ComedyCentralShortnameIE(InfoExtractor):
-    _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$'
-    _TESTS = [{
-        'url': ':tds',
-        'only_matching': True,
-    }, {
-        'url': ':thedailyshow',
-        'only_matching': True,
-    }, {
-        'url': ':theopposition',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        shortcut_map = {
-            'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-            'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-            'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes',
+    def _get_feed_query(self, uri):
+        return {
+            'accountOverride': 'intl.mtvi.com',
+            'arcEp': 'web.cc.tv',
+            'ep': 'b9032c3a',
+            'imageEp': 'web.cc.tv',
+            'mgid': uri,
        }
-        return self.url_result(shortcut_map[video_id])
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -235,11 +235,8 @@ from .cnn import (
 )
 from .coub import CoubIE
 from .comedycentral import (
-    ComedyCentralFullEpisodesIE,
    ComedyCentralIE,
-    ComedyCentralShortnameIE,
    ComedyCentralTVIE,
-    ToshIE,
 )
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .commonprotocols import (
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor):

        return try_get(feed, lambda x: x['result']['data']['id'], compat_str)

+    @staticmethod
+    def _extract_child_with_type(parent, t):
+        return next(c for c in parent['children'] if c.get('type') == t)
+
    def _extract_mgid(self, webpage):
        try:
            # the url can be http://media.mtvnservices.com/fb/{mgid}.swf
@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
        if not mgid:
            mgid = self._extract_triforce_mgid(webpage)

+        if not mgid:
+            data = self._parse_json(self._search_regex(
+                r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
+            main_container = self._extract_child_with_type(data, 'MainContainer')
+            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
+            mgid = video_player['props']['media']['video']['config']['uri']
+
        return mgid

    def _real_extract(self, url):
@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor):
        'only_matching': True,
    }]

-    @staticmethod
-    def extract_child_with_type(parent, t):
-        children = parent['children']
-        return next(c for c in children if c.get('type') == t)
-
-    def _extract_mgid(self, webpage):
-        data = self._parse_json(self._search_regex(
-            r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
-        main_container = self.extract_child_with_type(data, 'MainContainer')
-        video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
-        return video_player['props']['media']['video']['config']['uri']
-

 class MTVJapanIE(MTVServicesInfoExtractor):
    IE_NAME = 'mtvjapan'
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor):
    _FEED_URL = 'http://www.bellator.com/feeds/mrss/'
    _GEO_COUNTRIES = ['US']

-    def _extract_mgid(self, webpage):
-        return self._extract_triforce_mgid(webpage)
-

 class ParamountNetworkIE(MTVServicesInfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
    def _get_feed_query(self, uri):
        return {
            'arcEp': 'paramountnetwork.com',
+            'imageEp': 'paramountnetwork.com',
            'mgid': uri,
        }
-
-    def _extract_mgid(self, webpage):
-        root_data = self._parse_json(self._search_regex(
-            r'window\.__DATA__\s*=\s*({.+})',
-            webpage, 'data'), None)
-
-        def find_sub_data(data, data_type):
-            return next(c for c in data['children'] if c.get('type') == data_type)
-
-        c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
-        return c['props']['media']['video']['config']['uri']
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@ -1,12 +1,9 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import re
-
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
-    ExtractorError,
    unified_strdate,
    HEADRequest,
    int_or_none,
@ -46,15 +43,6 @@ class WatIE(InfoExtractor):
        },
    ]

-    _FORMATS = (
-        (200, 416, 234),
-        (400, 480, 270),
-        (600, 640, 360),
-        (1200, 640, 360),
-        (1800, 960, 540),
-        (2500, 1280, 720),
-    )
-
    def _real_extract(self, url):
        video_id = self._match_id(url)
        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
@ -97,46 +85,20 @@ class WatIE(InfoExtractor):
                    return red_url
            return None

-        def remove_bitrate_limit(manifest_url):
-            return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url)
-
        formats = []
-        try:
-            alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')]
        manifest_urls = self._download_json(
            'http://www.wat.tv/get/webhtml/' + video_id, video_id)
        m3u8_url = manifest_urls.get('hls')
        if m3u8_url:
-                m3u8_url = remove_bitrate_limit(m3u8_url)
-                for m3u8_alt_url in alt_urls(m3u8_url):
            formats.extend(self._extract_m3u8_formats(
-                        m3u8_alt_url, video_id, 'mp4',
+                m3u8_url, video_id, 'mp4',
                'm3u8_native', m3u8_id='hls', fatal=False))
-                    formats.extend(self._extract_f4m_formats(
-                        m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'),
-                        video_id, f4m_id='hds', fatal=False))
        mpd_url = manifest_urls.get('mpd')
        if mpd_url:
-                mpd_url = remove_bitrate_limit(mpd_url)
-                for mpd_alt_url in alt_urls(mpd_url):
            formats.extend(self._extract_mpd_formats(
-                        mpd_alt_url, video_id, mpd_id='dash', fatal=False))
+                mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
+                video_id, mpd_id='dash', fatal=False))
        self._sort_formats(formats)
-        except ExtractorError:
-            abr = 64
-            for vbr, width, height in self._FORMATS:
-                tbr = vbr + abr
-                format_id = 'http-%s' % tbr
-                fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
-                if self._is_valid_url(fmt_url, video_id, format_id):
-                    formats.append({
-                        'format_id': format_id,
-                        'url': fmt_url,
-                        'vbr': vbr,
-                        'abr': abr,
-                        'width': width,
-                        'height': height,
-                    })

        date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
Author	SHA1	Message	Date
Remita Amine	fa8f6d8580	[comedycentral] fix extraction(closes #27905 )	2021-01-21 23:53:09 +01:00
Remita Amine	3bb7769c40	[wat] remove unused variable	2021-01-21 17:22:30 +01:00
Remita Amine	8d286bd5b6	[wat] fix format extraction(closes #27901 )	2021-01-21 17:20:32 +01:00