Merge remote-tracking branch 'upstream/master'

2014-08-19 07:15:33 -07:00 · 2014-08-19 07:15:33 -07:00 · ca7a9c1bf7
commit ca7a9c1bf7
parent 247a5da704 696d49815e
7 changed files with 237 additions and 14 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -147,6 +147,7 @@ from .ivi import (
 from .izlesene import IzleseneIE
 from .jadorecettepub import JadoreCettePubIE
 from .jeuxvideo import JeuxVideoIE
 from .jove import JoveIE
 from .jukebox import JukeboxIE
 from .justintv import JustinTVIE
 from .jpopsukitv import JpopsukiIE
@ -178,6 +179,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
 from .mixcloud import MixcloudIE
 from .mlb import MLBIE
 from .mpora import MporaIE
@ -252,6 +254,7 @@ from .ro220 import Ro220IE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rtlnl import RtlXlIE
 from .rtlnow import RTLnowIE
 from .rts import RTSIE
 from .rtve import RTVEALaCartaIE
--- a/youtube_dl/extractor/dfb.py
+++ b/youtube_dl/extractor/dfb.py
@ -30,7 +30,7 @@ class DFBIE(InfoExtractor):
            video_id)
        video_info = player_info.find('video')
-        f4m_info = self._download_xml(video_info.find('url').text, video_id)
+        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
        token_el = f4m_info.find('token')
        manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
--- a/youtube_dl/extractor/jove.py
+++ b/youtube_dl/extractor/jove.py
@ -0,0 +1,80 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    unified_strdate
 )
 class JoveIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
    _TESTS = [
        {
            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
            'info_dict': {
                'id': '2744',
                'ext': 'mp4',
                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
                'thumbnail': 're:^https?://.*\.png$',
                'upload_date': '20110523',
            }
        },
        {
            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
            'md5': '914aeb356f416811d911996434811beb',
            'info_dict': {
                'id': '51796',
                'ext': 'mp4',
                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
                'description': 'md5:35ff029261900583970c4023b70f1dc9',
                'thumbnail': 're:^https?://.*\.png$',
                'upload_date': '20140802',
            }
        },
    ]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        chapters_id = self._html_search_regex(
            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
        chapters_xml = self._download_xml(
            self._CHAPTERS_URL.format(video_id=chapters_id),
            video_id, note='Downloading chapters XML',
            errnote='Failed to download chapters XML')
        video_url = chapters_xml.attrib.get('video')
        if not video_url:
            raise ExtractorError('Failed to get the video URL')
        title = self._html_search_meta('citation_title', webpage, 'title')
        thumbnail = self._og_search_thumbnail(webpage)
        description = self._html_search_regex(
            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
            webpage, 'description', fatal=False)
        publish_date = unified_strdate(self._html_search_meta(
            'citation_publication_date', webpage, 'publish date', fatal=False))
        comment_count = self._html_search_regex(
            r'<meta name="num_comments" content="(\d+) Comments?"',
            webpage, 'comment count', fatal=False)
        return {
            'id': video_id,
            'title': title,
            'url': video_url,
            'thumbnail': thumbnail,
            'description': description,
            'upload_date': publish_date,
            'comment_count': comment_count,
        }
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@ -0,0 +1,60 @@
 from __future__ import unicode_literals
 import re
 import json
 from .common import InfoExtractor
 from ..utils import (
    compat_urllib_parse,
    get_element_by_attribute,
    parse_duration,
    strip_jsonp,
 )
 class MiTeleIE(InfoExtractor):
    IE_NAME = 'mitele.es'
    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
    _TEST = {
        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
        'info_dict': {
            'id': '0fce117d',
            'ext': 'mp4',
            'title': 'Programa 144 - Tor, la web invisible',
            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
            'display_id': 'programa-144',
            'duration': 2913,
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        episode = mobj.group('episode')
        webpage = self._download_webpage(url, episode)
        embed_data_json = self._search_regex(
            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
            flags=re.DOTALL
        ).replace('\'', '"')
        embed_data = json.loads(embed_data_json)
        info_url = embed_data['flashvars']['host']
        info_el = self._download_xml(info_url, episode).find('./video/info')
        video_link = info_el.find('videoUrl/link').text
        token_query = compat_urllib_parse.urlencode({'id': video_link})
        token_info = self._download_json(
            'http://token.mitele.es/?' + token_query, episode,
            transform_source=strip_jsonp
        )
        return {
            'id': embed_data['videoId'],
            'display_id': episode,
            'title': info_el.find('title').text,
            'url': token_info['tokenizedUrl'],
            'description': get_element_by_attribute('class', 'text', webpage),
            'thumbnail': info_el.find('thumb').text,
            'duration': parse_duration(info_el.find('duration').text),
        }
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@ -20,7 +20,8 @@ class PBSIE(InfoExtractor):
        )
    '''
-    _TEST = {
+    _TESTS = [
        {
            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
            'md5': 'ce1888486f0908d555a8093cac9a7362',
            'info_dict': {
@ -30,7 +31,30 @@ class PBSIE(InfoExtractor):
                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
                'duration': 3190,
            },
-    }
+        },
        {
            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
            'md5': '143c98aa54a346738a3d78f54c925321',
            'info_dict': {
                'id': '2365297690',
                'ext': 'mp4',
                'title': 'Losing Iraq',
                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
                'duration': 5050,
            },
        },
        {
            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
            'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
            'info_dict': {
                'id': '2201174722',
                'ext': 'mp4',
                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
                'duration': 801,
            },
        },
    ]
    def _extract_ids(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -40,10 +64,13 @@ class PBSIE(InfoExtractor):
        if presumptive_id:
            webpage = self._download_webpage(url, display_id)
-            # frontline video embed
+            MEDIA_ID_REGEXES = [
                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
                r'class="coveplayerid">([^<]+)<',                       # coveplayer
            ]
            media_id = self._search_regex(
-                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",
+                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
                webpage, 'frontline video ID', fatal=False, default=None)
            if media_id:
                return media_id, presumptive_id
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@ -0,0 +1,52 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class RtlXlIE(InfoExtractor):
    IE_NAME = 'rtlxl.nl'
    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
    _TEST = {
        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
        'info_dict': {
            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
            'ext': 'flv',
            'title': 'RTL Nieuws - Laat',
            'description': 'Dagelijks het laatste nieuws uit binnen- en '
                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van '
                'onze mobiele apps.',
            'timestamp': 1408051800,
            'upload_date': '20140814',
        },
        'params': {
            # We download the first bytes of the first fragment, it can't be
            # processed by the f4m downloader beacuse it isn't complete
            'skip_download': True,
        },
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uuid = mobj.group('uuid')
        info = self._download_json(
            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
            uuid)
        meta = info['meta']
        material = info['material'][0]
        episode_info = info['episodes'][0]
        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath']
        progname = info['abstracts'][0]['name']
        subtitle = material['title'] or info['episodes'][0]['name']
        return {
            'id': uuid,
            'title': '%s - %s' % (progname, subtitle), 
            'formats': self._extract_f4m_formats(f4m_url, uuid),
            'timestamp': material['original_date'],
            'description': episode_info['synopsis'],
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -827,6 +827,7 @@ def unified_strdate(date_str):
        '%b %dnd %Y %I:%M%p',
        '%b %dth %Y %I:%M%p',
        '%Y-%m-%d',
        '%Y/%m/%d',
        '%d.%m.%Y',
        '%d/%m/%Y',
        '%Y/%m/%d %H:%M:%S',