2014-01-16 20:52:17 -06:00
from __future__ import unicode_literals
2013-06-23 14:55:53 -05:00
import json
import re
2013-11-02 13:48:39 -05:00
from . subtitles import SubtitlesInfoExtractor
2013-06-23 14:55:53 -05:00
2013-11-05 05:00:13 -06:00
from . . utils import (
2014-03-05 06:22:10 -06:00
compat_str ,
2013-11-05 05:00:13 -06:00
RegexNotFoundError ,
)
2014-01-16 20:52:17 -06:00
2013-11-02 13:48:39 -05:00
class TEDIE ( SubtitlesInfoExtractor ) :
2014-03-04 14:47:01 -06:00
_VALID_URL = r ''' (?x)http://www \ .ted \ .com/
2013-06-23 14:55:53 -05:00
(
2014-03-05 06:22:10 -06:00
( ? P < type_playlist > playlists ( ? : / \d + ) ? ) # We have a playlist
2013-06-23 14:55:53 -05:00
|
( ( ? P < type_talk > talks ) ) # We have a simple talk
)
( / lang / ( . * ? ) ) ? # The url may contain the language
/ ( ? P < name > \w + ) # Here goes the name and then ".html"
'''
2013-06-27 13:46:46 -05:00
_TEST = {
2014-01-16 20:52:17 -06:00
' url ' : ' http://www.ted.com/talks/dan_dennett_on_our_consciousness.html ' ,
' file ' : ' 102.mp4 ' ,
2014-01-16 20:54:54 -06:00
' md5 ' : ' 4ea1dada91e4174b53dac2bb8ace429d ' ,
2014-01-16 20:52:17 -06:00
' info_dict ' : {
2014-03-04 14:47:01 -06:00
' title ' : ' The illusion of consciousness ' ,
' description ' : ' Philosopher Dan Dennett makes a compelling argument that not only don \' t we understand our own consciousness, but that half the time our brains are actively fooling us. ' ,
' uploader ' : ' Dan Dennett ' ,
2013-06-27 13:46:46 -05:00
}
}
2013-06-23 14:55:53 -05:00
2014-03-04 14:47:01 -06:00
_FORMATS_PREFERENCE = {
' low ' : 1 ,
' medium ' : 2 ,
' high ' : 3 ,
}
2013-06-23 14:55:53 -05:00
2014-03-05 06:22:10 -06:00
def _extract_info ( self , webpage ) :
info_json = self . _search_regex ( r ' q \ ( " \ w+.init " ,( { .+}) \ )</script> ' , webpage , ' info json ' )
return json . loads ( info_json )
2013-06-23 14:55:53 -05:00
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url , re . VERBOSE )
if m . group ( ' type_talk ' ) :
2013-11-10 05:09:12 -06:00
return self . _talk_info ( url )
2013-06-23 14:55:53 -05:00
else :
name = m . group ( ' name ' )
2014-03-05 06:22:10 -06:00
return self . _playlist_videos_info ( url , name )
2013-06-23 14:55:53 -05:00
2013-11-15 07:33:51 -06:00
2014-03-05 06:22:10 -06:00
def _playlist_videos_info ( self , url , name ) :
2013-06-23 14:55:53 -05:00
''' Returns the videos of the playlist '''
2013-11-15 07:33:51 -06:00
2014-03-05 06:22:10 -06:00
webpage = self . _download_webpage ( url , name ,
' Downloading playlist webpage ' )
info = self . _extract_info ( webpage )
playlist_info = info [ ' playlist ' ]
2013-06-23 14:55:53 -05:00
2013-11-15 07:33:51 -06:00
playlist_entries = [
2014-03-05 06:22:10 -06:00
self . url_result ( u ' http://www.ted.com/talks/ ' + talk [ ' slug ' ] , self . ie_key ( ) )
for talk in info [ ' talks ' ]
2013-11-15 07:33:51 -06:00
]
return self . playlist_result (
2014-03-05 06:22:10 -06:00
playlist_entries ,
playlist_id = compat_str ( playlist_info [ ' id ' ] ) ,
playlist_title = playlist_info [ ' title ' ] )
2013-06-23 14:55:53 -05:00
def _talk_info ( self , url , video_id = 0 ) :
""" Return the video for the talk in the url """
2014-03-04 14:47:01 -06:00
m = re . match ( self . _VALID_URL , url )
2013-06-23 14:55:53 -05:00
video_name = m . group ( ' name ' )
webpage = self . _download_webpage ( url , video_id , ' Downloading \" %s \" page ' % video_name )
self . report_extraction ( video_name )
2013-11-02 13:48:39 -05:00
2014-03-05 06:22:10 -06:00
talk_info = self . _extract_info ( webpage ) [ ' talks ' ] [ 0 ]
2013-11-02 13:48:39 -05:00
2014-03-04 14:47:01 -06:00
formats = [ {
' ext ' : ' mp4 ' ,
' url ' : format_url ,
' format_id ' : format_id ,
' format ' : format_id ,
' preference ' : self . _FORMATS_PREFERENCE . get ( format_id , - 1 ) ,
} for ( format_id , format_url ) in talk_info [ ' nativeDownloads ' ] . items ( ) ]
self . _sort_formats ( formats )
video_id = talk_info [ ' id ' ]
2013-11-02 13:48:39 -05:00
# subtitles
2014-03-04 14:47:01 -06:00
video_subtitles = self . extract_subtitles ( video_id , talk_info )
2013-11-02 13:48:39 -05:00
if self . _downloader . params . get ( ' listsubtitles ' , False ) :
2014-03-04 14:47:01 -06:00
self . _list_available_subtitles ( video_id , talk_info )
2013-11-02 13:48:39 -05:00
return
2013-11-15 07:06:38 -06:00
return {
2013-11-02 13:48:39 -05:00
' id ' : video_id ,
2014-03-04 14:47:01 -06:00
' title ' : talk_info [ ' title ' ] ,
' uploader ' : talk_info [ ' speaker ' ] ,
' thumbnail ' : talk_info [ ' thumb ' ] ,
' description ' : self . _og_search_description ( webpage ) ,
2013-11-02 13:48:39 -05:00
' subtitles ' : video_subtitles ,
2013-10-04 03:32:34 -05:00
' formats ' : formats ,
}
2014-03-04 14:47:01 -06:00
def _get_available_subtitles ( self , video_id , talk_info ) :
languages = [ lang [ ' languageCode ' ] for lang in talk_info . get ( ' languages ' , [ ] ) ]
if languages :
sub_lang_list = { }
for l in languages :
url = ' http://www.ted.com/talks/subtitles/id/ %s /lang/ %s /format/srt ' % ( video_id , l )
sub_lang_list [ l ] = url
return sub_lang_list
else :
2013-11-05 05:00:13 -06:00
self . _downloader . report_warning ( u ' video doesn \' t have subtitles ' )
2014-03-04 14:47:01 -06:00
return { }