[extractor/common] Expose fragments interface for dashsegments formats

This commit is contained in:
Sergey M․ 2016-09-06 01:21:57 +07:00
parent a0d5077c8d
commit b4c1d6e800
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -1551,21 +1551,12 @@ class InfoExtractor(object):
def extract_multisegment_info(element, ms_parent_info): def extract_multisegment_info(element, ms_parent_info):
ms_info = ms_parent_info.copy() ms_info = ms_parent_info.copy()
segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None: # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) # common attributes and elements. We will only extract relevant
if segment_urls_e: # for us.
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] def extract_common(source):
initialization = segment_list.find(_add_ns('Initialization')) segment_timeline = source.find(_add_ns('SegmentTimeline'))
if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL']
else:
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
start_number = segment_template.get('startNumber')
if start_number:
ms_info['start_number'] = int(start_number)
segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
if segment_timeline is not None: if segment_timeline is not None:
s_e = segment_timeline.findall(_add_ns('S')) s_e = segment_timeline.findall(_add_ns('S'))
if s_e: if s_e:
@ -1580,13 +1571,32 @@ class InfoExtractor(object):
'd': int(s.attrib['d']), 'd': int(s.attrib['d']),
'r': r, 'r': r,
}) })
else: start_number = source.get('startNumber')
timescale = segment_template.get('timescale') if start_number:
ms_info['start_number'] = int(start_number)
timescale = source.get('timescale')
if timescale: if timescale:
ms_info['timescale'] = int(timescale) ms_info['timescale'] = int(timescale)
segment_duration = segment_template.get('duration') segment_duration = source.get('duration')
if segment_duration: if segment_duration:
ms_info['segment_duration'] = int(segment_duration) ms_info['segment_duration'] = int(segment_duration)
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL']
segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None:
extract_common(segment_list)
extract_Initialization(segment_list)
segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
if segment_urls_e:
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
else:
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
extract_common(segment_template)
media_template = segment_template.get('media') media_template = segment_template.get('media')
if media_template: if media_template:
ms_info['media_template'] = media_template ms_info['media_template'] = media_template
@ -1594,11 +1604,14 @@ class InfoExtractor(object):
if initialization: if initialization:
ms_info['initialization_url'] = initialization ms_info['initialization_url'] = initialization
else: else:
initialization = segment_template.find(_add_ns('Initialization')) extract_Initialization(segment_template)
if initialization is not None:
ms_info['initialization_url'] = initialization.attrib['sourceURL']
return ms_info return ms_info
def combine_url(base_url, target_url):
if re.match(r'^https?://', target_url):
return target_url
return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats = [] formats = []
for period in mpd_doc.findall(_add_ns('Period')): for period in mpd_doc.findall(_add_ns('Period')):
@ -1655,9 +1668,7 @@ class InfoExtractor(object):
} }
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
if 'total_number' not in representation_ms_info and 'segment_duration':
segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
media_template = representation_ms_info['media_template'] media_template = representation_ms_info['media_template']
media_template = media_template.replace('$RepresentationID$', representation_id) media_template = media_template.replace('$RepresentationID$', representation_id)
media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
@ -1666,7 +1677,11 @@ class InfoExtractor(object):
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time # can't be used at the same time
if '%(Number' in media_template: if '%(Number' in media_template and 's' not in representation_ms_info:
segment_duration = None
if 'total_number' not in representation_ms_info and 'segment_duration':
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['segment_urls'] = [ representation_ms_info['segment_urls'] = [
media_template % { media_template % {
'Number': segment_number, 'Number': segment_number,
@ -1675,28 +1690,65 @@ class InfoExtractor(object):
for segment_number in range( for segment_number in range(
representation_ms_info['start_number'], representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])] representation_ms_info['total_number'] + representation_ms_info['start_number'])]
representation_ms_info['fragments'] = [{
'url': media_template % {
'Number': segment_number,
'Bandwidth': representation_attrib.get('bandwidth'),
},
'duration': segment_duration,
} for segment_number in range(
representation_ms_info['start_number'],
representation_ms_info['total_number'] + representation_ms_info['start_number'])]
else: else:
# $Number*$ or $Time$ in media template with S list available
# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
representation_ms_info['segment_urls'] = [] representation_ms_info['segment_urls'] = []
representation_ms_info['fragments'] = []
segment_time = 0 segment_time = 0
segment_d = None
segment_number = representation_ms_info['start_number']
def add_segment_url(): def add_segment_url():
representation_ms_info['segment_urls'].append( segment_url = media_template % {
media_template % {
'Time': segment_time, 'Time': segment_time,
'Bandwidth': representation_attrib.get('bandwidth'), 'Bandwidth': representation_attrib.get('bandwidth'),
'Number': segment_number,
} }
) representation_ms_info['segment_urls'].append(segment_url)
representation_ms_info['fragments'].append({
'url': segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
for num, s in enumerate(representation_ms_info['s']): for num, s in enumerate(representation_ms_info['s']):
segment_time = s.get('t') or segment_time segment_time = s.get('t') or segment_time
segment_d = s['d']
add_segment_url() add_segment_url()
segment_number += 1
for r in range(s.get('r', 0)): for r in range(s.get('r', 0)):
segment_time += s['d'] segment_time += segment_d
add_segment_url() add_segment_url()
segment_time += s['d'] segment_number += 1
segment_time += segment_d
elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
# No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
fragments = []
s_num = 0
for segment_url in representation_ms_info['segment_urls']:
s = representation_ms_info['s'][s_num]
for r in range(s.get('r', 0) + 1):
fragments.append({
'url': segment_url,
'duration': float_or_none(s['d'], representation_ms_info['timescale']),
})
representation_ms_info['fragments'] = fragments
if 'segment_urls' in representation_ms_info: if 'segment_urls' in representation_ms_info:
f.update({ f.update({
'segment_urls': representation_ms_info['segment_urls'], 'segment_urls': representation_ms_info['segment_urls'],
'fragments': [],
'protocol': 'http_dash_segments', 'protocol': 'http_dash_segments',
}) })
if 'initialization_url' in representation_ms_info: if 'initialization_url' in representation_ms_info:
@ -1706,6 +1758,10 @@ class InfoExtractor(object):
}) })
if not f.get('url'): if not f.get('url'):
f['url'] = initialization_url f['url'] = initialization_url
f['fragments'].append({'url': initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
for fragment in f['fragments']:
fragment['url'] = combine_url(base_url, fragment['url'])
try: try:
existing_format = next( existing_format = next(
fo for fo in formats fo for fo in formats