Skip to content

Commit 3fb14cd

Browse files
committed
[zdf] Rework extractors (closes ytdl-org#11606, closes ytdl-org#13473, closes ytdl-org#17354, closes ytdl-org#21185, closes ytdl-org#26711, closes ytdl-org#27068, closes ytdl-org#27930, closes ytdl-org#28198, closes ytdl-org#28199, closes ytdl-org#28274)
* Generalize unique video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix
1 parent bee6182 commit 3fb14cd

File tree

3 files changed

+276
-285
lines changed

3 files changed

+276
-285
lines changed

youtube_dl/extractor/dreisat.py

Lines changed: 35 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -1,193 +1,43 @@
11
from __future__ import unicode_literals
22

3-
import re
3+
from .zdf import ZDFIE
44

5-
from .common import InfoExtractor
6-
from ..utils import (
7-
int_or_none,
8-
unified_strdate,
9-
xpath_text,
10-
determine_ext,
11-
float_or_none,
12-
ExtractorError,
13-
)
145

15-
16-
class DreiSatIE(InfoExtractor):
6+
class DreiSatIE(ZDFIE):
177
IE_NAME = '3sat'
18-
_GEO_COUNTRIES = ['DE']
19-
_VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
20-
_TESTS = [
21-
{
22-
'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
23-
'md5': 'be37228896d30a88f315b638900a026e',
24-
'info_dict': {
25-
'id': '45918',
26-
'ext': 'mp4',
27-
'title': 'Waidmannsheil',
28-
'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
29-
'uploader': 'SCHWEIZWEIT',
30-
'uploader_id': '100000210',
31-
'upload_date': '20140913'
32-
},
33-
'params': {
34-
'skip_download': True, # m3u8 downloads
35-
}
8+
_VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
9+
_TESTS = [{
10+
# Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html
11+
'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html',
12+
'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
13+
'info_dict': {
14+
'id': '141007_ab18_10wochensommer_film',
15+
'ext': 'mp4',
16+
'title': 'Ab 18! - 10 Wochen Sommer',
17+
'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
18+
'duration': 2660,
19+
'timestamp': 1608604200,
20+
'upload_date': '20201222',
3621
},
37-
{
38-
'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
39-
'only_matching': True,
22+
}, {
23+
'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html',
24+
'info_dict': {
25+
'id': '140913_sendung_schweizweit',
26+
'ext': 'mp4',
27+
'title': 'Waidmannsheil',
28+
'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
29+
'timestamp': 1410623100,
30+
'upload_date': '20140913'
4031
},
41-
]
42-
43-
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
44-
param_groups = {}
45-
for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
46-
group_id = param_group.get(self._xpath_ns(
47-
'id', 'http://www.w3.org/XML/1998/namespace'))
48-
params = {}
49-
for param in param_group:
50-
params[param.get('name')] = param.get('value')
51-
param_groups[group_id] = params
52-
53-
formats = []
54-
for video in smil.findall(self._xpath_ns('.//video', namespace)):
55-
src = video.get('src')
56-
if not src:
57-
continue
58-
bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
59-
group_id = video.get('paramGroup')
60-
param_group = param_groups[group_id]
61-
for proto in param_group['protocols'].split(','):
62-
formats.append({
63-
'url': '%s://%s' % (proto, param_group['host']),
64-
'app': param_group['app'],
65-
'play_path': src,
66-
'ext': 'flv',
67-
'format_id': '%s-%d' % (proto, bitrate),
68-
'tbr': bitrate,
69-
})
70-
self._sort_formats(formats)
71-
return formats
72-
73-
def extract_from_xml_url(self, video_id, xml_url):
74-
doc = self._download_xml(
75-
xml_url, video_id,
76-
note='Downloading video info',
77-
errnote='Failed to download video info')
78-
79-
status_code = xpath_text(doc, './status/statuscode')
80-
if status_code and status_code != 'ok':
81-
if status_code == 'notVisibleAnymore':
82-
message = 'Video %s is not available' % video_id
83-
else:
84-
message = '%s returned error: %s' % (self.IE_NAME, status_code)
85-
raise ExtractorError(message, expected=True)
86-
87-
title = xpath_text(doc, './/information/title', 'title', True)
88-
89-
urls = []
90-
formats = []
91-
for fnode in doc.findall('.//formitaeten/formitaet'):
92-
video_url = xpath_text(fnode, 'url')
93-
if not video_url or video_url in urls:
94-
continue
95-
urls.append(video_url)
96-
97-
is_available = 'http://www.metafilegenerator' not in video_url
98-
geoloced = 'static_geoloced_online' in video_url
99-
if not is_available or geoloced:
100-
continue
101-
102-
format_id = fnode.attrib['basetype']
103-
format_m = re.match(r'''(?x)
104-
(?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
105-
(?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
106-
''', format_id)
107-
108-
ext = determine_ext(video_url, None) or format_m.group('container')
109-
110-
if ext == 'meta':
111-
continue
112-
elif ext == 'smil':
113-
formats.extend(self._extract_smil_formats(
114-
video_url, video_id, fatal=False))
115-
elif ext == 'm3u8':
116-
# the certificates are misconfigured (see
117-
# https://github.com/ytdl-org/youtube-dl/issues/8665)
118-
if video_url.startswith('https://'):
119-
continue
120-
formats.extend(self._extract_m3u8_formats(
121-
video_url, video_id, 'mp4', 'm3u8_native',
122-
m3u8_id=format_id, fatal=False))
123-
elif ext == 'f4m':
124-
formats.extend(self._extract_f4m_formats(
125-
video_url, video_id, f4m_id=format_id, fatal=False))
126-
else:
127-
quality = xpath_text(fnode, './quality')
128-
if quality:
129-
format_id += '-' + quality
130-
131-
abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
132-
vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
133-
134-
tbr = int_or_none(self._search_regex(
135-
r'_(\d+)k', video_url, 'bitrate', None))
136-
if tbr and vbr and not abr:
137-
abr = tbr - vbr
138-
139-
formats.append({
140-
'format_id': format_id,
141-
'url': video_url,
142-
'ext': ext,
143-
'acodec': format_m.group('acodec'),
144-
'vcodec': format_m.group('vcodec'),
145-
'abr': abr,
146-
'vbr': vbr,
147-
'tbr': tbr,
148-
'width': int_or_none(xpath_text(fnode, './width')),
149-
'height': int_or_none(xpath_text(fnode, './height')),
150-
'filesize': int_or_none(xpath_text(fnode, './filesize')),
151-
'protocol': format_m.group('proto').lower(),
152-
})
153-
154-
geolocation = xpath_text(doc, './/details/geolocation')
155-
if not formats and geolocation and geolocation != 'none':
156-
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
157-
158-
self._sort_formats(formats)
159-
160-
thumbnails = []
161-
for node in doc.findall('.//teaserimages/teaserimage'):
162-
thumbnail_url = node.text
163-
if not thumbnail_url:
164-
continue
165-
thumbnail = {
166-
'url': thumbnail_url,
167-
}
168-
thumbnail_key = node.get('key')
169-
if thumbnail_key:
170-
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
171-
if m:
172-
thumbnail['width'] = int(m.group(1))
173-
thumbnail['height'] = int(m.group(2))
174-
thumbnails.append(thumbnail)
175-
176-
upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
177-
178-
return {
179-
'id': video_id,
180-
'title': title,
181-
'description': xpath_text(doc, './/information/detail'),
182-
'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
183-
'thumbnails': thumbnails,
184-
'uploader': xpath_text(doc, './/details/originChannelTitle'),
185-
'uploader_id': xpath_text(doc, './/details/originChannelId'),
186-
'upload_date': upload_date,
187-
'formats': formats,
32+
'params': {
33+
'skip_download': True,
18834
}
189-
190-
def _real_extract(self, url):
191-
video_id = self._match_id(url)
192-
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
193-
return self.extract_from_xml_url(video_id, details_url)
35+
}, {
36+
# Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html
37+
'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html',
38+
'only_matching': True,
39+
}, {
40+
# Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
41+
'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html',
42+
'only_matching': True,
43+
}]

youtube_dl/extractor/phoenix.py

Lines changed: 116 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,128 @@
1+
# coding: utf-8
12
from __future__ import unicode_literals
23

3-
from .dreisat import DreiSatIE
4+
import re
45

6+
from .youtube import YoutubeIE
7+
from .zdf import ZDFBaseIE
8+
from ..compat import compat_str
9+
from ..utils import (
10+
int_or_none,
11+
merge_dicts,
12+
unified_timestamp,
13+
xpath_text,
14+
)
515

6-
class PhoenixIE(DreiSatIE):
16+
17+
class PhoenixIE(ZDFBaseIE):
718
IE_NAME = 'phoenix.de'
8-
_VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/
9-
(?:
10-
phoenix/die_sendungen/(?:[^/]+/)?
11-
)?
12-
(?P<id>[0-9]+)'''
13-
_TESTS = [
14-
{
15-
'url': 'http://www.phoenix.de/content/884301',
16-
'md5': 'ed249f045256150c92e72dbb70eadec6',
17-
'info_dict': {
18-
'id': '884301',
19-
'ext': 'mp4',
20-
'title': 'Michael Krons mit Hans-Werner Sinn',
21-
'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
22-
'upload_date': '20141025',
23-
'uploader': 'Im Dialog',
24-
}
19+
_VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html'
20+
_TESTS = [{
21+
# Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html
22+
'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html',
23+
'md5': '34ec321e7eb34231fd88616c65c92db0',
24+
'info_dict': {
25+
'id': '210222_phx_nachgehakt_corona_protest',
26+
'ext': 'mp4',
27+
'title': 'Wohin führt der Protest in der Pandemie?',
28+
'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
29+
'duration': 1691,
30+
'timestamp': 1613906100,
31+
'upload_date': '20210221',
32+
'uploader': 'Phoenix',
33+
'channel': 'corona nachgehakt',
2534
},
26-
{
27-
'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815',
28-
'only_matching': True,
35+
}, {
36+
# Youtube embed
37+
'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html',
38+
'info_dict': {
39+
'id': 'hMQtqFYjomk',
40+
'ext': 'mp4',
41+
'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?',
42+
'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd',
43+
'duration': 3509,
44+
'upload_date': '20201219',
45+
'uploader': 'phoenix',
46+
'uploader_id': 'phoenix',
2947
},
30-
{
31-
'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234',
32-
'only_matching': True,
48+
'params': {
49+
'skip_download': True,
3350
},
34-
]
51+
}, {
52+
'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html',
53+
'only_matching': True,
54+
}, {
55+
# no media
56+
'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html',
57+
'only_matching': True,
58+
}, {
59+
# Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html
60+
'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche',
61+
'only_matching': True,
62+
}]
3563

3664
def _real_extract(self, url):
37-
video_id = self._match_id(url)
38-
webpage = self._download_webpage(url, video_id)
65+
article_id = self._match_id(url)
66+
67+
article = self._download_json(
68+
'https://www.phoenix.de/response/id/%s' % article_id, article_id,
69+
'Downloading article JSON')
70+
71+
video = article['absaetze'][0]
72+
title = video.get('titel') or article.get('subtitel')
73+
74+
if video.get('typ') == 'video-youtube':
75+
video_id = video['id']
76+
return self.url_result(
77+
video_id, ie=YoutubeIE.ie_key(), video_id=video_id,
78+
video_title=title)
79+
80+
video_id = compat_str(video.get('basename') or video.get('content'))
3981

40-
internal_id = self._search_regex(
41-
r'<div class="phx_vod" id="phx_vod_([0-9]+)"',
42-
webpage, 'internal video ID')
82+
details = self._download_xml(
83+
'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php',
84+
video_id, 'Downloading details XML', query={
85+
'ak': 'web',
86+
'ptmd': 'true',
87+
'id': video_id,
88+
'profile': 'player2',
89+
})
90+
91+
title = title or xpath_text(
92+
details, './/information/title', 'title', fatal=True)
93+
content_id = xpath_text(
94+
details, './/video/details/basename', 'content id', fatal=True)
95+
96+
info = self._extract_ptmd(
97+
'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id,
98+
content_id, None, url)
99+
100+
timestamp = unified_timestamp(xpath_text(details, './/details/airtime'))
101+
102+
thumbnails = []
103+
for node in details.findall('.//teaserimages/teaserimage'):
104+
thumbnail_url = node.text
105+
if not thumbnail_url:
106+
continue
107+
thumbnail = {
108+
'url': thumbnail_url,
109+
}
110+
thumbnail_key = node.get('key')
111+
if thumbnail_key:
112+
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
113+
if m:
114+
thumbnail['width'] = int(m.group(1))
115+
thumbnail['height'] = int(m.group(2))
116+
thumbnails.append(thumbnail)
43117

44-
api_url = 'http://www.phoenix.de/php/mediaplayer/data/beitrags_details.php?ak=web&id=%s' % internal_id
45-
return self.extract_from_xml_url(video_id, api_url)
118+
return merge_dicts(info, {
119+
'id': content_id,
120+
'title': title,
121+
'description': xpath_text(details, './/information/detail'),
122+
'duration': int_or_none(xpath_text(details, './/details/lengthSec')),
123+
'thumbnails': thumbnails,
124+
'timestamp': timestamp,
125+
'uploader': xpath_text(details, './/details/channel'),
126+
'uploader_id': xpath_text(details, './/details/originChannelId'),
127+
'channel': xpath_text(details, './/details/originChannelTitle'),
128+
})

0 commit comments

Comments
 (0)