[utils] Remove stray undocumented Host header in redirect (fix 46fde7c)

[InfoExtractor] Add _match_valid_url() class method and refactor
* API compatible with yt-dlp * also support Sequence of patterns in _VALID_URL * one place to compile _VALID_URL * TODO: remove existing extractor shims
2024-12-20 23:52:07 +00:00 · 2023-07-20 05:29:59 +01:00 · 2023-07-19 22:14:50 +01:00 · 2023-07-19 22:14:50 +01:00 · 2023-07-19 22:14:50 +01:00 · 2023-07-19 22:14:50 +01:00
10 changed files with 1098 additions and 95 deletions
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@ -4,6 +4,7 @@ from inspect import getsource
 import io
 import os
 from os.path import dirname as dirn
 import re
 import sys
 print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
 with open('devscripts/lazy_load_template.py', 'rt') as f:
    module_template = f.read()
 def get_source(m):
    return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
 module_contents = [
-    module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
+    module_template,
    get_source(InfoExtractor.suitable),
    get_source(InfoExtractor._match_valid_url) + '\n',
    'class LazyLoadSearchExtractor(LazyLoadExtractor):\n    pass\n',
    # needed for suitable() methods of Youtube extractor (see #28780)
-    'from youtube_dl.utils import parse_qs\n',
+    'from youtube_dl.utils import parse_qs, variadic\n',
 ]
 ie_template = '''
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
        valid_url=valid_url,
        module=ie.__module__)
    if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
-        s += '\n' + getsource(ie.suitable)
+        s += '\n' + get_source(ie.suitable)
    if hasattr(ie, '_make_valid_url'):
        # search extractors
        s += make_valid_template.format(valid_url=ie._make_valid_url())
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@ -7,15 +7,33 @@ import io
 import os
 import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
 from youtube_dl.compat import compat_etree_fromstring, compat_http_server
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import YoutubeIE, get_info_extractor
 from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
 import threading
 from test.helper import (
    expect_dict,
    expect_value,
    FakeYDL,
    http_server_port,
 )
 from youtube_dl.compat import (
    compat_etree_fromstring,
    compat_http_server,
 )
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import (
    get_info_extractor,
    YoutubeIE,
 )
 from youtube_dl.utils import (
    encode_data_uri,
    ExtractorError,
    RegexNotFoundError,
    strip_jsonp,
 )
 TEAPOT_RESPONSE_STATUS = 418
 TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):
        self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
        self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
    def test_search_nextjs_data(self):
        html = '''
 <!DOCTYPE html>
 <html>
 <head>
  <meta http-equiv="content-type" content=
  "text/html; charset=utf-8">
  <meta name="viewport" content="width=device-width">
  <title>Test _search_nextjs_data()</title>
 </head>
 <body>
  <div id="__next">
    <div style="background-color:#17171E" class="FU" dir="ltr">
      <div class="sc-93de261d-0 dyzzYE">
        <div>
          <header class="HD"></header>
          <main class="MN">
            <div style="height:0" class="HT0">
              <div style="width:NaN%" data-testid=
              "stream-container" class="WDN"></div>
            </div>
          </main>
        </div>
        <footer class="sc-6e5faf91-0 dEGaHS"></footer>
      </div>
    </div>
  </div>
  <script id="__NEXT_DATA__" type="application/json">
  {"props":{"pageProps":{"video":{"id":"testid"}}}}
  </script>
 </body>
 </html>
 '''
        search = self.ie._search_nextjs_data(html, 'testID')
        self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
    def test_search_nuxt_data(self):
        html = '''
 <!DOCTYPE html>
 <html>
 <head>
  <meta http-equiv="content-type" content=
  "text/html; charset=utf-8">
  <title>Nuxt.js Test Page</title>
  <meta name="viewport" content=
  "width=device-width, initial-scale=1">
  <meta data-hid="robots" name="robots" content="all">
 </head>
 <body class="BD">
  <div id="__layout">
    <h1 class="H1">Example heading</h1>
    <div class="IN">
      <p>Decoy text</p>
    </div>
  </div>
  <script>
  window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null));
  </script>
  <script src="/_nuxt/a12345b.js" defer="defer"></script>
 </body>
 </html>
 '''
        search = self.ie._search_nuxt_data(html, 'testID')
        self.assertEqual(search['track']['id'], 'testid')
    def test_search_json_ld_realworld(self):
        # https://github.com/ytdl-org/youtube-dl/issues/23306
        expect_dict(
@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):
                }],
            })
        # from https://0000.studio/
        # with type attribute but without extension in URL
        expect_dict(
            self,
            self.ie._parse_html5_media_entries(
                'https://0000.studio',
                r'''
                <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
                    controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
                </video>
                ''', None)[0],
            {
                'formats': [{
                    'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
                    'ext': 'mp4',
                }],
            })
    def test_extract_jwplayer_data_realworld(self):
        # from http://www.suffolk.edu/sjc/
        expect_dict(
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -20,7 +20,7 @@ import xml.etree.ElementTree
 from youtube_dl.utils import (
    age_restricted,
    args_to_str,
-    encode_base_n,
+    base_url,
    caesar,
    clean_html,
    clean_podcast_url,
@ -29,10 +29,12 @@ from youtube_dl.utils import (
    detect_exe_version,
    determine_ext,
    dict_get,
    encode_base_n,
    encode_compat_str,
    encodeFilename,
    escape_rfc3986,
    escape_url,
    expand_path,
    extract_attributes,
    ExtractorError,
    find_xpath_attr,
@ -51,6 +53,7 @@ from youtube_dl.utils import (
    js_to_json,
    LazyList,
    limit_length,
    lowercase_escape,
    merge_dicts,
    mimetype2ext,
    month_by_name,
@ -66,17 +69,16 @@ from youtube_dl.utils import (
    parse_resolution,
    parse_bitrate,
    pkcs1pad,
    read_batch_urls,
    sanitize_filename,
    sanitize_path,
    sanitize_url,
    expand_path,
    prepend_extension,
-    replace_extension,
+    read_batch_urls,
    remove_start,
    remove_end,
    remove_quotes,
    replace_extension,
    rot47,
    sanitize_filename,
    sanitize_path,
    sanitize_url,
    shell_quote,
    smuggle_url,
    str_or_none,
@ -93,10 +95,8 @@ from youtube_dl.utils import (
    unified_timestamp,
    unsmuggle_url,
    uppercase_escape,
    lowercase_escape,
    url_basename,
    url_or_none,
    base_url,
    urljoin,
    urlencode_postdata,
    urshift,
@ -905,6 +905,85 @@ class TestUtil(unittest.TestCase):
        )
        self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
    def test_js_to_json_vars_strings(self):
        self.assertDictEqual(
            json.loads(js_to_json(
                '''{
                    'null': a,
                    'nullStr': b,
                    'true': c,
                    'trueStr': d,
                    'false': e,
                    'falseStr': f,
                    'unresolvedVar': g,
                }''',
                {
                    'a': 'null',
                    'b': '"null"',
                    'c': 'true',
                    'd': '"true"',
                    'e': 'false',
                    'f': '"false"',
                    'g': 'var',
                }
            )),
            {
                'null': None,
                'nullStr': 'null',
                'true': True,
                'trueStr': 'true',
                'false': False,
                'falseStr': 'false',
                'unresolvedVar': 'var'
            }
        )
        self.assertDictEqual(
            json.loads(js_to_json(
                '''{
                    'int': a,
                    'intStr': b,
                    'float': c,
                    'floatStr': d,
                }''',
                {
                    'a': '123',
                    'b': '"123"',
                    'c': '1.23',
                    'd': '"1.23"',
                }
            )),
            {
                'int': 123,
                'intStr': '123',
                'float': 1.23,
                'floatStr': '1.23',
            }
        )
        self.assertDictEqual(
            json.loads(js_to_json(
                '''{
                    'object': a,
                    'objectStr': b,
                    'array': c,
                    'arrayStr': d,
                }''',
                {
                    'a': '{}',
                    'b': '"{}"',
                    'c': '[]',
                    'd': '"[]"',
                }
            )),
            {
                'object': {},
                'objectStr': '{}',
                'array': [],
                'arrayStr': '[]',
            }
        )
    def test_js_to_json_realworld(self):
        inp = '''{
            'clip':{'provider':'pseudo'}
@ -975,10 +1054,10 @@ class TestUtil(unittest.TestCase):
            !42: 42
        }''')
        self.assertEqual(json.loads(on), {
-            'a': 0,
+            'a': True,
-            'b': 1,
+            'b': False,
-            'c': 0,
+            'c': False,
-            'd': 42.42,
+            'd': True,
            'e': [],
            'f': "abc",
            'g': "",
@ -1048,10 +1127,26 @@ class TestUtil(unittest.TestCase):
        on = js_to_json('{ "040": "040" }')
        self.assertEqual(json.loads(on), {'040': '040'})
        on = js_to_json('[1,//{},\n2]')
        self.assertEqual(json.loads(on), [1, 2])
        on = js_to_json(r'"\^\$\#"')
        self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped')
        on = js_to_json('\'"\\""\'')
        self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
    def test_js_to_json_malformed(self):
        self.assertEqual(js_to_json('42a1'), '42"a1"')
        self.assertEqual(js_to_json('42a-1'), '42"a"-1')
    def test_js_to_json_template_literal(self):
        self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
        self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
        self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
        self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
        self.assertEqual(js_to_json('`${name}`', {}), '"name"')
    def test_extract_attributes(self):
        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
        self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@ -1586,6 +1681,11 @@ Line 1
            'dict': {},
        }
        # define a pukka Iterable
        def iter_range(stop):
            for from_ in range(stop):
                yield from_
        # Test base functionality
        self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str',
                         msg='allow tuple path')
@ -1602,13 +1702,13 @@ Line 1
        # Test Ellipsis behavior
        self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis),
                              (item for item in _TEST_DATA.values() if item not in (None, {})),
-                              msg='`...` should give all non discarded values')
+                              msg='`...` should give all non-discarded values')
        self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(),
                              msg='`...` selection for dicts should select all values')
        self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')),
                         ['https://www.example.com/0', 'https://www.example.com/1'],
                         msg='nested `...` queries should work')
-        self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4),
+        self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), iter_range(4),
                              msg='`...` query result should be flattened')
        self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)),
                         msg='`...` should accept iterables')
@ -1618,7 +1718,7 @@ Line 1
                         [_TEST_DATA['urls']],
                         msg='function as query key should perform a filter based on (key, value)')
        self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), set(('str',)),
-                              msg='exceptions in the query function should be catched')
+                              msg='exceptions in the query function should be caught')
        self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
                         msg='function key should accept iterables')
        if __debug__:
@ -1706,7 +1806,7 @@ Line 1
        self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {},
                         msg='remove empty values when dict key')
        self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis},
-                         msg='use `default` when dict key and `default`')
+                         msg='use `default` when dict key and a default')
        self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {},
                         msg='remove empty values when nested dict key fails')
        self.assertEqual(traverse_obj(None, {0: 'fail'}), {},
@ -1768,7 +1868,7 @@ Line 1
        self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str),
                         'str', msg='accept matching `expected_type` type')
        self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int),
-                         None, msg='reject non matching `expected_type` type')
+                         None, msg='reject non-matching `expected_type` type')
        self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)),
                         '0', msg='transform type using type function')
        self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0),
@ -1780,7 +1880,7 @@ Line 1
        self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none),
                         {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values')
        self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, set((int_or_none,))), expected_type=int),
-                         1, msg='expected_type should not filter non final dict values')
+                         1, msg='expected_type should not filter non-final dict values')
        self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int),
                         {0: {0: 100}}, msg='expected_type should transform deep dict values')
        self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)),
@ -1838,7 +1938,7 @@ Line 1
        self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)),
                                      _traverse_string=True), 'sr',
                         msg='`slice` should result in string if `traverse_string`')
-        self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"),
+        self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == 's'),
                                      _traverse_string=True), 'str',
                         msg='function should result in string if `traverse_string`')
        self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),
--- a/youtube_dl/extractor/clipchamp.py
+++ b/youtube_dl/extractor/clipchamp.py
@ -0,0 +1,69 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    ExtractorError,
    merge_dicts,
    T,
    traverse_obj,
    unified_timestamp,
    url_or_none,
 )
 class ClipchampIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
    _TESTS = [{
        'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
        'info_dict': {
            'id': 'gRXZ4ZhdDaU',
            'ext': 'mp4',
            'title': 'Untitled video',
            'uploader': 'Alexander Schwartz',
            'timestamp': 1680805580,
            'upload_date': '20230406',
            'thumbnail': r're:^https?://.+\.jpg',
        },
        'params': {
            'skip_download': 'm3u8',
            'format': 'bestvideo',
        },
    }]
    _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
    _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
        storage_location = data.get('storage_location')
        if storage_location != 'cf_stream':
            raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,))
        path = data['download_url']
        iframe = self._download_webpage(
            'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe')
        subdomain = self._search_regex(
            r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe,
            'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
        formats = self._extract_mpd_formats(
            self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
            query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
        formats.extend(self._extract_m3u8_formats(
            self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
            query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
        return merge_dicts({
            'id': video_id,
            'formats': formats,
            'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None,
        }, traverse_obj(data, {
            'title': ('project', 'project_name', T(compat_str)),
            'timestamp': ('created_at', T(unified_timestamp)),
            'thumbnail': ('thumbnail_url', T(url_or_none)),
        }), rev=True)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import base64
 import datetime
 import functools
 import hashlib
 import json
 import netrc
@ -23,6 +24,7 @@ from ..compat import (
    compat_getpass,
    compat_integer_types,
    compat_http_client,
    compat_map as map,
    compat_os_name,
    compat_str,
    compat_urllib_error,
@ -31,6 +33,7 @@ from ..compat import (
    compat_urllib_request,
    compat_urlparse,
    compat_xml_parse_error,
    compat_zip as zip,
 )
 from ..downloader.f4m import (
    get_base_url,
@ -70,6 +73,7 @@ from ..utils import (
    str_or_none,
    str_to_int,
    strip_or_none,
    traverse_obj,
    try_get,
    unescapeHTML,
    unified_strdate,
@ -79,6 +83,7 @@ from ..utils import (
    urljoin,
    url_basename,
    url_or_none,
    variadic,
    xpath_element,
    xpath_text,
    xpath_with_ns,
@ -367,9 +372,22 @@ class InfoExtractor(object):
    title, description etc.
-    Subclasses of this one should re-define the _real_initialize() and
+    A subclass of InfoExtractor must be defined to handle each specific site (or
-    _real_extract() methods and define a _VALID_URL regexp.
+    several sites). Such a concrete subclass should be added to the list of
-    Probably, they should also be added to the list of extractors.
+    extractors. It should also:
    * define its _VALID_URL attribute as a regexp, or a Sequence of alternative
      regexps (but see below)
    * re-define the _real_extract() method
    * optionally re-define the _real_initialize() method.
    An extractor subclass may also override suitable() if necessary, but the
    function signature must be preserved and the function must import everything
    it needs (except other extractors), so that lazy_extractors works correctly.
    If the subclass's suitable() and _real_extract() functions avoid using
    _VALID_URL, the subclass need not set that class attribute.
    An abstract subclass of InfoExtractor may be used to simplify implementation
    within an extractor module; it should not be added to the list of extractors.
    _GEO_BYPASS attribute may be set to False in order to disable
    geo restriction bypass mechanisms for a particular extractor.
@ -404,22 +422,33 @@ class InfoExtractor(object):
        self._x_forwarded_for_ip = None
        self.set_downloader(downloader)
    @classmethod
    def __match_valid_url(cls, url):
        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for cls, whereas getattr would also
        # match its superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            # _VALID_URL can now be a list/tuple of patterns
            cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
        # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
        for p in cls._VALID_URL_RE:
            p = p.match(url)
            if p:
                return p
    # The public alias can safely be overridden, as in some back-ports
    _match_valid_url = __match_valid_url
    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
-
+        # This function must import everything it needs (except other extractors),
-        # This does not use has/getattr intentionally - we want to know whether
+        # so that lazy_extractors works correctly
-        # we have cached the regexp for *this* class, whereas getattr would also
+        return cls.__match_valid_url(url) is not None
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None
    @classmethod
    def _match_id(cls, url):
-        if '_VALID_URL_RE' not in cls.__dict__:
+        m = cls.__match_valid_url(url)
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        m = cls._VALID_URL_RE.match(url)
        assert m
        return compat_str(m.group('id'))
@ -1005,6 +1034,8 @@ class InfoExtractor(object):
            if group is None:
                # return the first matching group
                return next(g for g in mobj.groups() if g is not None)
            elif isinstance(group, (list, tuple)):
                return tuple(mobj.group(g) for g in group)
            else:
                return mobj.group(group)
        elif default is not NO_DEFAULT:
@ -1020,10 +1051,9 @@ class InfoExtractor(object):
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
-        if res:
+        if isinstance(res, tuple):
-            return clean_html(res).strip()
+            return tuple(map(clean_html, res))
-        else:
+        return clean_html(res)
            return res
    def _get_netrc_login_info(self, netrc_machine=None):
        username = None
@ -1348,6 +1378,44 @@ class InfoExtractor(object):
                    break
        return dict((k, v) for k, v in info.items() if v is not None)
    def _search_nextjs_data(self, webpage, video_id, **kw):
        nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
        kw.pop('transform_source', None)
        next_data = self._search_regex(
            r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
            webpage, 'next.js data', group='nd', **kw)
        if not next_data:
            return {}
        return self._parse_json(next_data, video_id, **nkw)
    def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
        # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
        context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
        fatal = kwargs.get('fatal', True)
        traverse = kwargs.get('traverse', ('data', 0))
        re_ctx = re.escape(context_name)
        FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
                       r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
        js, arg_keys, arg_vals = self._search_regex(
            (p.format(re_ctx, FUNCTION_RE) for p in
             (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
              r'{0}\s*\([\s\S]*?{1}')),
            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
            default=NO_DEFAULT if fatal else (None, None, None))
        if js is None:
            return {}
        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
            '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
        return traverse_obj(ret, traverse) or {}
    @staticmethod
    def _hidden_inputs(html):
        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@ -2495,7 +2563,8 @@ class InfoExtractor(object):
                return f
            return {}
-        def _media_formats(src, cur_media_type, type_info={}):
+        def _media_formats(src, cur_media_type, type_info=None):
            type_info = type_info or {}
            full_url = absolute_url(src)
            ext = type_info.get('ext') or determine_ext(full_url)
            if ext == 'm3u8':
@ -2513,6 +2582,7 @@ class InfoExtractor(object):
                formats = [{
                    'url': full_url,
                    'vcodec': 'none' if cur_media_type == 'audio' else None,
                    'ext': ext,
                }]
            return is_plain_url, formats
@ -2521,7 +2591,7 @@ class InfoExtractor(object):
        # so we wll include them right here (see
        # https://www.ampproject.org/docs/reference/components/amp-video)
        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
-        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
+        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
        media_tags = [(media_tag, media_tag_name, media_type, '')
                      for media_tag, media_tag_name, media_type
                      in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
@ -2539,7 +2609,8 @@ class InfoExtractor(object):
            media_attributes = extract_attributes(media_tag)
            src = strip_or_none(media_attributes.get('src'))
            if src:
-                _, formats = _media_formats(src, media_type)
+                f = parse_content_type(media_attributes.get('type'))
                _, formats = _media_formats(src, media_type, f)
                media_info['formats'].extend(formats)
            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
            if media_content:
--- a/youtube_dl/extractor/dlf.py
+++ b/youtube_dl/extractor/dlf.py
@ -0,0 +1,204 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
 )
 from ..utils import (
    determine_ext,
    extract_attributes,
    int_or_none,
    merge_dicts,
    traverse_obj,
    url_or_none,
    variadic,
 )
 class DLFBaseIE(InfoExtractor):
    _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
    _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
    def _parse_button_attrs(self, button, audio_id=None):
        attrs = extract_attributes(button)
        audio_id = audio_id or attrs['data-audio-diraid']
        url = traverse_obj(
            attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
            'data-audio-src', expected_type=url_or_none)
        ext = determine_ext(url)
        formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
                   if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
        self._sort_formats(formats)
        def traverse_attrs(path):
            path = list(variadic(path))
            t = path.pop() if callable(path[-1]) else None
            return traverse_obj(attrs, path, expected_type=t, get_all=False)
        def txt_or_none(v, default=None):
            return default if v is None else (compat_str(v).strip() or default)
        return merge_dicts(*reversed([{
            'id': audio_id,
            # 'extractor_key': DLFIE.ie_key(),
            # 'extractor': DLFIE.IE_NAME,
            'formats': formats,
        }, dict((k, traverse_attrs(v)) for k, v in {
            'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
            'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
            'thumbnail': ('data-audioimage', url_or_none),
            'uploader': 'data-audio-producer',
            'series': 'data-audio-series',
            'channel': 'data-audio-origin-site-name',
            'webpage_url': ('data-audio-download-tracking-path', url_or_none),
        }.items())]))
 class DLFIE(DLFBaseIE):
    IE_NAME = 'dlf'
    _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
    _TESTS = [
        # Audio as an HLS stream
        {
            'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
            'info_dict': {
                'id': '03a3eb19',
                'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
                'ext': 'm4a',
                'duration': 3298,
                'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
                'uploader': 'Deutschlandfunk',
                'series': 'On Stage',
                'channel': 'deutschlandfunk'
            },
            'params': {
                'skip_download': 'm3u8'
            },
            'skip': 'This webpage no longer exists'
        }, {
            'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
            'info_dict': {
                'id': 'd9cc1856',
                'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
                'ext': 'mp3',
                'duration': 291,
                'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
                'uploader': 'Deutschlandfunk',
                'series': 'Kommentare und Themen der Woche',
                'channel': 'deutschlandfunk'
            }
        },
    ]
    def _real_extract(self, url):
        audio_id = self._match_id(url)
        webpage = self._download_webpage(url, audio_id)
        return self._parse_button_attrs(
            self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
 class DLFCorpusIE(DLFBaseIE):
    IE_NAME = 'dlf:corpus'
    IE_DESC = 'DLF Multi-feed Archives'
    _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
    _TESTS = [
        # Recorded news broadcast with referrals to related broadcasts
        {
            'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
            'info_dict': {
                'id': 'fechten-russland-belarus-ukraine-protest-100',
                'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
                'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
            },
            'playlist_mincount': 5,
            'playlist': [{
                'info_dict': {
                    'id': '1fc5d64a',
                    'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
                    'ext': 'mp3',
                    'duration': 252,
                    'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
                    'uploader': 'Deutschlandfunk',
                    'series': 'Sport',
                    'channel': 'deutschlandfunk'
                }
            }, {
                'info_dict': {
                    'id': '2ada145f',
                    'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
                    'ext': 'mp3',
                    'duration': 336,
                    'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
                    'uploader': 'Deutschlandfunk',
                    'series': 'Deutschlandfunk Nova',
                    'channel': 'deutschlandfunk-nova'
                }
            }, {
                'info_dict': {
                    'id': '5e55e8c9',
                    'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
                    'ext': 'mp3',
                    'duration': 187,
                    'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
                    'uploader': 'Deutschlandfunk',
                    'series': 'Sport am Samstag',
                    'channel': 'deutschlandfunk'
                }
            }, {
                'info_dict': {
                    'id': '47e1a096',
                    'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
                    'ext': 'mp3',
                    'duration': 602,
                    'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
                    'uploader': 'Deutschlandfunk',
                    'series': 'Sport am Samstag',
                    'channel': 'deutschlandfunk'
                }
            }, {
                'info_dict': {
                    'id': '5e55e8c9',
                    'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
                    'ext': 'mp3',
                    'duration': 187,
                    'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
                    'uploader': 'Deutschlandfunk',
                    'series': 'Sport am Samstag',
                    'channel': 'deutschlandfunk'
                }
            }]
        },
        # Podcast feed with tag buttons, playlist count fluctuates
        {
            'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
            'info_dict': {
                'id': 'kommentare-und-themen-der-woche-100',
                'title': 'Meinung - Kommentare und Themen der Woche',
                'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
            },
            'playlist_mincount': 10,
        },
        # Podcast feed with no description
        {
            'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
            'info_dict': {
                'id': 'podcast-tolle-idee-100',
                'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
            },
            'playlist_mincount': 11,
        },
    ]
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_webpage(url, playlist_id)
        return self.playlist_result(
            map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
            playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
            self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -226,6 +226,7 @@ from .ciscolive import (
    CiscoLiveSearchIE,
 )
 from .cjsw import CJSWIE
 from .clipchamp import ClipchampIE
 from .cliphunter import CliphunterIE
 from .clippit import ClippitIE
 from .cliprs import ClipRsIE
@ -295,6 +296,10 @@ from .dbtv import DBTVIE
 from .dctp import DctpTvIE
 from .deezer import DeezerPlaylistIE
 from .democracynow import DemocracynowIE
 from .dlf import (
    DLFCorpusIE,
    DLFIE,
 )
 from .dfb import DFBIE
 from .dhm import DHMIE
 from .digg import DiggIE
@ -444,6 +449,13 @@ from .gfycat import GfycatIE
 from .giantbomb import GiantBombIE
 from .giga import GigaIE
 from .glide import GlideIE
 from .globalplayer import (
    GlobalPlayerLiveIE,
    GlobalPlayerLivePlaylistIE,
    GlobalPlayerAudioIE,
    GlobalPlayerAudioEpisodeIE,
    GlobalPlayerVideoIE
 )
 from .globo import (
    GloboIE,
    GloboArticleIE,
@ -975,6 +987,10 @@ from .pornhub import (
 from .pornotube import PornotubeIE
 from .pornovoisines import PornoVoisinesIE
 from .pornoxo import PornoXOIE
 from .pr0gramm import (
    Pr0grammIE,
    Pr0grammStaticIE,
 )
 from .puhutv import (
    PuhuTVIE,
    PuhuTVSerieIE,
@ -1565,6 +1581,7 @@ from .weibo import (
    WeiboMobileIE
 )
 from .weiqitv import WeiqiTVIE
 from .whyp import WhypIE
 from .wistia import (
    WistiaIE,
    WistiaPlaylistIE,
@ -1678,7 +1695,3 @@ from .zingmp3 import (
 )
 from .zoom import ZoomIE
 from .zype import ZypeIE
 from .pr0gramm import (
    Pr0grammIE,
    Pr0grammStaticIE,
 )
--- a/youtube_dl/extractor/globalplayer.py
+++ b/youtube_dl/extractor/globalplayer.py
@ -0,0 +1,273 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    join_nonempty,
    merge_dicts,
    parse_duration,
    str_or_none,
    T,
    traverse_obj,
    unified_strdate,
    unified_timestamp,
    urlhandle_detect_ext,
 )
 class GlobalPlayerBaseIE(InfoExtractor):
    def _get_page_props(self, url, video_id):
        webpage = self._download_webpage(url, video_id)
        return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
    def _request_ext(self, url, video_id):
        return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests
            url, video_id, note='Determining source extension'))
    @staticmethod
    def _clean_desc(x):
        x = clean_html(x)
        if x:
            x = x.replace('\xa0', ' ')
        return x
    def _extract_audio(self, episode, series):
        return merge_dicts({
            'vcodec': 'none',
        }, traverse_obj(series, {
            'series': 'title',
            'series_id': 'id',
            'thumbnail': 'imageUrl',
            'uploader': 'itunesAuthor',  # podcasts only
        }), traverse_obj(episode, {
            'id': 'id',
            'description': ('description', T(self._clean_desc)),
            'duration': ('duration', T(parse_duration)),
            'thumbnail': 'imageUrl',
            'url': 'streamUrl',
            'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)),
            'title': 'title',
        }, get_all=False), rev=True)
 class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
    _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
    _TESTS = [{
        'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
        'info_dict': {
            'id': '2mx1E',
            'ext': 'aac',
            'display_id': 'smoothchill-uk',
            'title': 're:^Smooth Chill.+$',
            'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
            'description': 'Music To Chill To',
            # 'live_status': 'is_live',
            'is_live': True,
        },
    }, {
        # national station
        'url': 'https://www.globalplayer.com/live/heart/uk/',
        'info_dict': {
            'id': '2mwx4',
            'ext': 'aac',
            'description': 'turn up the feel good!',
            'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
            # 'live_status': 'is_live',
            'is_live': True,
            'title': 're:^Heart UK.+$',
            'display_id': 'heart-uk',
        },
    }, {
        # regional variation
        'url': 'https://www.globalplayer.com/live/heart/london/',
        'info_dict': {
            'id': 'AMqg',
            'ext': 'aac',
            'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
            'title': 're:^Heart London.+$',
            # 'live_status': 'is_live',
            'is_live': True,
            'display_id': 'heart-london',
            'description': 'turn up the feel good!',
        },
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        station = self._get_page_props(url, video_id)['station']
        stream_url = station['streamUrl']
        return merge_dicts({
            'id': station['id'],
            'display_id': (
                join_nonempty('brandSlug', 'slug', from_dict=station)
                or station.get('legacyStationPrefix')),
            'url': stream_url,
            'ext': self._request_ext(stream_url, video_id),
            'vcodec': 'none',
            'is_live': True,
        }, {
            'title': self._live_title(traverse_obj(
                station, (('name', 'brandName'), T(str_or_none)),
                get_all=False)),
        }, traverse_obj(station, {
            'description': 'tagline',
            'thumbnail': 'brandLogo',
        }), rev=True)
 class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
    _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
    _TESTS = [{
        # "live playlist"
        'url': 'https://www.globalplayer.com/playlists/8bLk/',
        'info_dict': {
            'id': '8bLk',
            'ext': 'aac',
            # 'live_status': 'is_live',
            'is_live': True,
            'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
            'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
            'title': 're:Classic FM Hall of Fame.+$'
        },
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        station = self._get_page_props(url, video_id)['playlistData']
        stream_url = station['streamUrl']
        return merge_dicts({
            'id': video_id,
            'url': stream_url,
            'ext': self._request_ext(stream_url, video_id),
            'vcodec': 'none',
            'is_live': True,
        }, traverse_obj(station, {
            'title': 'title',
            'description': ('description', T(self._clean_desc)),
            'thumbnail': 'image',
        }), rev=True)
 class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
    _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
    _TESTS = [{
        # podcast
        'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
        'playlist_mincount': 5,
        'info_dict': {
            'id': '42KuaM',
            'title': 'Filthy Ritual',
            'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
            'categories': ['Society & Culture', 'True Crime'],
            'uploader': 'Global',
            'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
        },
    }, {
        # radio catchup
        'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
        'playlist_mincount': 2,
        'info_dict': {
            'id': '46vyD7z',
            'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
            'title': 'Nick Ferrari',
            'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
        },
    }]
    def _real_extract(self, url):
        video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
        props = self._get_page_props(url, video_id)
        series = props['podcastInfo'] if podcast else props['catchupInfo']
        return merge_dicts({
            '_type': 'playlist',
            'id': video_id,
            'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
                        series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
            'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
        }, traverse_obj(series, {
            'description': ('description', T(self._clean_desc)),
            'thumbnail': 'imageUrl',
            'title': 'title',
            'uploader': 'itunesAuthor',  # podcasts only
        }), rev=True)
 class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
    _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
    _TESTS = [{
        # podcast
        'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
        'info_dict': {
            'id': '7DrfNnE',
            'ext': 'mp3',
            'title': 'Filthy Ritual - Trailer',
            'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
            'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
            'duration': 225.0,
            'timestamp': 1681254900,
            'series': 'Filthy Ritual',
            'series_id': '42KuaM',
            'upload_date': '20230411',
            'uploader': 'Global',
        },
    }, {
        # radio catchup
        'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
        'only_matching': True,
        # expired: refresh the details with a current show for a full test
        'info_dict': {
            'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
            'ext': 'm4a',
            'timestamp': 1682056800,
            'series': 'Nick Ferrari',
            'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
            'upload_date': '20230421',
            'series_id': '46vyD7z',
            'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
            'title': 'Nick Ferrari',
            'duration': 10800.0,
        },
    }]
    def _real_extract(self, url):
        video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
        props = self._get_page_props(url, video_id)
        episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
        return self._extract_audio(
            episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
 class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
    _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
    _TESTS = [{
        'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
        'info_dict': {
            'id': '2JsSZ7Gm2uP',
            'ext': 'mp4',
            'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
            'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
            'upload_date': '20230420',
            'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
        },
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        meta = self._get_page_props(url, video_id)['videoData']
        return merge_dicts({
            'id': video_id,
        }, traverse_obj(meta, {
            'url': 'url',
            'thumbnail': ('image', 'url'),
            'title': 'title',
            'upload_date': ('publish_date', T(unified_strdate)),
            'description': 'description',
        }), rev=True)
--- a/youtube_dl/extractor/whyp.py
+++ b/youtube_dl/extractor/whyp.py
@ -0,0 +1,55 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    float_or_none,
    merge_dicts,
    str_or_none,
    T,
    traverse_obj,
    url_or_none,
 )
 class WhypIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
        'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
        'info_dict': {
            'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
            'id': '18337',
            'title': 'Home Page Example Track',
            'description': r're:(?s).+\bexample track\b',
            'ext': 'mp3',
            'duration': 52.82,
            'uploader': 'Brad',
            'uploader_id': '1',
            'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
        },
    }, {
        'url': 'https://www.whyp.it/tracks/18337',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        unique_id = self._match_id(url)
        webpage = self._download_webpage(url, unique_id)
        data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
        return merge_dicts({
            'url': data['audio_url'],
            'id': unique_id,
        }, traverse_obj(data, {
            'title': 'title',
            'description': 'description',
            'duration': ('duration', T(float_or_none)),
            'uploader': ('user', 'username'),
            'uploader_id': ('user', 'id', T(str_or_none)),
            'thumbnail': ('artwork_url', T(url_or_none)),
        }), {
            'ext': 'mp3',
            'vcodec': 'none',
            'http_headers': {'Referer': 'https://whyp.it/'},
        }, rev=True)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2996,7 +2996,8 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
        # Technically the Cookie header should be in unredirected_hdrs;
        # however in practice some may set it in normal headers anyway.
        # We will remove it here to prevent any leaks.
-        remove_headers = ['Cookie']
+        # Also remove unwanted and undocumented Host header for old URL
        remove_headers = ['Cookie', 'Host']
        # A 303 must either use GET or HEAD for subsequent request
        # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
@ -4268,13 +4269,9 @@ def variadic(x, allowed_types=NO_DEFAULT):
 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
-    if isinstance(key_or_keys, (list, tuple)):
+    exp = (lambda x: x or None) if skip_false_values else IDENTITY
-        for key in key_or_keys:
+    return traverse_obj(d, *variadic(key_or_keys), expected_type=exp,
-            if key not in d or d[key] is None or skip_false_values and not d[key]:
+                        default=default, get_all=False)
                continue
            return d[key]
        return default
    return d.get(key_or_keys, default)
 def try_call(*funcs, **kwargs):
@ -4307,16 +4304,38 @@ def try_get(src, getter, expected_type=None):
                return v
-def merge_dicts(*dicts):
+def merge_dicts(*dicts, **kwargs):
    """
        Merge the `dict`s in `dicts` using the first valid value for each key.
        Normally valid: not None and not an empty string
        Keyword-only args:
        unblank:    allow empty string if False (default True)
        rev:        merge dicts in reverse order (default False)
        merge_dicts(dct1, dct2, ..., unblank=False, rev=True)
        matches {**dct1, **dct2, ...}
        However, merge_dicts(dct1, dct2, ..., rev=True) may often be better.
    """
    unblank = kwargs.get('unblank', True)
    rev = kwargs.get('rev', False)
    if unblank:
        def can_merge_str(k, v, to_dict):
            return (isinstance(v, compat_str) and v
                    and isinstance(to_dict[k], compat_str)
                    and not to_dict[k])
    else:
        can_merge_str = lambda k, v, to_dict: False
    merged = {}
-    for a_dict in dicts:
+    for a_dict in reversed(dicts) if rev else dicts:
        for k, v in a_dict.items():
            if v is None:
                continue
-            if (k not in merged
+            if (k not in merged) or can_merge_str(k, v, merged):
                    or (isinstance(v, compat_str) and v
                        and isinstance(merged[k], compat_str)
                        and not merged[k])):
                merged[k] = v
    return merged
@ -4370,46 +4389,108 @@ def strip_jsonp(code):
        r'\g<callback_data>', code)
-def js_to_json(code):
+def js_to_json(code, *args, **kwargs):
-    COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
+
    # vars is a dict of (var, val) pairs to substitute
    vars = args[0] if len(args) > 0 else kwargs.get('vars', {})
    strict = kwargs.get('strict', False)
    STRING_QUOTES = '\'"`'
    STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES)
    COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
    SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
    INTEGER_TABLE = (
        (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
        (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
        (r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10),
    )
    # compat candidate
    JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError
    def process_escape(match):
        JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu'
        escape = match.group(1) or match.group(2)
        return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES
                else '\\u00' if escape == 'x'
                else '' if escape == '\n'
                else escape)
    def template_substitute(match):
        evaluated = js_to_json(match.group(1), vars, strict=strict)
        if evaluated[0] == '"':
            return json.loads(evaluated)
        return evaluated
    def fix_kv(m):
        v = m.group(0)
        if v in ('true', 'false', 'null'):
            return v
-        elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
+        elif v in ('undefined', 'void 0'):
-            return ""
+            return 'null'
        elif v.startswith('/*') or v.startswith('//') or v == ',':
            return ''
-        if v[0] in ("'", '"'):
+        if v[0] in STRING_QUOTES:
-            v = re.sub(r'(?s)\\.|"', lambda m: {
+            v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
-                '"': '\\"',
+            escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
-                "\\'": "'",
+            return '"{0}"'.format(escaped)
                '\\\n': '',
                '\\x': '\\u00',
            }.get(m.group(0), m.group(0)), v[1:-1])
        else:
            for regex, base in INTEGER_TABLE:
                im = re.match(regex, v)
                if im:
                    i = int(im.group(1), base)
                    return '"%d":' % i if v.endswith(':') else '%d' % i
-        return '"%s"' % v
+        inv = IDENTITY
        im = re.split(r'^!+', v)
        if len(im) > 1 and not im[-1].endswith(':'):
            if (len(v) - len(im[1])) % 2 == 1:
                inv = lambda x: 'true' if x == 0 else 'false'
            else:
                inv = lambda x: 'false' if x == 0 else 'true'
        if not any(x for x in im):
            return
        v = im[-1]
        for regex, base in INTEGER_TABLE:
            im = re.match(regex, v)
            if im:
                i = int(im.group(1), base)
                return ('"%s":' if v.endswith(':') else '%s') % inv(i)
        if v in vars:
            try:
                if not strict:
                    json.loads(vars[v])
            except JSONDecodeError:
                return inv(json.dumps(vars[v]))
            else:
                return inv(vars[v])
        if not strict:
            v = try_call(inv, args=(v,), default=v)
            if v in ('true', 'false'):
                return v
            return '"{0}"'.format(v)
        raise ValueError('Unknown value: ' + v)
    def create_map(mobj):
        return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
    code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
    if not strict:
        code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
        code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
        code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
        code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
    return re.sub(r'''(?sx)
-        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
+        {str_}|
-        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
+        {comment}|
-        {comment}|,(?={skip}[\]}}])|
+        ,(?={skip}[\]}}])|
-        (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
+        void\s0|
-        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
+        !*(?:(?<!\d)[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
-        [0-9]+(?={skip}:)|
+        (?:\b|!+)0(?:[xX][\da-fA-F]+|[0-7]+)(?:{skip}:)?|
        !+\d+(?:\.\d*)?(?:{skip}:)?|
        [0-9]+(?:{skip}:)|
        !+
-        '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+        '''.format(comment=COMMENT_RE, skip=SKIP_RE, str_=STRING_RE), fix_kv, code)
 def qualities(quality_ids):
@ -6029,6 +6110,37 @@ def clean_podcast_url(url):
        )/''', '', url)
 if __debug__:
    # Raise TypeError if args can't be bound
    # needs compat owing to unstable inspect API, thanks PSF :-(
    try:
        inspect.signature
        def _try_bind_args(fn, *args, **kwargs):
            inspect.signature(fn).bind(*args, **kwargs)
    except AttributeError:
        # Py < 3.3
        def _try_bind_args(fn, *args, **kwargs):
            fn_args = inspect.getargspec(fn)
            # Py2: ArgInfo(args, varargs, keywords, defaults)
            # Py3: ArgSpec(args, varargs, keywords, defaults)
            if not fn_args.keywords:
                for k in kwargs:
                    if k not in (fn_args.args or []):
                        raise TypeError("got an unexpected keyword argument: '{0}'".format(k))
            if not fn_args.varargs:
                args_to_bind = len(args)
                bindable = len(fn_args.args or [])
                if args_to_bind > bindable:
                    raise TypeError('too many positional arguments')
                bindable -= len(fn_args.defaults or [])
                if args_to_bind < bindable:
                    if kwargs:
                        bindable -= len(set(fn_args.args or []) & set(kwargs))
                    if bindable > args_to_bind:
                        raise TypeError("missing a required argument: '{0}'".format(fn_args.args[args_to_bind]))
 def traverse_obj(obj, *paths, **kwargs):
    """
    Safely traverse nested `dict`s and `Iterable`s
@ -6247,10 +6359,7 @@ def traverse_obj(obj, *paths, **kwargs):
            if __debug__ and callable(key):
                # Verify function signature
-                args = inspect.getargspec(key)
+                _try_bind_args(key, None, None)
                if len(args.args) != 2:
                    # crash differently in 2.6 !
                    inspect.getcallargs(key, None, None)
            new_objs = []
            for obj in objs:
Author	SHA1	Message	Date
dirkf	1fa8b86f0b	[utils] Remove stray undocumented Host header in redirect (fix `46fde7c`)	2023-07-20 05:29:59 +01:00
dirkf	b2ba24bb02	[InfoExtractor] Add `_match_valid_url()` class method and refactor * API compatible with yt-dlp * also support Sequence of patterns in _VALID_URL * one place to compile _VALID_URL * TODO: remove existing extractor shims	2023-07-19 22:14:50 +01:00
dirkf	a190b55964	[utils] Fix broken Py 3.11+ compat in `traverse_obj()` * inspect.getargspec is missing despite doc claiming backward compat * replace with emulation of `Signature.bind()`	2023-07-19 22:14:50 +01:00
dirkf	b2741f2654	[InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp * add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386 thanks selfisekai * add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921, thanks Lesmiscore, pukkandan * add tests for the above * also fix HTML5 type recognition and tests, from `222a230871`, thanks Lesmiscore * update extractors in PR using above, fix tests.	2023-07-19 22:14:50 +01:00
dirkf	8465222041	[Clipchamp] Add new extractor back-ported from yt-dlp	2023-07-19 22:14:50 +01:00
dirkf	4339910df3	[DLF] Add site extractors back-ported from yt-dlp * from https://github.com/yt-dlp/yt-dlp/pull/6697, thanks nick-cd	2023-07-19 22:14:50 +01:00
dirkf	eaaf4c6736	[Whyp] Add extractor back-ported from yt-dlp * from https://github.com/yt-dlp/yt-dlp/pull/6803, thanks CoryTibbettsDev	2023-07-19 22:14:50 +01:00
dirkf	4566e6e53e	[GlobalPlayer] Add site extractors back-ported from yt-dlp * from https://github.com/yt-dlp/yt-dlp/pull/6903, thanks garret1317	2023-07-19 22:14:50 +01:00
dirkf	1e8ccdd2eb	[InfoExtractor] Support groups in _`search_regex()`, etc	2023-07-19 22:14:50 +01:00
dirkf	cb9366eda5	[utils] Minor updates (merge_dicts, T) A couple of mods to ease yt-dlp back-ports: * add kwargs to merge_dicts: `unblank=True` (disallow empty string), `rev=False` (reverse the merge list) * add `T(x)` shortcut for `{x}`, unsupported in Py2.6	2023-07-19 22:14:50 +01:00
dirkf	d9d07a9581	[utils] Improve js_to_json, align with yt-dlp * support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc, thanks ChillingPepper, Grub4k, pukkandan * improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521 thanks Grub4k * support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623 thanks Grub4k * add limited `!` evaluation (eg, !!0 -> false, see tests)	2023-07-19 22:14:50 +01:00
dirkf	825a40744b	[utils] Align traverse_obj() with yt-dlp Thanks Grub4k for these: * traverse `Iterable`s, from https://github.com/yt-dlp/yt-dlp/pull/6902, etc * traverse `set` key for transformations/filters, `re.Match` group names, from `776995bc10`, etc * traverse `re.Match`es, from https://github.com/yt-dlp/yt-dlp/pull/5174 * always return list when branching, from https://github.com/yt-dlp/yt-dlp/pull/5170	2023-07-19 22:14:50 +01:00