Compare commits

...

12 Commits

Author SHA1 Message Date
dirkf
1fa8b86f0b [utils] Remove stray undocumented Host header in redirect (fix 46fde7c) 2023-07-20 05:29:59 +01:00
dirkf
b2ba24bb02 [InfoExtractor] Add _match_valid_url() class method and refactor
* API compatible with yt-dlp
* also support Sequence of patterns in _VALID_URL
* one place to compile _VALID_URL
* TODO: remove existing extractor shims
2023-07-19 22:14:50 +01:00
dirkf
a190b55964 [utils] Fix broken Py 3.11+ compat in traverse_obj()
* inspect.getargspec is missing despite doc claiming backward compat
* replace with emulation of `Signature.bind()`
2023-07-19 22:14:50 +01:00
dirkf
b2741f2654 [InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
  thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
  thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
  222a230871,
  thanks Lesmiscore
* update extractors in PR using above, fix tests.
2023-07-19 22:14:50 +01:00
dirkf
8465222041 [Clipchamp] Add new extractor back-ported from yt-dlp 2023-07-19 22:14:50 +01:00
dirkf
4339910df3 [DLF] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6697, thanks nick-cd
2023-07-19 22:14:50 +01:00
dirkf
eaaf4c6736 [Whyp] Add extractor back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6803, thanks CoryTibbettsDev
2023-07-19 22:14:50 +01:00
dirkf
4566e6e53e [GlobalPlayer] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6903, thanks garret1317
2023-07-19 22:14:50 +01:00
dirkf
1e8ccdd2eb [InfoExtractor] Support groups in _search_regex(), etc 2023-07-19 22:14:50 +01:00
dirkf
cb9366eda5 [utils] Minor updates (merge_dicts, T)
A couple of mods to ease yt-dlp back-ports:
* add kwargs to merge_dicts:
  `unblank=True` (disallow empty string), `rev=False` (reverse the merge list)
* add `T(x)` shortcut for `{x}`, unsupported in Py2.6
2023-07-19 22:14:50 +01:00
dirkf
d9d07a9581 [utils] Improve js_to_json, align with yt-dlp
* support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc,
  thanks ChillingPepper, Grub4k, pukkandan
* improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521
  thanks Grub4k
* support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623
  thanks Grub4k
* add limited `!` evaluation (eg, !!0 -> false, see tests)
2023-07-19 22:14:50 +01:00
dirkf
825a40744b [utils] Align traverse_obj() with yt-dlp
Thanks Grub4k for these:
* traverse `Iterable`s, from https://github.com/yt-dlp/yt-dlp/pull/6902, etc
* traverse `set` key for transformations/filters, `re.Match` group names, from
  776995bc10, etc
* traverse `re.Match`es, from https://github.com/yt-dlp/yt-dlp/pull/5174
* always return list when branching, from https://github.com/yt-dlp/yt-dlp/pull/5170
2023-07-19 22:14:50 +01:00
10 changed files with 1098 additions and 95 deletions

View File

@ -4,6 +4,7 @@ from inspect import getsource
import io
import os
from os.path import dirname as dirn
import re
import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read()
def get_source(m):
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
module_template,
get_source(InfoExtractor.suitable),
get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n',
'from youtube_dl.utils import parse_qs, variadic\n',
]
ie_template = '''
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url,
module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += '\n' + getsource(ie.suitable)
s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'):
# search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url())

View File

@ -7,15 +7,33 @@ import io
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
from youtube_dl.compat import compat_etree_fromstring, compat_http_server
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
import threading
from test.helper import (
expect_dict,
expect_value,
FakeYDL,
http_server_port,
)
from youtube_dl.compat import (
compat_etree_fromstring,
compat_http_server,
)
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import (
get_info_extractor,
YoutubeIE,
)
from youtube_dl.utils import (
encode_data_uri,
ExtractorError,
RegexNotFoundError,
strip_jsonp,
)
TEAPOT_RESPONSE_STATUS = 418
TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_nextjs_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<meta name="viewport" content="width=device-width">
<title>Test _search_nextjs_data()</title>
</head>
<body>
<div id="__next">
<div style="background-color:#17171E" class="FU" dir="ltr">
<div class="sc-93de261d-0 dyzzYE">
<div>
<header class="HD"></header>
<main class="MN">
<div style="height:0" class="HT0">
<div style="width:NaN%" data-testid=
"stream-container" class="WDN"></div>
</div>
</main>
</div>
<footer class="sc-6e5faf91-0 dEGaHS"></footer>
</div>
</div>
</div>
<script id="__NEXT_DATA__" type="application/json">
{"props":{"pageProps":{"video":{"id":"testid"}}}}
</script>
</body>
</html>
'''
search = self.ie._search_nextjs_data(html, 'testID')
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
def test_search_nuxt_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<title>Nuxt.js Test Page</title>
<meta name="viewport" content=
"width=device-width, initial-scale=1">
<meta data-hid="robots" name="robots" content="all">
</head>
<body class="BD">
<div id="__layout">
<h1 class="H1">Example heading</h1>
<div class="IN">
<p>Decoy text</p>
</div>
</div>
<script>
window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null));
</script>
<script src="/_nuxt/a12345b.js" defer="defer"></script>
</body>
</html>
'''
search = self.ie._search_nuxt_data(html, 'testID')
self.assertEqual(search['track']['id'], 'testid')
def test_search_json_ld_realworld(self):
# https://github.com/ytdl-org/youtube-dl/issues/23306
expect_dict(
@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):
}],
})
# from https://0000.studio/
# with type attribute but without extension in URL
expect_dict(
self,
self.ie._parse_html5_media_entries(
'https://0000.studio',
r'''
<video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
</video>
''', None)[0],
{
'formats': [{
'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
'ext': 'mp4',
}],
})
def test_extract_jwplayer_data_realworld(self):
# from http://www.suffolk.edu/sjc/
expect_dict(

View File

@ -20,7 +20,7 @@ import xml.etree.ElementTree
from youtube_dl.utils import (
age_restricted,
args_to_str,
encode_base_n,
base_url,
caesar,
clean_html,
clean_podcast_url,
@ -29,10 +29,12 @@ from youtube_dl.utils import (
detect_exe_version,
determine_ext,
dict_get,
encode_base_n,
encode_compat_str,
encodeFilename,
escape_rfc3986,
escape_url,
expand_path,
extract_attributes,
ExtractorError,
find_xpath_attr,
@ -51,6 +53,7 @@ from youtube_dl.utils import (
js_to_json,
LazyList,
limit_length,
lowercase_escape,
merge_dicts,
mimetype2ext,
month_by_name,
@ -66,17 +69,16 @@ from youtube_dl.utils import (
parse_resolution,
parse_bitrate,
pkcs1pad,
read_batch_urls,
sanitize_filename,
sanitize_path,
sanitize_url,
expand_path,
prepend_extension,
replace_extension,
read_batch_urls,
remove_start,
remove_end,
remove_quotes,
replace_extension,
rot47,
sanitize_filename,
sanitize_path,
sanitize_url,
shell_quote,
smuggle_url,
str_or_none,
@ -93,10 +95,8 @@ from youtube_dl.utils import (
unified_timestamp,
unsmuggle_url,
uppercase_escape,
lowercase_escape,
url_basename,
url_or_none,
base_url,
urljoin,
urlencode_postdata,
urshift,
@ -905,6 +905,85 @@ class TestUtil(unittest.TestCase):
)
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
def test_js_to_json_vars_strings(self):
self.assertDictEqual(
json.loads(js_to_json(
'''{
'null': a,
'nullStr': b,
'true': c,
'trueStr': d,
'false': e,
'falseStr': f,
'unresolvedVar': g,
}''',
{
'a': 'null',
'b': '"null"',
'c': 'true',
'd': '"true"',
'e': 'false',
'f': '"false"',
'g': 'var',
}
)),
{
'null': None,
'nullStr': 'null',
'true': True,
'trueStr': 'true',
'false': False,
'falseStr': 'false',
'unresolvedVar': 'var'
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'int': a,
'intStr': b,
'float': c,
'floatStr': d,
}''',
{
'a': '123',
'b': '"123"',
'c': '1.23',
'd': '"1.23"',
}
)),
{
'int': 123,
'intStr': '123',
'float': 1.23,
'floatStr': '1.23',
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'object': a,
'objectStr': b,
'array': c,
'arrayStr': d,
}''',
{
'a': '{}',
'b': '"{}"',
'c': '[]',
'd': '"[]"',
}
)),
{
'object': {},
'objectStr': '{}',
'array': [],
'arrayStr': '[]',
}
)
def test_js_to_json_realworld(self):
inp = '''{
'clip':{'provider':'pseudo'}
@ -975,10 +1054,10 @@ class TestUtil(unittest.TestCase):
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'a': True,
'b': False,
'c': False,
'd': True,
'e': [],
'f': "abc",
'g': "",
@ -1048,10 +1127,26 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
on = js_to_json('[1,//{},\n2]')
self.assertEqual(json.loads(on), [1, 2])
on = js_to_json(r'"\^\$\#"')
self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped')
on = js_to_json('\'"\\""\'')
self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
def test_js_to_json_template_literal(self):
self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
self.assertEqual(js_to_json('`${name}`', {}), '"name"')
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@ -1586,6 +1681,11 @@ Line 1
'dict': {},
}
# define a pukka Iterable
def iter_range(stop):
for from_ in range(stop):
yield from_
# Test base functionality
self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str',
msg='allow tuple path')
@ -1602,13 +1702,13 @@ Line 1
# Test Ellipsis behavior
self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis),
(item for item in _TEST_DATA.values() if item not in (None, {})),
msg='`...` should give all non discarded values')
msg='`...` should give all non-discarded values')
self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(),
msg='`...` selection for dicts should select all values')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')),
['https://www.example.com/0', 'https://www.example.com/1'],
msg='nested `...` queries should work')
self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4),
self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), iter_range(4),
msg='`...` query result should be flattened')
self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)),
msg='`...` should accept iterables')
@ -1618,7 +1718,7 @@ Line 1
[_TEST_DATA['urls']],
msg='function as query key should perform a filter based on (key, value)')
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), set(('str',)),
msg='exceptions in the query function should be catched')
msg='exceptions in the query function should be caught')
self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
msg='function key should accept iterables')
if __debug__:
@ -1706,7 +1806,7 @@ Line 1
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {},
msg='remove empty values when dict key')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis},
msg='use `default` when dict key and `default`')
msg='use `default` when dict key and a default')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {},
msg='remove empty values when nested dict key fails')
self.assertEqual(traverse_obj(None, {0: 'fail'}), {},
@ -1768,7 +1868,7 @@ Line 1
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str),
'str', msg='accept matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int),
None, msg='reject non matching `expected_type` type')
None, msg='reject non-matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)),
'0', msg='transform type using type function')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0),
@ -1780,7 +1880,7 @@ Line 1
self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none),
{0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values')
self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, set((int_or_none,))), expected_type=int),
1, msg='expected_type should not filter non final dict values')
1, msg='expected_type should not filter non-final dict values')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int),
{0: {0: 100}}, msg='expected_type should transform deep dict values')
self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)),
@ -1838,7 +1938,7 @@ Line 1
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)),
_traverse_string=True), 'sr',
msg='`slice` should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"),
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == 's'),
_traverse_string=True), 'str',
msg='function should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),

View File

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
merge_dicts,
T,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ClipchampIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
'info_dict': {
'id': 'gRXZ4ZhdDaU',
'ext': 'mp4',
'title': 'Untitled video',
'uploader': 'Alexander Schwartz',
'timestamp': 1680805580,
'upload_date': '20230406',
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {
'skip_download': 'm3u8',
'format': 'bestvideo',
},
}]
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
storage_location = data.get('storage_location')
if storage_location != 'cf_stream':
raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,))
path = data['download_url']
iframe = self._download_webpage(
'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe')
subdomain = self._search_regex(
r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe,
'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
formats = self._extract_mpd_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
formats.extend(self._extract_m3u8_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
return merge_dicts({
'id': video_id,
'formats': formats,
'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None,
}, traverse_obj(data, {
'title': ('project', 'project_name', T(compat_str)),
'timestamp': ('created_at', T(unified_timestamp)),
'thumbnail': ('thumbnail_url', T(url_or_none)),
}), rev=True)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import base64
import datetime
import functools
import hashlib
import json
import netrc
@ -23,6 +24,7 @@ from ..compat import (
compat_getpass,
compat_integer_types,
compat_http_client,
compat_map as map,
compat_os_name,
compat_str,
compat_urllib_error,
@ -31,6 +33,7 @@ from ..compat import (
compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
compat_zip as zip,
)
from ..downloader.f4m import (
get_base_url,
@ -70,6 +73,7 @@ from ..utils import (
str_or_none,
str_to_int,
strip_or_none,
traverse_obj,
try_get,
unescapeHTML,
unified_strdate,
@ -79,6 +83,7 @@ from ..utils import (
urljoin,
url_basename,
url_or_none,
variadic,
xpath_element,
xpath_text,
xpath_with_ns,
@ -367,9 +372,22 @@ class InfoExtractor(object):
title, description etc.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
A subclass of InfoExtractor must be defined to handle each specific site (or
several sites). Such a concrete subclass should be added to the list of
extractors. It should also:
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
regexps (but see below)
* re-define the _real_extract() method
* optionally re-define the _real_initialize() method.
An extractor subclass may also override suitable() if necessary, but the
function signature must be preserved and the function must import everything
it needs (except other extractors), so that lazy_extractors works correctly.
If the subclass's suitable() and _real_extract() functions avoid using
_VALID_URL, the subclass need not set that class attribute.
An abstract subclass of InfoExtractor may be used to simplify implementation
within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
@ -404,22 +422,33 @@ class InfoExtractor(object):
self._x_forwarded_for_ip = None
self.set_downloader(downloader)
@classmethod
def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for cls, whereas getattr would also
# match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
# _VALID_URL can now be a list/tuple of patterns
cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
for p in cls._VALID_URL_RE:
p = p.match(url)
if p:
return p
# The public alias can safely be overridden, as in some back-ports
_match_valid_url = __match_valid_url
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url) is not None
# This function must import everything it needs (except other extractors),
# so that lazy_extractors works correctly
return cls.__match_valid_url(url) is not None
@classmethod
def _match_id(cls, url):
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
m = cls.__match_valid_url(url)
assert m
return compat_str(m.group('id'))
@ -1005,6 +1034,8 @@ class InfoExtractor(object):
if group is None:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
elif isinstance(group, (list, tuple)):
return tuple(mobj.group(g) for g in group)
else:
return mobj.group(group)
elif default is not NO_DEFAULT:
@ -1020,10 +1051,9 @@ class InfoExtractor(object):
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
return res
if isinstance(res, tuple):
return tuple(map(clean_html, res))
return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
username = None
@ -1348,6 +1378,44 @@ class InfoExtractor(object):
break
return dict((k, v) for k, v in info.items() if v is not None)
def _search_nextjs_data(self, webpage, video_id, **kw):
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
kw.pop('transform_source', None)
next_data = self._search_regex(
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
webpage, 'next.js data', group='nd', **kw)
if not next_data:
return {}
return self._parse_json(next_data, video_id, **nkw)
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
# self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
fatal = kwargs.get('fatal', True)
traverse = kwargs.get('traverse', ('data', 0))
re_ctx = re.escape(context_name)
FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
js, arg_keys, arg_vals = self._search_regex(
(p.format(re_ctx, FUNCTION_RE) for p in
(r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
r'{0}\s*\([\s\S]*?{1}')),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
default=NO_DEFAULT if fatal else (None, None, None))
if js is None:
return {}
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
return traverse_obj(ret, traverse) or {}
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@ -2495,7 +2563,8 @@ class InfoExtractor(object):
return f
return {}
def _media_formats(src, cur_media_type, type_info={}):
def _media_formats(src, cur_media_type, type_info=None):
type_info = type_info or {}
full_url = absolute_url(src)
ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
@ -2513,6 +2582,7 @@ class InfoExtractor(object):
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
'ext': ext,
}]
return is_plain_url, formats
@ -2521,7 +2591,7 @@ class InfoExtractor(object):
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
@ -2539,7 +2609,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag)
src = strip_or_none(media_attributes.get('src'))
if src:
_, formats = _media_formats(src, media_type)
f = parse_content_type(media_attributes.get('type'))
_, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats)
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:

204
youtube_dl/extractor/dlf.py Normal file
View File

@ -0,0 +1,204 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
merge_dicts,
traverse_obj,
url_or_none,
variadic,
)
class DLFBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
_BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
def _parse_button_attrs(self, button, audio_id=None):
attrs = extract_attributes(button)
audio_id = audio_id or attrs['data-audio-diraid']
url = traverse_obj(
attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
'data-audio-src', expected_type=url_or_none)
ext = determine_ext(url)
formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
self._sort_formats(formats)
def traverse_attrs(path):
path = list(variadic(path))
t = path.pop() if callable(path[-1]) else None
return traverse_obj(attrs, path, expected_type=t, get_all=False)
def txt_or_none(v, default=None):
return default if v is None else (compat_str(v).strip() or default)
return merge_dicts(*reversed([{
'id': audio_id,
# 'extractor_key': DLFIE.ie_key(),
# 'extractor': DLFIE.IE_NAME,
'formats': formats,
}, dict((k, traverse_attrs(v)) for k, v in {
'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
'thumbnail': ('data-audioimage', url_or_none),
'uploader': 'data-audio-producer',
'series': 'data-audio-series',
'channel': 'data-audio-origin-site-name',
'webpage_url': ('data-audio-download-tracking-path', url_or_none),
}.items())]))
class DLFIE(DLFBaseIE):
IE_NAME = 'dlf'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
_TESTS = [
# Audio as an HLS stream
{
'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
'info_dict': {
'id': '03a3eb19',
'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
'ext': 'm4a',
'duration': 3298,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'On Stage',
'channel': 'deutschlandfunk'
},
'params': {
'skip_download': 'm3u8'
},
'skip': 'This webpage no longer exists'
}, {
'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
'info_dict': {
'id': 'd9cc1856',
'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
'ext': 'mp3',
'duration': 291,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'Kommentare und Themen der Woche',
'channel': 'deutschlandfunk'
}
},
]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_id)
return self._parse_button_attrs(
self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
class DLFCorpusIE(DLFBaseIE):
IE_NAME = 'dlf:corpus'
IE_DESC = 'DLF Multi-feed Archives'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
_TESTS = [
# Recorded news broadcast with referrals to related broadcasts
{
'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
'info_dict': {
'id': 'fechten-russland-belarus-ukraine-protest-100',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
},
'playlist_mincount': 5,
'playlist': [{
'info_dict': {
'id': '1fc5d64a',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'ext': 'mp3',
'duration': 252,
'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
'uploader': 'Deutschlandfunk',
'series': 'Sport',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '2ada145f',
'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
'ext': 'mp3',
'duration': 336,
'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
'uploader': 'Deutschlandfunk',
'series': 'Deutschlandfunk Nova',
'channel': 'deutschlandfunk-nova'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '47e1a096',
'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
'ext': 'mp3',
'duration': 602,
'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}]
},
# Podcast feed with tag buttons, playlist count fluctuates
{
'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
'info_dict': {
'id': 'kommentare-und-themen-der-woche-100',
'title': 'Meinung - Kommentare und Themen der Woche',
'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
},
'playlist_mincount': 10,
},
# Podcast feed with no description
{
'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
'info_dict': {
'id': 'podcast-tolle-idee-100',
'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
},
'playlist_mincount': 11,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return self.playlist_result(
map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))

View File

@ -226,6 +226,7 @@ from .ciscolive import (
CiscoLiveSearchIE,
)
from .cjsw import CJSWIE
from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .cliprs import ClipRsIE
@ -295,6 +296,10 @@ from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .democracynow import DemocracynowIE
from .dlf import (
DLFCorpusIE,
DLFIE,
)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
@ -444,6 +449,13 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
from .globalplayer import (
GlobalPlayerLiveIE,
GlobalPlayerLivePlaylistIE,
GlobalPlayerAudioIE,
GlobalPlayerAudioEpisodeIE,
GlobalPlayerVideoIE
)
from .globo import (
GloboIE,
GloboArticleIE,
@ -975,6 +987,10 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
@ -1565,6 +1581,7 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
from .whyp import WhypIE
from .wistia import (
WistiaIE,
WistiaPlaylistIE,
@ -1678,7 +1695,3 @@ from .zingmp3 import (
)
from .zoom import ZoomIE
from .zype import ZypeIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)

View File

@ -0,0 +1,273 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
join_nonempty,
merge_dicts,
parse_duration,
str_or_none,
T,
traverse_obj,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
)
class GlobalPlayerBaseIE(InfoExtractor):
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
def _request_ext(self, url, video_id):
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
url, video_id, note='Determining source extension'))
@staticmethod
def _clean_desc(x):
x = clean_html(x)
if x:
x = x.replace('\xa0', ' ')
return x
def _extract_audio(self, episode, series):
return merge_dicts({
'vcodec': 'none',
}, traverse_obj(series, {
'series': 'title',
'series_id': 'id',
'thumbnail': 'imageUrl',
'uploader': 'itunesAuthor', # podcasts only
}), traverse_obj(episode, {
'id': 'id',
'description': ('description', T(self._clean_desc)),
'duration': ('duration', T(parse_duration)),
'thumbnail': 'imageUrl',
'url': 'streamUrl',
'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)),
'title': 'title',
}, get_all=False), rev=True)
class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
_TESTS = [{
'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
'info_dict': {
'id': '2mx1E',
'ext': 'aac',
'display_id': 'smoothchill-uk',
'title': 're:^Smooth Chill.+$',
'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
'description': 'Music To Chill To',
# 'live_status': 'is_live',
'is_live': True,
},
}, {
# national station
'url': 'https://www.globalplayer.com/live/heart/uk/',
'info_dict': {
'id': '2mwx4',
'ext': 'aac',
'description': 'turn up the feel good!',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
# 'live_status': 'is_live',
'is_live': True,
'title': 're:^Heart UK.+$',
'display_id': 'heart-uk',
},
}, {
# regional variation
'url': 'https://www.globalplayer.com/live/heart/london/',
'info_dict': {
'id': 'AMqg',
'ext': 'aac',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'title': 're:^Heart London.+$',
# 'live_status': 'is_live',
'is_live': True,
'display_id': 'heart-london',
'description': 'turn up the feel good!',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['station']
stream_url = station['streamUrl']
return merge_dicts({
'id': station['id'],
'display_id': (
join_nonempty('brandSlug', 'slug', from_dict=station)
or station.get('legacyStationPrefix')),
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, {
'title': self._live_title(traverse_obj(
station, (('name', 'brandName'), T(str_or_none)),
get_all=False)),
}, traverse_obj(station, {
'description': 'tagline',
'thumbnail': 'brandLogo',
}), rev=True)
class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
_TESTS = [{
# "live playlist"
'url': 'https://www.globalplayer.com/playlists/8bLk/',
'info_dict': {
'id': '8bLk',
'ext': 'aac',
# 'live_status': 'is_live',
'is_live': True,
'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
'title': 're:Classic FM Hall of Fame.+$'
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['playlistData']
stream_url = station['streamUrl']
return merge_dicts({
'id': video_id,
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, traverse_obj(station, {
'title': 'title',
'description': ('description', T(self._clean_desc)),
'thumbnail': 'image',
}), rev=True)
class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
'playlist_mincount': 5,
'info_dict': {
'id': '42KuaM',
'title': 'Filthy Ritual',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'categories': ['Society & Culture', 'True Crime'],
'uploader': 'Global',
'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
'playlist_mincount': 2,
'info_dict': {
'id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
series = props['podcastInfo'] if podcast else props['catchupInfo']
return merge_dicts({
'_type': 'playlist',
'id': video_id,
'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
}, traverse_obj(series, {
'description': ('description', T(self._clean_desc)),
'thumbnail': 'imageUrl',
'title': 'title',
'uploader': 'itunesAuthor', # podcasts only
}), rev=True)
class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
'info_dict': {
'id': '7DrfNnE',
'ext': 'mp3',
'title': 'Filthy Ritual - Trailer',
'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'duration': 225.0,
'timestamp': 1681254900,
'series': 'Filthy Ritual',
'series_id': '42KuaM',
'upload_date': '20230411',
'uploader': 'Global',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
'only_matching': True,
# expired: refresh the details with a current show for a full test
'info_dict': {
'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
'ext': 'm4a',
'timestamp': 1682056800,
'series': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
'upload_date': '20230421',
'series_id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'duration': 10800.0,
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
return self._extract_audio(
episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
'info_dict': {
'id': '2JsSZ7Gm2uP',
'ext': 'mp4',
'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
'upload_date': '20230420',
'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._get_page_props(url, video_id)['videoData']
return merge_dicts({
'id': video_id,
}, traverse_obj(meta, {
'url': 'url',
'thumbnail': ('image', 'url'),
'title': 'title',
'upload_date': ('publish_date', T(unified_strdate)),
'description': 'description',
}), rev=True)

View File

@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
str_or_none,
T,
traverse_obj,
url_or_none,
)
class WhypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
'info_dict': {
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
'id': '18337',
'title': 'Home Page Example Track',
'description': r're:(?s).+\bexample track\b',
'ext': 'mp3',
'duration': 52.82,
'uploader': 'Brad',
'uploader_id': '1',
'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
},
}, {
'url': 'https://www.whyp.it/tracks/18337',
'only_matching': True,
}]
def _real_extract(self, url):
unique_id = self._match_id(url)
webpage = self._download_webpage(url, unique_id)
data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
return merge_dicts({
'url': data['audio_url'],
'id': unique_id,
}, traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', T(float_or_none)),
'uploader': ('user', 'username'),
'uploader_id': ('user', 'id', T(str_or_none)),
'thumbnail': ('artwork_url', T(url_or_none)),
}), {
'ext': 'mp3',
'vcodec': 'none',
'http_headers': {'Referer': 'https://whyp.it/'},
}, rev=True)

View File

@ -2996,7 +2996,8 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
# Technically the Cookie header should be in unredirected_hdrs;
# however in practice some may set it in normal headers anyway.
# We will remove it here to prevent any leaks.
remove_headers = ['Cookie']
# Also remove unwanted and undocumented Host header for old URL
remove_headers = ['Cookie', 'Host']
# A 303 must either use GET or HEAD for subsequent request
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
@ -4268,13 +4269,9 @@ def variadic(x, allowed_types=NO_DEFAULT):
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
if key not in d or d[key] is None or skip_false_values and not d[key]:
continue
return d[key]
return default
return d.get(key_or_keys, default)
exp = (lambda x: x or None) if skip_false_values else IDENTITY
return traverse_obj(d, *variadic(key_or_keys), expected_type=exp,
default=default, get_all=False)
def try_call(*funcs, **kwargs):
@ -4307,16 +4304,38 @@ def try_get(src, getter, expected_type=None):
return v
def merge_dicts(*dicts):
def merge_dicts(*dicts, **kwargs):
"""
Merge the `dict`s in `dicts` using the first valid value for each key.
Normally valid: not None and not an empty string
Keyword-only args:
unblank: allow empty string if False (default True)
rev: merge dicts in reverse order (default False)
merge_dicts(dct1, dct2, ..., unblank=False, rev=True)
matches {**dct1, **dct2, ...}
However, merge_dicts(dct1, dct2, ..., rev=True) may often be better.
"""
unblank = kwargs.get('unblank', True)
rev = kwargs.get('rev', False)
if unblank:
def can_merge_str(k, v, to_dict):
return (isinstance(v, compat_str) and v
and isinstance(to_dict[k], compat_str)
and not to_dict[k])
else:
can_merge_str = lambda k, v, to_dict: False
merged = {}
for a_dict in dicts:
for a_dict in reversed(dicts) if rev else dicts:
for k, v in a_dict.items():
if v is None:
continue
if (k not in merged
or (isinstance(v, compat_str) and v
and isinstance(merged[k], compat_str)
and not merged[k])):
if (k not in merged) or can_merge_str(k, v, merged):
merged[k] = v
return merged
@ -4370,46 +4389,108 @@ def strip_jsonp(code):
r'\g<callback_data>', code)
def js_to_json(code):
COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
def js_to_json(code, *args, **kwargs):
# vars is a dict of (var, val) pairs to substitute
vars = args[0] if len(args) > 0 else kwargs.get('vars', {})
strict = kwargs.get('strict', False)
STRING_QUOTES = '\'"`'
STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES)
COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
INTEGER_TABLE = (
(r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
(r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
(r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10),
)
# compat candidate
JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError
def process_escape(match):
JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu'
escape = match.group(1) or match.group(2)
return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES
else '\\u00' if escape == 'x'
else '' if escape == '\n'
else escape)
def template_substitute(match):
evaluated = js_to_json(match.group(1), vars, strict=strict)
if evaluated[0] == '"':
return json.loads(evaluated)
return evaluated
def fix_kv(m):
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
return ""
elif v in ('undefined', 'void 0'):
return 'null'
elif v.startswith('/*') or v.startswith('//') or v == ',':
return ''
if v[0] in ("'", '"'):
v = re.sub(r'(?s)\\.|"', lambda m: {
'"': '\\"',
"\\'": "'",
'\\\n': '',
'\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1])
if v[0] in STRING_QUOTES:
v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
return '"{0}"'.format(escaped)
inv = IDENTITY
im = re.split(r'^!+', v)
if len(im) > 1 and not im[-1].endswith(':'):
if (len(v) - len(im[1])) % 2 == 1:
inv = lambda x: 'true' if x == 0 else 'false'
else:
inv = lambda x: 'false' if x == 0 else 'true'
if not any(x for x in im):
return
v = im[-1]
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(1), base)
return '"%d":' % i if v.endswith(':') else '%d' % i
return ('"%s":' if v.endswith(':') else '%s') % inv(i)
return '"%s"' % v
if v in vars:
try:
if not strict:
json.loads(vars[v])
except JSONDecodeError:
return inv(json.dumps(vars[v]))
else:
return inv(vars[v])
if not strict:
v = try_call(inv, args=(v,), default=v)
if v in ('true', 'false'):
return v
return '"{0}"'.format(v)
raise ValueError('Unknown value: ' + v)
def create_map(mobj):
return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
if not strict:
code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
return re.sub(r'''(?sx)
"(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
'(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
{comment}|,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
[0-9]+(?={skip}:)|
{str_}|
{comment}|
,(?={skip}[\]}}])|
void\s0|
!*(?:(?<!\d)[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
(?:\b|!+)0(?:[xX][\da-fA-F]+|[0-7]+)(?:{skip}:)?|
!+\d+(?:\.\d*)?(?:{skip}:)?|
[0-9]+(?:{skip}:)|
!+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
'''.format(comment=COMMENT_RE, skip=SKIP_RE, str_=STRING_RE), fix_kv, code)
def qualities(quality_ids):
@ -6029,6 +6110,37 @@ def clean_podcast_url(url):
)/''', '', url)
if __debug__:
# Raise TypeError if args can't be bound
# needs compat owing to unstable inspect API, thanks PSF :-(
try:
inspect.signature
def _try_bind_args(fn, *args, **kwargs):
inspect.signature(fn).bind(*args, **kwargs)
except AttributeError:
# Py < 3.3
def _try_bind_args(fn, *args, **kwargs):
fn_args = inspect.getargspec(fn)
# Py2: ArgInfo(args, varargs, keywords, defaults)
# Py3: ArgSpec(args, varargs, keywords, defaults)
if not fn_args.keywords:
for k in kwargs:
if k not in (fn_args.args or []):
raise TypeError("got an unexpected keyword argument: '{0}'".format(k))
if not fn_args.varargs:
args_to_bind = len(args)
bindable = len(fn_args.args or [])
if args_to_bind > bindable:
raise TypeError('too many positional arguments')
bindable -= len(fn_args.defaults or [])
if args_to_bind < bindable:
if kwargs:
bindable -= len(set(fn_args.args or []) & set(kwargs))
if bindable > args_to_bind:
raise TypeError("missing a required argument: '{0}'".format(fn_args.args[args_to_bind]))
def traverse_obj(obj, *paths, **kwargs):
"""
Safely traverse nested `dict`s and `Iterable`s
@ -6247,10 +6359,7 @@ def traverse_obj(obj, *paths, **kwargs):
if __debug__ and callable(key):
# Verify function signature
args = inspect.getargspec(key)
if len(args.args) != 2:
# crash differently in 2.6 !
inspect.getcallargs(key, None, None)
_try_bind_args(key, None, None)
new_objs = []
for obj in objs: