Compare commits

...

12 Commits

Author SHA1 Message Date
dirkf
1fa8b86f0b [utils] Remove stray undocumented Host header in redirect (fix 46fde7c) 2023-07-20 05:29:59 +01:00
dirkf
b2ba24bb02 [InfoExtractor] Add _match_valid_url() class method and refactor
* API compatible with yt-dlp
* also support Sequence of patterns in _VALID_URL
* one place to compile _VALID_URL
* TODO: remove existing extractor shims
2023-07-19 22:14:50 +01:00
dirkf
a190b55964 [utils] Fix broken Py 3.11+ compat in traverse_obj()
* inspect.getargspec is missing despite doc claiming backward compat
* replace with emulation of `Signature.bind()`
2023-07-19 22:14:50 +01:00
dirkf
b2741f2654 [InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
  thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
  thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
  222a230871,
  thanks Lesmiscore
* update extractors in PR using above, fix tests.
2023-07-19 22:14:50 +01:00
dirkf
8465222041 [Clipchamp] Add new extractor back-ported from yt-dlp 2023-07-19 22:14:50 +01:00
dirkf
4339910df3 [DLF] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6697, thanks nick-cd
2023-07-19 22:14:50 +01:00
dirkf
eaaf4c6736 [Whyp] Add extractor back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6803, thanks CoryTibbettsDev
2023-07-19 22:14:50 +01:00
dirkf
4566e6e53e [GlobalPlayer] Add site extractors back-ported from yt-dlp
* from https://github.com/yt-dlp/yt-dlp/pull/6903, thanks garret1317
2023-07-19 22:14:50 +01:00
dirkf
1e8ccdd2eb [InfoExtractor] Support groups in _search_regex(), etc 2023-07-19 22:14:50 +01:00
dirkf
cb9366eda5 [utils] Minor updates (merge_dicts, T)
A couple of mods to ease yt-dlp back-ports:
* add kwargs to merge_dicts:
  `unblank=True` (disallow empty string), `rev=False` (reverse the merge list)
* add `T(x)` shortcut for `{x}`, unsupported in Py2.6
2023-07-19 22:14:50 +01:00
dirkf
d9d07a9581 [utils] Improve js_to_json, align with yt-dlp
* support variable substitution, from https://github.com/yt-dlp/yt-dlp/pull/#521 etc,
  thanks ChillingPepper, Grub4k, pukkandan
* improve escape handling, from https://github.com/yt-dlp/yt-dlp/pull/#521
  thanks Grub4k
* support template strings from https://github.com/yt-dlp/yt-dlp/pull/6623
  thanks Grub4k
* add limited `!` evaluation (eg, !!0 -> false, see tests)
2023-07-19 22:14:50 +01:00
dirkf
825a40744b [utils] Align traverse_obj() with yt-dlp
Thanks Grub4k for these:
* traverse `Iterable`s, from https://github.com/yt-dlp/yt-dlp/pull/6902, etc
* traverse `set` key for transformations/filters, `re.Match` group names, from
  776995bc10, etc
* traverse `re.Match`es, from https://github.com/yt-dlp/yt-dlp/pull/5174
* always return list when branching, from https://github.com/yt-dlp/yt-dlp/pull/5170
2023-07-19 22:14:50 +01:00
10 changed files with 1098 additions and 95 deletions

View File

@ -4,6 +4,7 @@ from inspect import getsource
import io import io
import os import os
from os.path import dirname as dirn from os.path import dirname as dirn
import re
import sys import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f: with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read() module_template = f.read()
def get_source(m):
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
module_contents = [ module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', module_template,
get_source(InfoExtractor.suitable),
get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780) # needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n', 'from youtube_dl.utils import parse_qs, variadic\n',
] ]
ie_template = ''' ie_template = '''
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url, valid_url=valid_url,
module=ie.__module__) module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += '\n' + getsource(ie.suitable) s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'): if hasattr(ie, '_make_valid_url'):
# search extractors # search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url()) s += make_valid_template.format(valid_url=ie._make_valid_url())

View File

@ -7,15 +7,33 @@ import io
import os import os
import sys import sys
import unittest import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
from youtube_dl.compat import compat_etree_fromstring, compat_http_server
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
import threading import threading
from test.helper import (
expect_dict,
expect_value,
FakeYDL,
http_server_port,
)
from youtube_dl.compat import (
compat_etree_fromstring,
compat_http_server,
)
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import (
get_info_extractor,
YoutubeIE,
)
from youtube_dl.utils import (
encode_data_uri,
ExtractorError,
RegexNotFoundError,
strip_jsonp,
)
TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_STATUS = 418
TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
def test_search_nextjs_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<meta name="viewport" content="width=device-width">
<title>Test _search_nextjs_data()</title>
</head>
<body>
<div id="__next">
<div style="background-color:#17171E" class="FU" dir="ltr">
<div class="sc-93de261d-0 dyzzYE">
<div>
<header class="HD"></header>
<main class="MN">
<div style="height:0" class="HT0">
<div style="width:NaN%" data-testid=
"stream-container" class="WDN"></div>
</div>
</main>
</div>
<footer class="sc-6e5faf91-0 dEGaHS"></footer>
</div>
</div>
</div>
<script id="__NEXT_DATA__" type="application/json">
{"props":{"pageProps":{"video":{"id":"testid"}}}}
</script>
</body>
</html>
'''
search = self.ie._search_nextjs_data(html, 'testID')
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
def test_search_nuxt_data(self):
html = '''
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content=
"text/html; charset=utf-8">
<title>Nuxt.js Test Page</title>
<meta name="viewport" content=
"width=device-width, initial-scale=1">
<meta data-hid="robots" name="robots" content="all">
</head>
<body class="BD">
<div id="__layout">
<h1 class="H1">Example heading</h1>
<div class="IN">
<p>Decoy text</p>
</div>
</div>
<script>
window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null));
</script>
<script src="/_nuxt/a12345b.js" defer="defer"></script>
</body>
</html>
'''
search = self.ie._search_nuxt_data(html, 'testID')
self.assertEqual(search['track']['id'], 'testid')
def test_search_json_ld_realworld(self): def test_search_json_ld_realworld(self):
# https://github.com/ytdl-org/youtube-dl/issues/23306 # https://github.com/ytdl-org/youtube-dl/issues/23306
expect_dict( expect_dict(
@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):
}], }],
}) })
# from https://0000.studio/
# with type attribute but without extension in URL
expect_dict(
self,
self.ie._parse_html5_media_entries(
'https://0000.studio',
r'''
<video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
</video>
''', None)[0],
{
'formats': [{
'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
'ext': 'mp4',
}],
})
def test_extract_jwplayer_data_realworld(self): def test_extract_jwplayer_data_realworld(self):
# from http://www.suffolk.edu/sjc/ # from http://www.suffolk.edu/sjc/
expect_dict( expect_dict(

View File

@ -20,7 +20,7 @@ import xml.etree.ElementTree
from youtube_dl.utils import ( from youtube_dl.utils import (
age_restricted, age_restricted,
args_to_str, args_to_str,
encode_base_n, base_url,
caesar, caesar,
clean_html, clean_html,
clean_podcast_url, clean_podcast_url,
@ -29,10 +29,12 @@ from youtube_dl.utils import (
detect_exe_version, detect_exe_version,
determine_ext, determine_ext,
dict_get, dict_get,
encode_base_n,
encode_compat_str, encode_compat_str,
encodeFilename, encodeFilename,
escape_rfc3986, escape_rfc3986,
escape_url, escape_url,
expand_path,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
find_xpath_attr, find_xpath_attr,
@ -51,6 +53,7 @@ from youtube_dl.utils import (
js_to_json, js_to_json,
LazyList, LazyList,
limit_length, limit_length,
lowercase_escape,
merge_dicts, merge_dicts,
mimetype2ext, mimetype2ext,
month_by_name, month_by_name,
@ -66,17 +69,16 @@ from youtube_dl.utils import (
parse_resolution, parse_resolution,
parse_bitrate, parse_bitrate,
pkcs1pad, pkcs1pad,
read_batch_urls,
sanitize_filename,
sanitize_path,
sanitize_url,
expand_path,
prepend_extension, prepend_extension,
replace_extension, read_batch_urls,
remove_start, remove_start,
remove_end, remove_end,
remove_quotes, remove_quotes,
replace_extension,
rot47, rot47,
sanitize_filename,
sanitize_path,
sanitize_url,
shell_quote, shell_quote,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
@ -93,10 +95,8 @@ from youtube_dl.utils import (
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
uppercase_escape, uppercase_escape,
lowercase_escape,
url_basename, url_basename,
url_or_none, url_or_none,
base_url,
urljoin, urljoin,
urlencode_postdata, urlencode_postdata,
urshift, urshift,
@ -905,6 +905,85 @@ class TestUtil(unittest.TestCase):
) )
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
def test_js_to_json_vars_strings(self):
self.assertDictEqual(
json.loads(js_to_json(
'''{
'null': a,
'nullStr': b,
'true': c,
'trueStr': d,
'false': e,
'falseStr': f,
'unresolvedVar': g,
}''',
{
'a': 'null',
'b': '"null"',
'c': 'true',
'd': '"true"',
'e': 'false',
'f': '"false"',
'g': 'var',
}
)),
{
'null': None,
'nullStr': 'null',
'true': True,
'trueStr': 'true',
'false': False,
'falseStr': 'false',
'unresolvedVar': 'var'
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'int': a,
'intStr': b,
'float': c,
'floatStr': d,
}''',
{
'a': '123',
'b': '"123"',
'c': '1.23',
'd': '"1.23"',
}
)),
{
'int': 123,
'intStr': '123',
'float': 1.23,
'floatStr': '1.23',
}
)
self.assertDictEqual(
json.loads(js_to_json(
'''{
'object': a,
'objectStr': b,
'array': c,
'arrayStr': d,
}''',
{
'a': '{}',
'b': '"{}"',
'c': '[]',
'd': '"[]"',
}
)),
{
'object': {},
'objectStr': '{}',
'array': [],
'arrayStr': '[]',
}
)
def test_js_to_json_realworld(self): def test_js_to_json_realworld(self):
inp = '''{ inp = '''{
'clip':{'provider':'pseudo'} 'clip':{'provider':'pseudo'}
@ -975,10 +1054,10 @@ class TestUtil(unittest.TestCase):
!42: 42 !42: 42
}''') }''')
self.assertEqual(json.loads(on), { self.assertEqual(json.loads(on), {
'a': 0, 'a': True,
'b': 1, 'b': False,
'c': 0, 'c': False,
'd': 42.42, 'd': True,
'e': [], 'e': [],
'f': "abc", 'f': "abc",
'g': "", 'g': "",
@ -1048,10 +1127,26 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{ "040": "040" }') on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'}) self.assertEqual(json.loads(on), {'040': '040'})
on = js_to_json('[1,//{},\n2]')
self.assertEqual(json.loads(on), [1, 2])
on = js_to_json(r'"\^\$\#"')
self.assertEqual(json.loads(on), R'^$#', msg='Unnecessary escapes should be stripped')
on = js_to_json('\'"\\""\'')
self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
def test_js_to_json_malformed(self): def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1') self.assertEqual(js_to_json('42a-1'), '42"a"-1')
def test_js_to_json_template_literal(self):
self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '"X"'}), '"XX"')
self.assertEqual(js_to_json('`${name}${name}`', {'name': '5'}), '"55"')
self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
self.assertEqual(js_to_json('`${name}`', {}), '"name"')
def test_extract_attributes(self): def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@ -1586,6 +1681,11 @@ Line 1
'dict': {}, 'dict': {},
} }
# define a pukka Iterable
def iter_range(stop):
for from_ in range(stop):
yield from_
# Test base functionality # Test base functionality
self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str',
msg='allow tuple path') msg='allow tuple path')
@ -1602,13 +1702,13 @@ Line 1
# Test Ellipsis behavior # Test Ellipsis behavior
self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis), self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis),
(item for item in _TEST_DATA.values() if item not in (None, {})), (item for item in _TEST_DATA.values() if item not in (None, {})),
msg='`...` should give all non discarded values') msg='`...` should give all non-discarded values')
self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(), self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(),
msg='`...` selection for dicts should select all values') msg='`...` selection for dicts should select all values')
self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')), self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')),
['https://www.example.com/0', 'https://www.example.com/1'], ['https://www.example.com/0', 'https://www.example.com/1'],
msg='nested `...` queries should work') msg='nested `...` queries should work')
self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4), self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), iter_range(4),
msg='`...` query result should be flattened') msg='`...` query result should be flattened')
self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)), self.assertEqual(traverse_obj(iter(range(4)), Ellipsis), list(range(4)),
msg='`...` should accept iterables') msg='`...` should accept iterables')
@ -1618,7 +1718,7 @@ Line 1
[_TEST_DATA['urls']], [_TEST_DATA['urls']],
msg='function as query key should perform a filter based on (key, value)') msg='function as query key should perform a filter based on (key, value)')
self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), set(('str',)), self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), set(('str',)),
msg='exceptions in the query function should be catched') msg='exceptions in the query function should be caught')
self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2],
msg='function key should accept iterables') msg='function key should accept iterables')
if __debug__: if __debug__:
@ -1706,7 +1806,7 @@ Line 1
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {}, self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {},
msg='remove empty values when dict key') msg='remove empty values when dict key')
self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis}, self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: Ellipsis},
msg='use `default` when dict key and `default`') msg='use `default` when dict key and a default')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {}, self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {},
msg='remove empty values when nested dict key fails') msg='remove empty values when nested dict key fails')
self.assertEqual(traverse_obj(None, {0: 'fail'}), {}, self.assertEqual(traverse_obj(None, {0: 'fail'}), {},
@ -1768,7 +1868,7 @@ Line 1
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str),
'str', msg='accept matching `expected_type` type') 'str', msg='accept matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int),
None, msg='reject non matching `expected_type` type') None, msg='reject non-matching `expected_type` type')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)),
'0', msg='transform type using type function') '0', msg='transform type using type function')
self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0), self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0),
@ -1780,7 +1880,7 @@ Line 1
self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none), self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none),
{0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values') {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values')
self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, set((int_or_none,))), expected_type=int), self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, set((int_or_none,))), expected_type=int),
1, msg='expected_type should not filter non final dict values') 1, msg='expected_type should not filter non-final dict values')
self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int), self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int),
{0: {0: 100}}, msg='expected_type should transform deep dict values') {0: {0: 100}}, msg='expected_type should transform deep dict values')
self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)), self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(Ellipsis)),
@ -1838,7 +1938,7 @@ Line 1
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)),
_traverse_string=True), 'sr', _traverse_string=True), 'sr',
msg='`slice` should result in string if `traverse_string`') msg='`slice` should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == 's'),
_traverse_string=True), 'str', _traverse_string=True), 'str',
msg='function should result in string if `traverse_string`') msg='function should result in string if `traverse_string`')
self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)),

View File

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
merge_dicts,
T,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ClipchampIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
'info_dict': {
'id': 'gRXZ4ZhdDaU',
'ext': 'mp4',
'title': 'Untitled video',
'uploader': 'Alexander Schwartz',
'timestamp': 1680805580,
'upload_date': '20230406',
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {
'skip_download': 'm3u8',
'format': 'bestvideo',
},
}]
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
storage_location = data.get('storage_location')
if storage_location != 'cf_stream':
raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,))
path = data['download_url']
iframe = self._download_webpage(
'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe')
subdomain = self._search_regex(
r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe,
'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
formats = self._extract_mpd_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
formats.extend(self._extract_m3u8_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
return merge_dicts({
'id': video_id,
'formats': formats,
'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None,
}, traverse_obj(data, {
'title': ('project', 'project_name', T(compat_str)),
'timestamp': ('created_at', T(unified_timestamp)),
'thumbnail': ('thumbnail_url', T(url_or_none)),
}), rev=True)

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import base64 import base64
import datetime import datetime
import functools
import hashlib import hashlib
import json import json
import netrc import netrc
@ -23,6 +24,7 @@ from ..compat import (
compat_getpass, compat_getpass,
compat_integer_types, compat_integer_types,
compat_http_client, compat_http_client,
compat_map as map,
compat_os_name, compat_os_name,
compat_str, compat_str,
compat_urllib_error, compat_urllib_error,
@ -31,6 +33,7 @@ from ..compat import (
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_xml_parse_error, compat_xml_parse_error,
compat_zip as zip,
) )
from ..downloader.f4m import ( from ..downloader.f4m import (
get_base_url, get_base_url,
@ -70,6 +73,7 @@ from ..utils import (
str_or_none, str_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
traverse_obj,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
@ -79,6 +83,7 @@ from ..utils import (
urljoin, urljoin,
url_basename, url_basename,
url_or_none, url_or_none,
variadic,
xpath_element, xpath_element,
xpath_text, xpath_text,
xpath_with_ns, xpath_with_ns,
@ -367,9 +372,22 @@ class InfoExtractor(object):
title, description etc. title, description etc.
Subclasses of this one should re-define the _real_initialize() and A subclass of InfoExtractor must be defined to handle each specific site (or
_real_extract() methods and define a _VALID_URL regexp. several sites). Such a concrete subclass should be added to the list of
Probably, they should also be added to the list of extractors. extractors. It should also:
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
regexps (but see below)
* re-define the _real_extract() method
* optionally re-define the _real_initialize() method.
An extractor subclass may also override suitable() if necessary, but the
function signature must be preserved and the function must import everything
it needs (except other extractors), so that lazy_extractors works correctly.
If the subclass's suitable() and _real_extract() functions avoid using
_VALID_URL, the subclass need not set that class attribute.
An abstract subclass of InfoExtractor may be used to simplify implementation
within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable _GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor. geo restriction bypass mechanisms for a particular extractor.
@ -404,22 +422,33 @@ class InfoExtractor(object):
self._x_forwarded_for_ip = None self._x_forwarded_for_ip = None
self.set_downloader(downloader) self.set_downloader(downloader)
@classmethod
def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for cls, whereas getattr would also
# match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
# _VALID_URL can now be a list/tuple of patterns
cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
for p in cls._VALID_URL_RE:
p = p.match(url)
if p:
return p
# The public alias can safely be overridden, as in some back-ports
_match_valid_url = __match_valid_url
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE.""" """Receives a URL and returns True if suitable for this IE."""
# This function must import everything it needs (except other extractors),
# This does not use has/getattr intentionally - we want to know whether # so that lazy_extractors works correctly
# we have cached the regexp for *this* class, whereas getattr would also return cls.__match_valid_url(url) is not None
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url) is not None
@classmethod @classmethod
def _match_id(cls, url): def _match_id(cls, url):
if '_VALID_URL_RE' not in cls.__dict__: m = cls.__match_valid_url(url)
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
assert m assert m
return compat_str(m.group('id')) return compat_str(m.group('id'))
@ -1005,6 +1034,8 @@ class InfoExtractor(object):
if group is None: if group is None:
# return the first matching group # return the first matching group
return next(g for g in mobj.groups() if g is not None) return next(g for g in mobj.groups() if g is not None)
elif isinstance(group, (list, tuple)):
return tuple(mobj.group(g) for g in group)
else: else:
return mobj.group(group) return mobj.group(group)
elif default is not NO_DEFAULT: elif default is not NO_DEFAULT:
@ -1020,10 +1051,9 @@ class InfoExtractor(object):
Like _search_regex, but strips HTML tags and unescapes entities. Like _search_regex, but strips HTML tags and unescapes entities.
""" """
res = self._search_regex(pattern, string, name, default, fatal, flags, group) res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res: if isinstance(res, tuple):
return clean_html(res).strip() return tuple(map(clean_html, res))
else: return clean_html(res)
return res
def _get_netrc_login_info(self, netrc_machine=None): def _get_netrc_login_info(self, netrc_machine=None):
username = None username = None
@ -1348,6 +1378,44 @@ class InfoExtractor(object):
break break
return dict((k, v) for k, v in info.items() if v is not None) return dict((k, v) for k, v in info.items() if v is not None)
def _search_nextjs_data(self, webpage, video_id, **kw):
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
kw.pop('transform_source', None)
next_data = self._search_regex(
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
webpage, 'next.js data', group='nd', **kw)
if not next_data:
return {}
return self._parse_json(next_data, video_id, **nkw)
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
# self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
fatal = kwargs.get('fatal', True)
traverse = kwargs.get('traverse', ('data', 0))
re_ctx = re.escape(context_name)
FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
js, arg_keys, arg_vals = self._search_regex(
(p.format(re_ctx, FUNCTION_RE) for p in
(r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
r'{0}\s*\([\s\S]*?{1}')),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
default=NO_DEFAULT if fatal else (None, None, None))
if js is None:
return {}
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
return traverse_obj(ret, traverse) or {}
@staticmethod @staticmethod
def _hidden_inputs(html): def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@ -2495,7 +2563,8 @@ class InfoExtractor(object):
return f return f
return {} return {}
def _media_formats(src, cur_media_type, type_info={}): def _media_formats(src, cur_media_type, type_info=None):
type_info = type_info or {}
full_url = absolute_url(src) full_url = absolute_url(src)
ext = type_info.get('ext') or determine_ext(full_url) ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8': if ext == 'm3u8':
@ -2513,6 +2582,7 @@ class InfoExtractor(object):
formats = [{ formats = [{
'url': full_url, 'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None, 'vcodec': 'none' if cur_media_type == 'audio' else None,
'ext': ext,
}] }]
return is_plain_url, formats return is_plain_url, formats
@ -2521,7 +2591,7 @@ class InfoExtractor(object):
# so we wll include them right here (see # so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video) # https://www.ampproject.org/docs/reference/components/amp-video)
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
media_tags = [(media_tag, media_tag_name, media_type, '') media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type for media_tag, media_tag_name, media_type
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
@ -2539,7 +2609,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag) media_attributes = extract_attributes(media_tag)
src = strip_or_none(media_attributes.get('src')) src = strip_or_none(media_attributes.get('src'))
if src: if src:
_, formats = _media_formats(src, media_type) f = parse_content_type(media_attributes.get('type'))
_, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats) media_info['formats'].extend(formats)
media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content: if media_content:

204
youtube_dl/extractor/dlf.py Normal file
View File

@ -0,0 +1,204 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
merge_dicts,
traverse_obj,
url_or_none,
variadic,
)
class DLFBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
_BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
def _parse_button_attrs(self, button, audio_id=None):
attrs = extract_attributes(button)
audio_id = audio_id or attrs['data-audio-diraid']
url = traverse_obj(
attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
'data-audio-src', expected_type=url_or_none)
ext = determine_ext(url)
formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
self._sort_formats(formats)
def traverse_attrs(path):
path = list(variadic(path))
t = path.pop() if callable(path[-1]) else None
return traverse_obj(attrs, path, expected_type=t, get_all=False)
def txt_or_none(v, default=None):
return default if v is None else (compat_str(v).strip() or default)
return merge_dicts(*reversed([{
'id': audio_id,
# 'extractor_key': DLFIE.ie_key(),
# 'extractor': DLFIE.IE_NAME,
'formats': formats,
}, dict((k, traverse_attrs(v)) for k, v in {
'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
'thumbnail': ('data-audioimage', url_or_none),
'uploader': 'data-audio-producer',
'series': 'data-audio-series',
'channel': 'data-audio-origin-site-name',
'webpage_url': ('data-audio-download-tracking-path', url_or_none),
}.items())]))
class DLFIE(DLFBaseIE):
IE_NAME = 'dlf'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
_TESTS = [
# Audio as an HLS stream
{
'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
'info_dict': {
'id': '03a3eb19',
'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
'ext': 'm4a',
'duration': 3298,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'On Stage',
'channel': 'deutschlandfunk'
},
'params': {
'skip_download': 'm3u8'
},
'skip': 'This webpage no longer exists'
}, {
'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
'info_dict': {
'id': 'd9cc1856',
'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
'ext': 'mp3',
'duration': 291,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'Kommentare und Themen der Woche',
'channel': 'deutschlandfunk'
}
},
]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_id)
return self._parse_button_attrs(
self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
class DLFCorpusIE(DLFBaseIE):
IE_NAME = 'dlf:corpus'
IE_DESC = 'DLF Multi-feed Archives'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
_TESTS = [
# Recorded news broadcast with referrals to related broadcasts
{
'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
'info_dict': {
'id': 'fechten-russland-belarus-ukraine-protest-100',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
},
'playlist_mincount': 5,
'playlist': [{
'info_dict': {
'id': '1fc5d64a',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'ext': 'mp3',
'duration': 252,
'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
'uploader': 'Deutschlandfunk',
'series': 'Sport',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '2ada145f',
'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
'ext': 'mp3',
'duration': 336,
'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
'uploader': 'Deutschlandfunk',
'series': 'Deutschlandfunk Nova',
'channel': 'deutschlandfunk-nova'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '47e1a096',
'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
'ext': 'mp3',
'duration': 602,
'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}]
},
# Podcast feed with tag buttons, playlist count fluctuates
{
'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
'info_dict': {
'id': 'kommentare-und-themen-der-woche-100',
'title': 'Meinung - Kommentare und Themen der Woche',
'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
},
'playlist_mincount': 10,
},
# Podcast feed with no description
{
'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
'info_dict': {
'id': 'podcast-tolle-idee-100',
'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
},
'playlist_mincount': 11,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return self.playlist_result(
map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))

View File

@ -226,6 +226,7 @@ from .ciscolive import (
CiscoLiveSearchIE, CiscoLiveSearchIE,
) )
from .cjsw import CJSWIE from .cjsw import CJSWIE
from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE from .cliphunter import CliphunterIE
from .clippit import ClippitIE from .clippit import ClippitIE
from .cliprs import ClipRsIE from .cliprs import ClipRsIE
@ -295,6 +296,10 @@ from .dbtv import DBTVIE
from .dctp import DctpTvIE from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE from .deezer import DeezerPlaylistIE
from .democracynow import DemocracynowIE from .democracynow import DemocracynowIE
from .dlf import (
DLFCorpusIE,
DLFIE,
)
from .dfb import DFBIE from .dfb import DFBIE
from .dhm import DHMIE from .dhm import DHMIE
from .digg import DiggIE from .digg import DiggIE
@ -444,6 +449,13 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE from .giantbomb import GiantBombIE
from .giga import GigaIE from .giga import GigaIE
from .glide import GlideIE from .glide import GlideIE
from .globalplayer import (
GlobalPlayerLiveIE,
GlobalPlayerLivePlaylistIE,
GlobalPlayerAudioIE,
GlobalPlayerAudioEpisodeIE,
GlobalPlayerVideoIE
)
from .globo import ( from .globo import (
GloboIE, GloboIE,
GloboArticleIE, GloboArticleIE,
@ -975,6 +987,10 @@ from .pornhub import (
from .pornotube import PornotubeIE from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE from .pornoxo import PornoXOIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)
from .puhutv import ( from .puhutv import (
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
@ -1565,6 +1581,7 @@ from .weibo import (
WeiboMobileIE WeiboMobileIE
) )
from .weiqitv import WeiqiTVIE from .weiqitv import WeiqiTVIE
from .whyp import WhypIE
from .wistia import ( from .wistia import (
WistiaIE, WistiaIE,
WistiaPlaylistIE, WistiaPlaylistIE,
@ -1678,7 +1695,3 @@ from .zingmp3 import (
) )
from .zoom import ZoomIE from .zoom import ZoomIE
from .zype import ZypeIE from .zype import ZypeIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)

View File

@ -0,0 +1,273 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
join_nonempty,
merge_dicts,
parse_duration,
str_or_none,
T,
traverse_obj,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
)
class GlobalPlayerBaseIE(InfoExtractor):
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
def _request_ext(self, url, video_id):
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
url, video_id, note='Determining source extension'))
@staticmethod
def _clean_desc(x):
x = clean_html(x)
if x:
x = x.replace('\xa0', ' ')
return x
def _extract_audio(self, episode, series):
return merge_dicts({
'vcodec': 'none',
}, traverse_obj(series, {
'series': 'title',
'series_id': 'id',
'thumbnail': 'imageUrl',
'uploader': 'itunesAuthor', # podcasts only
}), traverse_obj(episode, {
'id': 'id',
'description': ('description', T(self._clean_desc)),
'duration': ('duration', T(parse_duration)),
'thumbnail': 'imageUrl',
'url': 'streamUrl',
'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)),
'title': 'title',
}, get_all=False), rev=True)
class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
_TESTS = [{
'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
'info_dict': {
'id': '2mx1E',
'ext': 'aac',
'display_id': 'smoothchill-uk',
'title': 're:^Smooth Chill.+$',
'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
'description': 'Music To Chill To',
# 'live_status': 'is_live',
'is_live': True,
},
}, {
# national station
'url': 'https://www.globalplayer.com/live/heart/uk/',
'info_dict': {
'id': '2mwx4',
'ext': 'aac',
'description': 'turn up the feel good!',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
# 'live_status': 'is_live',
'is_live': True,
'title': 're:^Heart UK.+$',
'display_id': 'heart-uk',
},
}, {
# regional variation
'url': 'https://www.globalplayer.com/live/heart/london/',
'info_dict': {
'id': 'AMqg',
'ext': 'aac',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'title': 're:^Heart London.+$',
# 'live_status': 'is_live',
'is_live': True,
'display_id': 'heart-london',
'description': 'turn up the feel good!',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['station']
stream_url = station['streamUrl']
return merge_dicts({
'id': station['id'],
'display_id': (
join_nonempty('brandSlug', 'slug', from_dict=station)
or station.get('legacyStationPrefix')),
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, {
'title': self._live_title(traverse_obj(
station, (('name', 'brandName'), T(str_or_none)),
get_all=False)),
}, traverse_obj(station, {
'description': 'tagline',
'thumbnail': 'brandLogo',
}), rev=True)
class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
_TESTS = [{
# "live playlist"
'url': 'https://www.globalplayer.com/playlists/8bLk/',
'info_dict': {
'id': '8bLk',
'ext': 'aac',
# 'live_status': 'is_live',
'is_live': True,
'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
'title': 're:Classic FM Hall of Fame.+$'
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['playlistData']
stream_url = station['streamUrl']
return merge_dicts({
'id': video_id,
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, traverse_obj(station, {
'title': 'title',
'description': ('description', T(self._clean_desc)),
'thumbnail': 'image',
}), rev=True)
class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
'playlist_mincount': 5,
'info_dict': {
'id': '42KuaM',
'title': 'Filthy Ritual',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'categories': ['Society & Culture', 'True Crime'],
'uploader': 'Global',
'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
'playlist_mincount': 2,
'info_dict': {
'id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
series = props['podcastInfo'] if podcast else props['catchupInfo']
return merge_dicts({
'_type': 'playlist',
'id': video_id,
'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
}, traverse_obj(series, {
'description': ('description', T(self._clean_desc)),
'thumbnail': 'imageUrl',
'title': 'title',
'uploader': 'itunesAuthor', # podcasts only
}), rev=True)
class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
'info_dict': {
'id': '7DrfNnE',
'ext': 'mp3',
'title': 'Filthy Ritual - Trailer',
'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'duration': 225.0,
'timestamp': 1681254900,
'series': 'Filthy Ritual',
'series_id': '42KuaM',
'upload_date': '20230411',
'uploader': 'Global',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
'only_matching': True,
# expired: refresh the details with a current show for a full test
'info_dict': {
'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
'ext': 'm4a',
'timestamp': 1682056800,
'series': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
'upload_date': '20230421',
'series_id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'duration': 10800.0,
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
return self._extract_audio(
episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
'info_dict': {
'id': '2JsSZ7Gm2uP',
'ext': 'mp4',
'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
'upload_date': '20230420',
'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._get_page_props(url, video_id)['videoData']
return merge_dicts({
'id': video_id,
}, traverse_obj(meta, {
'url': 'url',
'thumbnail': ('image', 'url'),
'title': 'title',
'upload_date': ('publish_date', T(unified_strdate)),
'description': 'description',
}), rev=True)

View File

@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
str_or_none,
T,
traverse_obj,
url_or_none,
)
class WhypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
'info_dict': {
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
'id': '18337',
'title': 'Home Page Example Track',
'description': r're:(?s).+\bexample track\b',
'ext': 'mp3',
'duration': 52.82,
'uploader': 'Brad',
'uploader_id': '1',
'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
},
}, {
'url': 'https://www.whyp.it/tracks/18337',
'only_matching': True,
}]
def _real_extract(self, url):
unique_id = self._match_id(url)
webpage = self._download_webpage(url, unique_id)
data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
return merge_dicts({
'url': data['audio_url'],
'id': unique_id,
}, traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', T(float_or_none)),
'uploader': ('user', 'username'),
'uploader_id': ('user', 'id', T(str_or_none)),
'thumbnail': ('artwork_url', T(url_or_none)),
}), {
'ext': 'mp3',
'vcodec': 'none',
'http_headers': {'Referer': 'https://whyp.it/'},
}, rev=True)

View File

@ -2996,7 +2996,8 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
# Technically the Cookie header should be in unredirected_hdrs; # Technically the Cookie header should be in unredirected_hdrs;
# however in practice some may set it in normal headers anyway. # however in practice some may set it in normal headers anyway.
# We will remove it here to prevent any leaks. # We will remove it here to prevent any leaks.
remove_headers = ['Cookie'] # Also remove unwanted and undocumented Host header for old URL
remove_headers = ['Cookie', 'Host']
# A 303 must either use GET or HEAD for subsequent request # A 303 must either use GET or HEAD for subsequent request
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
@ -4268,13 +4269,9 @@ def variadic(x, allowed_types=NO_DEFAULT):
def dict_get(d, key_or_keys, default=None, skip_false_values=True): def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)): exp = (lambda x: x or None) if skip_false_values else IDENTITY
for key in key_or_keys: return traverse_obj(d, *variadic(key_or_keys), expected_type=exp,
if key not in d or d[key] is None or skip_false_values and not d[key]: default=default, get_all=False)
continue
return d[key]
return default
return d.get(key_or_keys, default)
def try_call(*funcs, **kwargs): def try_call(*funcs, **kwargs):
@ -4307,16 +4304,38 @@ def try_get(src, getter, expected_type=None):
return v return v
def merge_dicts(*dicts): def merge_dicts(*dicts, **kwargs):
"""
Merge the `dict`s in `dicts` using the first valid value for each key.
Normally valid: not None and not an empty string
Keyword-only args:
unblank: allow empty string if False (default True)
rev: merge dicts in reverse order (default False)
merge_dicts(dct1, dct2, ..., unblank=False, rev=True)
matches {**dct1, **dct2, ...}
However, merge_dicts(dct1, dct2, ..., rev=True) may often be better.
"""
unblank = kwargs.get('unblank', True)
rev = kwargs.get('rev', False)
if unblank:
def can_merge_str(k, v, to_dict):
return (isinstance(v, compat_str) and v
and isinstance(to_dict[k], compat_str)
and not to_dict[k])
else:
can_merge_str = lambda k, v, to_dict: False
merged = {} merged = {}
for a_dict in dicts: for a_dict in reversed(dicts) if rev else dicts:
for k, v in a_dict.items(): for k, v in a_dict.items():
if v is None: if v is None:
continue continue
if (k not in merged if (k not in merged) or can_merge_str(k, v, merged):
or (isinstance(v, compat_str) and v
and isinstance(merged[k], compat_str)
and not merged[k])):
merged[k] = v merged[k] = v
return merged return merged
@ -4370,46 +4389,108 @@ def strip_jsonp(code):
r'\g<callback_data>', code) r'\g<callback_data>', code)
def js_to_json(code): def js_to_json(code, *args, **kwargs):
COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
# vars is a dict of (var, val) pairs to substitute
vars = args[0] if len(args) > 0 else kwargs.get('vars', {})
strict = kwargs.get('strict', False)
STRING_QUOTES = '\'"`'
STRING_RE = '|'.join(r'{0}(?:\\.|[^\\{0}])*{0}'.format(q) for q in STRING_QUOTES)
COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
INTEGER_TABLE = ( INTEGER_TABLE = (
(r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
(r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
(r'(?s)^(\d+){skip}:?$'.format(skip=SKIP_RE), 10),
) )
# compat candidate
JSONDecodeError = json.JSONDecodeError if 'JSONDecodeError' in dir(json) else ValueError
def process_escape(match):
JSON_PASSTHROUGH_ESCAPES = r'"\bfnrtu'
escape = match.group(1) or match.group(2)
return ('\\' + escape if escape in JSON_PASSTHROUGH_ESCAPES
else '\\u00' if escape == 'x'
else '' if escape == '\n'
else escape)
def template_substitute(match):
evaluated = js_to_json(match.group(1), vars, strict=strict)
if evaluated[0] == '"':
return json.loads(evaluated)
return evaluated
def fix_kv(m): def fix_kv(m):
v = m.group(0) v = m.group(0)
if v in ('true', 'false', 'null'): if v in ('true', 'false', 'null'):
return v return v
elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': elif v in ('undefined', 'void 0'):
return "" return 'null'
elif v.startswith('/*') or v.startswith('//') or v == ',':
return ''
if v[0] in ("'", '"'): if v[0] in STRING_QUOTES:
v = re.sub(r'(?s)\\.|"', lambda m: { v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
'"': '\\"', escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
"\\'": "'", return '"{0}"'.format(escaped)
'\\\n': '',
'\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1])
else:
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(1), base)
return '"%d":' % i if v.endswith(':') else '%d' % i
return '"%s"' % v inv = IDENTITY
im = re.split(r'^!+', v)
if len(im) > 1 and not im[-1].endswith(':'):
if (len(v) - len(im[1])) % 2 == 1:
inv = lambda x: 'true' if x == 0 else 'false'
else:
inv = lambda x: 'false' if x == 0 else 'true'
if not any(x for x in im):
return
v = im[-1]
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(1), base)
return ('"%s":' if v.endswith(':') else '%s') % inv(i)
if v in vars:
try:
if not strict:
json.loads(vars[v])
except JSONDecodeError:
return inv(json.dumps(vars[v]))
else:
return inv(vars[v])
if not strict:
v = try_call(inv, args=(v,), default=v)
if v in ('true', 'false'):
return v
return '"{0}"'.format(v)
raise ValueError('Unknown value: ' + v)
def create_map(mobj):
return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
if not strict:
code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
return re.sub(r'''(?sx) return re.sub(r'''(?sx)
"(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| {str_}|
'(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| {comment}|
{comment}|,(?={skip}[\]}}])| ,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| void\s0|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| !*(?:(?<!\d)[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
[0-9]+(?={skip}:)| (?:\b|!+)0(?:[xX][\da-fA-F]+|[0-7]+)(?:{skip}:)?|
!+\d+(?:\.\d*)?(?:{skip}:)?|
[0-9]+(?:{skip}:)|
!+ !+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) '''.format(comment=COMMENT_RE, skip=SKIP_RE, str_=STRING_RE), fix_kv, code)
def qualities(quality_ids): def qualities(quality_ids):
@ -6029,6 +6110,37 @@ def clean_podcast_url(url):
)/''', '', url) )/''', '', url)
if __debug__:
# Raise TypeError if args can't be bound
# needs compat owing to unstable inspect API, thanks PSF :-(
try:
inspect.signature
def _try_bind_args(fn, *args, **kwargs):
inspect.signature(fn).bind(*args, **kwargs)
except AttributeError:
# Py < 3.3
def _try_bind_args(fn, *args, **kwargs):
fn_args = inspect.getargspec(fn)
# Py2: ArgInfo(args, varargs, keywords, defaults)
# Py3: ArgSpec(args, varargs, keywords, defaults)
if not fn_args.keywords:
for k in kwargs:
if k not in (fn_args.args or []):
raise TypeError("got an unexpected keyword argument: '{0}'".format(k))
if not fn_args.varargs:
args_to_bind = len(args)
bindable = len(fn_args.args or [])
if args_to_bind > bindable:
raise TypeError('too many positional arguments')
bindable -= len(fn_args.defaults or [])
if args_to_bind < bindable:
if kwargs:
bindable -= len(set(fn_args.args or []) & set(kwargs))
if bindable > args_to_bind:
raise TypeError("missing a required argument: '{0}'".format(fn_args.args[args_to_bind]))
def traverse_obj(obj, *paths, **kwargs): def traverse_obj(obj, *paths, **kwargs):
""" """
Safely traverse nested `dict`s and `Iterable`s Safely traverse nested `dict`s and `Iterable`s
@ -6247,10 +6359,7 @@ def traverse_obj(obj, *paths, **kwargs):
if __debug__ and callable(key): if __debug__ and callable(key):
# Verify function signature # Verify function signature
args = inspect.getargspec(key) _try_bind_args(key, None, None)
if len(args.args) != 2:
# crash differently in 2.6 !
inspect.getcallargs(key, None, None)
new_objs = [] new_objs = []
for obj in objs: for obj in objs: