Compare commits

..

No commits in common. "abef53466da1f7d2e79f5644718a2cf7524abc49" and "0861812d7208310a03909502b1610f5e89d04401" have entirely different histories.

6 changed files with 85 additions and 203 deletions

View File

@ -278,7 +278,7 @@ jobs:
#-------- Jython ------ #-------- Jython ------
- name: Set up Java 8 - name: Set up Java 8
if: ${{ matrix.python-impl == 'jython' }} if: ${{ matrix.python-impl == 'jython' }}
uses: actions/setup-java@v3 uses: actions/setup-java@v2
with: with:
java-version: 8 java-version: 8
distribution: 'zulu' distribution: 'zulu'

View File

@ -180,12 +180,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
respond() respond()
elif self.path == '/%c7%9f': elif self.path == '/%c7%9f':
respond() respond()
elif self.path == '/redirect_dotsegments':
self.send_response(301)
# redirect to /headers but with dot segments before
self.send_header('Location', '/a/b/./../../headers')
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path.startswith('/redirect_'): elif self.path.startswith('/redirect_'):
self._redirect() self._redirect()
elif self.path.startswith('/method'): elif self.path.startswith('/method'):
@ -467,23 +461,33 @@ class TestHTTP(unittest.TestCase):
sanitized_Request( sanitized_Request(
self._test_url('content-encoding'), self._test_url('content-encoding'),
headers={'ytdl-encoding': encoding})) headers={'ytdl-encoding': encoding}))
# decoded encodings are removed: only check for valid decompressed data self.assertEqual(res.headers.get('Content-Encoding'), encoding)
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
@unittest.skipUnless(brotli, 'brotli support is not installed') @unittest.skipUnless(brotli, 'brotli support is not installed')
@unittest.expectedFailure
def test_brotli(self): def test_brotli(self):
self.__test_compression('br') self.__test_compression('br')
@unittest.expectedFailure
def test_deflate(self): def test_deflate(self):
self.__test_compression('deflate') self.__test_compression('deflate')
@unittest.expectedFailure
def test_gzip(self): def test_gzip(self):
self.__test_compression('gzip') self.__test_compression('gzip')
@unittest.expectedFailure # not yet implemented
def test_multiple_encodings(self): def test_multiple_encodings(self):
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4 # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
with FakeYDL() as ydl:
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
self.__test_compression(pair) res = ydl.urlopen(
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': pair}))
self.assertEqual(res.headers.get('Content-Encoding'), pair)
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
def test_unsupported_encoding(self): def test_unsupported_encoding(self):
# it should return the raw content # it should return the raw content
@ -495,14 +499,6 @@ class TestHTTP(unittest.TestCase):
self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported') self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported')
self.assertEqual(res.read(), b'raw') self.assertEqual(res.read(), b'raw')
def test_remove_dot_segments(self):
with FakeYDL() as ydl:
res = ydl.urlopen(sanitized_Request(self._test_url('a/b/./../../headers')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')
res = ydl.urlopen(sanitized_Request(self._test_url('redirect_dotsegments')))
self.assertEqual(compat_urllib_parse.urlparse(res.geturl()).path, '/headers')
def _build_proxy_handler(name): def _build_proxy_handler(name):
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):

View File

@ -64,7 +64,6 @@ from youtube_dl.utils import (
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
parse_filesize, parse_filesize,
parse_codecs,
parse_count, parse_count,
parse_iso8601, parse_iso8601,
parse_resolution, parse_resolution,
@ -115,7 +114,7 @@ from youtube_dl.utils import (
cli_option, cli_option,
cli_valueless_option, cli_valueless_option,
cli_bool_option, cli_bool_option,
YoutubeDLHandler, parse_codecs,
) )
from youtube_dl.compat import ( from youtube_dl.compat import (
compat_chr, compat_chr,
@ -906,32 +905,6 @@ class TestUtil(unittest.TestCase):
) )
self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
def test_remove_dot_segments(self):
def remove_dot_segments(p):
q = '' if p.startswith('/') else '/'
p = 'http://example.com' + q + p
p = compat_urlparse.urlsplit(YoutubeDLHandler._fix_path(p)).path
return p[1:] if q else p
self.assertEqual(remove_dot_segments('/a/b/c/./../../g'), '/a/g')
self.assertEqual(remove_dot_segments('mid/content=5/../6'), 'mid/6')
self.assertEqual(remove_dot_segments('/ad/../cd'), '/cd')
self.assertEqual(remove_dot_segments('/ad/../cd/'), '/cd/')
self.assertEqual(remove_dot_segments('/..'), '/')
self.assertEqual(remove_dot_segments('/./'), '/')
self.assertEqual(remove_dot_segments('/./a'), '/a')
self.assertEqual(remove_dot_segments('/abc/./.././d/././e/.././f/./../../ghi'), '/ghi')
self.assertEqual(remove_dot_segments('/'), '/')
self.assertEqual(remove_dot_segments('/t'), '/t')
self.assertEqual(remove_dot_segments('t'), 't')
self.assertEqual(remove_dot_segments(''), '')
self.assertEqual(remove_dot_segments('/../a/b/c'), '/a/b/c')
self.assertEqual(remove_dot_segments('../a'), 'a')
self.assertEqual(remove_dot_segments('./a'), 'a')
self.assertEqual(remove_dot_segments('.'), '')
self.assertEqual(remove_dot_segments('////'), '////')
def test_js_to_json_vars_strings(self): def test_js_to_json_vars_strings(self):
self.assertDictEqual( self.assertDictEqual(
json.loads(js_to_json( json.loads(js_to_json(

View File

@ -71,6 +71,7 @@ from .utils import (
format_bytes, format_bytes,
formatSeconds, formatSeconds,
GeoRestrictedError, GeoRestrictedError,
HEADRequest,
int_or_none, int_or_none,
ISO3166Utils, ISO3166Utils,
join_nonempty, join_nonempty,
@ -87,6 +88,7 @@ from .utils import (
preferredencoding, preferredencoding,
prepend_extension, prepend_extension,
process_communicate_or_kill, process_communicate_or_kill,
PUTRequest,
register_socks_protocols, register_socks_protocols,
render_table, render_table,
replace_extension, replace_extension,
@ -2458,6 +2460,27 @@ class YoutubeDL(object):
""" Start an HTTP download """ """ Start an HTTP download """
if isinstance(req, compat_basestring): if isinstance(req, compat_basestring):
req = sanitized_Request(req) req = sanitized_Request(req)
# an embedded /../ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
url = req.get_full_url()
parts = url.partition('/../')
if parts[1]:
url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2])
if url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
parts = compat_urllib_parse.urlsplit(url)
path = parts.path
while path.startswith('/../'):
path = path[3:]
url = parts._replace(path=path).geturl()
# get a new Request with the munged URL
if url != req.get_full_url():
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
req.get_method(), compat_urllib_request.Request)
req = req_type(
url, data=req.data, headers=dict(req.header_items()),
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
return self._opener.open(req, timeout=self._socket_timeout) return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self): def print_debug_header(self):

View File

@ -3200,18 +3200,6 @@ except AttributeError:
def compat_datetime_timedelta_total_seconds(td): def compat_datetime_timedelta_total_seconds(td):
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
# optional decompression packages
# PyPi brotli package implements 'br' Content-Encoding
try:
import brotli as compat_brotli
except ImportError:
compat_brotli = None
# PyPi ncompress package implements 'compress' Content-Encoding
try:
import ncompress as compat_ncompress
except ImportError:
compat_ncompress = None
legacy = [ legacy = [
'compat_HTMLParseError', 'compat_HTMLParseError',
@ -3246,7 +3234,6 @@ __all__ = [
'compat_Struct', 'compat_Struct',
'compat_base64_b64decode', 'compat_base64_b64decode',
'compat_basestring', 'compat_basestring',
'compat_brotli',
'compat_casefold', 'compat_casefold',
'compat_chr', 'compat_chr',
'compat_collections_abc', 'compat_collections_abc',
@ -3272,7 +3259,6 @@ __all__ = [
'compat_itertools_zip_longest', 'compat_itertools_zip_longest',
'compat_kwargs', 'compat_kwargs',
'compat_map', 'compat_map',
'compat_ncompress',
'compat_numeric_types', 'compat_numeric_types',
'compat_open', 'compat_open',
'compat_ord', 'compat_ord',

View File

@ -15,6 +15,7 @@ import email.utils
import email.header import email.header
import errno import errno
import functools import functools
import gzip
import inspect import inspect
import io import io
import itertools import itertools
@ -41,7 +42,6 @@ from .compat import (
compat_HTMLParseError, compat_HTMLParseError,
compat_HTMLParser, compat_HTMLParser,
compat_basestring, compat_basestring,
compat_brotli as brotli,
compat_casefold, compat_casefold,
compat_chr, compat_chr,
compat_collections_abc, compat_collections_abc,
@ -55,7 +55,6 @@ from .compat import (
compat_http_client, compat_http_client,
compat_integer_types, compat_integer_types,
compat_kwargs, compat_kwargs,
compat_ncompress as ncompress,
compat_os_name, compat_os_name,
compat_re_Match, compat_re_Match,
compat_re_Pattern, compat_re_Pattern,
@ -2639,91 +2638,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
req) req)
@staticmethod @staticmethod
def deflate_gz(data): def deflate(data):
try: try:
# format:zlib,gzip + windowsize:32768
return data and zlib.decompress(data, 32 + zlib.MAX_WBITS)
except zlib.error:
# raw zlib * windowsize:32768 (RFC 9110: "non-conformant")
return zlib.decompress(data, -zlib.MAX_WBITS) return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
@staticmethod return zlib.decompress(data)
def gzip(data):
from gzip import GzipFile
def _gzip(data):
with io.BytesIO(data) as data_buf:
gz = GzipFile(fileobj=data_buf, mode='rb')
return gz.read()
try:
return _gzip(data)
except IOError as original_ioerror:
# There may be junk at the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range(1, 1024):
try:
return _gzip(data[:-i])
except IOError:
continue
else:
raise original_ioerror
@staticmethod
def brotli(data):
return data and brotli.decompress(data)
@staticmethod
def compress(data):
return data and ncompress.decompress(data)
@staticmethod
def _fix_path(url):
# an embedded /../ or /./ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
parsed_url = compat_urllib_parse.urlsplit(url)
path = parsed_url.path
if not path.endswith('/'):
path += '/'
parts = path.partition('/./')
if not parts[1]:
parts = path.partition('/../')
if parts[1]:
path = compat_urllib_parse.urljoin(
parts[0] + parts[1][:1],
parts[1][1:] + (parts[2] if parsed_url.path.endswith('/') else parts[2][:-1]))
url = parsed_url._replace(path=path).geturl()
if '/.' in url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
path = parsed_url.path + '/'
while path.startswith('/.'):
if path.startswith('/../'):
path = path[3:]
elif path.startswith('/./'):
path = path[2:]
else:
break
path = path[:-1]
if not path.startswith('/') and parsed_url.path.startswith('/'):
path = '/' + path
url = parsed_url._replace(path=path).geturl()
return url
def http_request(self, req): def http_request(self, req):
url = req.get_full_url() # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# resolve embedded . and .. # always respected by websites, some tend to give out URLs with non percent-encoded
url_fixed = self._fix_path(url)
# According to RFC 3986, URLs can not contain non-ASCII characters; however this is not
# always respected by websites: some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412]) # non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with # To work around aforementioned issue we will replace request's original URL with
# percent-encoded one # percent-encoded one
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen() # the code of this workaround has been moved here from YoutubeDL.urlopen()
url_escaped = escape_url(url_fixed) url = req.get_full_url()
url_escaped = escape_url(url)
# Substitute URL if any change after escaping # Substitute URL if any change after escaping
if url != url_escaped: if url != url_escaped:
@ -2737,10 +2668,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
req.headers = handle_youtubedl_headers(req.headers) req.headers = handle_youtubedl_headers(req.headers)
if sys.version_info < (2, 7): if sys.version_info < (2, 7) and '#' in req.get_full_url():
# avoid possible race where __r_type may be unset
req.get_type()
if '#' in req.get_full_url():
# Python 2.6 is brain-dead when it comes to fragments # Python 2.6 is brain-dead when it comes to fragments
req._Request__original = req._Request__original.partition('#')[0] req._Request__original = req._Request__original.partition('#')[0]
req._Request__r_type = req._Request__r_type.partition('#')[0] req._Request__r_type = req._Request__r_type.partition('#')[0]
@ -2751,59 +2679,33 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
def http_response(self, req, resp): def http_response(self, req, resp):
old_resp = resp old_resp = resp
# gzip
# Content-Encoding header lists the encodings in order that they were applied [1]. if resp.headers.get('Content-encoding', '') == 'gzip':
# To decompress, we simply do the reverse. content = resp.read()
# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
decoded_response = None
decoders = {
'gzip': self.deflate_gz,
'deflate': self.deflate_gz,
}
if brotli:
decoders['br'] = self.brotli
if ncompress:
decoders['compress'] = self.compress
if sys.platform.startswith('java'):
# Jython zlib implementation misses gzip
decoders['gzip'] = self.gzip
def encodings(hdrs):
# A header field that allows multiple values can have multiple instances [2].
# [2]: https://datatracker.ietf.org/doc/html/rfc9110#name-fields
for e in reversed(','.join(hdrs).split(',')):
if e:
yield e.strip()
encodings_left = []
try: try:
resp.headers.get_all uncompressed = io.BytesIO(gz.read())
hdrs = resp.headers except IOError as original_ioerror:
except AttributeError: # There may be junk at the end of the file
# Py2 has no get_all() method: headers are rfc822.Message # See http://stackoverflow.com/q/4928560/35070 for details
from email.message import Message for i in range(1, 1024):
hdrs = Message() try:
for k, v in resp.headers.items(): gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
hdrs[k] = v uncompressed = io.BytesIO(gz.read())
except IOError:
decoder, decoded_response = True, None
for encoding in encodings(hdrs.get_all('Content-Encoding', [])):
# "SHOULD consider" x-compress, x-gzip as compress, gzip
decoder = decoder and decoders.get(remove_start(encoding, 'x-'))
if not decoder:
encodings_left.insert(0, encoding)
continue continue
decoded_response = decoder(decoded_response or resp.read()) break
if decoded_response is not None: else:
resp = compat_urllib_request.addinfourl( raise original_ioerror
io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg resp.msg = old_resp.msg
del resp.headers['Content-Length'] del resp.headers['Content-encoding']
resp.headers['Content-Length'] = '%d' % len(decoded_response) # deflate
del resp.headers['Content-Encoding'] if resp.headers.get('Content-encoding', '') == 'deflate':
if encodings_left: gz = io.BytesIO(self.deflate(resp.read()))
resp.headers['Content-Encoding'] = ', '.join(encodings_left) resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
del resp.headers['Content-encoding']
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
# https://github.com/ytdl-org/youtube-dl/issues/6457). # https://github.com/ytdl-org/youtube-dl/issues/6457).
if 300 <= resp.code < 400: if 300 <= resp.code < 400:
@ -2813,13 +2715,10 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
location = location.encode('iso-8859-1') location = location.encode('iso-8859-1')
location = location.decode('utf-8') location = location.decode('utf-8')
# resolve embedded . and .. location_escaped = escape_url(location)
location_fixed = self._fix_path(location)
location_escaped = escape_url(location_fixed)
if location != location_escaped: if location != location_escaped:
del resp.headers['Location'] del resp.headers['Location']
# if sys.version_info < (3, 0): if sys.version_info < (3, 0):
if not isinstance(location_escaped, str):
location_escaped = location_escaped.encode('utf-8') location_escaped = location_escaped.encode('utf-8')
resp.headers['Location'] = location_escaped resp.headers['Location'] = location_escaped
return resp return resp
@ -4289,8 +4188,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
req_headers.update(headers) req_headers.update(headers)
req_data = data if data is not None else req.data req_data = data if data is not None else req.data
req_url = update_url_query(url or req.get_full_url(), query) req_url = update_url_query(url or req.get_full_url(), query)
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get( req_get_method = req.get_method()
req.get_method(), compat_urllib_request.Request) if req_get_method == 'HEAD':
req_type = HEADRequest
elif req_get_method == 'PUT':
req_type = PUTRequest
else:
req_type = compat_urllib_request.Request
new_req = req_type( new_req = req_type(
req_url, data=req_data, headers=req_headers, req_url, data=req_data, headers=req_headers,
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)