Compare commits

...

17 Commits

Author SHA1 Message Date
fonkap
6f8c2635a5 [StreamsbIE] Add extractor for streamsb.com (viewsb.com) (#31517)
* Add extractor for streamsb.com (viewsb.com)

* make data url using app.js version

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-02-13 03:54:51 +00:00
fonkap
de48105dd8 [KommunetvIE] Add extractor for kommunetv.no (#31516)
* Add extractor for kommunetv.no
* Using utils.update_url instead of regex

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-02-13 03:54:51 +00:00
fonkap
822f19f05d [FileMoonIE] Add extractor for filemoon.sx (#31515)
---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-02-13 03:54:51 +00:00
teddy171
33db85c571 [feat]: Add support to external downloader aria2p (#31500)
* feat: add class Aria2pFD

* feat: create call_downloader function

* feat: a colorful download interface to aria2pFD

* feat: change value name

* Apply suggestions from code review

Co-authored-by: dirkf <fieldhouse@gmx.net>

* Typo in suggestion

* fix: remove unused value

* fix: add not function to return value(0 is normal); add total_seconds to download.eta(timedelta object); add waiting status when hook progress

* fix: remove unuse method ..utils.format_bytes

* fix: be up to flake8

* fix: be up to flake8

* Apply suggestions from code review

* [feat] test external downloader aria2p

* [feat] test external downloader aria2p

* [fix] test_external_downloader.py

* Apply suggestions from code review

Co-authored-by: dirkf <fieldhouse@gmx.net>

* Apply suggestions from code review

Co-authored-by: dirkf <fieldhouse@gmx.net>

* Update test/test_external_downloader.py

Co-authored-by: dirkf <fieldhouse@gmx.net>

* Update test/test_external_downloader.py

Co-authored-by: dirkf <fieldhouse@gmx.net>

* Update youtube_dl/downloader/external.py

Co-authored-by: dirkf <fieldhouse@gmx.net>

* refactoring code and fix bugs

* Apply suggestions from code review

* Rename test_external_downloader.py to test_downloader_external.py

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-02-13 03:54:51 +00:00
Valentin Metz
f33923cba7 [rbgtum] Add new extractor (#31305)
* [rbgtum] Add new extractor

* Small update, force CI

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-02-13 03:54:51 +00:00
dirkf
e8198c517b [YouTube] Fix tests 2023-02-13 03:54:51 +00:00
dirkf
bafb6dec72 [YouTube] Refresh compat/utils usage
* import parse_qs()
* import parse_qs in lazy_extractors (clears old TODO)
* clean up old compiled lazy_extractors for Py2
* use update_url()
2023-02-13 03:54:51 +00:00
dirkf
4e04f10499 [compat] Update test_compat
[skip ci]
2023-02-13 03:54:51 +00:00
dirkf
90c9f789d9 [utils] Add parse_qs, update_url
[skip ci]
2023-02-13 03:54:51 +00:00
dirkf
249f2b6316 [compat] Systematise compat_ naming
[skip ci]
2023-02-13 03:54:51 +00:00
dirkf
d6b14ba316 [test] Fix TestAgeRestriction
* age restriction may cause DownloadError
* update obsolete test URLs
[skip ci]
2023-02-13 03:54:51 +00:00
dirkf
30e986b834 [YouTube] Add signatureTimestamp for age-gate bypass 2023-02-13 03:54:51 +00:00
dirkf
58988c1421 [YouTube] Bypass age-gating for certain restricted videos
* Use TVHTML5_SIMPLY_EMBEDDED_PLAYER client

* Also add and fix tests

* Introduce and use new utility function `update_url()`
2023-02-13 03:54:51 +00:00
dirkf
e19ec52322 [Vimeo] Support /user{video_id}/{slug} URL format 2023-02-12 22:16:00 +00:00
dirkf
f2f90887ca [Vimeo] Fix Unable to extract info section redux
* as reported in yt-dlp/yt-dlp#6149
* also allow newline in target JSON object
2023-02-12 22:16:00 +00:00
dirkf
cd987e6fca [jsinterp] Nits 2023-02-12 22:16:00 +00:00
dirkf
d947ffe8e3 [IGN] Overhaul extractor to avoid URL redirection loop
Consequently/also:
* centralise video data extraction
* detect 404 and 503 expected errors
* handle the test video in IGNVideo
* handle two additional page formats for the tests in IGNArticle
2023-02-12 22:16:00 +00:00
20 changed files with 1181 additions and 354 deletions

View File

@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
lazy_extractors_filename = sys.argv[1] lazy_extractors_filename = sys.argv[1]
if os.path.exists(lazy_extractors_filename): if os.path.exists(lazy_extractors_filename):
os.remove(lazy_extractors_filename) os.remove(lazy_extractors_filename)
# Py2: may be confused by leftover lazy_extractors.pyc
try:
os.remove(lazy_extractors_filename + 'c')
except OSError:
pass
from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor import _ALL_CLASSES
from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f:
module_contents = [ module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n',
]
ie_template = ''' ie_template = '''
class {name}({bases}): class {name}({bases}):

View File

@ -89,6 +89,17 @@ class FakeYDL(YoutubeDL):
self.report_warning = types.MethodType(report_warning, self) self.report_warning = types.MethodType(report_warning, self)
class FakeLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
def gettestcases(include_onlymatching=False): def gettestcases(include_onlymatching=False):
for ie in youtube_dl.extractor.gen_extractors(): for ie in youtube_dl.extractor.gen_extractors():
for tc in ie.get_testcases(include_onlymatching): for tc in ie.get_testcases(include_onlymatching):

View File

@ -11,6 +11,7 @@ from test.helper import try_rm
from youtube_dl import YoutubeDL from youtube_dl import YoutubeDL
from youtube_dl.utils import DownloadError
def _download_restricted(url, filename, age): def _download_restricted(url, filename, age):
@ -26,7 +27,10 @@ def _download_restricted(url, filename, age):
ydl.add_default_info_extractors() ydl.add_default_info_extractors()
json_filename = os.path.splitext(filename)[0] + '.info.json' json_filename = os.path.splitext(filename)[0] + '.info.json'
try_rm(json_filename) try_rm(json_filename)
try:
ydl.download([url]) ydl.download([url])
except DownloadError:
try_rm(json_filename)
res = os.path.exists(json_filename) res = os.path.exists(json_filename)
try_rm(json_filename) try_rm(json_filename)
return res return res
@ -38,12 +42,12 @@ class TestAgeRestriction(unittest.TestCase):
self.assertFalse(_download_restricted(url, filename, age)) self.assertFalse(_download_restricted(url, filename, age))
def test_youtube(self): def test_youtube(self):
self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10)
def test_youporn(self): def test_youporn(self):
self._assert_restricted( self._assert_restricted(
'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/',
'505835.mp4', 2, old_age=25) '16715086.mp4', 2, old_age=25)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -48,10 +48,11 @@ class TestCompat(unittest.TestCase):
def test_all_present(self): def test_all_present(self):
import youtube_dl.compat import youtube_dl.compat
all_names = youtube_dl.compat.__all__ all_names = sorted(
present_names = set(filter( youtube_dl.compat.__all__ + youtube_dl.compat.legacy)
present_names = set(map(compat_str, filter(
lambda c: '_' in c and not c.startswith('_'), lambda c: '_' in c and not c.startswith('_'),
dir(youtube_dl.compat))) - set(['unicode_literals']) dir(youtube_dl.compat)))) - set(['unicode_literals'])
self.assertEqual(all_names, sorted(present_names)) self.assertEqual(all_names, sorted(present_names))
def test_compat_urllib_parse_unquote(self): def test_compat_urllib_parse_unquote(self):

View File

@ -0,0 +1,115 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
# Allow direct execution
import os
import re
import sys
import subprocess
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
FakeLogger,
http_server_port,
try_rm,
)
from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_http_server
from youtube_dl.utils import encodeFilename
from youtube_dl.downloader.external import Aria2pFD
import threading
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
TEST_SIZE = 10 * 1024
class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass
def send_content_range(self, total=None):
range_header = self.headers.get('Range')
start = end = None
if range_header:
mobj = re.match(r'bytes=(\d+)-(\d+)', range_header)
if mobj:
start, end = (int(mobj.group(i)) for i in (1, 2))
valid_range = start is not None and end is not None
if valid_range:
content_range = 'bytes %d-%d' % (start, end)
if total:
content_range += '/%d' % total
self.send_header('Content-Range', content_range)
return (end - start + 1) if valid_range else total
def serve(self, range=True, content_length=True):
self.send_response(200)
self.send_header('Content-Type', 'video/mp4')
size = TEST_SIZE
if range:
size = self.send_content_range(TEST_SIZE)
if content_length:
self.send_header('Content-Length', size)
self.end_headers()
self.wfile.write(b'#' * size)
def do_GET(self):
if self.path == '/regular':
self.serve()
elif self.path == '/no-content-length':
self.serve(content_length=False)
elif self.path == '/no-range':
self.serve(range=False)
elif self.path == '/no-range-no-content-length':
self.serve(range=False, content_length=False)
else:
assert False, 'unrecognised server path'
@unittest.skipUnless(Aria2pFD.available(), 'aria2p module not found')
class TestAria2pFD(unittest.TestCase):
def setUp(self):
self.httpd = compat_http_server.HTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
def download(self, params, ep):
with subprocess.Popen(
['aria2c', '--enable-rpc'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
) as process:
if not process.poll():
filename = 'testfile.mp4'
params['logger'] = FakeLogger()
params['outtmpl'] = filename
ydl = YoutubeDL(params)
try_rm(encodeFilename(filename))
self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0)
self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE)
try_rm(encodeFilename(filename))
process.kill()
def download_all(self, params):
for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'):
self.download(params, ep)
def test_regular(self):
self.download_all({'external_downloader': 'aria2p'})
def test_chunked(self):
self.download_all({
'external_downloader': 'aria2p',
'http_chunk_size': 1000,
})
if __name__ == '__main__':
unittest.main()

View File

@ -9,7 +9,11 @@ import sys
import unittest import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import http_server_port, try_rm from test.helper import (
FakeLogger,
http_server_port,
try_rm,
)
from youtube_dl import YoutubeDL from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_http_server from youtube_dl.compat import compat_http_server
from youtube_dl.downloader.http import HttpFD from youtube_dl.downloader.http import HttpFD
@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
assert False assert False
class FakeLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
class TestHttpFD(unittest.TestCase): class TestHttpFD(unittest.TestCase):
def setUp(self): def setUp(self):
self.httpd = compat_http_server.HTTPServer( self.httpd = compat_http_server.HTTPServer(

View File

@ -40,12 +40,14 @@ class TestExecution(unittest.TestCase):
self.assertFalse(stderr) self.assertFalse(stderr)
def test_lazy_extractors(self): def test_lazy_extractors(self):
lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py'
try: try:
subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL)
finally: finally:
for x in ['', 'c'] if sys.version_info[0] < 3 else ['']:
try: try:
os.remove('youtube_dl/extractor/lazy_extractors.py') os.remove(lazy_extractors + x)
except (IOError, OSError): except (IOError, OSError):
pass pass

View File

@ -8,7 +8,10 @@ import sys
import unittest import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import http_server_port from test.helper import (
FakeLogger,
http_server_port,
)
from youtube_dl import YoutubeDL from youtube_dl import YoutubeDL
from youtube_dl.compat import compat_http_server, compat_urllib_request from youtube_dl.compat import compat_http_server, compat_urllib_request
import ssl import ssl
@ -52,17 +55,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
assert False assert False
class FakeLogger(object):
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
class TestHTTP(unittest.TestCase): class TestHTTP(unittest.TestCase):
def setUp(self): def setUp(self):
self.httpd = compat_http_server.HTTPServer( self.httpd = compat_http_server.HTTPServer(

View File

@ -21,6 +21,10 @@ import subprocess
import sys import sys
import xml.etree.ElementTree import xml.etree.ElementTree
# naming convention
# 'compat_' + Python3_name.replace('.', '_')
# other aliases exist for convenience and/or legacy
# deal with critical unicode/str things first # deal with critical unicode/str things first
try: try:
# Python 2 # Python 2
@ -28,6 +32,7 @@ try:
unicode, basestring, unichr unicode, basestring, unichr
) )
from .casefold import casefold as compat_casefold from .casefold import casefold as compat_casefold
except NameError: except NameError:
compat_str, compat_basestring, compat_chr = ( compat_str, compat_basestring, compat_chr = (
str, str, chr str, str, chr
@ -53,16 +58,15 @@ try:
import urllib.parse as compat_urllib_parse import urllib.parse as compat_urllib_parse
except ImportError: # Python 2 except ImportError: # Python 2
import urllib as compat_urllib_parse import urllib as compat_urllib_parse
import urlparse as _urlparse
for a in dir(_urlparse):
if not hasattr(compat_urllib_parse, a):
setattr(compat_urllib_parse, a, getattr(_urlparse, a))
del _urlparse
try: # unfavoured aliases
from urllib.parse import urlparse as compat_urllib_parse_urlparse compat_urlparse = compat_urllib_parse
except ImportError: # Python 2 compat_urllib_parse_urlparse = compat_urllib_parse.urlparse
from urlparse import urlparse as compat_urllib_parse_urlparse
try:
import urllib.parse as compat_urlparse
except ImportError: # Python 2
import urlparse as compat_urlparse
try: try:
import urllib.response as compat_urllib_response import urllib.response as compat_urllib_response
@ -73,6 +77,7 @@ try:
import http.cookiejar as compat_cookiejar import http.cookiejar as compat_cookiejar
except ImportError: # Python 2 except ImportError: # Python 2
import cookielib as compat_cookiejar import cookielib as compat_cookiejar
compat_http_cookiejar = compat_cookiejar
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
class compat_cookiejar_Cookie(compat_cookiejar.Cookie): class compat_cookiejar_Cookie(compat_cookiejar.Cookie):
@ -84,11 +89,13 @@ if sys.version_info[0] == 2:
compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs)
else: else:
compat_cookiejar_Cookie = compat_cookiejar.Cookie compat_cookiejar_Cookie = compat_cookiejar.Cookie
compat_http_cookiejar_Cookie = compat_cookiejar_Cookie
try: try:
import http.cookies as compat_cookies import http.cookies as compat_cookies
except ImportError: # Python 2 except ImportError: # Python 2
import Cookie as compat_cookies import Cookie as compat_cookies
compat_http_cookies = compat_cookies
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):
@ -98,6 +105,7 @@ if sys.version_info[0] == 2:
return super(compat_cookies_SimpleCookie, self).load(rawdata) return super(compat_cookies_SimpleCookie, self).load(rawdata)
else: else:
compat_cookies_SimpleCookie = compat_cookies.SimpleCookie compat_cookies_SimpleCookie = compat_cookies.SimpleCookie
compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie
try: try:
import html.entities as compat_html_entities import html.entities as compat_html_entities
@ -2351,16 +2359,19 @@ try:
from urllib.error import HTTPError as compat_HTTPError from urllib.error import HTTPError as compat_HTTPError
except ImportError: # Python 2 except ImportError: # Python 2
from urllib2 import HTTPError as compat_HTTPError from urllib2 import HTTPError as compat_HTTPError
compat_urllib_HTTPError = compat_HTTPError
try: try:
from urllib.request import urlretrieve as compat_urlretrieve from urllib.request import urlretrieve as compat_urlretrieve
except ImportError: # Python 2 except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve from urllib import urlretrieve as compat_urlretrieve
compat_urllib_request_urlretrieve = compat_urlretrieve
try: try:
from html.parser import HTMLParser as compat_HTMLParser from html.parser import HTMLParser as compat_HTMLParser
except ImportError: # Python 2 except ImportError: # Python 2
from HTMLParser import HTMLParser as compat_HTMLParser from HTMLParser import HTMLParser as compat_HTMLParser
compat_html_parser_HTMLParser = compat_HTMLParser
try: # Python 2 try: # Python 2
from HTMLParser import HTMLParseError as compat_HTMLParseError from HTMLParser import HTMLParseError as compat_HTMLParseError
@ -2374,6 +2385,7 @@ except ImportError: # Python <3.4
# and uniform cross-version exception handling # and uniform cross-version exception handling
class compat_HTMLParseError(Exception): class compat_HTMLParseError(Exception):
pass pass
compat_html_parser_HTMLParseError = compat_HTMLParseError
try: try:
from subprocess import DEVNULL from subprocess import DEVNULL
@ -2390,6 +2402,8 @@ try:
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote as compat_urllib_parse_unquote
from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
from urllib.parse import urlencode as compat_urllib_parse_urlencode
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2 except ImportError: # Python 2
_asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
else re.compile(r'([\x00-\x7f]+)')) else re.compile(r'([\x00-\x7f]+)'))
@ -2456,9 +2470,6 @@ except ImportError: # Python 2
string = string.replace('+', ' ') string = string.replace('+', ' ')
return compat_urllib_parse_unquote(string, encoding, errors) return compat_urllib_parse_unquote(string, encoding, errors)
try:
from urllib.parse import urlencode as compat_urllib_parse_urlencode
except ImportError: # Python 2
# Python 2 will choke in urlencode on mixture of byte and unicode strings. # Python 2 will choke in urlencode on mixture of byte and unicode strings.
# Possible solutions are to either port it from python 3 with all # Possible solutions are to either port it from python 3 with all
# the friends or manually ensure input query contains only byte strings. # the friends or manually ensure input query contains only byte strings.
@ -2480,7 +2491,62 @@ except ImportError: # Python 2
def encode_list(l): def encode_list(l):
return [encode_elem(e) for e in l] return [encode_elem(e) for e in l]
return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq)
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
qs, _coerce_result = qs, compat_str
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError('bad query field: %r' % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = compat_urllib_parse_unquote(
name, encoding=encoding, errors=errors)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = compat_urllib_parse_unquote(
value, encoding=encoding, errors=errors)
value = _coerce_result(value)
r.append((name, value))
return r
def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
parsed_result = {}
pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
else:
parsed_result[name] = [value]
return parsed_result
setattr(compat_urllib_parse, '_urlencode',
getattr(compat_urllib_parse, 'urlencode'))
for name, fix in (
('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes),
('parse_unquote', compat_urllib_parse_unquote),
('unquote_plus', compat_urllib_parse_unquote_plus),
('urlencode', compat_urllib_parse_urlencode),
('parse_qs', compat_parse_qs)):
setattr(compat_urllib_parse, name, fix)
compat_urllib_parse_parse_qs = compat_parse_qs
try: try:
from urllib.request import DataHandler as compat_urllib_request_DataHandler from urllib.request import DataHandler as compat_urllib_request_DataHandler
@ -2520,6 +2586,7 @@ try:
from xml.etree.ElementTree import ParseError as compat_xml_parse_error from xml.etree.ElementTree import ParseError as compat_xml_parse_error
except ImportError: # Python 2.6 except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error from xml.parsers.expat import ExpatError as compat_xml_parse_error
compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error
etree = xml.etree.ElementTree etree = xml.etree.ElementTree
@ -2533,10 +2600,11 @@ try:
# xml.etree.ElementTree.Element is a method in Python <=2.6 and # xml.etree.ElementTree.Element is a method in Python <=2.6 and
# the following will crash with: # the following will crash with:
# TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types
isinstance(None, xml.etree.ElementTree.Element) isinstance(None, etree.Element)
from xml.etree.ElementTree import Element as compat_etree_Element from xml.etree.ElementTree import Element as compat_etree_Element
except TypeError: # Python <=2.6 except TypeError: # Python <=2.6
from xml.etree.ElementTree import _ElementInterface as compat_etree_Element from xml.etree.ElementTree import _ElementInterface as compat_etree_Element
compat_xml_etree_ElementTree_Element = compat_etree_Element
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
def compat_etree_fromstring(text): def compat_etree_fromstring(text):
@ -2592,6 +2660,7 @@ else:
if k == uri or v == prefix: if k == uri or v == prefix:
del etree._namespace_map[k] del etree._namespace_map[k]
etree._namespace_map[uri] = prefix etree._namespace_map[uri] = prefix
compat_xml_etree_register_namespace = compat_etree_register_namespace
if sys.version_info < (2, 7): if sys.version_info < (2, 7):
# Here comes the crazy part: In 2.6, if the xpath is a unicode, # Here comes the crazy part: In 2.6, if the xpath is a unicode,
@ -2603,53 +2672,6 @@ if sys.version_info < (2, 7):
else: else:
compat_xpath = lambda xpath: xpath compat_xpath = lambda xpath: xpath
try:
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
qs, _coerce_result = qs, compat_str
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value and not strict_parsing:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError('bad query field: %r' % (name_value,))
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = compat_urllib_parse_unquote(
name, encoding=encoding, errors=errors)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = compat_urllib_parse_unquote(
value, encoding=encoding, errors=errors)
value = _coerce_result(value)
r.append((name, value))
return r
def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
encoding='utf-8', errors='replace'):
parsed_result = {}
pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
encoding=encoding, errors=errors)
for name, value in pairs:
if name in parsed_result:
parsed_result[name].append(value)
else:
parsed_result[name] = [value]
return parsed_result
compat_os_name = os._name if os.name == 'java' else os.name compat_os_name = os._name if os.name == 'java' else os.name
@ -2774,6 +2796,8 @@ else:
else: else:
compat_expanduser = os.path.expanduser compat_expanduser = os.path.expanduser
compat_os_path_expanduser = compat_expanduser
if compat_os_name == 'nt' and sys.version_info < (3, 8): if compat_os_name == 'nt' and sys.version_info < (3, 8):
# os.path.realpath on Windows does not follow symbolic links # os.path.realpath on Windows does not follow symbolic links
@ -2785,6 +2809,8 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8):
else: else:
compat_realpath = os.path.realpath compat_realpath = os.path.realpath
compat_os_path_realpath = compat_realpath
if sys.version_info < (3, 0): if sys.version_info < (3, 0):
def compat_print(s): def compat_print(s):
@ -2805,11 +2831,15 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':
else: else:
compat_getpass = getpass.getpass compat_getpass = getpass.getpass
compat_getpass_getpass = compat_getpass
try: try:
compat_input = raw_input compat_input = raw_input
except NameError: # Python 3 except NameError: # Python 3
compat_input = input compat_input = input
# Python < 2.6.5 require kwargs to be bytes # Python < 2.6.5 require kwargs to be bytes
try: try:
def _testfunc(x): def _testfunc(x):
@ -2915,15 +2945,16 @@ else:
lines = _lines lines = _lines
return _terminal_size(columns, lines) return _terminal_size(columns, lines)
try: try:
itertools.count(start=0, step=1) itertools.count(start=0, step=1)
compat_itertools_count = itertools.count compat_itertools_count = itertools.count
except TypeError: # Python 2.6 except TypeError: # Python 2.6
def compat_itertools_count(start=0, step=1): def compat_itertools_count(start=0, step=1):
n = start
while True: while True:
yield n yield start
n += step start += step
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
from tokenize import tokenize as compat_tokenize_tokenize from tokenize import tokenize as compat_tokenize_tokenize
@ -3075,6 +3106,8 @@ if sys.version_info < (3, 3):
else: else:
compat_b64decode = base64.b64decode compat_b64decode = base64.b64decode
compat_base64_b64decode = compat_b64decode
if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0):
# PyPy2 prior to version 5.4.0 expects byte strings as Windows function # PyPy2 prior to version 5.4.0 expects byte strings as Windows function
@ -3094,30 +3127,53 @@ else:
return ctypes.WINFUNCTYPE(*args, **kwargs) return ctypes.WINFUNCTYPE(*args, **kwargs)
__all__ = [ legacy = [
'compat_HTMLParseError', 'compat_HTMLParseError',
'compat_HTMLParser', 'compat_HTMLParser',
'compat_HTTPError', 'compat_HTTPError',
'compat_Struct',
'compat_b64decode', 'compat_b64decode',
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
'compat_cookies_SimpleCookie',
'compat_etree_Element',
'compat_etree_register_namespace',
'compat_expanduser',
'compat_getpass',
'compat_parse_qs',
'compat_realpath',
'compat_urllib_parse_parse_qs',
'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus',
'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlencode',
'compat_urllib_parse_urlparse',
'compat_urlparse',
'compat_urlretrieve',
'compat_xml_parse_error',
]
__all__ = [
'compat_html_parser_HTMLParseError',
'compat_html_parser_HTMLParser',
'compat_Struct',
'compat_base64_b64decode',
'compat_basestring', 'compat_basestring',
'compat_casefold', 'compat_casefold',
'compat_chr', 'compat_chr',
'compat_collections_abc', 'compat_collections_abc',
'compat_collections_chain_map', 'compat_collections_chain_map',
'compat_cookiejar', 'compat_http_cookiejar',
'compat_cookiejar_Cookie', 'compat_http_cookiejar_Cookie',
'compat_cookies', 'compat_http_cookies',
'compat_cookies_SimpleCookie', 'compat_http_cookies_SimpleCookie',
'compat_ctypes_WINFUNCTYPE', 'compat_ctypes_WINFUNCTYPE',
'compat_etree_Element',
'compat_etree_fromstring', 'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',
'compat_filter', 'compat_filter',
'compat_get_terminal_size', 'compat_get_terminal_size',
'compat_getenv', 'compat_getenv',
'compat_getpass', 'compat_getpass_getpass',
'compat_html_entities', 'compat_html_entities',
'compat_html_entities_html5', 'compat_html_entities_html5',
'compat_http_client', 'compat_http_client',
@ -3131,11 +3187,11 @@ __all__ = [
'compat_numeric_types', 'compat_numeric_types',
'compat_ord', 'compat_ord',
'compat_os_name', 'compat_os_name',
'compat_parse_qs', 'compat_os_path_expanduser',
'compat_os_path_realpath',
'compat_print', 'compat_print',
'compat_re_Match', 'compat_re_Match',
'compat_re_Pattern', 'compat_re_Pattern',
'compat_realpath',
'compat_setenv', 'compat_setenv',
'compat_shlex_quote', 'compat_shlex_quote',
'compat_shlex_split', 'compat_shlex_split',
@ -3147,17 +3203,14 @@ __all__ = [
'compat_tokenize_tokenize', 'compat_tokenize_tokenize',
'compat_urllib_error', 'compat_urllib_error',
'compat_urllib_parse', 'compat_urllib_parse',
'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus',
'compat_urllib_parse_unquote_to_bytes',
'compat_urllib_parse_urlencode',
'compat_urllib_parse_urlparse',
'compat_urllib_request', 'compat_urllib_request',
'compat_urllib_request_DataHandler', 'compat_urllib_request_DataHandler',
'compat_urllib_response', 'compat_urllib_response',
'compat_urlparse', 'compat_urllib_request_urlretrieve',
'compat_urlretrieve', 'compat_urllib_HTTPError',
'compat_xml_parse_error', 'compat_xml_etree_ElementTree_Element',
'compat_xml_etree_ElementTree_ParseError',
'compat_xml_etree_register_namespace',
'compat_xpath', 'compat_xpath',
'compat_zip', 'compat_zip',
'workaround_optparse_bug9161', 'workaround_optparse_bug9161',

View File

@ -200,6 +200,64 @@ class Aria2cFD(ExternalFD):
return cmd return cmd
class Aria2pFD(ExternalFD):
''' Aria2pFD class
This class support to use aria2p as downloader.
(Aria2p, a command-line tool and Python library to interact with an aria2c daemon process
through JSON-RPC.)
It can help you to get download progress more easily.
To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip.
Then run aria2c in the background and enable with the --enable-rpc option.
'''
try:
import aria2p
__avail = True
except ImportError:
__avail = False
@classmethod
def available(cls):
return cls.__avail
def _call_downloader(self, tmpfilename, info_dict):
aria2 = self.aria2p.API(
self.aria2p.Client(
host='http://localhost',
port=6800,
secret=''
)
)
options = {
'min-split-size': '1M',
'max-connection-per-server': 4,
'auto-file-renaming': 'false',
}
options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.')
options['out'] = os.path.basename(tmpfilename)
options['header'] = []
for key, val in info_dict['http_headers'].items():
options['header'].append('{0}: {1}'.format(key, val))
download = aria2.add_uris([info_dict['url']], options)
status = {
'status': 'downloading',
'tmpfilename': tmpfilename,
}
started = time.time()
while download.status in ['active', 'waiting']:
download = aria2.get_download(download.gid)
status.update({
'downloaded_bytes': download.completed_length,
'total_bytes': download.total_length,
'elapsed': time.time() - started,
'eta': download.eta.total_seconds(),
'speed': download.download_speed,
})
self._hook_progress(status)
time.sleep(.5)
return download.status != 'complete'
class HttpieFD(ExternalFD): class HttpieFD(ExternalFD):
@classmethod @classmethod
def available(cls): def available(cls):

View File

@ -376,6 +376,7 @@ from .fc2 import (
FC2EmbedIE, FC2EmbedIE,
) )
from .fczenit import FczenitIE from .fczenit import FczenitIE
from .filemoon import FileMoonIE
from .fifa import FifaIE from .fifa import FifaIE
from .filmon import ( from .filmon import (
FilmOnIE, FilmOnIE,
@ -556,6 +557,7 @@ from .khanacademy import (
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE
from .kommunetv import KommunetvIE
from .konserthusetplay import KonserthusetPlayIE from .konserthusetplay import KonserthusetPlayIE
from .krasview import KrasViewIE from .krasview import KrasViewIE
from .kth import KTHIE from .kth import KTHIE
@ -1010,6 +1012,10 @@ from .raywenderlich import (
RayWenderlichIE, RayWenderlichIE,
RayWenderlichCourseIE, RayWenderlichCourseIE,
) )
from .rbgtum import (
RbgTumIE,
RbgTumCourseIE,
)
from .rbmaradio import RBMARadioIE from .rbmaradio import RBMARadioIE
from .rds import RDSIE from .rds import RDSIE
from .redbulltv import ( from .redbulltv import (
@ -1200,6 +1206,7 @@ from .storyfire import (
from .streamable import StreamableIE from .streamable import StreamableIE
from .streamcloud import StreamcloudIE from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE from .streamcz import StreamCZIE
from .streamsb import StreamsbIE
from .streetvoice import StreetVoiceIE from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE from .stretchinternet import StretchInternetIE
from .stv import STVPlayerIE from .stv import STVPlayerIE

View File

@ -0,0 +1,43 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
decode_packed_codes,
js_to_json,
)
class FileMoonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P<id>\w+)'
_TEST = {
'url': 'https://filemoon.sx/e/dw40rxrzruqz',
'md5': '5a713742f57ac4aef29b74733e8dda01',
'info_dict': {
'id': 'dw40rxrzruqz',
'title': 'dw40rxrzruqz',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
matches = re.findall(r'(?s)(eval.*?)</script>', webpage)
packed = matches[-1]
unpacked = decode_packed_codes(packed)
jwplayer_sources = self._parse_json(
self._search_regex(
r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'),
video_id, transform_source=js_to_json)
formats = self._parse_jwplayer_formats(jwplayer_sources, video_id)
return {
'id': video_id,
'title': self._generic_title(url) or video_id,
'formats': formats
}

View File

@ -1,19 +1,29 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_filter as filter,
compat_HTTPError,
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
HEADRequest,
determine_ext, determine_ext,
error_to_compat_str,
extract_attributes,
ExtractorError,
int_or_none, int_or_none,
merge_dicts,
orderedSet,
parse_iso8601, parse_iso8601,
strip_or_none, strip_or_none,
try_get, traverse_obj,
url_or_none,
urljoin,
) )
@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor):
return self._download_json( return self._download_json(
'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
raise
def _extract_video_info(self, video, fatal=True):
video_id = video['videoId']
formats = []
refs = traverse_obj(video, 'refs', expected_type=dict) or {}
m3u8_url = url_or_none(refs.get('m3uUrl'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = url_or_none(refs.get('f4mUrl'))
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = url_or_none(asset.get('url'))
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = traverse_obj(
video, ('system', 'mezzanineUrl'), expected_type=url_or_none)
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
if formats or fatal:
self._sort_formats(formats)
else:
return
thumbnails = traverse_obj(
video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none)
tags = traverse_obj(
video, ('tags', Ellipsis, 'displayName'),
expected_type=lambda x: x.strip() or None)
metadata = traverse_obj(video, 'metadata', expected_type=dict) or {}
title = traverse_obj(
metadata, 'longTitle', 'title', 'name',
expected_type=lambda x: x.strip() or None)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
# yt-dlp shim
@classmethod
def _extract_from_webpage(cls, url, webpage):
for embed_url in orderedSet(
cls._extract_embed_urls(url, webpage) or [], lazy=True):
yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
class IGNIE(IGNBaseIE): class IGNIE(IGNBaseIE):
""" """
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported Some videos of it.ign.com are also supported
""" """
_VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)'
_VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' _PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?'
_VALID_URL = (
r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)'
% '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))
IE_NAME = 'ign.com' IE_NAME = 'ign.com'
_PAGE_TYPE = 'video' _PAGE_TYPE = 'video'
@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE):
'timestamp': 1370440800, 'timestamp': 1370440800,
'upload_date': '20130605', 'upload_date': '20130605',
'tags': 'count:9', 'tags': 'count:9',
} },
'params': {
'nocheckcertificate': True,
},
}, { }, {
'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
'md5': 'f1581a6fe8c5121be5b807684aeac3f6', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE):
'timestamp': 1420571160, 'timestamp': 1420571160,
'upload_date': '20150106', 'upload_date': '20150106',
'tags': 'count:4', 'tags': 'count:4',
} },
'skip': '404 Not Found',
}, { }, {
'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
'only_matching': True, 'only_matching': True,
}] }]
@classmethod
def _extract_embed_urls(cls, url, webpage):
grids = re.findall(
r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''',
webpage)
return filter(None,
(urljoin(url, m.group('path')) for m in re.finditer(
r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1'''
% cls._VIDEO_PATH_RE, grids[0] if grids else '')))
def _real_extract(self, url): def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
display_id = m.group('id')
if display_id:
return self._extract_video(url, display_id)
display_id = m.group('filt') or 'all'
return self._extract_playlist(url, display_id)
def _extract_playlist(self, url, display_id):
webpage = self._download_webpage(url, display_id)
return self.playlist_result(
(self.url_result(u, ie=self.ie_key())
for u in self._extract_embed_urls(url, webpage)),
playlist_id=display_id)
def _extract_video(self, url, display_id):
display_id = self._match_id(url) display_id = self._match_id(url)
video = self._call_api(display_id) video = self._checked_call_api(display_id)
video_id = video['videoId']
metadata = video['metadata']
title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
formats = [] info = self._extract_video_info(video)
refs = video.get('refs') or {}
m3u8_url = refs.get('m3uUrl') return merge_dicts({
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = refs.get('f4mUrl')
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = asset.get('url')
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
self._sort_formats(formats)
thumbnails = []
for thumbnail in (video.get('thumbnails') or []):
thumbnail_url = thumbnail.get('url')
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
})
tags = []
for tag in (video.get('tags') or []):
display_name = tag.get('displayName')
if not display_name:
continue
tags.append(display_name)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'display_id': display_id, 'display_id': display_id,
'thumbnails': thumbnails, }, info)
'formats': formats,
'tags': tags,
}
class IGNVideoIE(InfoExtractor): class IGNVideoIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{ _TESTS = [{
'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor):
'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'description': 'Taking out assassination targets in Hitman has never been more stylish.',
'timestamp': 1444665600, 'timestamp': 1444665600,
'upload_date': '20151012', 'upload_date': '20151012',
} },
'expected_warnings': ['HTTP Error 400: Bad Request'],
}, { }, {
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True, 'only_matching': True,
@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') parsed_url = compat_urlparse.urlparse(url)
url = self._request_webpage(req, video_id).geturl() embed_url = compat_urlparse.urlunparse(
parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed'))
webpage, urlh = self._download_webpage_handle(embed_url, video_id)
new_url = urlh.geturl()
ign_url = compat_parse_qs( ign_url = compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('url', [None])[0] compat_urlparse.urlparse(new_url).query).get('url', [None])[-1]
if ign_url: if ign_url:
return self.url_result(ign_url, IGNIE.ie_key()) return self.url_result(ign_url, IGNIE.ie_key())
return self.url_result(url) video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False)
if not video:
if new_url == url:
raise ExtractorError('Redirect loop: ' + url)
return self.url_result(new_url)
video = extract_attributes(video)
video_data = video.get('data-settings') or '{}'
video_data = self._parse_json(video_data, video_id)['video']
info = self._extract_video_info(video_data)
return merge_dicts({
'display_id': video_id,
}, info)
class IGNArticleIE(IGNBaseIE): class IGNArticleIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)'
_PAGE_TYPE = 'article' _PAGE_TYPE = 'article'
_TESTS = [{ _TESTS = [{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': { 'info_dict': {
'id': '524497489e4e8ff5848ece34', 'id': '72113',
'title': '100 Little Things in GTA 5 That Will Blow Your Mind', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
}, },
'playlist': [ 'playlist': [
@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': { 'info_dict': {
'id': '5ebbd138523268b93c9141af17bec937', 'id': '5ebbd138523268b93c9141af17bec937',
'ext': 'mp4', 'ext': 'mp4',
'title': 'GTA 5 Video Review', 'title': 'Grand Theft Auto V Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
'timestamp': 1379339880, 'timestamp': 1379339880,
'upload_date': '20130916', 'upload_date': '20130916',
@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': { 'info_dict': {
'id': '638672ee848ae4ff108df2a296418ee2', 'id': '638672ee848ae4ff108df2a296418ee2',
'ext': 'mp4', 'ext': 'mp4',
'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'title': 'GTA 5 In Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
'timestamp': 1386878820, 'timestamp': 1386878820,
'upload_date': '20131212', 'upload_date': '20131212',
@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE):
}, },
], ],
'params': { 'params': {
'playlist_items': '2-3',
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Backend fetch failed'],
}, { }, {
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': { 'info_dict': {
'id': '53ee806780a81ec46e0790f8', 'id': '53ee806780a81ec46e0790f8',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
}, },
'playlist_count': 2, 'playlist_count': 1,
'expected_warnings': ['Backend fetch failed'],
}, { }, {
# videoId pattern # videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
if e.cause.code == 404:
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
elif e.cause.code == 503:
self.report_warning(error_to_compat_str(e.cause))
return
raise
def _search_nextjs_data(self, webpage, video_id, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', **kw),
video_id, **kw)
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
article = self._call_api(display_id) article = self._checked_call_api(display_id)
if article:
# obsolete ?
def entries(): def entries():
media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) media_url = traverse_obj(
article, ('mediaRelations', 0, 'media', 'metadata', 'url'),
expected_type=url_or_none)
if media_url: if media_url:
yield self.url_result(media_url, IGNIE.ie_key()) yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []): for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
if url_or_none(video_url):
yield self.url_result(video_url) yield self.url_result(video_url)
return self.playlist_result( return self.playlist_result(
entries(), article.get('articleId'), entries(), article.get('articleId'),
strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) traverse_obj(
article, ('metadata', 'headline'),
expected_type=lambda x: x.strip() or None))
webpage = self._download_webpage(url, display_id)
playlist_id = self._html_search_meta('dable:item_id', webpage, default=None)
if playlist_id:
def entries():
for m in re.finditer(
r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''',
webpage):
flashvars = self._search_regex(
r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''',
m.group('params'), 'flashvars', default='')
flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '')
v_url = url_or_none((flashvars.get('url') or [None])[-1])
if v_url:
yield self.url_result(v_url)
else:
playlist_id = self._search_regex(
r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''',
webpage, 'id', group='id', default=None)
nextjs_data = self._search_nextjs_data(webpage, display_id)
def entries():
for player in traverse_obj(
nextjs_data,
('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')):
# skip promo links (which may not always be served, eg GH CI servers)
if traverse_obj(nextjs_data,
('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')),
expected_type=dict):
continue
video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {}
info = self._extract_video_info(video, fatal=False)
if info:
yield merge_dicts({
'display_id': display_id,
}, info)
return self.playlist_result(
entries(), playlist_id or display_id,
re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)

View File

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import update_url
class KommunetvIE(InfoExtractor):
_VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)'
_TEST = {
'url': 'https://oslo.kommunetv.no/archive/921',
'md5': '5f102be308ee759be1e12b63d5da4bbc',
'info_dict': {
'id': '921',
'title': 'Bystyremøte',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
headers = {
'Accept': 'application/json'
}
data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers)
title = data['stream']['title']
file = data['playlist'][0]['playlist'][0]['file']
url = update_url(file, query=None, fragment=None)
formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': title
}

View File

@ -0,0 +1,97 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
_TESTS = [{
# Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128',
'md5': '53a5e7b3e07128e33bbf36687fe1c08f',
'info_dict': {
'id': 'cpp/22128',
'ext': 'mp4',
'title': 'Lecture: October 18. 2022',
'series': 'Concepts of C++ programming (IN2377)',
}
}, {
# Presentation only
'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES',
'md5': '36c584272179f3e56b0db5d880639cba',
'info_dict': {
'id': 'I2DL/12349/PRES',
'ext': 'mp4',
'title': 'Lecture 3: Introduction to Neural Networks',
'series': 'Introduction to Deep Learning (IN2346)',
}
}, {
# Camera only
'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM',
'md5': 'e04189d92ff2f56aedf5cede65d37aad',
'info_dict': {
'id': 'fvv-info/16130/CAM',
'ext': 'mp4',
'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik',
}
}, ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_series_title = self._html_search_regex(
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
return {
'id': video_id,
'title': lecture_title,
'series': lecture_series_title,
'formats': formats,
}
class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
_TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/course/2022/W/set',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, ]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_urls = []
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lecture_urls, course_id, lecture_series_title)

View File

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import binascii
import random
import re
import string
from .common import InfoExtractor
from ..utils import urljoin, url_basename
def to_ascii_hex(str1):
return binascii.hexlify(str1.encode('utf-8')).decode('ascii')
def generate_random_string(length):
return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length))
class StreamsbIE(InfoExtractor):
_DOMAINS = ('viewsb.com', )
_VALID_URL = r'https://(?P<domain>%s)/(?P<id>.+)' % '|'.join(_DOMAINS)
_TEST = {
'url': 'https://viewsb.com/dxfvlu4qanjx',
'md5': '488d111a63415369bf90ea83adc8a325',
'info_dict': {
'id': 'dxfvlu4qanjx',
'ext': 'mp4',
'title': 'Sintel'
}
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id')
webpage = self._download_webpage(url, video_id)
iframe_rel_url = self._search_regex(r'''(?i)<iframe\b[^>]+\bsrc\s*=\s*('|")(?P<path>/.*\.html)\1''', webpage, 'iframe', group='path')
iframe_url = urljoin('https://' + domain, iframe_rel_url)
iframe_data = self._download_webpage(iframe_url, video_id)
app_version = self._search_regex(r'''<script\b[^>]+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50'
video_code = url_basename(iframe_url).rsplit('.')[0]
length = 12
req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb'))
ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req))
video_data = self._download_webpage(ereq, video_id, headers={
'Referer': iframe_url,
'watchsb': 'sbstream',
})
player_data = self._parse_json(video_data, video_id)
title = player_data['stream_data']['title']
formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
return {
'id': video_id,
'formats': formats,
'title': title,
}

View File

@ -270,17 +270,23 @@ class VimeoIE(VimeoBaseInfoExtractor):
\. \.
)? )?
vimeo(?:pro)?\.com/ vimeo(?:pro)?\.com/
(?:
(?P<u>user)|
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)?? (?:.*?/)??
(?: (?P<q>
(?: (?:
play_redirect_hls| play_redirect_hls|
moogaloop\.swf)\?clip_id= moogaloop\.swf)\?clip_id=
)? )?
(?:videos?/)? (?:videos?/)?
)
(?P<id>[0-9]+) (?P<id>[0-9]+)
(?:/(?P<unlisted_hash>[\da-f]{10}))? (?(u)
/?(?:[?&].*)?(?:[#].*)?$ /(?!videos|likes)[^/?#]+/?|
(?(q)|/(?P<unlisted_hash>[\da-f]{10}))?
)
(?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$
''' '''
IE_NAME = 'vimeo' IE_NAME = 'vimeo'
_TESTS = [ _TESTS = [
@ -539,7 +545,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
} },
{
# user playlist alias -> https://vimeo.com/258705797
'url': 'https://vimeo.com/user26785108/newspiritualguide',
'only_matching': True,
},
# https://gettingthingsdone.com/workflowmap/ # https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header # vimeo embed with check-password page protected by Referer header
] ]
@ -663,7 +674,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
if '//player.vimeo.com/video/' in url: if '//player.vimeo.com/video/' in url:
config = self._parse_json(self._search_regex( config = self._parse_json(self._search_regex(
r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id)
if config.get('view') == 4: if config.get('view') == 4:
config = self._verify_player_video_password( config = self._verify_player_video_password(
redirect_url, video_id, headers) redirect_url, video_id, headers)

View File

@ -14,12 +14,11 @@ from ..compat import (
compat_chr, compat_chr,
compat_HTTPError, compat_HTTPError,
compat_map as map, compat_map as map,
compat_parse_qs,
compat_str, compat_str,
compat_urllib_parse,
compat_urllib_parse_parse_qs as compat_parse_qs,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urlparse,
) )
from ..jsinterp import JSInterpreter from ..jsinterp import JSInterpreter
from ..utils import ( from ..utils import (
@ -28,20 +27,25 @@ from ..utils import (
dict_get, dict_get,
error_to_compat_str, error_to_compat_str,
float_or_none, float_or_none,
extract_attributes,
get_element_by_attribute,
int_or_none, int_or_none,
js_to_json, js_to_json,
mimetype2ext, mimetype2ext,
parse_codecs, parse_codecs,
parse_duration, parse_duration,
parse_qs,
qualities, qualities,
remove_start, remove_start,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
str_to_int, str_to_int,
traverse_obj,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
unsmuggle_url, unsmuggle_url,
update_url,
update_url_query, update_url_query,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
@ -49,10 +53,6 @@ from ..utils import (
) )
def parse_qs(url):
return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors""" """Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
@ -286,15 +286,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
def _call_api(self, ep, query, video_id, fatal=True): def _call_api(self, ep, query, video_id, fatal=True, headers=None):
data = self._DEFAULT_API_DATA.copy() data = self._DEFAULT_API_DATA.copy()
data.update(query) data.update(query)
real_headers = {'content-type': 'application/json'}
if headers:
real_headers.update(headers)
return self._download_json( return self._download_json(
'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
note='Downloading API JSON', errnote='Unable to download API page', note='Downloading API JSON', errnote='Unable to download API page',
data=json.dumps(data).encode('utf8'), fatal=fatal, data=json.dumps(data).encode('utf8'), fatal=fatal,
headers={'content-type': 'application/json'}, headers=real_headers,
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
def _extract_yt_initial_data(self, video_id, webpage): def _extract_yt_initial_data(self, video_id, webpage):
@ -515,6 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Philipp Hagemeister', 'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag', 'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'channel': 'Philipp Hagemeister',
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002', 'upload_date': '20121002',
@ -524,10 +528,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 10, 'duration': 10,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int, 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
'start_time': 1, 'start_time': 1,
'end_time': 9, 'end_time': 9,
} },
}, },
{ {
'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
@ -562,7 +566,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': 10, 'duration': 10,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -621,8 +624,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
} }
}, },
# Normal age-gate video (No vevo, embed allowed), available via embed page # Age-gated videos
{ {
'note': 'Age-gated video (No vevo, embed allowed)',
'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
'info_dict': { 'info_dict': {
'id': 'HtVdAasjOgU', 'id': 'HtVdAasjOgU',
@ -634,14 +638,98 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'WitcherGame', 'uploader_id': 'WitcherGame',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605', 'upload_date': '20140605',
'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg',
'age_limit': 18, 'age_limit': 18,
'categories': ['Gaming'],
'tags': 'count:17',
'channel': 'The Witcher',
'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg',
'channel_id': 'UCzybXLxv08IApdjdN0mJhEg',
'view_count': int,
'like_count': int,
}, },
}, },
{ {
# Age-gated video only available with authentication (unavailable 'note': 'Age-gated video with embed allowed in public site',
# via embed page workaround) 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U',
'info_dict': {
'id': 'HsUATh_Nc2U',
'ext': 'mp4',
'title': 'Godzilla 2 (Official Video)',
'description': 'md5:bf77e03fcae5529475e500129b05668a',
'duration': 177,
'uploader': 'FlyingKitty',
'uploader_id': 'FlyingKitty900',
'upload_date': '20200408',
'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',
'age_limit': 18,
'categories': ['Entertainment'],
'tags': ['Flyingkitty', 'godzilla 2'],
'channel': 'FlyingKitty',
'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg',
'channel_id': 'UCYQT13AtrJC0gsM1far_zJg',
'view_count': int,
'like_count': int,
},
},
{
'note': 'Age-gated video embeddable only with clientScreen=EMBED',
'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
'info_dict': {
'id': 'Tq92D6wQ1mg',
'ext': 'mp4',
'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
'description': 'md5:17eccca93a786d51bc67646756894066',
'duration': 106,
'uploader': 'Projekt Melody',
'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'upload_date': '20191227',
'age_limit': 18,
'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg',
'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'],
'categories': ['Entertainment'],
'channel': 'Projekt Melody',
'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'view_count': int,
'like_count': int,
},
},
{
'note': 'Non-Age-gated non-embeddable video',
'url': 'https://youtube.com/watch?v=MeJVWBSsPAY',
'info_dict': {
'id': 'MeJVWBSsPAY',
'ext': 'mp4',
'title': 'OOMPH! - Such Mich Find Mich (Lyrics)',
'description': 'Fan Video. Music & Lyrics by OOMPH!.',
'duration': 210,
'uploader': 'Herr Lurik',
'uploader_id': 'st3in234',
'upload_date': '20130730',
'uploader_url': 'http://www.youtube.com/user/st3in234',
'age_limit': 0,
'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg',
'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'],
'categories': ['Music'],
'channel': 'Herr Lurik',
'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA',
'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA',
'artist': 'OOMPH!',
'view_count': int,
'like_count': int,
},
},
{
'note': 'Non-bypassable age-gated video',
'url': 'https://youtube.com/watch?v=Cr381pDsSsA',
'only_matching': True,
},
{
'note': 'Age-gated video only available with authentication (not via embed workaround)',
'url': 'XgnwCQzjau8', 'url': 'XgnwCQzjau8',
'only_matching': True, 'only_matching': True,
'skip': '''This video has been removed for violating YouTube's Community Guidelines''',
}, },
# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
# YouTube Red ad is not captured for creator # YouTube Red ad is not captured for creator
@ -670,17 +758,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': { 'info_dict': {
'id': 'lqQg6PlCWgI', 'id': 'lqQg6PlCWgI',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
'description': r're:(?s)(?:.+\s)?HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*',
'duration': 6085, 'duration': 6085,
'upload_date': '20150827', 'upload_date': '20150827',
'uploader_id': 'olympic', 'uploader_id': 'olympic',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': r're:Olympics?',
'uploader': 'Olympic', 'age_limit': 0,
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg',
'categories': ['Sports'],
'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'],
'channel': 'Olympics',
'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q',
'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q',
'view_count': int,
'like_count': int,
}, },
'params': {
'skip_download': 'requires avconv',
}
}, },
# Non-square pixels # Non-square pixels
{ {
@ -840,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'lsguqyKfVQg', 'id': 'lsguqyKfVQg',
'ext': 'mp4', 'ext': 'mp4',
'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
'alt_title': 'Dark Walk - Position Music', 'alt_title': 'Dark Walk',
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'duration': 133, 'duration': 133,
'upload_date': '20151119', 'upload_date': '20151119',
'uploader_id': 'IronSoulElf', 'uploader_id': 'IronSoulElf',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf', 'uploader': 'IronSoulElf',
'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan',
'track': 'Dark Walk - Position Music', 'track': 'Dark Walk',
'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', 'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan',
'album': 'Position Music - Production Music Vol. 143 - Dark Walk', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
}, },
'params': { 'params': {
@ -1301,11 +1395,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
# Hack for lazy extractors until more generic solution is implemented if parse_qs(url).get('list', [None])[0]:
# (see #28780)
from .youtube import parse_qs
qs = parse_qs(url)
if qs.get('list', [None])[0]:
return False return False
return super(YoutubeIE, cls).suitable(url) return super(YoutubeIE, cls).suitable(url)
@ -1455,7 +1545,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_url.startswith('//'): if player_url.startswith('//'):
player_url = 'https:' + player_url player_url = 'https:' + player_url
elif not re.match(r'https?://', player_url): elif not re.match(r'https?://', player_url):
player_url = compat_urlparse.urljoin( player_url = compat_urllib_parse.urljoin(
'https://www.youtube.com', player_url) 'https://www.youtube.com', player_url)
return player_url return player_url
@ -1537,9 +1627,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _unthrottle_format_urls(self, video_id, player_url, formats): def _unthrottle_format_urls(self, video_id, player_url, formats):
for fmt in formats: for fmt in formats:
parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url'])
qs = compat_urlparse.parse_qs(parsed_fmt_url.query) n_param = compat_parse_qs(parsed_fmt_url.query).get('n')
n_param = qs.get('n')
if not n_param: if not n_param:
continue continue
n_param = n_param[-1] n_param = n_param[-1]
@ -1547,9 +1636,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if n_response is None: if n_response is None:
# give up if descrambling failed # give up if descrambling failed
break break
qs['n'] = [n_response] fmt['url'] = update_url(
fmt['url'] = compat_urlparse.urlunparse( parsed_fmt_url, query_update={'n': [n_response]})
parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
# from yt-dlp, with tweaks
def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
"""
Extract signatureTimestamp (sts)
Required to tell API what sig/player version is in use.
"""
sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None
if not sts:
# Attempt to extract from player
if player_url is None:
error_msg = 'Cannot extract signature timestamp without player_url.'
if fatal:
raise ExtractorError(error_msg)
self._downloader.report_warning(error_msg)
return
code = self._get_player_code(video_id, player_url)
sts = int_or_none(self._search_regex(
r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '',
'JS player signature timestamp', group='sts', fatal=fatal))
return sts
def _mark_watched(self, video_id, player_response): def _mark_watched(self, video_id, player_response):
playback_url = url_or_none(try_get( playback_url = url_or_none(try_get(
@ -1557,20 +1666,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
if not playback_url: if not playback_url:
return return
parsed_playback_url = compat_urlparse.urlparse(playback_url)
qs = compat_urlparse.parse_qs(parsed_playback_url.query)
# cpn generation algorithm is reverse engineered from base.js. # cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn. # In fact it works even with dummy cpn.
CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
qs.update({ playback_url = update_url(
playback_url, query_update={
'ver': ['2'], 'ver': ['2'],
'cpn': [cpn], 'cpn': [cpn],
}) })
playback_url = compat_urlparse.urlunparse(
parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
self._download_webpage( self._download_webpage(
playback_url, video_id, 'Marking watched', playback_url, video_id, 'Marking watched',
@ -1675,6 +1781,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
player_response = None player_response = None
player_url = None
if webpage: if webpage:
player_response = self._extract_yt_initial_variable( player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
@ -1683,27 +1790,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response = self._call_api( player_response = self._call_api(
'player', {'videoId': video_id}, video_id) 'player', {'videoId': video_id}, video_id)
playability_status = player_response.get('playabilityStatus') or {} def is_agegated(playability):
if playability_status.get('reason') == 'Sign in to confirm your age': if not isinstance(playability, dict):
video_info = self._download_webpage( return
base_url + 'get_video_info', video_id,
'Refetching age-gated info webpage', if playability.get('desktopLegacyAgeGateReason'):
'unable to download video info webpage', query={ return True
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id, reasons = filter(None, (playability.get(r) for r in ('status', 'reason')))
'html5': 1, AGE_GATE_REASONS = (
# See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 'confirm your age', 'age-restricted', 'inappropriate', # reason
'c': 'TVHTML5', 'age_verification_required', 'age_check_required', # status
'cver': '6.20180913', )
}, fatal=False) return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons)
if video_info:
pr = self._parse_json( def get_playability_status(response):
try_get( return try_get(response, lambda x: x['playabilityStatus'], dict) or {}
compat_parse_qs(video_info),
lambda x: x['player_response'][0], compat_str) or '{}', playability_status = get_playability_status(player_response)
video_id, fatal=False) if (is_agegated(playability_status)
if pr and isinstance(pr, dict): and int_or_none(self._downloader.params.get('age_limit'), default=18) >= 18):
player_response = pr
self.report_age_confirmation()
# Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233
pb_context = {'html5Preference': 'HTML5_PREF_WANTS'}
# Use signatureTimestamp if available
# Thanks https://github.com/ytdl-org/youtube-dl/issues/31034#issuecomment-1160718026
player_url = self._extract_player_url(webpage)
ytcfg = self._extract_ytcfg(video_id, webpage)
sts = self._extract_signature_timestamp(video_id, player_url, ytcfg)
if sts:
pb_context['signatureTimestamp'] = sts
query = {
'playbackContext': {'contentPlaybackContext': pb_context},
'contentCheckOk': True,
'racyCheckOk': True,
'context': {
'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'},
'thirdParty': {'embedUrl': 'https://google.com'},
},
'videoId': video_id,
}
headers = {
'X-YouTube-Client-Name': '85',
'X-YouTube-Client-Version': '2.0',
'Origin': 'https://www.youtube.com'
}
video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
age_gate_status = get_playability_status(video_info)
if age_gate_status.get('status') == 'OK':
player_response = video_info
playability_status = age_gate_status
trailer_video_id = try_get( trailer_video_id = try_get(
playability_status, playability_status,
@ -1785,7 +1926,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats = [] formats = []
itags = [] itags = []
itag_qualities = {} itag_qualities = {}
player_url = None
q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
streaming_data = player_response.get('streamingData') or {} streaming_data = player_response.get('streamingData') or {}
streaming_formats = streaming_data.get('formats') or [] streaming_formats = streaming_data.get('formats') or []
@ -1929,15 +2069,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
thumbnails = [] thumbnails = []
for container in (video_details, microformat): for container in (video_details, microformat):
for thumbnail in (try_get( for thumbnail in try_get(
container, container,
lambda x: x['thumbnail']['thumbnails'], list) or []): lambda x: x['thumbnail']['thumbnails'], list) or []:
thumbnail_url = thumbnail.get('url') thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
thumbnails.append({ thumbnails.append({
'height': int_or_none(thumbnail.get('height')), 'height': int_or_none(thumbnail.get('height')),
'url': thumbnail_url, 'url': update_url(thumbnail_url, query=None, fragment=None),
'width': int_or_none(thumbnail.get('width')), 'width': int_or_none(thumbnail.get('width')),
}) })
if thumbnails: if thumbnails:
@ -1956,7 +2096,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or microformat.get('lengthSeconds')) \ or microformat.get('lengthSeconds')) \
or parse_duration(search_meta('duration')) or parse_duration(search_meta('duration'))
is_live = video_details.get('isLive') is_live = video_details.get('isLive')
owner_profile_url = microformat.get('ownerProfileUrl')
def gen_owner_profile_url():
yield microformat.get('ownerProfileUrl')
yield extract_attributes(self._search_regex(
r'''(?s)(<link\b[^>]+\bitemprop\s*=\s*("|')url\2[^>]*>)''',
get_element_by_attribute('itemprop', 'author', webpage),
'owner_profile_url', default='')).get('href')
owner_profile_url = next(
(x for x in map(url_or_none, gen_owner_profile_url()) if x),
None)
if not player_url: if not player_url:
player_url = self._extract_player_url(webpage) player_url = self._extract_player_url(webpage)
@ -2041,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
info[d_k] = parse_duration(query[k][0]) info[d_k] = parse_duration(query[k][0])
if video_description: if video_description:
# Youtube Music Auto-generated description
mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
if mobj: if mobj:
release_year = mobj.group('release_year') release_year = mobj.group('release_year')
@ -2115,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
info['location'] = stl info['location'] = stl
else: else:
mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) # •? doesn't match, but [•]? does; \xa0 = non-breaking space
mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl)
if mobj: if mobj:
info.update({ info.update({
'series': mobj.group(1), 'series': mobj.group(1),
@ -2126,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
vpir, vpir,
lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
list) or []): list) or []):
tbr = tlb.get('toggleButtonRenderer') or {} tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {}
for getter, regex in [( for getter, regex in [(
lambda x: x['defaultText']['accessibility']['accessibilityData'], lambda x: x['defaultText']['accessibility']['accessibilityData'],
r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
@ -2142,6 +2294,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
sbr_tooltip = try_get( sbr_tooltip = try_get(
vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
if sbr_tooltip: if sbr_tooltip:
# however dislike_count was hidden by YT, as if there could ever be dislikable content on YT
like_count, dislike_count = sbr_tooltip.split(' / ') like_count, dislike_count = sbr_tooltip.split(' / ')
info.update({ info.update({
'like_count': str_to_int(like_count), 'like_count': str_to_int(like_count),
@ -2179,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif mrr_title == 'Song': elif mrr_title == 'Song':
info['track'] = mrr_contents_text info['track'] = mrr_contents_text
# this is not extraction but spelunking!
carousel_lockups = traverse_obj(
initial_data,
('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer',
'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis,
'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis),
expected_type=dict) or []
# try to reproduce logic from metadataRowContainerRenderer above (if it still is)
fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license'))
# multiple_songs ?
if len(carousel_lockups) > 1:
fields = fields[-1:]
for info_row in traverse_obj(
carousel_lockups,
(0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'),
expected_type=dict):
row_title = traverse_obj(info_row, ('title', 'simpleText'))
row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text)
if not row_text:
continue
for name, field in fields:
if name == row_title and not info.get(field):
info[field] = row_text
for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
v = info.get(s_k) v = info.get(s_k)
if v: if v:
@ -2411,7 +2588,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'tags': list, 'tags': list,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'dislike_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -2438,7 +2614,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'categories': ['News & Politics'], 'categories': ['News & Politics'],
'tags': list, 'tags': list,
'like_count': int, 'like_count': int,
'dislike_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -2458,7 +2633,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'categories': ['News & Politics'], 'categories': ['News & Politics'],
'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
'like_count': int, 'like_count': int,
'dislike_count': int,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -3043,8 +3217,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
item_id = self._match_id(url) item_id = self._match_id(url)
url = compat_urlparse.urlunparse( url = update_url(url, netloc='www.youtube.com')
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
# Handle both video/playlist URLs # Handle both video/playlist URLs
qs = parse_qs(url) qs = parse_qs(url)
video_id = qs.get('v', [None])[0] video_id = qs.get('v', [None])[0]
@ -3144,11 +3317,7 @@ class YoutubePlaylistIE(InfoExtractor):
def suitable(cls, url): def suitable(cls, url):
if YoutubeTabIE.suitable(url): if YoutubeTabIE.suitable(url):
return False return False
# Hack for lazy extractors until more generic solution is implemented if parse_qs(url).get('v', [None])[0]:
# (see #28780)
from .youtube import parse_qs
qs = parse_qs(url)
if qs.get('v', [None])[0]:
return False return False
return super(YoutubePlaylistIE, cls).suitable(url) return super(YoutubePlaylistIE, cls).suitable(url)
@ -3178,7 +3347,6 @@ class YoutubeYtBeIE(InfoExtractor):
'categories': ['Nonprofits & Activism'], 'categories': ['Nonprofits & Activism'],
'tags': list, 'tags': list,
'like_count': int, 'like_count': int,
'dislike_count': int,
}, },
'params': { 'params': {
'noplaylist': True, 'noplaylist': True,
@ -3288,9 +3456,9 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) qs = parse_qs(url)
query = (qs.get('search_query') or qs.get('q'))[0] query = (qs.get('search_query') or qs.get('q'))[-1]
params = qs.get('sp', ('',))[0] params = qs.get('sp', ('',))[-1]
return self.playlist_result(self._search_results(query, params), query, query) return self.playlist_result(self._search_results(query, params), query, query)

View File

@ -201,7 +201,7 @@ class JSInterpreter(object):
def __init__(self, msg, *args, **kwargs): def __init__(self, msg, *args, **kwargs):
expr = kwargs.pop('expr', None) expr = kwargs.pop('expr', None)
if expr is not None: if expr is not None:
msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr)
super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
class JS_RegExp(object): class JS_RegExp(object):
@ -699,7 +699,7 @@ class JSInterpreter(object):
""" assert, but without risk of getting optimized out """ """ assert, but without risk of getting optimized out """
if not cndn: if not cndn:
memb = member memb = member
raise self.Exception('{member} {msg}'.format(**locals()), expr=expr) raise self.Exception('{memb} {msg}'.format(**locals()), expr=expr)
def eval_method(): def eval_method():
if (variable, member) == ('console', 'debug'): if (variable, member) == ('console', 'debug'):

View File

@ -42,6 +42,7 @@ from .compat import (
compat_HTMLParser, compat_HTMLParser,
compat_HTTPError, compat_HTTPError,
compat_basestring, compat_basestring,
compat_casefold,
compat_chr, compat_chr,
compat_collections_abc, compat_collections_abc,
compat_cookiejar, compat_cookiejar,
@ -54,18 +55,18 @@ from .compat import (
compat_integer_types, compat_integer_types,
compat_kwargs, compat_kwargs,
compat_os_name, compat_os_name,
compat_parse_qs, compat_re_Match,
compat_shlex_quote, compat_shlex_quote,
compat_str, compat_str,
compat_struct_pack, compat_struct_pack,
compat_struct_unpack, compat_struct_unpack,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse, compat_urllib_parse,
compat_urllib_parse_parse_qs as compat_parse_qs,
compat_urllib_parse_urlencode, compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
compat_urllib_request, compat_urllib_request,
compat_urlparse,
compat_xpath, compat_xpath,
) )
@ -80,12 +81,12 @@ def register_socks_protocols():
# In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
# URLs with protocols not in urlparse.uses_netloc are not handled correctly # URLs with protocols not in urlparse.uses_netloc are not handled correctly
for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
if scheme not in compat_urlparse.uses_netloc: if scheme not in compat_urllib_parse.uses_netloc:
compat_urlparse.uses_netloc.append(scheme) compat_urllib_parse.uses_netloc.append(scheme)
# This is not clearly defined otherwise # Unfavoured alias
compiled_regex_type = type(re.compile('')) compiled_regex_type = compat_re_Match
def random_user_agent(): def random_user_agent():
@ -2725,7 +2726,7 @@ def make_socks_conn_class(base_class, socks_proxy):
assert issubclass(base_class, ( assert issubclass(base_class, (
compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
url_components = compat_urlparse.urlparse(socks_proxy) url_components = compat_urllib_parse.urlparse(socks_proxy)
if url_components.scheme.lower() == 'socks5': if url_components.scheme.lower() == 'socks5':
socks_type = ProxyType.SOCKS5 socks_type = ProxyType.SOCKS5
elif url_components.scheme.lower() in ('socks', 'socks4'): elif url_components.scheme.lower() in ('socks', 'socks4'):
@ -3673,7 +3674,7 @@ def remove_quotes(s):
def url_basename(url): def url_basename(url):
path = compat_urlparse.urlparse(url).path path = compat_urllib_parse.urlparse(url).path
return path.strip('/').split('/')[-1] return path.strip('/').split('/')[-1]
@ -3693,7 +3694,7 @@ def urljoin(base, path):
if not isinstance(base, compat_str) or not re.match( if not isinstance(base, compat_str) or not re.match(
r'^(?:https?:)?//', base): r'^(?:https?:)?//', base):
return None return None
return compat_urlparse.urljoin(base, path) return compat_urllib_parse.urljoin(base, path)
class HEADRequest(compat_urllib_request.Request): class HEADRequest(compat_urllib_request.Request):
@ -4091,6 +4092,10 @@ def escape_url(url):
).geturl() ).geturl()
def parse_qs(url):
return compat_parse_qs(compat_urllib_parse.urlparse(url).query)
def read_batch_urls(batch_fd): def read_batch_urls(batch_fd):
def fixup(url): def fixup(url):
if not isinstance(url, compat_str): if not isinstance(url, compat_str):
@ -4111,14 +4116,28 @@ def urlencode_postdata(*args, **kargs):
return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
def update_url_query(url, query): def update_url(url, **kwargs):
if not query: """Replace URL components specified by kwargs
return url url: compat_str or parsed URL tuple
parsed_url = compat_urlparse.urlparse(url) if query_update is in kwargs, update query with
qs = compat_parse_qs(parsed_url.query) its value instead of replacing (overrides any `query`)
returns: compat_str
"""
if not kwargs:
return compat_urllib_parse.urlunparse(url) if isinstance(url, tuple) else url
if not isinstance(url, tuple):
url = compat_urllib_parse.urlparse(url)
query = kwargs.pop('query_update', None)
if query:
qs = compat_parse_qs(url.query)
qs.update(query) qs.update(query)
return compat_urlparse.urlunparse(parsed_url._replace( kwargs['query'] = compat_urllib_parse_urlencode(qs, True)
query=compat_urllib_parse_urlencode(qs, True))) kwargs = compat_kwargs(kwargs)
return compat_urllib_parse.urlunparse(url._replace(**kwargs))
def update_url_query(url, query):
return update_url(url, query_update=query)
def update_Request(req, url=None, data=None, headers={}, query={}): def update_Request(req, url=None, data=None, headers={}, query={}):
@ -5586,7 +5605,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
if proxy == '__noproxy__': if proxy == '__noproxy__':
return None # No Proxy return None # No Proxy
if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): if compat_urllib_parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
req.add_header('Ytdl-socks-proxy', proxy) req.add_header('Ytdl-socks-proxy', proxy)
# youtube-dl's http/https handlers do wrapping the socket with socks # youtube-dl's http/https handlers do wrapping the socket with socks
return None return None
@ -6024,14 +6043,6 @@ def traverse_obj(obj, *paths, **kwargs):
str = compat_str str = compat_str
is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes))
# stand-in until compat_re_Match is added
compat_re_Match = type(re.match('a', 'a'))
# stand-in until casefold.py is added
try:
''.casefold()
compat_casefold = lambda s: s.casefold()
except AttributeError:
compat_casefold = lambda s: s.lower()
casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k
if isinstance(expected_type, type): if isinstance(expected_type, type):