Compare commits

...

12 Commits

Author SHA1 Message Date
Jasmine Hou
96274a354b
Merge 5aee29b7ca into c5098961b0 2024-08-21 22:32:43 -04:00
dirkf
c5098961b0 [Youtube] Rework n function extraction pattern
Now also succeeds with player b12cc44b
2024-08-06 20:59:09 +01:00
dirkf
dbc08fba83 [jsinterp] Improve slice implementation for player b12cc44b
Partly taken from yt-dlp/yt-dlp#10664, thx seproDev
        Fixes #32896
2024-08-06 20:51:38 +01:00
Aiur Adept
71223bff39
[Youtube] Fix nsig extraction for player 20dfca59 (#32891)
* dirkf's patch for nsig extraction
* add generic search per  yt-dlp/yt-dlp/pull/10611 - thx bashonly

---------

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-08-01 19:18:34 +01:00
jsmnhou
5aee29b7ca Fixed styling to comply with flake8. 2023-04-10 21:11:29 -04:00
jsmnhou
df617513cc Fixed __dictify to support new array format. 2023-04-10 21:05:31 -04:00
jsmnhou
e1b7640587 Changes to possible manifest urls. 2023-04-10 20:31:22 -04:00
jsmnhou
6d1af4e2cf Changed _COMM_MAP to original format and added __dictify to be able to process dict-ified format. 2023-04-10 20:24:02 -04:00
jsmnhou
f5e01562b1 Condensed code when extracting formats. Edited comments. 2023-04-09 16:23:01 -04:00
jsmnhou
2fcb8b8c20 Added new tests and added expected warnings to existing tests. 2023-04-09 16:17:55 -04:00
jsmnhou
7def329f19 Updated _COMM_MAP to be a dictionary to speed up execution. Updated values from latest senate.gov page. 2023-04-09 16:13:13 -04:00
jsmnhou
d59b83f04b Updated _real_extract() to iterate through possible manifest URLs to find the m3u8. Added start_time and stop_time metadata. 2023-04-09 16:10:06 -04:00
5 changed files with 203 additions and 107 deletions

View File

@ -425,6 +425,34 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, [''], args=['', '-'])
self._test(jsi, [], args=['', ''])
def test_slice(self):
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0)}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(5)}', [5, 6, 7, 8])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(99)}', [])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-2)}', [7, 8])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-99)}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 0)}', [])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, 0)}', [])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 1)}', [0])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(3, 6)}', [3, 4, 5])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, -1)}', [1, 2, 3, 4, 5, 6, 7])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-1, 1)}', [])
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-3, -1)}', [6, 7])
self._test('function f(){return "012345678".slice()}', '012345678')
self._test('function f(){return "012345678".slice(0)}', '012345678')
self._test('function f(){return "012345678".slice(5)}', '5678')
self._test('function f(){return "012345678".slice(99)}', '')
self._test('function f(){return "012345678".slice(-2)}', '78')
self._test('function f(){return "012345678".slice(-99)}', '012345678')
self._test('function f(){return "012345678".slice(0, 0)}', '')
self._test('function f(){return "012345678".slice(1, 0)}', '')
self._test('function f(){return "012345678".slice(0, 1)}', '0')
self._test('function f(){return "012345678".slice(3, 6)}', '345')
self._test('function f(){return "012345678".slice(1, -1)}', '1234567')
self._test('function f(){return "012345678".slice(-1, 1)}', '')
self._test('function f(){return "012345678".slice(-3, -1)}', '67')
if __name__ == '__main__':
unittest.main()

View File

@ -174,6 +174,14 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/5604538d/player_ias.vflset/en_US/base.js',
'7X-he4jjvMx7BCX', 'sViSydX8IHtdWA',
),
(
'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js',
'-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw',
),
(
'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js',
'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw',
),
]

View File

@ -5,48 +5,52 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_duration,
unsmuggle_url,
url_or_none,
)
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
from ..compat import compat_parse_qs
class SenateISVPIE(InfoExtractor):
# [committee, stream_number, stream_domain, stream_id, msl3]
_COMM_MAP = [
['ag', '76440', 'http://ag-f.akamaihd.net'],
['aging', '76442', 'http://aging-f.akamaihd.net'],
['approps', '76441', 'http://approps-f.akamaihd.net'],
['armed', '76445', 'http://armed-f.akamaihd.net'],
['banking', '76446', 'http://banking-f.akamaihd.net'],
['budget', '76447', 'http://budget-f.akamaihd.net'],
['cecc', '76486', 'http://srs-f.akamaihd.net'],
['commerce', '80177', 'http://commerce1-f.akamaihd.net'],
['csce', '75229', 'http://srs-f.akamaihd.net'],
['dpc', '76590', 'http://dpc-f.akamaihd.net'],
['energy', '76448', 'http://energy-f.akamaihd.net'],
['epw', '76478', 'http://epw-f.akamaihd.net'],
['ethics', '76449', 'http://ethics-f.akamaihd.net'],
['finance', '76450', 'http://finance-f.akamaihd.net'],
['foreign', '76451', 'http://foreign-f.akamaihd.net'],
['govtaff', '76453', 'http://govtaff-f.akamaihd.net'],
['help', '76452', 'http://help-f.akamaihd.net'],
['indian', '76455', 'http://indian-f.akamaihd.net'],
['intel', '76456', 'http://intel-f.akamaihd.net'],
['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'],
['jccic', '85180', 'http://jccic-f.akamaihd.net'],
['jec', '76458', 'http://jec-f.akamaihd.net'],
['judiciary', '76459', 'http://judiciary-f.akamaihd.net'],
['rpc', '76591', 'http://rpc-f.akamaihd.net'],
['rules', '76460', 'http://rules-f.akamaihd.net'],
['saa', '76489', 'http://srs-f.akamaihd.net'],
['smbiz', '76461', 'http://smbiz-f.akamaihd.net'],
['srs', '75229', 'http://srs-f.akamaihd.net'],
['uscc', '76487', 'http://srs-f.akamaihd.net'],
['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
['arch', '', 'http://ussenate-f.akamaihd.net/']
['ag', '76440', 'http://ag-f.akamaihd.net', '2036803', 'agriculture'],
['aging', '76442', 'http://aging-f.akamaihd.net', '2036801', 'aging'],
['approps', '76441', 'http://approps-f.akamaihd.net', '2036802', 'appropriations'],
['armed', '76445', 'http://armed-f.akamaihd.net', '2036800', 'armedservices'],
['banking', '76446', 'http://banking-f.akamaihd.net', '2036799', 'banking'],
['budget', '76447', 'http://budget-f.akamaihd.net', '2036798', 'budget'],
['cecc', '76486', 'http://srs-f.akamaihd.net', '2036782', 'srs_cecc'],
['commerce', '80177', 'http://commerce1-f.akamaihd.net', '2036779', 'commerce'],
['csce', '75229', 'http://srs-f.akamaihd.net', '2036777', 'srs_srs'],
['dpc', '76590', 'http://dpc-f.akamaihd.net', None, 'dpc'],
['energy', '76448', 'http://energy-f.akamaihd.net', '2036797', 'energy'],
['epw', '76478', 'http://epw-f.akamaihd.net', '2036783', 'environment'],
['ethics', '76449', 'http://ethics-f.akamaihd.net', '2036796', 'ethics'],
['finance', '76450', 'http://finance-f.akamaihd.net', '2036795', 'finance_finance'],
['foreign', '76451', 'http://foreign-f.akamaihd.net', '2036794', 'foreignrelations'],
['govtaff', '76453', 'http://govtaff-f.akamaihd.net', '2036792', 'hsgac'],
['help', '76452', 'http://help-f.akamaihd.net', '2036793', 'help'],
['indian', '76455', 'http://indian-f.akamaihd.net', '2036791', 'indianaffairs'],
['intel', '76456', 'http://intel-f.akamaihd.net', '2036790', 'intelligence'],
['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net', None, 'internationalnarcoticscaucus'],
['jccic', '85180', 'http://jccic-f.akamaihd.net', '2036778', 'jccic'],
['jec', '76458', 'http://jec-f.akamaihd.net', '2036789', 'jointeconomic'],
['judiciary', '76459', 'http://judiciary-f.akamaihd.net', '2036788', 'judiciary'],
['rpc', '76591', 'http://rpc-f.akamaihd.net', None, 'rpc'],
['rules', '76460', 'http://rules-f.akamaihd.net', '2036787', 'rules'],
['saa', '76489', 'http://srs-f.akamaihd.net', '2036780', 'srs_saa'],
['smbiz', '76461', 'http://smbiz-f.akamaihd.net', '2036786', 'smallbusiness'],
['srs', '75229', 'http://srs-f.akamaihd.net', '2031966', 'srs_srs'],
['uscc', '76487', 'http://srs-f.akamaihd.net', '2036781', 'srs_uscc'],
['vetaff', '76462', 'http://vetaff-f.akamaihd.net', '2036785', 'veteransaffairs'],
['arch', '', 'http://ussenate-f.akamaihd.net/', None, None],
['uscp', None, '', '2043685', 'uscp'],
['cio', None, '', '2043686', 'cio']
]
_IE_NAME = 'senate.gov'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
@ -54,100 +58,120 @@ class SenateISVPIE(InfoExtractor):
'info_dict': {
'id': 'judiciary031715',
'ext': 'mp4',
'title': 'Integrated Senate Video Player',
'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
'info_dict': {
'id': 'commerce011514',
'ext': 'mp4',
'title': 'Integrated Senate Video Player'
},
'params': {
# m3u8 download
'skip_download': True,
'title': 'judiciary031715',
'thumbnail': 'http://www.judiciary.senate.gov/themes/judiciary/images/video-poster-flash-fit.png',
},
'expected_warnings': ['Failed to download m3u8 information: HTTP Error 404: Not Found'],
}, {
'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
# checksum differs each time
'info_dict': {
'id': 'intel090613',
'ext': 'mp4',
'title': 'Integrated Senate Video Player'
}
'title': 'intel090613',
},
'expected_warnings': ['Failed to download m3u8 information: HTTP Error 404: Not Found'],
}, {
# From http://www.c-span.org/video/?96791-1
'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
'only_matching': True,
'url': 'https://www.senate.gov/isvp/?comm=govtaff&type=archv&stt=975&filename=govtaff111722&auto_play=false&poster=https%3A%2F%2Fwww%2Ehsgac%2Esenate%2Egov%2Fimages%2Fvideo%2Dposter%2Dflash%2Dfit%2Epng',
'info_dict': {
'id': 'govtaff111722',
'ext': 'mp4',
'title': 'govtaff111722',
'thumbnail': 'https://www.hsgac.senate.gov/images/video-poster-flash-fit.png',
},
}, {
'url': 'https://www.senate.gov/isvp/?type=arch&comm=energy&filename=energy111722&stt=00:22:30&auto_play=false&wmode=transparent&poster=https%3A%2F%2Fwww%2Eenergy%2Esenate%2Egov%2Fthemes%2Fenergy%2Fimages%2Fvideo%2Dposter%2Dflash%2Dfit%2Epng',
'info_dict': {
'id': 'energy111722',
'ext': 'mp4',
'title': 'energy111722',
'thumbnail': 'https://www.energy.senate.gov/themes/energy/images/video-poster-flash-fit.png',
},
}, {
'url': 'https://www.senate.gov/isvp/?comm=foreign&type=archv&stt=0&filename=foreign080322&auto_play=false&wmode=transparent&poster=https%3A%2F%2Fwww%2Eforeign%2Esenate%2Egov%2Fthemes%2Fforeign%2Fimages%2Fvideo%2Dposter%2Dflash%2Dfit%2Epng',
'info_dict': {
'id': 'foreign080322',
'ext': 'mp4',
'title': 'foreign080322',
'thumbnail': 'https://www.foreign.senate.gov/themes/foreign/images/video-poster-flash-fit.png',
},
}]
@staticmethod
# returns url from an iframe
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
r'''<iframe\b[^>]+\bsrc\s*=\s*(['"])(?P<url>https?://www\.senate\.gov/isvp/?\?(?:(?!\1)\S)+)''',
webpage)
if mobj:
return mobj.group('url')
@staticmethod
def __dictify(cm):
return dict((row[0], (row[1:] + ['', ''])[:4]) for row in cm)
# returns stream_number, stream_domain, stream_id, msl3
def _get_info_for_comm(self, committee):
for entry in self._COMM_MAP:
if entry[0] == committee:
return entry[1:]
dict = self.__dictify(self._COMM_MAP)
return dict[committee][0:]
def _real_extract(self, url):
# smuggled data may contain a forced title that should be used
url, smuggled_data = unsmuggle_url(url, {})
qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
raise ExtractorError('Invalid URL', expected=True)
video_id = re.sub(r'.mp4$', '', qs['filename'][0])
# error handling for invalid URL - specify which error
if not qs.get('filename'):
raise ExtractorError('Invalid URL. Missing filename in query parameters', expected=True)
if not qs.get('comm'):
raise ExtractorError('Invalid URL. Missing committee in query parameters', expected=True)
webpage = self._download_webpage(url, video_id)
committee = qs.get('comm')[0]
filename = qs.get('filename')[0]
video_id = re.sub(r'\.mp4$', '', filename)
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
poster = qs.get('poster')
thumbnail = poster[0] if poster else None
# there is no point in pulling the title from the webpage since it always defaults to 'Integrated Senate Player'
title = smuggled_data.get('force_title') or filename
video_type = qs['type'][0]
committee = video_type if video_type == 'arch' else qs['comm'][0]
stream_num, domain = self._get_info_for_comm(committee)
stream_number, stream_domain, stream_id, msl3 = self._get_info_for_comm(committee)
stream_number = int(stream_number)
stream_id = int(stream_id)
# the possible locations for the video: only the first has been seen in use
possible_manifest_urls = [
'https://www-senate-gov-media-srs.akamaized.net/hls/live/%d/%s/%s/master.m3u8' % (stream_id, committee, filename),
'https://www-senate-gov-msl3archive.akamaized.net/%s/%s_1/master.m3u8' % (msl3, filename),
'%s/i/%s_1@%d/master.m3u8' % (stream_domain, filename, stream_number),
'https://ussenate-f.akamaihd.net/i/%s.mp4/master.m3u8' % video_id,
]
# we iterate through the possible locations until we find formats
formats = []
if video_type == 'arch':
filename = video_id if '.' in video_id else video_id + '.mp4'
formats = [{
# All parameters in the query string are necessary to prevent a 403 error
'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
}]
else:
hdcore_sign = 'hdcore=3.1.0'
url_params = (domain, video_id, stream_num)
f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign
m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
# URLs without the extra param induce an 404 error
entry.update({'extra_param_to_segment_url': hdcore_sign})
formats.append(entry)
for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
for url in possible_manifest_urls:
entries = self._extract_m3u8_formats(
url, video_id, ext='mp4', m3u8_id='hls',
entry_protocol='m3u8_native', fatal=False)
for entry in entries:
mobj = re.search(r'(?P<tag>-[pb]).m3u8', entry['url'])
if mobj:
entry['format_id'] += mobj.group('tag')
formats.append(entry)
self._sort_formats(formats)
if formats:
break
self._sort_formats(formats)
thumbnail = url_or_none(qs.get('poster', [None])[-1])
start_time = parse_duration(qs.get('stt', [None])[-1])
stop_time = parse_duration(qs.get('dur', [None])[-1])
if stop_time is not None:
stop_time += start_time or 0
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'start_time': start_time,
'stop_time': stop_time,
}

View File

@ -1659,17 +1659,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_name(self, jscode):
func_name, idx = self._search_regex(
# new: (b=String.fromCharCode(110),c=a.get(b))&&c=nfunc[idx](c)
# or: (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c)s
# old: .get("n"))&&(b=nfunc[idx](b)
# older: .get("n"))&&(b=nfunc(b)
# or: (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c)
# or: (PL(a),b=a.j.n||null)&&(b=nfunc[idx](b)
# or: (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("")
# old: (b=a.get("n"))&&(b=nfunc[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s*
# older: (b=a.get("n"))&&(b=nfunc(b)
r'''(?x)
(?:\(\s*(?P<b>[a-z])\s*=\s*(?:
String\s*\.\s*fromCharCode\s*\(\s*110\s*\)|
"n+"\[\s*\+?s*[\w$.]+\s*]
)\s*,(?P<c>[a-z])\s*=\s*[a-z]\s*)?
\.\s*get\s*\(\s*(?(b)(?P=b)|"n{1,2}")(?:\s*\)){2}\s*&&\s*\(\s*(?(c)(?P=c)|b)\s*=\s*
(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\s*\[(?P<idx>\d+)\])?\s*\(\s*[\w$]+\s*\)
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
\((?:[\w$()\s]+,)*?\s* # (
(?P<b>[a-z])\s*=\s* # b=
(?:
(?: # expect ,c=a.get(b) (etc)
String\s*\.\s*fromCharCode\s*\(\s*110\s*\)|
"n+"\[\s*\+?s*[\w$.]+\s*]
)\s*(?:,[\w$()\s]+(?=,))*|
(?P<old>[\w$]+) # a (old[er])
)\s*
(?(old)
# b.get("n")
(?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*?
(?:\.\s*n|\[\s*"n"\s*]|\.\s*get\s*\(\s*"n"\s*\))
| # ,c=a.get(b)
,\s*(?P<c>[a-z])\s*=\s*[a-z]\s*
(?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*?
(?:\[\s*(?P=b)\s*]|\.\s*get\s*\(\s*(?P=b)\s*\))
)
# interstitial junk
\s*(?:\|\|\s*null\s*)?(?:\)\s*)?&&\s*(?:\(\s*)?
(?(c)(?P=c)|(?P=b))\s*=\s* # [c|b]=
# nfunc|nfunc[idx]
(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\s*\[(?P<idx>\d+)\])?\s*\(\s*[\w$]+\s*\)
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
default=(None, None))
# thx bashonly: yt-dlp/yt-dlp/pull/10611
if not func_name:
self.report_warning('Falling back to generic n function search')
return self._search_regex(
r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
(?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
\s*\{(?:(?!};).)+?["']enhanced_except_
''', jscode, 'Initial JS player n function name', group='name')
if not idx:
return func_name

View File

@ -925,9 +925,16 @@ class JSInterpreter(object):
obj.reverse()
return obj
elif member == 'slice':
assertion(isinstance(obj, list), 'must be applied on a list')
assertion(len(argvals) == 1, 'takes exactly one argument')
return obj[argvals[0]:]
assertion(isinstance(obj, (list, compat_str)), 'must be applied on a list or string')
# From [1]:
# .slice() - like [:]
# .slice(n) - like [n:] (not [slice(n)]
# .slice(m, n) - like [m:n] or [slice(m, n)]
# [1] https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/slice
assertion(len(argvals) <= 2, 'takes between 0 and 2 arguments')
if len(argvals) < 2:
argvals += (None,)
return obj[slice(*argvals)]
elif member == 'splice':
assertion(isinstance(obj, list), 'must be applied on a list')
assertion(argvals, 'takes one or more arguments')