2024-12-29 20:12:04 +00:00
8 changed files with 122 additions and 92 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -773,20 +773,11 @@ class YoutubeDL(object):
    def extract_info(self, url, download=True, ie_key=None, extra_info={},
                     process=True, force_generic_extractor=False):
-        """
+        '''
-        Return a list with a dictionary for each video extracted.
+        Returns a list with a dictionary for each video we find.
-
+        If 'download', also downloads the videos.
-        Arguments:
+        extra_info is a dict containing the extra values to add to each result
-        url -- URL to extract
+        '''
        Keyword arguments:
        download -- whether to download videos during extraction
        ie_key -- extractor key hint
        extra_info -- dictionary containing the extra values to add to each result
        process -- whether to resolve all unresolved references (URLs, playlist items),
            must be True for download to work.
        force_generic_extractor -- force using the generic extractor
        """
        if not ie_key and force_generic_extractor:
            ie_key = 'Generic'
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@ -0,0 +1,86 @@
 from __future__ import unicode_literals
 import json
 from .common import InfoExtractor
 from ..utils import (
    remove_start,
    int_or_none,
 )
 class BlinkxIE(InfoExtractor):
    _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
    IE_NAME = 'blinkx'
    _TEST = {
        'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
        'md5': '337cf7a344663ec79bf93a526a2e06c7',
        'info_dict': {
            'id': 'Da0Gw3xc',
            'ext': 'mp4',
            'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
            'uploader': 'IGN News',
            'upload_date': '20150217',
            'timestamp': 1424215740,
            'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
            'duration': 47.743333,
        },
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        display_id = video_id[:8]
        api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
                   + 'video=%s' % video_id)
        data_json = self._download_webpage(api_url, display_id)
        data = json.loads(data_json)['api']['results'][0]
        duration = None
        thumbnails = []
        formats = []
        for m in data['media']:
            if m['type'] == 'jpg':
                thumbnails.append({
                    'url': m['link'],
                    'width': int(m['w']),
                    'height': int(m['h']),
                })
            elif m['type'] == 'original':
                duration = float(m['d'])
            elif m['type'] == 'youtube':
                yt_id = m['link']
                self.to_screen('Youtube video detected: %s' % yt_id)
                return self.url_result(yt_id, 'Youtube', video_id=yt_id)
            elif m['type'] in ('flv', 'mp4'):
                vcodec = remove_start(m['vcodec'], 'ff')
                acodec = remove_start(m['acodec'], 'ff')
                vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
                abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
                tbr = vbr + abr if vbr and abr else None
                format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
                formats.append({
                    'format_id': format_id,
                    'url': m['link'],
                    'vcodec': vcodec,
                    'acodec': acodec,
                    'abr': abr,
                    'vbr': vbr,
                    'tbr': tbr,
                    'width': int_or_none(m.get('w')),
                    'height': int_or_none(m.get('h')),
                })
        self._sort_formats(formats)
        return {
            'id': display_id,
            'fullid': video_id,
            'title': data['title'],
            'formats': formats,
            'uploader': data['channel_name'],
            'timestamp': data['pubdate_epoch'],
            'description': data.get('description'),
            'thumbnails': thumbnails,
            'duration': duration,
        }
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@ -32,18 +32,6 @@ class DigitallySpeakingIE(InfoExtractor):
        # From http://www.gdcvault.com/play/1013700/Advanced-Material
        'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
        'only_matching': True,
    }, {
        # From https://gdcvault.com/play/1016624, empty speakerVideo
        'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml',
        'info_dict': {
            'id': '201210-822101_1349794556671DDDD',
            'ext': 'flv',
            'title': 'Pre-launch - Preparing to Take the Plunge',
        },
    }, {
        # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo
        'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml',
        'only_matching': True,
    }]
    def _parse_mp4(self, metadata):
@ -96,20 +84,26 @@ class DigitallySpeakingIE(InfoExtractor):
                'vcodec': 'none',
                'format_id': audio.get('code'),
            })
-        for video_key, format_id, preference in (
+        slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
-                ('slide', 'slides', -2), ('speaker', 'speaker', -1)):
+        formats.append({
-            video_path = xpath_text(metadata, './%sVideo' % video_key)
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
-            if not video_path:
+            'play_path': remove_end(slide_video_path, '.flv'),
-                continue
+            'ext': 'flv',
-            formats.append({
+            'format_note': 'slide deck video',
-                'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+            'quality': -2,
-                'play_path': remove_end(video_path, '.flv'),
+            'preference': -2,
-                'ext': 'flv',
+            'format_id': 'slides',
-                'format_note': '%s video' % video_key,
+        })
-                'quality': preference,
+        speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
-                'preference': preference,
+        formats.append({
-                'format_id': format_id,
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
-            })
+            'play_path': remove_end(speaker_video_path, '.flv'),
            'ext': 'flv',
            'format_note': 'speaker video',
            'quality': -1,
            'preference': -1,
            'format_id': 'speaker',
        })
        return formats
    def _real_extract(self, url):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -132,6 +132,7 @@ from .bleacherreport import (
    BleacherReportIE,
    BleacherReportCMSIE,
 )
 from .blinkx import BlinkxIE
 from .bloomberg import BloombergIE
 from .bokecc import BokeCCIE
 from .bongacams import BongaCamsIE
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@ -16,7 +16,7 @@ from ..utils import (
 class FunimationIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
    _NETRC_MACHINE = 'funimation'
    _TOKEN = None
@ -51,10 +51,6 @@ class FunimationIE(InfoExtractor):
    }, {
        'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
        'only_matching': True,
    }, {
        # with lang code
        'url': 'https://www.funimation.com/en/shows/hacksign/role-play/',
        'only_matching': True,
    }]
    def _login(self):
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@ -6,7 +6,6 @@ from .common import InfoExtractor
 from .kaltura import KalturaIE
 from ..utils import (
    HEADRequest,
    remove_start,
    sanitized_Request,
    smuggle_url,
    urlencode_postdata,
@ -103,26 +102,6 @@ class GDCVaultIE(InfoExtractor):
                'format': 'mp4-408',
            },
        },
        {
            # Kaltura embed, whitespace between quote and embedded URL in iframe's src
            'url': 'https://www.gdcvault.com/play/1025699',
            'info_dict': {
                'id': '0_zagynv0a',
                'ext': 'mp4',
                'title': 'Tech Toolbox',
                'upload_date': '20190408',
                'uploader_id': 'joe@blazestreaming.com',
                'timestamp': 1554764629,
            },
            'params': {
                'skip_download': True,
            },
        },
        {
            # HTML5 video
            'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru',
            'only_matching': True,
        },
    ]
    def _login(self, webpage_url, display_id):
@ -196,18 +175,7 @@ class GDCVaultIE(InfoExtractor):
            xml_name = self._html_search_regex(
                r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>',
-                start_page, 'xml filename', default=None)
+                start_page, 'xml filename')
            if not xml_name:
                info = self._parse_html5_media_entries(url, start_page, video_id)[0]
                info.update({
                    'title': remove_start(self._search_regex(
                        r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page,
                        'title', default=None) or self._og_search_title(
                        start_page, default=None), 'GDC Vault - '),
                    'id': video_id,
                    'display_id': display_id,
                })
                return info
            embed_url = '%s/xml/%s' % (xml_root, xml_name)
            ie_key = 'DigitallySpeaking'
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor):
    def _extract_urls(webpage):
        # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
        finditer = (
-            list(re.finditer(
+            re.finditer(
                r"""(?xs)
                    kWidget\.(?:thumb)?[Ee]mbed\(
                    \{.*?
@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor):
                        (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
                        (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
                        (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
-                """, webpage))
+                """, webpage)
-            or list(re.finditer(
+            or re.finditer(
                r'''(?xs)
                    (?P<q1>["'])
                        (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
@ -142,16 +142,16 @@ class KalturaIE(InfoExtractor):
                        \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s*
                    )
                    (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
-                ''', webpage))
+                ''', webpage)
-            or list(re.finditer(
+            or re.finditer(
                r'''(?xs)
-                    <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])\s*
+                    <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])
                      (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
                      (?:(?!(?P=q1)).)*
                      [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
                      (?:(?!(?P=q1)).)*
                    (?P=q1)
-                ''', webpage))
+                ''', webpage)
        )
        urls = []
        for mobj in finditer:
--- a/youtube_dl/extractor/medaltv.py
+++ b/youtube_dl/extractor/medaltv.py
@ -15,7 +15,7 @@ from ..utils import (
 class MedalTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[a-zA-Z0-9]+)'
    _TESTS = [{
        'url': 'https://medal.tv/clips/2mA60jWAGQCBH',
        'md5': '7b07b064331b1cf9e8e5c52a06ae68fa',
@ -42,12 +42,6 @@ class MedalTVIE(InfoExtractor):
            'upload_date': '20201117',
            'uploader_id': '5156321',
        }
    }, {
        'url': 'https://medal.tv/clips/37rMeFpryCC-9',
        'only_matching': True,
    }, {
        'url': 'https://medal.tv/clips/2WRj40tpY_EU9',
        'only_matching': True,
    }]
    def _real_extract(self, url):