[cda] Improve extraction (closes #28709 , closes #28937 )

[twitter] Improve formats extraction from vmap URL (closes #28909 )
[xtube] Fix formats extraction (closes #28870 )
2024-12-18 22:52:05 +00:00 · 2021-05-01 22:53:30 +07:00 · 2021-05-01 19:00:39 +07:00 · 2021-05-01 18:33:05 +07:00 · 2021-05-01 18:09:32 +07:00 · 2021-05-01 17:53:27 +07:00
5 changed files with 68 additions and 17 deletions
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@ -133,6 +133,8 @@ class CDAIE(InfoExtractor):
            'age_limit': 18 if need_confirm_age else 0,
        }

+        info = self._search_json_ld(webpage, video_id, default={})
+
        # Source: https://www.cda.pl/js/player.js?t=1606154898
        def decrypt_file(a):
            for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
@ -197,7 +199,7 @@ class CDAIE(InfoExtractor):
                handler = self._download_webpage

            webpage = handler(
-                self._BASE_URL + href, video_id,
+                urljoin(self._BASE_URL, href), video_id,
                'Downloading %s version information' % resolution, fatal=False)
            if not webpage:
                # Manually report warning because empty page is returned when
@ -209,6 +211,4 @@ class CDAIE(InfoExtractor):

        self._sort_formats(formats)

-        info = self._search_json_ld(webpage, video_id, default={})
-
        return merge_dicts(info_dict, info)
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE):
                        )
                        (?P<svt_id>[^/?#&]+)|
                        https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
-                        (?:.*?modalId=(?P<modal_id>[\da-zA-Z-]+))?
+                        (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))?
                    )
                    '''
    _TESTS = [{
@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE):
    }, {
        'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA',
        'only_matching': True,
+    }, {
+        'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa',
+        'only_matching': True,
    }, {
        # geo restricted to Sweden
        'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE):
        if not svt_id:
            svt_id = self._search_regex(
                (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
-                 r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id),
+                 r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id),
                 r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)',
                 r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)',
                 r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"',
--- a/youtube_dl/extractor/tv2dk.py
+++ b/youtube_dl/extractor/tv2dk.py
@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor):
        webpage = self._download_webpage(url, video_id)

        entries = []
+
+        def add_entry(partner_id, kaltura_id):
+            entries.append(self.url_result(
+                'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
+                video_id=kaltura_id))
+
        for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage):
            video = extract_attributes(video_el)
            kaltura_id = video.get('data-entryid')
@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor):
            partner_id = video.get('data-partnerid')
            if not partner_id:
                continue
-            entries.append(self.url_result(
-                'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
-                video_id=kaltura_id))
+            add_entry(partner_id, kaltura_id)
+        if not entries:
+            kaltura_id = self._search_regex(
+                r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id')
+            partner_id = self._search_regex(
+                (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage,
+                'partner id')
+            add_entry(partner_id, kaltura_id)
        return self.playlist_result(entries)


--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@ -19,6 +19,7 @@ from ..utils import (
    strip_or_none,
    unified_timestamp,
    update_url_query,
+    url_or_none,
    xpath_text,
 )

@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor):
            return [f]

    def _extract_formats_from_vmap_url(self, vmap_url, video_id):
+        vmap_url = url_or_none(vmap_url)
+        if not vmap_url:
+            return []
        vmap_data = self._download_xml(vmap_url, video_id)
        formats = []
        urls = []
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@ -11,6 +11,7 @@ from ..utils import (
    parse_duration,
    sanitized_Request,
    str_to_int,
+    url_or_none,
 )


@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor):
                'Cookie': 'age_verified=1; cookiesAccepted=1',
            })

-        title, thumbnail, duration = [None] * 3
+        title, thumbnail, duration, sources, media_definition = [None] * 5

        config = self._parse_json(self._search_regex(
-            r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
+            r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config',
            default='{}'), video_id, transform_source=js_to_json, fatal=False)
        if config:
            config = config.get('mainRoll')
@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor):
                thumbnail = config.get('poster')
                duration = int_or_none(config.get('duration'))
                sources = config.get('sources') or config.get('format')
+                media_definition = config.get('mediaDefinition')

-        if not isinstance(sources, dict):
+        if not isinstance(sources, dict) and not media_definition:
            sources = self._parse_json(self._search_regex(
                r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
                webpage, 'sources', group='sources'), video_id,
                transform_source=js_to_json)

        formats = []
-        for format_id, format_url in sources.items():
-            formats.append({
-                'url': format_url,
-                'format_id': format_id,
-                'height': int_or_none(format_id),
-            })
+        format_urls = set()
+
+        if isinstance(sources, dict):
+            for format_id, format_url in sources.items():
+                format_url = url_or_none(format_url)
+                if not format_url:
+                    continue
+                if format_url in format_urls:
+                    continue
+                format_urls.add(format_url)
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'height': int_or_none(format_id),
+                })
+
+        if isinstance(media_definition, list):
+            for media in media_definition:
+                video_url = url_or_none(media.get('videoUrl'))
+                if not video_url:
+                    continue
+                if video_url in format_urls:
+                    continue
+                format_urls.add(video_url)
+                format_id = media.get('format')
+                if format_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif format_id == 'mp4':
+                    height = int_or_none(media.get('quality'))
+                    formats.append({
+                        'url': video_url,
+                        'format_id': '%s-%d' % (format_id, height) if height else format_id,
+                        'height': height,
+                    })
+
        self._remove_duplicate_formats(formats)
        self._sort_formats(formats)
Author	SHA1	Message	Date
Sergey M․	a0df8a0617	[cda] Improve extraction (closes #28709 , closes #28937 )	2021-05-01 22:53:30 +07:00
Sergey M․	d1b9a5e2ef	[twitter] Improve formats extraction from vmap URL (closes #28909 )	2021-05-01 19:00:39 +07:00
Sergey M․	ff04d43c46	[xtube] Fix formats extraction (closes #28870 )	2021-05-01 18:33:05 +07:00
Sergey M․	d2f72c40db	[svtplay] Improve extraction (closes #28507 , closes #28876 )	2021-05-01 18:09:32 +07:00
Sergey M․	e33dfb445c	[tv2dk] Fix extraction (closes #28888 )	2021-05-01 17:53:27 +07:00