[jamendo] fix track extraction(closes #28686 )

[curiositystream] fix format extraction(closes #26845 , closes #28668 )
2024-12-18 14:41:59 +00:00 · 2021-04-07 10:41:06 +01:00 · 2021-04-07 09:27:05 +01:00
2 changed files with 99 additions and 78 deletions
--- a/youtube_dl/extractor/curiositystream.py
+++ b/youtube_dl/extractor/curiositystream.py
@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor):
            raise ExtractorError(
                '%s said: %s' % (self.IE_NAME, error), expected=True)

-    def _call_api(self, path, video_id):
+    def _call_api(self, path, video_id, query=None):
        headers = {}
        if self._auth_token:
            headers['X-Auth-Token'] = self._auth_token
        result = self._download_json(
-            self._API_BASE_URL + path, video_id, headers=headers)
+            self._API_BASE_URL + path, video_id, headers=headers, query=query)
        self._handle_errors(result)
        return result['data']

@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
    _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
    _TEST = {
        'url': 'https://app.curiositystream.com/video/2',
-        'md5': '262bb2f257ff301115f1973540de8983',
        'info_dict': {
            'id': '2',
            'ext': 'mp4',
            'title': 'How Did You Develop The Internet?',
            'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
-        }
+        },
+        'params': {
+            'format': 'bestvideo',
+            # m3u8 download
+            'skip_download': True,
+        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        media = self._call_api('media/' + video_id, video_id)
-        title = media['title']

        formats = []
-        for encoding in media.get('encodings', []):
-            m3u8_url = encoding.get('master_playlist_url')
-            if m3u8_url:
-                formats.extend(self._extract_m3u8_formats(
-                    m3u8_url, video_id, 'mp4', 'm3u8_native',
-                    m3u8_id='hls', fatal=False))
-            encoding_url = encoding.get('url')
-            file_url = encoding.get('file_url')
-            if not encoding_url and not file_url:
-                continue
-            f = {
-                'width': int_or_none(encoding.get('width')),
-                'height': int_or_none(encoding.get('height')),
-                'vbr': int_or_none(encoding.get('video_bitrate')),
-                'abr': int_or_none(encoding.get('audio_bitrate')),
-                'filesize': int_or_none(encoding.get('size_in_bytes')),
-                'vcodec': encoding.get('video_codec'),
-                'acodec': encoding.get('audio_codec'),
-                'container': encoding.get('container_type'),
-            }
-            for f_url in (encoding_url, file_url):
-                if not f_url:
+        for encoding_format in ('m3u8', 'mpd'):
+            media = self._call_api('media/' + video_id, video_id, query={
+                'encodingsNew': 'true',
+                'encodingsFormat': encoding_format,
+            })
+            for encoding in media.get('encodings', []):
+                playlist_url = encoding.get('master_playlist_url')
+                if encoding_format == 'm3u8':
+                    # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
+                    formats.extend(self._extract_m3u8_formats(
+                        playlist_url, video_id, 'mp4',
+                        m3u8_id='hls', fatal=False))
+                elif encoding_format == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        playlist_url, video_id, mpd_id='dash', fatal=False))
+                encoding_url = encoding.get('url')
+                file_url = encoding.get('file_url')
+                if not encoding_url and not file_url:
                    continue
-                fmt = f.copy()
-                rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
-                if rtmp:
-                    fmt.update({
-                        'url': rtmp.group('url'),
-                        'play_path': rtmp.group('playpath'),
-                        'app': rtmp.group('app'),
-                        'ext': 'flv',
-                        'format_id': 'rtmp',
-                    })
-                else:
-                    fmt.update({
-                        'url': f_url,
-                        'format_id': 'http',
-                    })
-                formats.append(fmt)
+                f = {
+                    'width': int_or_none(encoding.get('width')),
+                    'height': int_or_none(encoding.get('height')),
+                    'vbr': int_or_none(encoding.get('video_bitrate')),
+                    'abr': int_or_none(encoding.get('audio_bitrate')),
+                    'filesize': int_or_none(encoding.get('size_in_bytes')),
+                    'vcodec': encoding.get('video_codec'),
+                    'acodec': encoding.get('audio_codec'),
+                    'container': encoding.get('container_type'),
+                }
+                for f_url in (encoding_url, file_url):
+                    if not f_url:
+                        continue
+                    fmt = f.copy()
+                    rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+                    if rtmp:
+                        fmt.update({
+                            'url': rtmp.group('url'),
+                            'play_path': rtmp.group('playpath'),
+                            'app': rtmp.group('app'),
+                            'ext': 'flv',
+                            'format_id': 'rtmp',
+                        })
+                    else:
+                        fmt.update({
+                            'url': f_url,
+                            'format_id': 'http',
+                        })
+                    formats.append(fmt)
        self._sort_formats(formats)

+        title = media['title']
+
        subtitles = {}
        for closed_caption in media.get('closed_captions', []):
            sub_url = closed_caption.get('file')
@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
            'title': 'Curious Minds: The Internet',
            'description': 'How is the internet shaping our lives in the 21st Century?',
        },
-        'playlist_mincount': 17,
+        'playlist_mincount': 16,
    }, {
        'url': 'https://curiositystream.com/series/2',
        'only_matching': True,
--- a/youtube_dl/extractor/jamendo.py
+++ b/youtube_dl/extractor/jamendo.py
@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor):
            'id': '196219',
            'display_id': 'stories-from-emona-i',
            'ext': 'flac',
-            'title': 'Maya Filipič - Stories from Emona I',
-            'artist': 'Maya Filipič',
+            # 'title': 'Maya Filipič - Stories from Emona I',
+            'title': 'Stories from Emona I',
+            # 'artist': 'Maya Filipič',
            'track': 'Stories from Emona I',
            'duration': 210,
            'thumbnail': r're:^https?://.*\.jpg',
            'timestamp': 1217438117,
            'upload_date': '20080730',
+            'license': 'by-nc-nd',
+            'view_count': int,
+            'like_count': int,
+            'average_rating': int,
+            'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'],
        }
    }, {
        'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
        'only_matching': True,
    }]

+    def _call_api(self, resource, resource_id):
+        path = '/api/%ss' % resource
+        rand = compat_str(random.random())
+        return self._download_json(
+            'https://www.jamendo.com' + path, resource_id, query={
+                'id[]': resource_id,
+            }, headers={
+                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+            })[0]
+
    def _real_extract(self, url):
        track_id, display_id = self._VALID_URL_RE.match(url).groups()
-        webpage = self._download_webpage(
-            'https://www.jamendo.com/track/' + track_id, track_id)
-        models = self._parse_json(self._html_search_regex(
-            r"data-bundled-models='([^']+)",
-            webpage, 'bundled models'), track_id)
-        track = models['track']['models'][0]
+        # webpage = self._download_webpage(
+        #     'https://www.jamendo.com/track/' + track_id, track_id)
+        # models = self._parse_json(self._html_search_regex(
+        #     r"data-bundled-models='([^']+)",
+        #     webpage, 'bundled models'), track_id)
+        # track = models['track']['models'][0]
+        track = self._call_api('track', track_id)
        title = track_name = track['name']
-        get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
-        artist = get_model('artist')
-        artist_name = artist.get('name')
-        if artist_name:
-            title = '%s - %s' % (artist_name, title)
-        album = get_model('album')
+        # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+        # artist = get_model('artist')
+        # artist_name = artist.get('name')
+        # if artist_name:
+        #     title = '%s - %s' % (artist_name, title)
+        # album = get_model('album')

        formats = [{
            'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor):

        urls = []
        thumbnails = []
-        for _, covers in track.get('cover', {}).items():
+        for covers in (track.get('cover') or {}).values():
            for cover_id, cover_url in covers.items():
                if not cover_url or cover_url in urls:
                    continue
@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor):
                })

        tags = []
-        for tag in track.get('tags', []):
+        for tag in (track.get('tags') or []):
            tag_name = tag.get('name')
            if not tag_name:
                continue
            tags.append(tag_name)

        stats = track.get('stats') or {}
+        license = track.get('licenseCC') or []

        return {
            'id': track_id,
@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor):
            'title': title,
            'description': track.get('description'),
            'duration': int_or_none(track.get('duration')),
-            'artist': artist_name,
+            # 'artist': artist_name,
            'track': track_name,
-            'album': album.get('name'),
+            # 'album': album.get('name'),
            'formats': formats,
-            'license': '-'.join(track.get('licenseCC', [])) or None,
+            'license': '-'.join(license) if license else None,
            'timestamp': int_or_none(track.get('dateCreated')),
            'view_count': int_or_none(stats.get('listenedAll')),
            'like_count': int_or_none(stats.get('favorited')),
@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor):
        }


-class JamendoAlbumIE(InfoExtractor):
+class JamendoAlbumIE(JamendoIE):
    _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
-    _TEST = {
+    _TESTS = [{
        'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
        'info_dict': {
            'id': '121486',
@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor):
        'params': {
            'playlistend': 2
        }
-    }
-
-    def _call_api(self, resource, resource_id):
-        path = '/api/%ss' % resource
-        rand = compat_str(random.random())
-        return self._download_json(
-            'https://www.jamendo.com' + path, resource_id, query={
-                'id[]': resource_id,
-            }, headers={
-                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
-            })[0]
+    }]

    def _real_extract(self, url):
        album_id = self._match_id(url)
@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor):
        album_name = album.get('name')

        entries = []
-        for track in album.get('tracks', []):
+        for track in (album.get('tracks') or []):
            track_id = track.get('id')
            if not track_id:
                continue
Author	SHA1	Message	Date
Remita Amine	281b8e3443	[jamendo] fix track extraction(closes #28686 )	2021-04-07 10:41:06 +01:00
Remita Amine	c0c5134c57	[curiositystream] fix format extraction(closes #26845 , closes #28668 )	2021-04-07 09:27:05 +01:00