[screencastomatic] fix extraction(closes #11976 , closes #24489 )

[palcomp3] Add new extractor(closes #13120 )
2024-12-18 14:41:59 +00:00 · 2021-04-01 19:05:00 +01:00 · 2021-04-01 17:10:38 +01:00
3 changed files with 184 additions and 17 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -879,6 +879,11 @@ from .packtpub import (
    PacktPubIE,
    PacktPubCourseIE,
 )
+from .palcomp3 import (
+    PalcoMP3IE,
+    PalcoMP3ArtistIE,
+    PalcoMP3VideoIE,
+)
 from .pandoratv import PandoraTVIE
 from .parliamentliveuk import ParliamentLiveUKIE
 from .patreon import PatreonIE
--- a/youtube_dl/extractor/palcomp3.py
+++ b/youtube_dl/extractor/palcomp3.py
@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    str_or_none,
+    try_get,
+)
+
+
+class PalcoMP3BaseIE(InfoExtractor):
+    _GQL_QUERY_TMPL = '''{
+  artist(slug: "%s") {
+    %s
+  }
+}'''
+    _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") {
+      %s
+    }'''
+    _MUSIC_FIELDS = '''duration
+      hls
+      mp3File
+      musicID
+      plays
+      title'''
+
+    def _call_api(self, artist_slug, artist_fields):
+        return self._download_json(
+            'https://www.palcomp3.com.br/graphql/', artist_slug, query={
+                'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields),
+            })['data']
+
+    def _parse_music(self, music):
+        music_id = compat_str(music['musicID'])
+        title = music['title']
+
+        formats = []
+        hls_url = music.get('hls')
+        if hls_url:
+            formats.append({
+                'url': hls_url,
+                'protocol': 'm3u8_native',
+                'ext': 'mp4',
+            })
+        mp3_file = music.get('mp3File')
+        if mp3_file:
+            formats.append({
+                'url': mp3_file,
+            })
+
+        return {
+            'id': music_id,
+            'title': title,
+            'formats': formats,
+            'duration': int_or_none(music.get('duration')),
+            'view_count': int_or_none(music.get('plays')),
+        }
+
+    def _real_initialize(self):
+        self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS
+
+    def _real_extract(self, url):
+        artist_slug, music_slug = re.match(self._VALID_URL, url).groups()
+        artist_fields = self._ARTIST_FIELDS_TMPL % music_slug
+        music = self._call_api(artist_slug, artist_fields)['artist']['music']
+        return self._parse_music(music)
+
+
+class PalcoMP3IE(PalcoMP3BaseIE):
+    IE_NAME = 'PalcoMP3:song'
+    _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/',
+        'md5': '99fd6405b2d8fd589670f6db1ba3b358',
+        'info_dict': {
+            'id': '3162927',
+            'ext': 'mp3',
+            'title': 'Nossas Composições - CUIDA BEM DELA',
+            'duration': 210,
+            'view_count': int,
+        }
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url)
+
+
+class PalcoMP3ArtistIE(PalcoMP3BaseIE):
+    IE_NAME = 'PalcoMP3:artist'
+    _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<id>[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://www.palcomp3.com.br/condedoforro/',
+        'info_dict': {
+            'id': '358396',
+            'title': 'Conde do Forró',
+        },
+        'playlist_mincount': 188,
+    }]
+    _ARTIST_FIELDS_TMPL = '''artistID
+    musics {
+      nodes {
+        %s
+      }
+    }
+    name'''
+
+    @ classmethod
+    def suitable(cls, url):
+        return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        artist_slug = self._match_id(url)
+        artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
+
+        def entries():
+            for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []):
+                yield self._parse_music(music)
+
+        return self.playlist_result(
+            entries(), str_or_none(artist.get('artistID')), artist.get('name'))
+
+
+class PalcoMP3VideoIE(PalcoMP3BaseIE):
+    IE_NAME = 'PalcoMP3:video'
+    _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)/?#clipe'
+    _TESTS = [{
+        'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': '_pD1nR2qqPg',
+            'ext': 'mp4',
+            'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
+            'description': 'md5:7043342c09a224598e93546e98e49282',
+            'upload_date': '20161107',
+            'uploader_id': 'maiaramaraisaoficial',
+            'uploader': 'Maiara e Maraisa',
+        }
+    }]
+    _MUSIC_FIELDS = 'youtubeID'
+
+    def _parse_music(self, music):
+        youtube_id = music['youtubeID']
+        return self.url_result(youtube_id, 'Youtube', youtube_id)
--- a/youtube_dl/extractor/screencastomatic.py
+++ b/youtube_dl/extractor/screencastomatic.py
@ -2,12 +2,18 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    get_element_by_class,
+    int_or_none,
+    remove_start,
+    strip_or_none,
+    unified_strdate,
+)


 class ScreencastOMaticIE(InfoExtractor):
-    _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
-    _TEST = {
+    _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P<id>[0-9a-zA-Z]+)'
+    _TESTS = [{
        'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
        'md5': '483583cb80d92588f15ccbedd90f0c18',
        'info_dict': {
@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor):
            'title': 'Welcome to 3-4 Philosophy @ DECV!',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
-            'duration': 369.163,
+            'duration': 369,
+            'upload_date': '20141216',
        }
-    }
+    }, {
+        'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl',
+        'only_matching': True,
+    }, {
+        'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        jwplayer_data = self._parse_json(
-            self._search_regex(
-                r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'),
-            video_id, transform_source=js_to_json)
-
-        info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
-        info_dict.update({
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
+        webpage = self._download_webpage(
+            'https://screencast-o-matic.com/player/' + video_id, video_id)
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+        info.update({
+            'id': video_id,
+            'title': get_element_by_class('overlayTitle', webpage),
+            'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None,
+            'duration': int_or_none(self._search_regex(
+                r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};',
+                webpage, 'duration', default=None)),
+            'upload_date': unified_strdate(remove_start(
+                get_element_by_class('overlayPublished', webpage), 'Published: ')),
        })
-        return info_dict
+        return info
Author	SHA1	Message	Date
Remita Amine	04d4a3b136	[screencastomatic] fix extraction(closes #11976 , closes #24489 )	2021-04-01 19:05:00 +01:00
Allan Daemon	392c467f95	[palcomp3] Add new extractor(closes #13120 )	2021-04-01 17:10:38 +01:00