Compare commits

..

3 Commits

Author SHA1 Message Date
dirkf
195f22f679
[generic] Improve KVS (etc) extraction 2022-11-13 15:09:29 +00:00
dirkf
fc2beab0e7
[generic] Improve KVS (etc) extraction
* detect kt_player('kt_player', 'https://.../kt_player.swf?v=5...
* detect age limit if 18 USC 2257 is mentioned
* test with shooshtime.com

Partially resolves #31332.
2022-11-13 14:59:30 +00:00
FraFraFra-LongD
1a4fbe8462
Added ThisVid.com support (#29187)
* add ThisVidIE, ThisVidMemberIE, ThisVidPlaylistIE
* redirect embed to main page for more metadata
* use KVS extraction newly added to GenericIE and remove duplicate tests
* also add MrDeepFake etc compat to GenericIE
(closes #22390)

Co-authored-by: dirkf <fieldhouse@gmx.net>
2022-11-13 13:22:04 +00:00
3 changed files with 271 additions and 35 deletions

View File

@ -1265,6 +1265,11 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
ThisVidIE,
ThisVidMemberIE,
ThisVidPlaylistIE,
)
from .threeqsdn import ThreeQSDNIE from .threeqsdn import ThreeQSDNIE
from .tiktok import ( from .tiktok import (
TikTokIE, TikTokIE,

View File

@ -36,6 +36,7 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_or_none, url_or_none,
urljoin,
xpath_attr, xpath_attr,
xpath_text, xpath_text,
xpath_with_ns, xpath_with_ns,
@ -2252,31 +2253,7 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# KVS Player # KVS Player (tested also in thisvid.py)
'url': 'https://thisvid.com/videos/fruit-is-healthy/',
'md5': 'f83e52f409b9139a7efee58ef926a72e',
'info_dict': {
'id': '7079579',
'display_id': 'fruit-is-healthy',
'ext': 'mp4',
'title': 'Fruit is healthy - ThisVid.com',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg',
}
}, {
# KVS Player
'url': 'https://thisvid.com/embed/7079579/',
'info_dict': {
'id': '7079579',
'display_id': 'fruit-is-healthy',
'ext': 'mp4',
'title': 'Fruit is healthy - ThisVid.com',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/7079000/7079579/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
# KVS Player
'url': 'https://youix.com/video/leningrad-zoj/', 'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': { 'info_dict': {
@ -2306,6 +2283,7 @@ class GenericIE(InfoExtractor):
'display_id': '40-nochey-2016', 'display_id': '40-nochey-2016',
'ext': 'mp4', 'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org', 'title': '40 ночей (2016) - BogMedia.org',
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
}, },
}, { }, {
@ -2319,6 +2297,29 @@ class GenericIE(InfoExtractor):
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
}, },
}, {
'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
'info_dict': {
'id': '5',
'display_id': 'selena-gomez-pov-deep-fakes',
'ext': 'mp4',
'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes',
'description': 'md5:17d1f84b578c9c26875ac5ef9a932354',
'height': 720,
'age_limit': 18,
},
}, {
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
'id': '284002',
'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime',
'height': 720,
'age_limit': 18,
},
}, },
] ]
@ -2488,9 +2489,10 @@ class GenericIE(InfoExtractor):
format_id = flashvars.get(key + '_text', key) format_id = flashvars.get(key + '_text', key)
formats.append(merge_dicts( formats.append(merge_dicts(
parse_resolution(format_id) or parse_resolution(flashvars[key]), { parse_resolution(format_id) or parse_resolution(flashvars[key]), {
'url': getrealurl(flashvars[key], flashvars['license_code']), 'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])),
'format_id': format_id, 'format_id': format_id,
'ext': 'mp4', 'ext': 'mp4',
'http_headers': {'Referer': url},
})) }))
if not formats[-1].get('height'): if not formats[-1].get('height'):
formats[-1]['quality'] = 1 formats[-1]['quality'] = 1
@ -2713,9 +2715,16 @@ class GenericIE(InfoExtractor):
# but actually don't. # but actually don't.
AGE_LIMIT_MARKERS = [ AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
r'>[^<]*you acknowledge you are at least (\d+) years old',
r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
] ]
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): for marker in AGE_LIMIT_MARKERS:
age_limit = 18 m = re.search(marker, webpage)
if not m:
continue
age_limit = max(
age_limit or 0,
int_or_none(m.groups() and m.group(1), default=18))
# video uploader is domain name # video uploader is domain name
video_uploader = self._search_regex( video_uploader = self._search_regex(
@ -3563,14 +3572,18 @@ class GenericIE(InfoExtractor):
return info_dict return info_dict
# Look for generic KVS player (before ld+json for tests) # Look for generic KVS player (before ld+json for tests)
found = re.search( found = self._search_regex(
r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)\1[^>]*>', (r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
webpage) # kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ...
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
), webpage, 'KVS player', group='ver', default=False)
if found: if found:
self.report_extraction('KVS Player') self.report_extraction('%s: KVS Player' % (video_id, ))
if found.group('maj_ver') not in ('4', '5', '6'): if found.split('.')[0] not in ('4', '5', '6'):
self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found.group('ver'), )) self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
return self._extract_kvs(url, webpage, video_id) return merge_dicts(
self._extract_kvs(url, webpage, video_id),
info_dict)
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld( json_ld = self._search_json_ld(

View File

@ -0,0 +1,218 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
from ..utils import (
clean_html,
get_element_by_class,
int_or_none,
merge_dicts,
url_or_none,
urljoin,
)
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'description': 'md5:372353bb995883d1b65fddf507489acd',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}, {
'url': 'https://thisvid.com/embed/3533241/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}]
def _real_extract(self, url):
main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
webpage = self._download_webpage(url, main_id)
title = self._html_search_regex(
r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
webpage, 'title')
if type_ == 'embed':
# look for more metadata
video_alt_url = url_or_none(self._search_regex(
r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ),
webpage, 'video_alt_url', default=None))
if video_alt_url and video_alt_url != url:
webpage = self._download_webpage(
video_alt_url, main_id,
note='Redirecting embed to main page', fatal=False) or webpage
video_holder = get_element_by_class('video-holder', webpage) or ''
if '>This video is a private video' in video_holder:
self.raise_login_required(
(clean_html(video_holder) or 'Private video').split('\n', 1)[0])
uploader = self._html_search_regex(
r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
webpage, 'uploader', default='')
uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
if len(uploader) == 2:
# id must be non-empty, uploader could be ''
uploader_id, uploader = uploader
uploader = uploader or None
else:
uploader_id = uploader = None
return merge_dicts({
'_type': 'url_transparent',
'title': title,
'age_limit': 18,
'uploader': uploader,
'uploader_id': uploader_id,
}, self.url_result(url, ie='Generic'))
class ThisVidMemberIE(InfoExtractor):
_VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
_TESTS = [{
'url': 'https://thisvid.com/members/2140501/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Profile',
},
'playlist_mincount': 16,
}, {
'url': 'https://thisvid.com/members/2140501/favourite_videos/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Favourite Videos',
},
'playlist_mincount': 15,
}, {
'url': 'https://thisvid.com/members/636468/public_videos/',
'info_dict': {
'id': '636468',
'title': 'Happymouth\'s Public Videos',
},
'playlist_mincount': 196,
},
]
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html):
yield m.group('url')
def _real_extract(self, url):
pl_id = self._match_id(url)
webpage = self._download_webpage(url, pl_id)
title = re.split(
r'(?i)\s*\|\s*ThisVid\.com\s*$',
self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
def entries(page_url, html=None):
for page in itertools.count(1):
if not html:
html = self._download_webpage(
page_url, pl_id, note='Downloading page %d' % (page, ),
fatal=False) or ''
for u in self._urls(html):
yield u
next_page = get_element_by_class('pagination-next', html) or ''
if next_page:
# member list page
next_page = urljoin(url, self._search_regex(
r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
next_page, 'next page link', group='url', default=None))
# in case a member page should have pagination-next with empty link, not just `else:`
if next_page is None:
# playlist page
parsed_url = compat_urlparse.urlparse(page_url)
base_path, num = parsed_url.path.rsplit('/', 1)
num = int_or_none(num)
if num is None:
base_path, num = parsed_url.path.rstrip('/'), 1
parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, )))
next_page = compat_urlparse.urlunparse(parsed_url)
if page_url == next_page:
next_page = None
if not next_page:
break
page_url, html = next_page, None
return self.playlist_from_matches(
entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid')
class ThisVidPlaylistIE(ThisVidMemberIE):
_VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '6615',
'title': 'Underwear Stuff',
},
'playlist_mincount': 200,
}, {
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '1072387',
'ext': 'mp4',
'title': 'Big Italian Booty 28',
'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
'uploader_id': '367912',
'uploader': 'Jcmusclefun',
'age_limit': 18,
},
'params': {
'noplaylist': True,
},
}]
def _get_video_url(self, pl_url):
video_id = re.match(self._VALID_URL, pl_url).group('video_id')
return urljoin(pl_url, '/videos/%s/' % (video_id, ))
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (self._VALID_URL, ), html):
yield self._get_video_url(m.group('url'))
def _real_extract(self, url):
pl_id = self._match_id(url)
if self._downloader.params.get('noplaylist'):
self.to_screen('Downloading just the featured video because of --no-playlist')
return self.url_result(self._get_video_url(url), 'ThisVid')
self.to_screen(
'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, ))
result = super(ThisVidPlaylistIE, self)._real_extract(url)
# rework title returned as `the title - the title`
title = result['title']
t_len = len(title)
if t_len > 5 and t_len % 2 != 0:
t_len = t_len // 2
if title[t_len] == '-':
title = [t.strip() for t in (title[:t_len], title[t_len + 1:])]
if title[0] and title[0] == title[1]:
result['title'] = title[0]
return result