Merge branch 'yt-dlp:master' into master

observeroftime01 · Aug 2, 2023 · 956e5d6 · 956e5d6
2 parents 83b2500 + db97438
commit 956e5d6
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 16 deletions.
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
@@ -1420,7 +1420,7 @@
     PatreonIE,
     PatreonCampaignIE
 )
-from .pbs import PBSIE
+from .pbs import PBSIE, PBSKidsIE
 from .pearvideo import PearVideoIE
 from .peekvids import PeekVidsIE, PlayVidsIE
 from .peertube import (
@@ -1709,6 +1709,7 @@
     RuvIE,
     RuvSpilaIE
 )
+from .s4c import S4CIE
 from .safari import (
     SafariIE,
     SafariApiIE,

diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
@@ -481,7 +481,8 @@ def extract_dash_manifest(video, formats):
             dash_manifest = video.get('dash_manifest')
             if dash_manifest:
                 formats.extend(self._parse_mpd_formats(
-                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest))))
+                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
+                    mpd_url=video.get('dash_manifest_url')))
 
         def process_formats(info):
             # Downloads with browser's User-Agent are rate limited. Working around

diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py
@@ -20,7 +20,7 @@
 
 
 class FOXIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+    _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P<id>[\da-fA-F]+)'
     _TESTS = [{
         # clip
         'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
@@ -50,6 +50,10 @@ class FOXIE(InfoExtractor):
         # sports event, geo-restricted
         'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/',
         'only_matching': True,
+    }, {
+        # fox sports replay, geo-restricted
+        'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89',
+        'only_matching': True,
     }]
     _GEO_BYPASS = False
     _HOME_PAGE_URL = 'https://www.fox.com/'

diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py
@@ -11,6 +11,7 @@
     orderedSet,
     strip_jsonp,
     strip_or_none,
+    traverse_obj,
     unified_strdate,
     url_or_none,
     US_RATINGS,
@@ -696,3 +697,61 @@ def extract_redirect_urls(info):
             'subtitles': subtitles,
             'chapters': chapters,
         }
+
+
+class PBSKidsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'https://pbskids.org/video/molly-of-denali/3030407927',
+            'md5': '1ded20a017cc6b53446238f1804ce4c7',
+            'info_dict': {
+                'id': '3030407927',
+                'title': 'Bird in the Hand/Bye-Bye Birdie',
+                'channel': 'molly-of-denali',
+                'duration': 1540,
+                'ext': 'mp4',
+                'series': 'Molly of Denali',
+                'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
+                'categories': ['Episode'],
+                'upload_date': '20190718',
+            }
+        },
+        {
+            'url': 'https://pbskids.org/video/plum-landing/2365205059',
+            'md5': '92e5d189851a64ae1d0237a965be71f5',
+            'info_dict': {
+                'id': '2365205059',
+                'title': 'Cooper\'s Favorite Place in Nature',
+                'channel': 'plum-landing',
+                'duration': 67,
+                'ext': 'mp4',
+                'series': 'Plum Landing',
+                'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
+                'categories': ['Episode'],
+                'upload_date': '20140302',
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id)
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+            traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            **traverse_obj(meta, {
+                'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}),
+                'channel': ('show_slug', {str}),
+                'description': ('video_obj', 'description', {str}),
+                'duration': ('video_obj', 'duration', {int_or_none}),
+                'series': ('video_obj', 'program_title', {str}),
+                'title': ('video_obj', 'title', {str}),
+                'upload_date': ('video_obj', 'air_date', {unified_strdate}),
+            })
+        }
diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py
@@ -1,7 +1,10 @@
+import urllib.parse
+
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
-    js_to_json,
+    str_or_none,
+    traverse_obj,
 )
 
 
@@ -84,7 +87,7 @@ def _real_extract(self, url):
 
 
 class PicartoVodIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)'
     _TESTS = [{
         'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv',
         'md5': '3ab45ba4352c52ee841a28fb73f2d9ca',
@@ -94,6 +97,18 @@ class PicartoVodIE(InfoExtractor):
             'title': 'ArtofZod_2017.12.12.00.13.23.flv',
             'thumbnail': r're:^https?://.*\.jpg'
         },
+        'skip': 'The VOD does not exist',
+    }, {
+        'url': 'https://picarto.tv/ArtofZod/videos/772650',
+        'md5': '00067a0889f1f6869cc512e3e79c521b',
+        'info_dict': {
+            'id': '772650',
+            'ext': 'mp4',
+            'title': 'Art of Zod - Drawing and Painting',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'channel': 'ArtofZod',
+            'age_limit': 18,
+        }
     }, {
         'url': 'https://picarto.tv/videopopout/Plague',
         'only_matching': True,
@@ -102,21 +117,36 @@ class PicartoVodIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-
-        vod_info = self._parse_json(
-            self._search_regex(
-                r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage,
-                'vod player'),
-            video_id, transform_source=js_to_json)
+        data = self._download_json(
+            'https://ptvintern.picarto.tv/ptvapi', video_id, query={
+                'query': f'''{{
+  video(id: "{video_id}") {{
+    id
+    title
+    adult
+    file_name
+    video_recording_image_url
+    channel {{
+      name
+    }}
+  }}
+}}'''
+            })['data']['video']
+
+        file_name = data['file_name']
+        netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc
 
         formats = self._extract_m3u8_formats(
-            vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native',
-            m3u8_id='hls')
+            f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls')
 
         return {
             'id': video_id,
-            'title': video_id,
-            'thumbnail': vod_info.get('vodThumb'),
+            **traverse_obj(data, {
+                'id': ('id', {str_or_none}),
+                'title': ('title', {str}),
+                'thumbnail': 'video_recording_image_url',
+                'channel': ('channel', 'name', {str}),
+                'age_limit': ('adult', {lambda x: 18 if x else 0}),
+            }),
             'formats': formats,
         }
diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py
@@ -0,0 +1,62 @@
+from .common import InfoExtractor
+from ..utils import traverse_obj
+
+
+class S4CIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.s4c.cymru/clic/programme/861362209',
+        'info_dict': {
+            'id': '861362209',
+            'ext': 'mp4',
+            'title': 'Y Swn',
+            'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
+            'duration': 5340
+        },
+    }, {
+        'url': 'https://www.s4c.cymru/clic/programme/856636948',
+        'info_dict': {
+            'id': '856636948',
+            'ext': 'mp4',
+            'title': 'Am Dro',
+            'duration': 2880,
+            'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        details = self._download_json(
+            f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}',
+            video_id, fatal=False)
+
+        filename = self._download_json(
+            'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
+                'programme_id': video_id,
+                'signed': '0',
+                'lang': 'en',
+                'mode': 'od',
+                'appId': 'clic',
+                'streamName': '',
+            }, note='Downloading player config JSON')['filename']
+        m3u8_url = self._download_json(
+            'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
+                'mode': 'od',
+                'application': 'clic',
+                'region': 'WW',
+                'extra': 'false',
+                'thirdParty': 'false',
+                'filename': filename,
+            }, note='Downloading streaming urls JSON')['hls']
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            **traverse_obj(details, ('full_prog_details', 0, {
+                'title': (('programme_title', 'series_title'), {str}),
+                'description': ('full_billing', {str.strip}),
+                'duration': ('duration', {lambda x: int(x) * 60}),
+            }), get_all=False),
+        }