diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e5d6958fca93..08ea9666ed20 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -216,5 +216,23 @@ "action": "add", "when": "d784464399b600ba9516bbcec6286f11d68974dd", "short": "[priority] **The minimum *required* Python version has been raised to 3.9**\nPython 3.8 reached its end-of-life on 2024.10.07, and yt-dlp has now removed support for it. As an unfortunate side effect, the official `yt-dlp.exe` and `yt-dlp_x86.exe` binaries are no longer supported on Windows 7. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "914af9a0cf51c9a3f74aa88d952bee8334c67511", + "short": "Expand paths in `--plugin-dirs` (#11334)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "c29f5a7fae93a08f3cfbb6127b2faa75145b06a0", + "short": "[ie/generic] Do not impersonate by default (#11336)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "57212a5f97ce367590aaa5c3e9a135eead8f81f7", + "short": "[ie/vimeo] Fix API retries (#11351)", + "authors": ["bashonly"] } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 00634fb9116d..7c876101b49d 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -71,14 +71,13 @@ def group_lookup(cls): def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: group, _, subgroup = (group.strip().lower() for group in value.partition('/')) - result = cls.group_lookup().get(group) - if not result: - if subgroup: - return None, value - subgroup = group - result = cls.subgroup_lookup().get(subgroup) + if result := cls.group_lookup().get(group): + return result, subgroup or None - return result, subgroup or None + if subgroup: + return None, value + + return cls.subgroup_lookup().get(group), group or None @dataclass @@ -136,8 +135,7 @@ def _format_groups(self, groups): first = False yield '\n

Changelog

\n' - group = groups[item] - if group: + if group := groups[item]: yield self.format_module(item.value, group) if self._collapsible: @@ -253,7 +251,7 @@ class CommitRange: ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') - FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert|Improve)\s+([\da-f]{40})') + FIXES_RE = re.compile(r'(?i:(?:bug\s*)?fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Improve)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') def __init__(self, start, end, default_author=None): @@ -287,11 +285,16 @@ def _get_commits_and_fixes(self, default_author): short = next(lines) skip = short.startswith('Release ') or short == '[version] update' + fix_commitish = None + if match := self.FIXES_RE.search(short): + fix_commitish = match.group(1) + authors = [default_author] if default_author else [] for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR): - match = self.AUTHOR_INDICATOR_RE.match(line) - if match: + if match := self.AUTHOR_INDICATOR_RE.match(line): authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold) + if not fix_commitish and (match := self.FIXES_RE.fullmatch(line)): + fix_commitish = match.group(1) commit = Commit(commit_hash, short, authors) if skip and (self._start or not i): @@ -301,21 +304,17 @@ def _get_commits_and_fixes(self, default_author): logger.debug(f'Reached Release commit, breaking: {commit}') break - revert_match = self.REVERT_RE.fullmatch(commit.short) - if revert_match: - reverts[revert_match.group(1)] = commit + if match := self.REVERT_RE.fullmatch(commit.short): + reverts[match.group(1)] = commit continue - fix_match = self.FIXES_RE.search(commit.short) - if fix_match: - commitish = fix_match.group(1) - fixes[commitish].append(commit) + if fix_commitish: + fixes[fix_commitish].append(commit) commits[commit.hash] = commit for commitish, revert_commit in reverts.items(): - reverted = commits.pop(commitish, None) - if reverted: + if reverted := commits.pop(commitish, None): logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -461,8 +460,7 @@ def create_changelog(args): logger.info(f'Loaded {len(commits)} commits') - new_contributors = get_new_contributors(args.contributors_path, commits) - if new_contributors: + if new_contributors := get_new_contributors(args.contributors_path, commits): if args.contributors: write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 31e8f82448d4..54f35ef55221 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -53,6 +53,18 @@ def setUp(self): def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) + def test_get_netrc_login_info(self): + for params in [ + {'usenetrc': True, 'netrc_location': './test/testdata/netrc/netrc'}, + {'netrc_cmd': f'{sys.executable} ./test/testdata/netrc/print_netrc.py'}, + ]: + ie = DummyIE(FakeYDL(params)) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='normal_use'), ('user', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_user'), ('', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_pass'), ('user', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='both_empty'), ('', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='nonexistent'), (None, None)) + def test_html_search_regex(self): html = '

Watch this video

' search = lambda re, *args: self.ie._html_search_regex(re, html, *args) diff --git a/test/test_traversal.py b/test/test_traversal.py index 9179dadda47c..f1d123bd6e5f 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -12,9 +12,10 @@ str_or_none, ) from yt_dlp.utils.traversal import ( - traverse_obj, require, subs_list_to_dict, + traverse_obj, + trim_str, ) _TEST_DATA = { @@ -495,6 +496,20 @@ def test_subs_list_to_dict(self): {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, ]}, '`quality` key should sort subtitle list accordingly' + def test_trim_str(self): + with pytest.raises(TypeError): + trim_str('positional') + + assert callable(trim_str(start='a')) + assert trim_str(start='ab')('abc') == 'c' + assert trim_str(end='bc')('abc') == 'a' + assert trim_str(start='a', end='c')('abc') == 'b' + assert trim_str(start='ab', end='c')('abc') == '' + assert trim_str(start='a', end='bc')('abc') == '' + assert trim_str(start='ab', end='bc')('abc') == '' + assert trim_str(start='abc', end='abc')('abc') == '' + assert trim_str(start='', end='')('abc') == 'abc' + class TestDictGet: def test_dict_get(self): diff --git a/test/test_utils.py b/test/test_utils.py index d4b846f56fba..04f91547a4c2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import os import sys import unittest +import unittest.mock import warnings import datetime as dt @@ -71,6 +72,7 @@ intlist_to_bytes, iri_to_uri, is_html, + join_nonempty, js_to_json, limit_length, locked_file, @@ -343,11 +345,13 @@ def test_remove_start(self): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') self.assertEqual(remove_start('B - A', 'A - '), 'B - A') + self.assertEqual(remove_start('non-empty', ''), 'non-empty') def test_remove_end(self): self.assertEqual(remove_end(None, ' - B'), None) self.assertEqual(remove_end('A - B', ' - B'), 'A') self.assertEqual(remove_end('B - A', ' - B'), 'B - A') + self.assertEqual(remove_end('non-empty', ''), 'non-empty') def test_remove_quotes(self): self.assertEqual(remove_quotes(None), None) @@ -2148,6 +2152,16 @@ def run_shell(args): assert run_shell(args) == expected assert run_shell(shell_quote(args, shell=True)) == expected + def test_partial_application(self): + assert callable(int_or_none(scale=10)), 'missing positional parameter should apply partially' + assert int_or_none(10, scale=0.1) == 100, 'positionally passed argument should call function' + assert int_or_none(v=10) == 10, 'keyword passed positional should call function' + assert int_or_none(scale=0.1)(10) == 100, 'call after partial applicatino should call the function' + + assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially' + assert callable(join_nonempty()), 'varargs positional should apply partially' + assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function' + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/netrc/netrc b/test/testdata/netrc/netrc new file mode 100644 index 000000000000..bafe92fe6a1e --- /dev/null +++ b/test/testdata/netrc/netrc @@ -0,0 +1,4 @@ +machine normal_use login user password pass +machine empty_user login "" password pass +machine empty_pass login user password "" +machine both_empty login "" password "" diff --git a/test/testdata/netrc/print_netrc.py b/test/testdata/netrc/print_netrc.py new file mode 100644 index 000000000000..5c25814f8496 --- /dev/null +++ b/test/testdata/netrc/print_netrc.py @@ -0,0 +1,2 @@ +with open('./test/testdata/netrc/netrc', encoding='utf-8') as fp: + print(fp.read()) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 2c622d6f4410..d89be57ca30d 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2849,13 +2849,10 @@ def is_wellformed(f): sanitize_string_field(fmt, 'format_id') sanitize_numeric_fields(fmt) fmt['url'] = sanitize_url(fmt['url']) - if fmt.get('ext') is None: - fmt['ext'] = determine_ext(fmt['url']).lower() + FormatSorter._fill_sorting_fields(fmt) if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): if fmt.get('acodec') is None: fmt['acodec'] = fmt['ext'] - if fmt.get('protocol') is None: - fmt['protocol'] = determine_protocol(fmt) if fmt.get('resolution') is None: fmt['resolution'] = self.format_resolution(fmt, default=None) if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none': diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8d59360949f9..13b5633d4627 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -401,8 +401,6 @@ from .cnbc import CNBCVideoIE from .cnn import ( CNNIE, - CNNArticleIE, - CNNBlogsIE, CNNIndonesiaIE, ) from .comedycentral import ( diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py index ffe4b49c15d9..7014c208d43a 100644 --- a/yt_dlp/extractor/ccma.py +++ b/yt_dlp/extractor/ccma.py @@ -12,53 +12,86 @@ class CCMAIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + IE_DESC = '3Cat, TV3 and Catalunya Ràdio' + _VALID_URL = r'https?://(?:www\.)?3cat\.cat/(?:3cat|tv3/sx3)/[^/?#]+/(?Pvideo|audio)/(?P\d+)' _TESTS = [{ - 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + # ccma.cat/tv3/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/lespot-de-la-marato-de-tv3/video/5630208/', 'md5': '7296ca43977c8ea4469e719c609b0871', 'info_dict': { 'id': '5630208', 'ext': 'mp4', - 'title': 'L\'espot de La Marató de TV3', + 'title': 'L\'espot de La Marató 2016: Ictus i les lesions medul·lars i cerebrals traumàtiques', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', 'timestamp': 1478608140, 'upload_date': '20161108', 'age_limit': 0, + 'alt_title': 'EsportMarató2016WEB_PerPublicar', + 'duration': 79, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/4/6/1478536106664.jpg', + 'series': 'Dedicada a l\'ictus i les lesions medul·lars i cerebrals traumàtiques', + 'categories': ['Divulgació'], }, }, { - 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + # ccma.cat/catradio/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/el-consell-de-savis-analitza-el-derbi/audio/943685/', 'md5': 'fa3e38f269329a278271276330261425', 'info_dict': { 'id': '943685', 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20170512', - 'timestamp': 1494622500, + 'upload_date': '20161217', + 'timestamp': 1482011700, 'vcodec': 'none', 'categories': ['Esports'], + 'series': 'Tot gira', + 'duration': 821, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/8/9/1482002602598.jpg', }, }, { - 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', - 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'url': 'https://www.3cat.cat/3cat/crims-josep-tallada-lespereu-me-part-1/video/6031387/', + 'md5': '27493513d08a3e5605814aee9bb778d2', 'info_dict': { 'id': '6031387', 'ext': 'mp4', - 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'title': 'T1xC5 - Josep Talleda, l\'"Espereu-me" (part 1)', 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', - 'timestamp': 1582577700, + 'timestamp': 1582577919, 'upload_date': '20200224', - 'subtitles': 'mincount:4', - 'age_limit': 16, + 'subtitles': 'mincount:1', + 'age_limit': 13, 'series': 'Crims', + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/1/9/1582564376991.jpg', + 'duration': 3203, + 'categories': ['Divulgació'], + 'alt_title': 'Crims - 5 - Josep Talleda, l\'"Espereu-me" (1a part) - Josep Talleda, l\'"Espereu-me" (part 1)', + 'episode_number': 5, + 'episode': 'Episode 5', + }, + }, { + 'url': 'https://www.3cat.cat/tv3/sx3/una-mosca-volava-per-la-llum/video/5759227/', + 'info_dict': { + 'id': '5759227', + 'ext': 'mp4', + 'title': 'Una mosca volava per la llum', + 'alt_title': '17Z004Ç UNA MOSCA VOLAVA PER LA LLUM', + 'description': 'md5:9ab64276944b0825336f4147f13f7854', + 'series': 'Mic', + 'upload_date': '20180411', + 'timestamp': 1523440105, + 'duration': 160, + 'age_limit': 0, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/6/1/1524071667216.jpg', + 'categories': ['Música'], }, }] def _real_extract(self, url): - media_type, media_id = self._match_valid_url(url).groups() + media_type, media_id = self._match_valid_url(url).group('type', 'id') media = self._download_json( - 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'http://api-media.3cat.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, 'format': 'dm', diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py index fe7615a89148..cfcec9d1fd9e 100644 --- a/yt_dlp/extractor/cnn.py +++ b/yt_dlp/extractor/cnn.py @@ -1,146 +1,226 @@ +import functools +import json +import re + from .common import InfoExtractor -from .turner import TurnerBaseIE -from ..utils import merge_dicts, try_call, url_basename +from ..utils import ( + clean_html, + extract_attributes, + int_or_none, + merge_dicts, + parse_duration, + parse_iso8601, + parse_resolution, + try_call, + update_url, + url_or_none, +) +from ..utils.traversal import find_elements, traverse_obj -class CNNIE(TurnerBaseIE): - _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ - (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' +class CNNIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|www|money|cnnespanol)\.)?cnn\.com/(?!audio/)(?P<display_id>[^?#]+?)(?:[?#]|$|/index\.html)' _TESTS = [{ - 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', - 'md5': '3e6121ea48df7e2259fe73a0628605c4', + 'url': 'https://www.cnn.com/2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl', 'info_dict': { - 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'id': 'med0e97ad0d154f56e29aa96e57192a14226734b6b', + 'display_id': '2024/05/31/sport/video/jadon-sancho-borussia-dortmund-champions-league-exclusive-spt-intl', 'ext': 'mp4', - 'title': 'Nadal wins 8th French Open title', - 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', - 'duration': 135, - 'upload_date': '20130609', + 'upload_date': '20240531', + 'description': 'md5:844bcdb0629e1877a7a466c913f4c19c', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/gettyimages-2151936122.jpg?c=original', + 'duration': 373.0, + 'timestamp': 1717148586, + 'title': 'Borussia Dortmund star Jadon Sancho seeks Wembley redemption after 2020 Euros hurt', + 'modified_date': '20240531', + 'modified_timestamp': 1717150140, }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', - 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', + 'url': 'https://edition.cnn.com/2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid', 'info_dict': { - 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'id': 'me522945c4709b299e5cb8657900a7a21ad3b559f9', + 'display_id': '2024/06/11/politics/video/inmates-vote-jail-nevada-murray-dnt-ac360-digvid', 'ext': 'mp4', - 'title': "Student's epic speech stuns new freshmen", - 'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."', - 'upload_date': '20130821', + 'description': 'md5:e0120fe5da9ad8259fd707c1cbb64a60', + 'title': 'Here’s how some inmates in closely divided state are now able to vote from jail', + 'timestamp': 1718158269, + 'upload_date': '20240612', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701554-13565-571-still.jpg?c=original', + 'duration': 202.0, + 'modified_date': '20240612', + 'modified_timestamp': 1718158509, }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', - 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'url': 'https://edition.cnn.com/2024/06/11/style/king-charles-portrait-vandalized/index.html', 'info_dict': { - 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'id': 'mef5f52b9e1fe28b1ad192afcbc9206ae984894b68', + 'display_id': '2024/06/11/style/king-charles-portrait-vandalized', 'ext': 'mp4', - 'title': 'Nashville Ep. 1: Hand crafted skateboards', - 'description': 'md5:e7223a503315c9f150acac52e76de086', - 'upload_date': '20141222', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/still-20701257-8846-816-still.jpg?c=original', + 'description': 'md5:19f78338ccec533db0fa8a4511012dae', + 'title': 'Video shows King Charles\' portrait being vandalized by activists', + 'timestamp': 1718113852, + 'upload_date': '20240611', + 'duration': 51.0, + 'modified_timestamp': 1718116193, + 'modified_date': '20240611', }, - 'expected_warnings': ['Failed to download m3u8 information'], }, { - 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', - 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'url': 'https://edition.cnn.com/videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln', 'info_dict': { - 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'id': 'mefba13799201b084ea3b1d0f7ca820ae94d4bb5b2', + 'display_id': 'videos/media/2022/12/05/robin-meade-final-sign-off-broadcast-hln-mxp-contd-vpx.hln', 'ext': 'mp4', - 'title': '5 stunning stats about Netflix', - 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', - 'upload_date': '20160819', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/221205163510-robin-meade-sign-off.jpg?c=original', + 'duration': 158.0, + 'title': 'Robin Meade signs off after HLN\'s last broadcast', + 'description': 'md5:cff3c62d18d2fbc6c5c75cb029b7353b', + 'upload_date': '20221205', + 'timestamp': 1670284296, + 'modified_timestamp': 1670332404, + 'modified_date': '20221206', }, + 'params': {'format': 'direct'}, }, { - 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', - 'only_matching': True, - }, { - 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', - 'only_matching': True, - }, { - 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', - 'only_matching': True, - }] - - _CONFIG = { - # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml - 'edition': { - 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', - 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + 'url': 'https://cnnespanol.cnn.com/video/ataque-misil-israel-beirut-libano-octubre-trax', + 'info_dict': { + 'id': 'me484a43722642aa00627b812fe928f2e99c6e2997', + 'ext': 'mp4', + 'display_id': 'video/ataque-misil-israel-beirut-libano-octubre-trax', + 'timestamp': 1729501452, + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/ataqeubeirut-1.jpg?c=original', + 'description': 'md5:256ee7137d161f776cda429654135e52', + 'upload_date': '20241021', + 'duration': 31.0, + 'title': 'VIDEO | Israel lanza un nuevo ataque sobre Beirut', + 'modified_date': '20241021', + 'modified_timestamp': 1729501530, }, - # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml - 'money': { - 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', - 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, { + 'url': 'https://edition.cnn.com/2024/10/16/politics/kamala-harris-fox-news-interview/index.html', + 'info_dict': { + 'id': '2024/10/16/politics/kamala-harris-fox-news-interview', }, - } - - def _extract_timestamp(self, video_data): - # TODO: fix timestamp extraction - return None + 'playlist_count': 2, + 'playlist': [{ + 'md5': '073ffab87b8bef97c9913e71cc18ef9e', + 'info_dict': { + 'id': 'me19d548fdd54df0924087039283128ef473ab397d', + 'ext': 'mp4', + 'title': '\'I\'m not finished\': Harris interview with Fox News gets heated', + 'display_id': 'kamala-harris-fox-news-interview-ebof-digvid', + 'description': 'md5:e7dd3d1a04df916062230b60ca419a0a', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/harris-20241016234916617.jpg?c=original', + 'duration': 173.0, + 'timestamp': 1729122182, + 'upload_date': '20241016', + 'modified_timestamp': 1729194706, + 'modified_date': '20241017', + }, + 'params': {'format': 'direct'}, + }, { + 'md5': '11604ab4af83b650826753f1ccb8ecff', + 'info_dict': { + 'id': 'med04507d8ca3da827001f63d22af321ec29c7d97b', + 'ext': 'mp4', + 'title': '\'Wise\': Buttigieg on Harris\' handling of interview question about gender transition surgery', + 'display_id': 'pete-buttigieg-harris-fox-newssrc-digvid', + 'description': 'md5:602a8a7e853ed5e574acd3159428c98e', + 'thumbnail': 'https://media.cnn.com/api/v1/images/stellar/prod/buttigieg-20241017040412074.jpg?c=original', + 'duration': 145.0, + 'timestamp': 1729137765, + 'upload_date': '20241017', + 'modified_timestamp': 1729138184, + 'modified_date': '20241017', + }, + 'params': {'format': 'direct'}, + }], + }] def _real_extract(self, url): - sub_domain, path, page_title = self._match_valid_url(url).groups() - if sub_domain not in ('money', 'edition'): - sub_domain = 'edition' - config = self._CONFIG[sub_domain] - return self._extract_cvp_info( - config['data_src'] % path, page_title, { - 'default': { - 'media_src': config['media_src'], - }, - 'f4m': { - 'host': 'cnn-vh.akamaihd.net', - }, - }) + display_id = self._match_valid_url(url).group('display_id') + webpage = self._download_webpage(url, display_id) + app_id = traverse_obj( + self._search_json(r'window\.env\s*=', webpage, 'window env', display_id, default={}), + ('TOP_AUTH_SERVICE_APP_ID', {str})) + entries = [] + for player_data in traverse_obj(webpage, ( + {find_elements(tag='div', attr='data-component-name', value='video-player', html=True)}, + ..., {extract_attributes}, all, lambda _, v: v['data-media-id'])): + media_id = player_data['data-media-id'] + parent_uri = player_data.get('data-video-resource-parent-uri') + formats, subtitles = [], {} -class CNNBlogsIE(InfoExtractor): - _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+' - _TEST = { - 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', - 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', - 'info_dict': { - 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', - 'ext': 'mp4', - 'title': 'Criminalizing journalism?', - 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', - 'upload_date': '20140209', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } + video_data = {} + if parent_uri: + video_data = self._download_json( + 'https://fave.api.cnn.io/v1/video', media_id, fatal=False, + query={ + 'id': media_id, + 'stellarUri': parent_uri, + }) + for direct_url in traverse_obj(video_data, ('files', ..., 'fileUri', {url_or_none})): + resolution, bitrate = None, None + if mobj := re.search(r'-(?P<res>\d+x\d+)_(?P<tbr>\d+)k\.mp4', direct_url): + resolution, bitrate = mobj.group('res', 'tbr') + formats.append({ + 'url': direct_url, + 'format_id': 'direct', + 'quality': 1, + 'tbr': int_or_none(bitrate), + **parse_resolution(resolution), + }) + for sub_data in traverse_obj(video_data, ( + 'closedCaptions', 'types', lambda _, v: url_or_none(v['track']['url']), 'track')): + subtitles.setdefault(sub_data.get('lang') or 'en', []).append({ + 'url': sub_data['url'], + 'name': sub_data.get('label'), + }) - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return self.url_result(cnn_url, CNNIE.ie_key()) + if app_id: + media_data = self._download_json( + f'https://medium.ngtv.io/v2/media/{media_id}/desktop', media_id, fatal=False, + query={'appId': app_id}) + m3u8_url = traverse_obj(media_data, ( + 'media', 'desktop', 'unprotected', 'unencrypted', 'url', {url_or_none})) + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + entries.append({ + **traverse_obj(player_data, { + 'title': ('data-headline', {clean_html}), + 'description': ('data-description', {clean_html}), + 'duration': ('data-duration', {parse_duration}), + 'timestamp': ('data-publish-date', {parse_iso8601}), + 'thumbnail': ( + 'data-poster-image-override', {json.loads}, 'big', 'uri', {url_or_none}, + {functools.partial(update_url, query='c=original')}), + 'display_id': 'data-video-slug', + }), + **traverse_obj(video_data, { + 'timestamp': ('dateCreated', 'uts', {int_or_none(scale=1000)}), + 'description': ('description', {clean_html}), + 'title': ('headline', {str}), + 'modified_timestamp': ('lastModified', 'uts', {int_or_none(scale=1000)}), + 'duration': ('trt', {int_or_none}), + }), + 'id': media_id, + 'formats': formats, + 'subtitles': subtitles, + }) -class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' - _TEST = { - 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', - 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', - 'info_dict': { - 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', - 'ext': 'mp4', - 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', - 'upload_date': '20141221', - }, - 'expected_warnings': ['Failed to download m3u8 information'], - 'add_ie': ['CNN'], - } + if len(entries) == 1: + return { + **entries[0], + 'display_id': display_id, + } - def _real_extract(self, url): - webpage = self._download_webpage(url, url_basename(url)) - cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + return self.playlist_result(entries, display_id) class CNNIndonesiaIE(InfoExtractor): diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 795105b7d878..7e6e6227d339 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -47,6 +47,7 @@ FormatSorter, GeoRestrictedError, GeoUtils, + ISO639Utils, LenientJSONDecoder, Popen, RegexNotFoundError, @@ -1408,6 +1409,13 @@ def _get_netrc_login_info(self, netrc_machine=None): return None, None self.write_debug(f'Using netrc for {netrc_machine} authentication') + + # compat: <=py3.10: netrc cannot parse tokens as empty strings, will return `""` instead + # Ref: https://github.com/yt-dlp/yt-dlp/issues/11413 + # https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378 + if sys.version_info < (3, 11): + return tuple(x if x != '""' else '' for x in info[::2]) + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): @@ -3071,7 +3079,11 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): url_pattern = stream.attrib['Url'] stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') - stream_language = stream.get('Language', 'und') + # IsmFD expects ISO 639 Set 2 language codes (3-character length) + # See: https://github.com/yt-dlp/yt-dlp/issues/11356 + stream_language = stream.get('Language') or 'und' + if len(stream_language) != 3: + stream_language = ISO639Utils.short2long(stream_language) or 'und' for track in stream.findall('QualityLevel'): KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index e06740d62edf..961dd0c5e966 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -869,7 +869,7 @@ def _real_extract(self, url): class NiconicoUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)(?:/video)?/?(?:$|[#?])' _TEST = { 'url': 'https://www.nicovideo.jp/user/419948', 'info_dict': { @@ -877,7 +877,7 @@ class NiconicoUserIE(InfoExtractor): }, 'playlist_mincount': 101, } - _API_URL = 'https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' + _API_URL = 'https://nvapi.nicovideo.jp/v2/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' _PAGE_SIZE = 100 _API_HEADERS = { @@ -897,12 +897,13 @@ def _entries(self, list_id): total_count = int_or_none(json_parsed['data'].get('totalCount')) for entry in json_parsed['data']['items']: count += 1 - yield self.url_result('https://www.nicovideo.jp/watch/{}'.format(entry['id'])) + yield self.url_result( + f'https://www.nicovideo.jp/watch/{entry["essential"]["id"]}', ie=NiconicoIE) page_num += 1 def _real_extract(self, url): list_id = self._match_id(url) - return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + return self.playlist_result(self._entries(list_id), list_id) class NiconicoLiveIE(InfoExtractor): diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 4f8d96407d0a..f4beab75b7df 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -208,7 +208,6 @@ def sign(self, user, pw, clid): def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): track_id = str(info['id']) - title = info['title'] format_urls = set() formats = [] @@ -367,7 +366,7 @@ def extract_count(key): 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), 'uploader_url': user.get('permalink_url'), 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, + 'title': info.get('title'), 'description': info.get('description'), 'thumbnails': thumbnails, 'duration': float_or_none(info.get('duration'), 1000), @@ -377,7 +376,8 @@ def extract_count(key): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), + 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -429,7 +429,6 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', - 'genres': [], }, }, # geo-restricted @@ -453,6 +452,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/the-concept-band', 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], + 'artists': ['The Royal Concept'], }, }, # private link @@ -525,6 +525,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'view_count': int, 'genres': ['Dance & EDM'], + 'artists': ['80M'], }, }, # private link, downloadable format @@ -549,6 +550,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], + 'artists': ['Ori Uplift'], }, }, # no album art, use avatar pic for thumbnail @@ -572,7 +574,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'comment_count': int, 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', - 'genres': [], + 'artists': ['MadReal'], }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 5adaf16393c5..8196ce6c328b 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -150,14 +150,6 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - # XXX: Temporary workaround until twitter.com => x.com migration is completed - def _real_initialize(self): - if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'): - return - # User has not yet been migrated to x.com and has passed twitter.com cookies - TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/' - TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - @functools.cached_property def _selected_api(self): return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 367d5e5835d6..0ed7b9ec1fe4 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -869,11 +869,12 @@ def _extract_from_api(self, video_id, unlisted_hash=None): for retry in (False, True): try: video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + break except ExtractorError as e: if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 and 'password' in traverse_obj( - e.cause.response.read(), - ({bytes.decode}, {json.loads}, 'invalid_parameters', ..., 'field'), + self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), + ({json.loads}, 'invalid_parameters', ..., 'field'), )): self._verify_video_password( video_id, self._get_video_password(), viewer['xsrft']) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5148e8261900..99b8bfecc92f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -644,13 +644,14 @@ def _initialize_oauth(self, user, refresh_token): YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE] = {} if refresh_token: - refresh_token = refresh_token.strip('\'') or None - - # Allow refresh token passed to initialize cache - if refresh_token: + msg = f'{self._OAUTH_DISPLAY_ID}: Using password input as refresh token' + if self.get_param('cachedir') is not False: + msg += ' and caching token to disk; you should supply an empty password next time' + self.to_screen(msg) self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) + else: + refresh_token = self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) - refresh_token = refresh_token or self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) if refresh_token: YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token try: diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 7aff67ddfc25..e30008e931a7 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -212,6 +212,23 @@ def write_json_file(obj, fn): raise +def partial_application(func): + sig = inspect.signature(func) + required_args = [ + param.name for param in sig.parameters.values() + if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL) + if param.default is inspect.Parameter.empty + ] + + @functools.wraps(func) + def wrapped(*args, **kwargs): + if set(required_args[len(args):]).difference(kwargs): + return functools.partial(func, *args, **kwargs) + return func(*args, **kwargs) + + return wrapped + + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) @@ -1192,6 +1209,7 @@ def extract_timezone(date_str, default=None): return timezone, date_str +@partial_application def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -1269,6 +1287,7 @@ def unified_timestamp(date_str, day_first=True): return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() +@partial_application def determine_ext(url, default_ext='unknown_video'): if url is None or '.' not in url: return default_ext @@ -1944,7 +1963,7 @@ def remove_start(s, start): def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s + return s[:-len(end)] if s is not None and end and s.endswith(end) else s def remove_quotes(s): @@ -1973,6 +1992,7 @@ def base_url(url): return re.match(r'https?://[^?#]+/', url).group() +@partial_application def urljoin(base, path): if isinstance(path, bytes): path = path.decode() @@ -1988,21 +2008,6 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -def partial_application(func): - sig = inspect.signature(func) - - @functools.wraps(func) - def wrapped(*args, **kwargs): - try: - sig.bind(*args, **kwargs) - except TypeError: - return functools.partial(func, *args, **kwargs) - else: - return func(*args, **kwargs) - - return wrapped - - @partial_application def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr and v is not None: @@ -2583,6 +2588,7 @@ def urlencode_postdata(*args, **kargs): return urllib.parse.urlencode(*args, **kargs).encode('ascii') +@partial_application def update_url(url, *, query_update=None, **kwargs): """Replace URL components specified by kwargs @param url str or parse url tuple @@ -2603,6 +2609,7 @@ def update_url(url, *, query_update=None, **kwargs): return urllib.parse.urlunparse(url._replace(**kwargs)) +@partial_application def update_url_query(url, query): return update_url(url, query_update=query) @@ -2924,6 +2931,7 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' +@partial_application def mimetype2ext(mt, default=NO_DEFAULT): if not isinstance(mt, str): if default is not NO_DEFAULT: @@ -4664,6 +4672,7 @@ def to_high_limit_path(path): return path +@partial_application def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traversal.traverse_obj(obj, *variadic(field)) if not val if ignore is NO_DEFAULT else val in variadic(ignore): @@ -4828,6 +4837,7 @@ def number_of_digits(number): return len('%d' % number) +@partial_application def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) @@ -5165,6 +5175,7 @@ class _UnsafeExtensionError(Exception): 'ico', 'image', 'jng', + 'jpe', 'jpeg', 'jxl', 'svg', @@ -5277,6 +5288,7 @@ def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffi time.sleep(delay) +@partial_application def make_archive_id(ie, video_id): ie_key = ie if isinstance(ie, str) else ie.ie_key() return f'{ie_key.lower()} {video_id}' @@ -5578,14 +5590,15 @@ def _calculate_field_preference(self, format_, field): value = get_value(field) return self._calculate_field_preference_from_value(format_, field, type_, value) - def calculate_preference(self, format): + @staticmethod + def _fill_sorting_fields(format): # Determine missing protocol if not format.get('protocol'): format['protocol'] = determine_protocol(format) # Determine missing ext if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() if format.get('vcodec') == 'none': format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' format['video_ext'] = 'none' @@ -5613,6 +5626,8 @@ def calculate_preference(self, format): if not format.get('tbr'): format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None + def calculate_preference(self, format): + self._fill_sorting_fields(format) return tuple(self._calculate_field_preference(format, field) for field in self._order) diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index df3ff406f57d..dd9b4690beb2 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -391,14 +391,13 @@ def find_element(*, tag: str, html=False): ... def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): # deliberately using `id=` and `cls=` for ease of readability assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' - if not tag: - tag = r'[\w:.-]+' + ANY_TAG = r'[\w:.-]+' if attr and value: assert not cls, 'Cannot match both attr and cls' assert not id, 'Cannot match both attr and id' func = get_element_html_by_attribute if html else get_element_by_attribute - return functools.partial(func, attr, value, tag=tag) + return functools.partial(func, attr, value, tag=tag or ANY_TAG) elif cls: assert not id, 'Cannot match both cls and id' @@ -408,7 +407,7 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal elif id: func = get_element_html_by_id if html else get_element_by_id - return functools.partial(func, id, tag=tag) + return functools.partial(func, id, tag=tag or ANY_TAG) index = int(bool(html)) return lambda html: get_element_text_and_html_by_tag(tag, html)[index] @@ -436,6 +435,20 @@ def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): return functools.partial(func, cls) +def trim_str(*, start=None, end=None): + def trim(s): + if s is None: + return None + start_idx = 0 + if start and s.startswith(start): + start_idx = len(start) + if end and s.endswith(end): + return s[start_idx:-len(end)] + return s[start_idx:] + + return trim + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False)