From df4668754dd6901aeee6fbe1afca58e078ada552 Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Sun, 9 Jun 2024 22:53:56 -0700 Subject: [PATCH] Fix "Argument list too long" error in the "spot" check when checking 100k+ files (#866). --- NEWS | 2 + borgmatic/actions/check.py | 76 ++++++++++++++++++++++---------- tests/unit/actions/test_check.py | 40 +++++++++++++++++ 3 files changed, 94 insertions(+), 24 deletions(-) diff --git a/NEWS b/NEWS index 24a50bb..cffdbab 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,8 @@ * #860: Fix interaction between environment variable interpolation in constants and shell escaping. * #863: When color output is disabled (explicitly or implicitly), don't prefix each log line with the log level. + * #866: Fix "Argument list too long" error in the "spot" check when checking hundreds of thousands + of files at once. * #874: Add the configured repository label as "repository_label" to the interpolated variables passed to before/after command hooks. * In the "spot" check, don't try to hash symlinked directories. diff --git a/borgmatic/actions/check.py b/borgmatic/actions/check.py index d27dced..bded12a 100644 --- a/borgmatic/actions/check.py +++ b/borgmatic/actions/check.py @@ -387,6 +387,9 @@ def collect_spot_check_archive_paths( ) +SAMPLE_PATHS_SUBSET_COUNT = 10000 + + def compare_spot_check_hashes( repository, archive, @@ -419,32 +422,57 @@ def compare_spot_check_hashes( f'{log_label}: Sampling {sample_count} source paths (~{spot_check_config["data_sample_percentage"]}%) for spot check' ) - # Hash each file in the sample paths (if it exists). - hash_output = borgmatic.execute.execute_command_and_capture_output( - (spot_check_config.get('xxh64sum_command', 'xxh64sum'),) - + tuple(path for path in source_sample_paths if path in existing_source_sample_paths) - ) + source_sample_paths_iterator = iter(source_sample_paths) + source_hashes = {} + archive_hashes = {} - source_hashes = dict( - (reversed(line.split(' ', 1)) for line in hash_output.splitlines()), - **{path: '' for path in source_sample_paths if path not in existing_source_sample_paths}, - ) - - archive_hashes = dict( - reversed(line.split(' ', 1)) - for line in borgmatic.borg.list.capture_archive_listing( - repository['path'], - archive, - config, - local_borg_version, - global_arguments, - list_paths=source_sample_paths, - path_format='{xxh64} /{path}{NL}', # noqa: FS003 - local_path=local_path, - remote_path=remote_path, + # Only hash a few thousand files at a time (a subset of the total paths) to avoid an "Argument + # list too long" shell error. + while True: + # Hash each file in the sample paths (if it exists). + source_sample_paths_subset = tuple( + itertools.islice(source_sample_paths_iterator, SAMPLE_PATHS_SUBSET_COUNT) + ) + if not source_sample_paths_subset: + break + + hash_output = borgmatic.execute.execute_command_and_capture_output( + (spot_check_config.get('xxh64sum_command', 'xxh64sum'),) + + tuple( + path for path in source_sample_paths_subset if path in existing_source_sample_paths + ) + ) + + source_hashes.update( + **dict( + (reversed(line.split(' ', 1)) for line in hash_output.splitlines()), + # Represent non-existent files as having empty hashes so the comparison below still works. + **{ + path: '' + for path in source_sample_paths_subset + if path not in existing_source_sample_paths + }, + ) + ) + + # Get the hash for each file in the archive. + archive_hashes.update( + **dict( + reversed(line.split(' ', 1)) + for line in borgmatic.borg.list.capture_archive_listing( + repository['path'], + archive, + config, + local_borg_version, + global_arguments, + list_paths=source_sample_paths_subset, + path_format='{xxh64} /{path}{NL}', # noqa: FS003 + local_path=local_path, + remote_path=remote_path, + ) + if line + ) ) - if line - ) # Compare the source hashes with the archive hashes to see how many match. failing_paths = [] diff --git a/tests/unit/actions/test_check.py b/tests/unit/actions/test_check.py index e00b6ef..3a79360 100644 --- a/tests/unit/actions/test_check.py +++ b/tests/unit/actions/test_check.py @@ -770,6 +770,46 @@ def test_compare_spot_check_hashes_considers_non_existent_path_as_not_matching() ) == ('/bar',) +def test_compare_spot_check_hashes_with_too_many_paths_feeds_them_to_commands_in_chunks(): + flexmock(module).SAMPLE_PATHS_SUBSET_COUNT = 2 + flexmock(module.random).should_receive('sample').replace_with( + lambda population, count: population[:count] + ) + flexmock(module.os.path).should_receive('exists').and_return(True) + flexmock(module.borgmatic.execute).should_receive( + 'execute_command_and_capture_output' + ).with_args(('xxh64sum', '/foo', '/bar')).and_return('hash1 /foo\nhash2 /bar') + flexmock(module.borgmatic.execute).should_receive( + 'execute_command_and_capture_output' + ).with_args(('xxh64sum', '/baz', '/quux')).and_return('hash3 /baz\nhash4 /quux') + flexmock(module.borgmatic.borg.list).should_receive('capture_archive_listing').and_return( + ['hash1 /foo', 'hash2 /bar'] + ).and_return(['hash3 /baz', 'nothash4 /quux']) + + assert module.compare_spot_check_hashes( + repository={'path': 'repo'}, + archive='archive', + config={ + 'checks': [ + { + 'name': 'archives', + 'frequency': '2 weeks', + }, + { + 'name': 'spot', + 'data_sample_percentage': 100, + }, + ] + }, + local_borg_version=flexmock(), + global_arguments=flexmock(), + local_path=flexmock(), + remote_path=flexmock(), + log_label='repo', + source_paths=('/foo', '/bar', '/baz', '/quux'), + ) == ('/quux',) + + def test_spot_check_without_spot_configuration_errors(): with pytest.raises(ValueError): module.spot_check(