Fix "Argument list too long" error in the "spot" check when checking 100k+ files (#866).
This commit is contained in:
parent
08d6f83b2e
commit
df4668754d
3 changed files with 94 additions and 24 deletions
2
NEWS
2
NEWS
|
@ -2,6 +2,8 @@
|
||||||
* #860: Fix interaction between environment variable interpolation in constants and shell escaping.
|
* #860: Fix interaction between environment variable interpolation in constants and shell escaping.
|
||||||
* #863: When color output is disabled (explicitly or implicitly), don't prefix each log line with
|
* #863: When color output is disabled (explicitly or implicitly), don't prefix each log line with
|
||||||
the log level.
|
the log level.
|
||||||
|
* #866: Fix "Argument list too long" error in the "spot" check when checking hundreds of thousands
|
||||||
|
of files at once.
|
||||||
* #874: Add the configured repository label as "repository_label" to the interpolated variables
|
* #874: Add the configured repository label as "repository_label" to the interpolated variables
|
||||||
passed to before/after command hooks.
|
passed to before/after command hooks.
|
||||||
* In the "spot" check, don't try to hash symlinked directories.
|
* In the "spot" check, don't try to hash symlinked directories.
|
||||||
|
|
|
@ -387,6 +387,9 @@ def collect_spot_check_archive_paths(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_PATHS_SUBSET_COUNT = 10000
|
||||||
|
|
||||||
|
|
||||||
def compare_spot_check_hashes(
|
def compare_spot_check_hashes(
|
||||||
repository,
|
repository,
|
||||||
archive,
|
archive,
|
||||||
|
@ -419,32 +422,57 @@ def compare_spot_check_hashes(
|
||||||
f'{log_label}: Sampling {sample_count} source paths (~{spot_check_config["data_sample_percentage"]}%) for spot check'
|
f'{log_label}: Sampling {sample_count} source paths (~{spot_check_config["data_sample_percentage"]}%) for spot check'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Hash each file in the sample paths (if it exists).
|
source_sample_paths_iterator = iter(source_sample_paths)
|
||||||
hash_output = borgmatic.execute.execute_command_and_capture_output(
|
source_hashes = {}
|
||||||
(spot_check_config.get('xxh64sum_command', 'xxh64sum'),)
|
archive_hashes = {}
|
||||||
+ tuple(path for path in source_sample_paths if path in existing_source_sample_paths)
|
|
||||||
)
|
|
||||||
|
|
||||||
source_hashes = dict(
|
# Only hash a few thousand files at a time (a subset of the total paths) to avoid an "Argument
|
||||||
(reversed(line.split(' ', 1)) for line in hash_output.splitlines()),
|
# list too long" shell error.
|
||||||
**{path: '' for path in source_sample_paths if path not in existing_source_sample_paths},
|
while True:
|
||||||
)
|
# Hash each file in the sample paths (if it exists).
|
||||||
|
source_sample_paths_subset = tuple(
|
||||||
archive_hashes = dict(
|
itertools.islice(source_sample_paths_iterator, SAMPLE_PATHS_SUBSET_COUNT)
|
||||||
reversed(line.split(' ', 1))
|
)
|
||||||
for line in borgmatic.borg.list.capture_archive_listing(
|
if not source_sample_paths_subset:
|
||||||
repository['path'],
|
break
|
||||||
archive,
|
|
||||||
config,
|
hash_output = borgmatic.execute.execute_command_and_capture_output(
|
||||||
local_borg_version,
|
(spot_check_config.get('xxh64sum_command', 'xxh64sum'),)
|
||||||
global_arguments,
|
+ tuple(
|
||||||
list_paths=source_sample_paths,
|
path for path in source_sample_paths_subset if path in existing_source_sample_paths
|
||||||
path_format='{xxh64} /{path}{NL}', # noqa: FS003
|
)
|
||||||
local_path=local_path,
|
)
|
||||||
remote_path=remote_path,
|
|
||||||
|
source_hashes.update(
|
||||||
|
**dict(
|
||||||
|
(reversed(line.split(' ', 1)) for line in hash_output.splitlines()),
|
||||||
|
# Represent non-existent files as having empty hashes so the comparison below still works.
|
||||||
|
**{
|
||||||
|
path: ''
|
||||||
|
for path in source_sample_paths_subset
|
||||||
|
if path not in existing_source_sample_paths
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the hash for each file in the archive.
|
||||||
|
archive_hashes.update(
|
||||||
|
**dict(
|
||||||
|
reversed(line.split(' ', 1))
|
||||||
|
for line in borgmatic.borg.list.capture_archive_listing(
|
||||||
|
repository['path'],
|
||||||
|
archive,
|
||||||
|
config,
|
||||||
|
local_borg_version,
|
||||||
|
global_arguments,
|
||||||
|
list_paths=source_sample_paths_subset,
|
||||||
|
path_format='{xxh64} /{path}{NL}', # noqa: FS003
|
||||||
|
local_path=local_path,
|
||||||
|
remote_path=remote_path,
|
||||||
|
)
|
||||||
|
if line
|
||||||
|
)
|
||||||
)
|
)
|
||||||
if line
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compare the source hashes with the archive hashes to see how many match.
|
# Compare the source hashes with the archive hashes to see how many match.
|
||||||
failing_paths = []
|
failing_paths = []
|
||||||
|
|
|
@ -770,6 +770,46 @@ def test_compare_spot_check_hashes_considers_non_existent_path_as_not_matching()
|
||||||
) == ('/bar',)
|
) == ('/bar',)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compare_spot_check_hashes_with_too_many_paths_feeds_them_to_commands_in_chunks():
|
||||||
|
flexmock(module).SAMPLE_PATHS_SUBSET_COUNT = 2
|
||||||
|
flexmock(module.random).should_receive('sample').replace_with(
|
||||||
|
lambda population, count: population[:count]
|
||||||
|
)
|
||||||
|
flexmock(module.os.path).should_receive('exists').and_return(True)
|
||||||
|
flexmock(module.borgmatic.execute).should_receive(
|
||||||
|
'execute_command_and_capture_output'
|
||||||
|
).with_args(('xxh64sum', '/foo', '/bar')).and_return('hash1 /foo\nhash2 /bar')
|
||||||
|
flexmock(module.borgmatic.execute).should_receive(
|
||||||
|
'execute_command_and_capture_output'
|
||||||
|
).with_args(('xxh64sum', '/baz', '/quux')).and_return('hash3 /baz\nhash4 /quux')
|
||||||
|
flexmock(module.borgmatic.borg.list).should_receive('capture_archive_listing').and_return(
|
||||||
|
['hash1 /foo', 'hash2 /bar']
|
||||||
|
).and_return(['hash3 /baz', 'nothash4 /quux'])
|
||||||
|
|
||||||
|
assert module.compare_spot_check_hashes(
|
||||||
|
repository={'path': 'repo'},
|
||||||
|
archive='archive',
|
||||||
|
config={
|
||||||
|
'checks': [
|
||||||
|
{
|
||||||
|
'name': 'archives',
|
||||||
|
'frequency': '2 weeks',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'spot',
|
||||||
|
'data_sample_percentage': 100,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
local_borg_version=flexmock(),
|
||||||
|
global_arguments=flexmock(),
|
||||||
|
local_path=flexmock(),
|
||||||
|
remote_path=flexmock(),
|
||||||
|
log_label='repo',
|
||||||
|
source_paths=('/foo', '/bar', '/baz', '/quux'),
|
||||||
|
) == ('/quux',)
|
||||||
|
|
||||||
|
|
||||||
def test_spot_check_without_spot_configuration_errors():
|
def test_spot_check_without_spot_configuration_errors():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
module.spot_check(
|
module.spot_check(
|
||||||
|
|
Loading…
Reference in a new issue