Skip to content

Commit f1b7b7e

Browse files
committed
fix replication repair files alignment in deep trees
Signed-off-by: Stephen L. <lrq3000@gmail.com>
1 parent a33cf30 commit f1b7b7e

File tree

4 files changed

+52
-19
lines changed

4 files changed

+52
-19
lines changed

pyFileFixity/lib/aux_funcs.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,17 @@ def sizeof_fmt(num, suffix='B', mod=1024.0):
6969
num /= mod
7070
return "%.1f%s%s" % (num, 'Y', suffix)
7171

72-
def path2unix(path, fromwinpath=False):
72+
def path2unix(path, nojoin=False, fromwinpath=False):
7373
'''From a path given in any format, converts to posix path format
7474
fromwinpath=True forces the input path to be recognized as a Windows path (useful on Unix machines to unit test Windows paths)'''
7575
if fromwinpath:
76-
return posixpath.join(*list(PureWindowsPath(path).parts))
76+
pathparts = list(PureWindowsPath(path).parts)
7777
else:
78-
return posixpath.join(*list(PurePath(path).parts))
78+
pathparts = list(PurePath(path).parts)
79+
if nojoin:
80+
return pathparts
81+
else:
82+
return posixpath.join(*pathparts)
7983

8084
def get_next_entry(file, entrymarker="\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF\xFE\xFF", only_coord=True, blocksize=65535):
8185
'''Find or read the next ecc entry in a given ecc file.

pyFileFixity/replication_repair.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,27 @@
6161

6262
def relpath_posix(recwalk_result, pardir, fromwinpath=False):
6363
''' Helper function to convert all paths to relative posix like paths (to ease comparison) '''
64-
return recwalk_result[0], os.path.split(path2unix(os.path.join(os.path.relpath(recwalk_result[0], pardir),recwalk_result[1]), fromwinpath=fromwinpath))
64+
return recwalk_result[0], path2unix(os.path.join(os.path.relpath(recwalk_result[0], pardir),recwalk_result[1]), nojoin=True, fromwinpath=fromwinpath)
6565

6666
#def checkAllEqual(lst):
6767
# return not lst or [lst[0]]*len(lst) == lst
6868

69+
def sort_dict_of_paths(d):
70+
""" Sort a dict containing paths parts (ie, paths divided in parts and stored as a list). Top paths will be given precedence over deeper paths. """
71+
# Find the path that is the deepest, and count the number of parts
72+
max_rec = max(len(x) if x else 0 for x in d.values())
73+
# Pad other paths with empty parts to fill in, so that all paths will have the same number of parts (necessary to compare correctly, else deeper paths may get precedence over top ones, since the folder name will be compared to filenames!)
74+
for key in d.keys():
75+
if d[key]:
76+
d[key] = ['']*(max_rec-len(d[key])) + d[key]
77+
# Sort the dict relatively to the paths alphabetical order
78+
d_sort = sorted(d.items(), key=lambda x: x[1])
79+
return d_sort
80+
6981
def sort_group(d, return_only_first=False):
7082
''' Sort a dictionary of relative paths and cluster equal paths together at the same time '''
7183
# First, sort the paths in order (this must be a couple: (parent_dir, filename), so that there's no ambiguity because else a file at root will be considered as being after a folder/file since the ordering is done alphabetically without any notion of tree structure).
72-
d_sort = sorted(d.items(), key=lambda x: x[1])
84+
d_sort = sort_dict_of_paths(d)
7385
# Pop the first item in the ordered list
7486
base_elt = (-1, None)
7587
while (base_elt[1] is None and d_sort):
@@ -219,7 +231,9 @@ def majority_vote_byte_scan(relfilepath, fileslist, outpath, blocksize=65535, de
219231

220232
def synchronize_files(inputpaths, outpath, database=None, tqdm_bar=None, report_file=None, ptee=None, verbose=False):
221233
''' Main function to synchronize files contents by majority vote
222-
The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.'''
234+
The main job of this function is to walk through the input folders and align the files, so that we can compare every files across every folders, one by one.
235+
The whole trick here is to align files, so that we don't need to memorize all the files in memory and we compare all equivalent files together: to do that, we ensure that we walk through the input directories in alphabetical order, and we pick the relative filepath at the top of the alphabetical order, this ensures the alignment of files between different folders, without memorizing the whole trees structures.
236+
'''
223237
# (Generator) Files Synchronization Algorithm:
224238
# Needs a function stable_dir_walking, which will walk through directories recursively but in always the same order on all platforms (same order for files but also for folders), whatever order it is, as long as it is stable.
225239
# Until there's no file in any of the input folders to be processed:

pyFileFixity/tests/test_aux_funcs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def test_sizeof_fmt():
5252

5353
def test_path2unix():
5454
""" aux: test path2unix """
55-
assert auxf.path2unix(r'test\some\folder\file.ext', True) == r'test/some/folder/file.ext'
55+
assert auxf.path2unix(r'test\some\folder\file.ext', fromwinpath=True) == r'test/some/folder/file.ext'
56+
assert auxf.path2unix(r'test\some\folder\file.ext', nojoin=True, fromwinpath=True) == ['test', 'some', 'folder', 'file.ext']
5657
assert auxf.path2unix(r'test/some/folder/file.ext') == r'test/some/folder/file.ext'
5758

5859
def test_is_file():

pyFileFixity/tests/test_replication_repair.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,30 +27,44 @@ def test_relpath_posix():
2727
""" repli: test internal: relpath_posix()"""
2828
recwalk_result = [r'C:\test\some\path', r'relative\path\file.ext']
2929
pardir = r'C:\test\some'
30-
assert rep.relpath_posix(recwalk_result, pardir, True) == ('C:\\test\\some\\path', ('path/relative/path', 'file.ext'))
30+
print(rep.relpath_posix(recwalk_result, pardir, True))
31+
assert rep.relpath_posix(recwalk_result, pardir, True) == ('C:\\test\\some\\path', ['path', 'relative', 'path', 'file.ext'])
3132
recwalk_result = [r'/test/some/path', r'relative/path/file.ext']
3233
pardir = r'/test/some'
33-
assert rep.relpath_posix(recwalk_result, pardir, False) == ('/test/some/path', ('path/relative/path', 'file.ext'))
34+
assert rep.relpath_posix(recwalk_result, pardir, False) == ('/test/some/path', ['path', 'relative', 'path', 'file.ext'])
3435
recwalk_result = [r'/test/some/path', r'relative\path\file.ext']
3536
pardir = r'/test/some'
36-
assert rep.relpath_posix(recwalk_result, pardir, True) == ('/test/some/path', ('path/relative/path', 'file.ext'))
37+
assert rep.relpath_posix(recwalk_result, pardir, True) == ('/test/some/path', ['path', 'relative', 'path', 'file.ext'])
38+
39+
def test_sort_dict_of_paths():
40+
d = {0: ['testoo.TXT'], 1: ['testoo.TXT'], 2: ['testbb-more.TXT'], 3: ['sub', 'testsub.TXT']}
41+
d_sort = rep.sort_dict_of_paths(d)
42+
assert d_sort == [(2, ['', 'testbb-more.TXT']),
43+
(0, ['', 'testoo.TXT']),
44+
(1, ['', 'testoo.TXT']),
45+
(3, ['sub', 'testsub.TXT'])]
3746

3847
def test_sort_group():
3948
""" repli: test internal: sort_group()"""
49+
# Generate an artificial tree, with some relative paths being the same across multiple folders,
50+
# and some others in different tree depth (very import to check that it works OK!)
4051
curfiles = {
41-
0: ('relative/path', 'file.ext'),
42-
1: ('relative/path', 'file.ext'),
43-
2: ('relative/path', 'zzzz.ext'),
44-
3: ('relative/path', 'zzzz.ext'),
45-
4: ('relative/path', 'bbbb.ext'),
52+
0: ['relative', 'path', 'file.ext'],
53+
1: ['relative', 'path', 'file.ext'],
54+
2: ['relative', 'aaa', 'zzzz.ext'],
55+
3: ['zzzz.ext'],
56+
4: ['relative', 'zzzz.ext'],
57+
5: ['relative', 'path', 'bbbb.ext'],
4658
}
4759
assert rep.sort_group(curfiles, return_only_first=False) == \
4860
[
49-
[(4, ('relative/path', 'bbbb.ext'))],
50-
[(0, ('relative/path', 'file.ext')), (1, ('relative/path', 'file.ext'))],
51-
[(2, ('relative/path', 'zzzz.ext')), (3, ('relative/path', 'zzzz.ext'))]
61+
[(3, ['', '', 'zzzz.ext'])],
62+
[(4, ['', 'relative', 'zzzz.ext'])],
63+
[(2, ['relative', 'aaa', 'zzzz.ext'])],
64+
[(5, ['relative', 'path', 'bbbb.ext'])],
65+
[(0, ['relative', 'path', 'file.ext']), (1, ['relative', 'path', 'file.ext'])]
5266
]
53-
assert rep.sort_group(curfiles, return_only_first=True) == [[(4, ('relative/path', 'bbbb.ext'))]]
67+
assert rep.sort_group(curfiles, return_only_first=True) == [[(3, ['', '', 'zzzz.ext'])]]
5468

5569
def test_majority_vote_byte_scan():
5670
""" repli: test internal: majority_vote_byte_scan()"""

0 commit comments

Comments
 (0)