Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Felix Seibert
xtreemfs_client
Commits
c27f7d4a
Commit
c27f7d4a
authored
Apr 18, 2018
by
Felix Seibert
Browse files
implement rebalance_one_folder, refactor, test and bugfix
parent
902eb9ad
Changes
2
Hide whitespace changes
Inline
Side-by-side
tests/test_dataDistribution.py
View file @
c27f7d4a
import
random
import
unittest
from
xtreemfs_client
import
dataDistribution
...
...
@@ -140,24 +141,40 @@ class TestDataDistribution(unittest.TestCase):
num_osds
=
4
osd_capacities
=
[
0
]
# test for equally-sized OSDs
distribution
=
dataDistribution
.
DataDistribution
()
distribution
.
add_osd_list
(
create_test_osd_list
(
num_osds
,
osd_capacities
))
distribution
.
add_folders
(
create_test_folder_list
(
num_folders
,
folder_sizes
))
osds
=
distribution
.
get_osd_list
()
total_folder_sizes
=
list
(
map
(
lambda
x
:
distribution
.
OSDs
[
x
].
total_folder_size
,
osds
))
self
.
assertTrue
(
min
(
total_folder_sizes
)
==
max
(
total_folder_sizes
))
# test 1 for differently-sized OSDs
osd_capacities
=
[
10
,
20
]
folder_sizes
=
[
4
,
4
,
4
]
distribution
=
dataDistribution
.
DataDistribution
()
distribution
.
add_osd_list
(
create_test_osd_list
(
num_osds
,
osd_capacities
))
distribution
.
add_folders
(
create_test_folder_list
(
num_folders
,
folder_sizes
))
osds
=
distribution
.
get_osd_list
()
total_folder_sizes
=
list
(
map
(
lambda
x
:
distribution
.
OSDs
[
x
].
total_folder_size
,
osds
))
self
.
assertTrue
(
2
*
min
(
total_folder_sizes
)
==
max
(
total_folder_sizes
))
# test 2 for differently-sized OSDs. the expected result is that the 4 large OSD receive 2 files each,
# while the 4 small OSDs receive no files.
osd_capacities
=
[
10
,
30
]
folder_sizes
=
[
1
]
num_folders
=
8
distribution
=
dataDistribution
.
DataDistribution
()
distribution
.
add_osd_list
(
create_test_osd_list
(
num_osds
,
osd_capacities
))
distribution
.
add_folders
(
create_test_folder_list
(
num_folders
,
folder_sizes
),
osd_information
=
create_osd_information
(
num_osds
,
osd_capacities
),
ratio_parameter
=
osd_capacity_key
)
osds
=
distribution
.
get_osd_list
()
total_folder_sizes
=
list
(
map
(
lambda
x
:
distribution
.
OSDs
[
x
].
total_folder_size
,
osds
))
self
.
assertEqual
(
0
,
min
(
total_folder_sizes
))
self
.
assertEqual
(
2
,
max
(
total_folder_sizes
))
def
test_average_osd_load
(
self
):
folder_sizes
=
[
49
,
123
,
1
,
7
]
num_folders
=
2
...
...
@@ -204,6 +221,35 @@ class TestDataDistribution(unittest.TestCase):
# we should obtain a perfectly balanced distribution
self
.
assertEqual
(
min
(
total_folder_sizes
),
max
(
total_folder_sizes
))
def
test_rebalance_one_folder
(
self
):
folder_sizes
=
[
1
]
num_folders
=
8
osd_capacities
=
[
10
]
num_osds
=
4
distribution
=
dataDistribution
.
DataDistribution
()
distribution
.
add_osd_list
(
create_test_osd_list
(
num_osds
,
osd_capacities
))
distribution
.
add_folders
(
create_test_folder_list
(
num_folders
,
folder_sizes
),
random_osd_assignment
=
True
)
distribution
.
rebalance_one_folder
()
osds
=
distribution
.
get_osd_list
()
total_folder_sizes
=
list
(
map
(
lambda
x
:
distribution
.
OSDs
[
x
].
total_folder_size
,
osds
))
# we should obtain a perfectly balanced distribution
self
.
assertEqual
(
min
(
total_folder_sizes
),
max
(
total_folder_sizes
))
osd_capacities
=
[
10
,
30
]
folder_sizes
=
[
1
]
num_folders
=
8
distribution
=
dataDistribution
.
DataDistribution
()
distribution
.
add_osd_list
(
create_test_osd_list
(
num_osds
,
osd_capacities
))
distribution
.
add_folders
(
create_test_folder_list
(
num_folders
,
folder_sizes
),
random_osd_assignment
=
True
)
distribution
.
rebalance_one_folder
(
osd_information
=
create_osd_information
(
num_osds
,
osd_capacities
),
capacity
=
osd_capacity_key
)
osds
=
distribution
.
get_osd_list
()
total_folder_sizes
=
list
(
map
(
lambda
x
:
distribution
.
OSDs
[
x
].
total_folder_size
,
osds
))
# all folders should now be on the 'large' OSDs
self
.
assertEqual
(
0
,
min
(
total_folder_sizes
))
self
.
assertEqual
(
2
,
max
(
total_folder_sizes
))
def
create_test_osd_list
(
num_osds
,
osd_capacities
):
test_osds
=
[]
...
...
@@ -219,6 +265,7 @@ def create_test_folder_list(num_folders, folder_sizes):
for
folder_size
in
folder_sizes
:
new_folder
=
folder
.
Folder
(
folder_id_prefix
+
"_"
+
str
(
folder_size
)
+
"_"
+
str
(
i
),
folder_size
,
None
)
test_folders
.
append
(
new_folder
)
random
.
shuffle
(
test_folders
)
return
test_folders
...
...
xtreemfs_client/dataDistribution.py
View file @
c27f7d4a
...
...
@@ -11,8 +11,12 @@ class DataDistribution(object):
this class also allows to calculate several data distributions, e.g., mappings from folders to OSDs (each folder
gets mapped to one OSD).
the load is defined as the quotient from the total_folder_size of an OSD divided by its capacity.
"""
# TODO introduce consistent handling of (missing) OSD capacities / osd_information
def
__init__
(
self
):
self
.
OSDs
=
{}
...
...
@@ -98,6 +102,21 @@ class DataDistribution(object):
total_osd_capacity
+=
osd_information
[
osd_uuid
][
capacity
]
return
total_folder_size
/
total_osd_capacity
def
get_maximum_osd_load
(
self
,
osd_information
,
capacity
):
"""
calculate the maximum OSD load.
"""
assert
osd_information
is
not
None
assert
capacity
!=
''
maximum_load
=
0
maximum_osd
=
None
for
osd
in
self
.
OSDs
.
values
():
load
=
osd
.
total_folder_size
/
osd_information
[
osd
.
uuid
][
capacity
]
if
maximum_osd
is
None
or
load
>
maximum_load
:
maximum_load
=
load
maximum_osd
=
osd
return
maximum_osd
,
maximum_load
def
get_average_total_folder_size
(
self
):
"""
calculate the average total_folder_size of the OSDs.
...
...
@@ -204,39 +223,34 @@ class DataDistribution(object):
# (following largest processing time first, also called post-greedy approach)
list
.
sort
(
new_folders
,
key
=
lambda
x
:
x
.
size
,
reverse
=
True
)
osd_ratios
=
{}
# ratios are given - use them to assign proportionally
if
osd_information
is
not
None
and
ratio_parameter
!=
''
:
total_osd_size
=
0
for
osd_size
in
osd_information
.
values
():
total_osd_size
+=
osd_size
[
ratio_parameter
]
for
osd_uuid
,
osd_size
in
osd_information
.
items
():
osd_ratios
[
osd_uuid
]
=
float
(
osd_size
[
ratio_parameter
])
/
float
(
total_osd_size
)
# no ratios are given - assume that all OSDs have same capacity
else
:
for
osd_uuid
in
self
.
OSDs
.
keys
():
osd_ratios
[
osd_uuid
]
=
float
(
1
)
# if osd_information is None, use the fake osd_information, which assumes that all OSDs have the same capacity
# otherwise use the given osd_information
if
osd_information
is
None
:
ratio_parameter
=
'dummy_value'
osd_information
=
self
.
get_equal_sized_fake_osd_information
(
ratio_parameter
)
# for each folder calculate the best OSD and add it to it
for
a_folder
in
new_folders
:
least_used_osd
=
None
for
one_osd
in
self
.
OSDs
.
values
():
if
(
least_used_osd
is
None
)
or
\
one_osd
.
total_folder_size
/
osd_ratios
[
one_osd
.
uuid
]
\
<=
least_used_osd
.
total_folder_size
/
osd_ratios
[
least_used_osd
.
uuid
]:
least_used_osd
=
one_osd
least_used_osd
,
_
=
self
.
get_lpt_osd
(
osd_information
,
ratio_parameter
,
a_folder
.
size
)
least_used_osd
.
add_folder
(
a_folder
.
id
,
a_folder
.
size
)
osds_for_new_folders
.
append
((
a_folder
.
id
,
least_used_osd
.
uuid
))
return
osds_for_new_folders
def
rebalance_lpt
(
self
,
rebalance_factor
=
1
,
osd_information
=
None
,
capacity
=
''
):
"""
rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
1. 'unroll' the assignment. this means that, for each OSD, folders are removed until the OSD has less
total_folder_size than the average total folder size of this distribution multiplied by rebalance_factor.
2. reassign the removed folders using the LPT strategy.
"""
# TODO maybe osd_information should be taken into consideration during the unrolling?
movements
=
{}
folders_to_be_reassigned
=
[]
reassignment_limit
=
self
.
get_average_total_folder_size
()
*
rebalance_factor
# for each OSD, remove the smallest folder until its total_folder_size does not exceed the reassignment_limit
# unrolling
for
osd
in
self
.
OSDs
.
values
():
while
osd
.
total_folder_size
>
reassignment_limit
:
folder_id
,
folder_size
=
osd
.
get_smallest_folder
()
...
...
@@ -244,6 +258,7 @@ class DataDistribution(object):
movements
[
folder_id
]
=
osd
.
uuid
osd
.
remove_folder
(
folder_id
)
# reassignment
new_assignments
=
self
.
add_folders
(
folders_to_be_reassigned
,
osd_information
=
osd_information
,
ratio_parameter
=
capacity
)
...
...
@@ -252,6 +267,75 @@ class DataDistribution(object):
return
movements
def
rebalance_one_folder
(
self
,
osd_information
=
None
,
capacity
=
''
):
"""
rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
1. find OSD with the highest load
2. get folder with smallest size on this OSD
3. find new OSD for this folder using get_lpt_osd
4. if the load on the new OSD is lower than on the original OSD, move the folder to the new OSD.
otherwise, return.
one open question is whether getting the folder with smallest size in step 2 is a clever choice
(in principle, all folders of the OSD with the highest load are eligible).
this optimization scheme classifies as local search. two distributions are neighbors if one can be transformed
into the other by moving one folder from one OSD to another. note, however, that we do not search the whole
neighborhood of a distribution.
but it might be possible to show that if there is no improvement step of the type that we check for,
there is no improvement step at all.
"""
if
osd_information
is
None
:
capacity_key
=
'capacity'
osd_information
=
self
.
get_equal_sized_fake_osd_information
(
capacity_key
)
capacity
=
capacity_key
movements
=
{}
while
True
:
# find OSD with the highest load (origin)
origin_osd
,
maximum_load
=
self
.
get_maximum_osd_load
(
osd_information
,
capacity
)
# pick a folder of this OSD
# there are several ways to pick a folder (like largest, smallest, constrained by the resulting load of the
# origin OSD, random...), it is not clear which way is a good way
# for now pick the smallest folder on origin OSD
smallest_folder_id
,
smallest_folder_size
=
self
.
OSDs
[
origin_osd
.
uuid
].
get_smallest_folder
()
# find other OSD best suited for the picked folder (target)
# check whether moving folder from origin to target decreases the maximum load of all OSDs (makespan).
best_osd
,
best_osd_load
=
self
.
get_lpt_osd
(
osd_information
,
capacity
,
smallest_folder_size
)
if
best_osd_load
<
maximum_load
:
self
.
assign_new_osd
(
smallest_folder_id
,
best_osd
.
uuid
)
movements
[
smallest_folder_id
]
=
(
origin_osd
.
uuid
,
best_osd
.
uuid
)
else
:
break
return
movements
def
get_lpt_osd
(
self
,
osd_information
,
ratio_parameter
,
folder_size
):
"""
calculate the load of all OSDs, using the sum of their current total_folder_size and folder_size.
return (OSD with the smallest such value, the smallest value)
"""
least_used_osd
=
None
best_load_so_far
=
1
for
one_osd
in
self
.
OSDs
.
values
():
one_osd_load
=
(
one_osd
.
total_folder_size
+
folder_size
)
/
osd_information
[
one_osd
.
uuid
][
ratio_parameter
]
if
(
least_used_osd
is
None
)
or
one_osd_load
<
best_load_so_far
:
least_used_osd
=
one_osd
best_load_so_far
=
one_osd_load
return
least_used_osd
,
best_load_so_far
def
create_osd_ratios
(
self
,
osd_information
,
ratio_parameter
):
osd_ratios
=
{}
# ratios are given - use them to assign proportionally
total_osd_size
=
0
for
osd_size
in
osd_information
.
values
():
total_osd_size
+=
osd_size
[
ratio_parameter
]
for
osd_uuid
,
osd_size
in
osd_information
.
items
():
osd_ratios
[
osd_uuid
]
=
float
(
osd_size
[
ratio_parameter
])
/
float
(
total_osd_size
)
return
osd_ratios
def
update_folder
(
self
,
folder
,
size
):
"""
updates the size of a given folder
...
...
@@ -259,6 +343,14 @@ class DataDistribution(object):
for
one_osd
in
self
.
OSDs
.
values
():
if
folder
in
one_osd
.
folders
.
keys
():
one_osd
.
update_folder
(
folder
,
size
)
break
def
get_equal_sized_fake_osd_information
(
self
,
capacity
):
osd_information
=
{}
for
osd_uuid
in
self
.
get_osd_list
():
osd_information
[
osd_uuid
]
=
{}
osd_information
[
osd_uuid
][
capacity
]
=
1
return
osd_information
def
description
(
self
):
"""
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment