Commit c27f7d4a authored by Felix Seibert's avatar Felix Seibert
Browse files

implement rebalance_one_folder, refactor, test and bugfix

parent 902eb9ad
import random
import unittest
from xtreemfs_client import dataDistribution
......@@ -140,24 +141,40 @@ class TestDataDistribution(unittest.TestCase):
num_osds = 4
osd_capacities = [0]
# test for equally-sized OSDs
distribution = dataDistribution.DataDistribution()
distribution.add_osd_list(create_test_osd_list(num_osds, osd_capacities))
distribution.add_folders(create_test_folder_list(num_folders, folder_sizes))
osds = distribution.get_osd_list()
total_folder_sizes = list(map(lambda x: distribution.OSDs[x].total_folder_size, osds))
self.assertTrue(min(total_folder_sizes) == max(total_folder_sizes))
# test 1 for differently-sized OSDs
osd_capacities = [10, 20]
folder_sizes = [4, 4, 4]
distribution = dataDistribution.DataDistribution()
distribution.add_osd_list(create_test_osd_list(num_osds, osd_capacities))
distribution.add_folders(create_test_folder_list(num_folders, folder_sizes))
osds = distribution.get_osd_list()
total_folder_sizes = list(map(lambda x: distribution.OSDs[x].total_folder_size, osds))
self.assertTrue(2 * min(total_folder_sizes) == max(total_folder_sizes))
# test 2 for differently-sized OSDs. the expected result is that the 4 large OSD receive 2 files each,
# while the 4 small OSDs receive no files.
osd_capacities = [10, 30]
folder_sizes = [1]
num_folders = 8
distribution = dataDistribution.DataDistribution()
distribution.add_osd_list(create_test_osd_list(num_osds, osd_capacities))
distribution.add_folders(create_test_folder_list(num_folders, folder_sizes),
osd_information=create_osd_information(num_osds, osd_capacities),
ratio_parameter=osd_capacity_key)
osds = distribution.get_osd_list()
total_folder_sizes = list(map(lambda x: distribution.OSDs[x].total_folder_size, osds))
self.assertEqual(0, min(total_folder_sizes))
self.assertEqual(2, max(total_folder_sizes))
def test_average_osd_load(self):
folder_sizes = [49, 123, 1, 7]
num_folders = 2
......@@ -204,6 +221,35 @@ class TestDataDistribution(unittest.TestCase):
# we should obtain a perfectly balanced distribution
self.assertEqual(min(total_folder_sizes), max(total_folder_sizes))
def test_rebalance_one_folder(self):
folder_sizes = [1]
num_folders = 8
osd_capacities = [10]
num_osds = 4
distribution = dataDistribution.DataDistribution()
distribution.add_osd_list(create_test_osd_list(num_osds, osd_capacities))
distribution.add_folders(create_test_folder_list(num_folders, folder_sizes), random_osd_assignment=True)
distribution.rebalance_one_folder()
osds = distribution.get_osd_list()
total_folder_sizes = list(map(lambda x: distribution.OSDs[x].total_folder_size, osds))
# we should obtain a perfectly balanced distribution
self.assertEqual(min(total_folder_sizes), max(total_folder_sizes))
osd_capacities = [10, 30]
folder_sizes = [1]
num_folders = 8
distribution = dataDistribution.DataDistribution()
distribution.add_osd_list(create_test_osd_list(num_osds, osd_capacities))
distribution.add_folders(create_test_folder_list(num_folders, folder_sizes), random_osd_assignment=True)
distribution.rebalance_one_folder(osd_information=create_osd_information(num_osds, osd_capacities),
capacity=osd_capacity_key)
osds = distribution.get_osd_list()
total_folder_sizes = list(map(lambda x: distribution.OSDs[x].total_folder_size, osds))
# all folders should now be on the 'large' OSDs
self.assertEqual(0, min(total_folder_sizes))
self.assertEqual(2, max(total_folder_sizes))
def create_test_osd_list(num_osds, osd_capacities):
test_osds = []
......@@ -219,6 +265,7 @@ def create_test_folder_list(num_folders, folder_sizes):
for folder_size in folder_sizes:
new_folder = folder.Folder(folder_id_prefix + "_" + str(folder_size) + "_" + str(i), folder_size, None)
test_folders.append(new_folder)
random.shuffle(test_folders)
return test_folders
......
......@@ -11,8 +11,12 @@ class DataDistribution(object):
this class also allows to calculate several data distributions, e.g., mappings from folders to OSDs (each folder
gets mapped to one OSD).
the load is defined as the quotient from the total_folder_size of an OSD divided by its capacity.
"""
# TODO introduce consistent handling of (missing) OSD capacities / osd_information
def __init__(self):
self.OSDs = {}
......@@ -98,6 +102,21 @@ class DataDistribution(object):
total_osd_capacity += osd_information[osd_uuid][capacity]
return total_folder_size / total_osd_capacity
def get_maximum_osd_load(self, osd_information, capacity):
"""
calculate the maximum OSD load.
"""
assert osd_information is not None
assert capacity != ''
maximum_load = 0
maximum_osd = None
for osd in self.OSDs.values():
load = osd.total_folder_size / osd_information[osd.uuid][capacity]
if maximum_osd is None or load > maximum_load:
maximum_load = load
maximum_osd = osd
return maximum_osd, maximum_load
def get_average_total_folder_size(self):
"""
calculate the average total_folder_size of the OSDs.
......@@ -204,39 +223,34 @@ class DataDistribution(object):
# (following largest processing time first, also called post-greedy approach)
list.sort(new_folders, key=lambda x: x.size, reverse=True)
osd_ratios = {}
# ratios are given - use them to assign proportionally
if osd_information is not None and ratio_parameter != '':
total_osd_size = 0
for osd_size in osd_information.values():
total_osd_size += osd_size[ratio_parameter]
for osd_uuid, osd_size in osd_information.items():
osd_ratios[osd_uuid] = float(osd_size[ratio_parameter]) / float(total_osd_size)
# no ratios are given - assume that all OSDs have same capacity
else:
for osd_uuid in self.OSDs.keys():
osd_ratios[osd_uuid] = float(1)
# if osd_information is None, use the fake osd_information, which assumes that all OSDs have the same capacity
# otherwise use the given osd_information
if osd_information is None:
ratio_parameter = 'dummy_value'
osd_information = self.get_equal_sized_fake_osd_information(ratio_parameter)
# for each folder calculate the best OSD and add it to it
for a_folder in new_folders:
least_used_osd = None
for one_osd in self.OSDs.values():
if (least_used_osd is None) or \
one_osd.total_folder_size / osd_ratios[one_osd.uuid] \
<= least_used_osd.total_folder_size / osd_ratios[least_used_osd.uuid]:
least_used_osd = one_osd
least_used_osd, _ = self.get_lpt_osd(osd_information, ratio_parameter, a_folder.size)
least_used_osd.add_folder(a_folder.id, a_folder.size)
osds_for_new_folders.append((a_folder.id,
least_used_osd.uuid))
return osds_for_new_folders
def rebalance_lpt(self, rebalance_factor=1, osd_information=None, capacity=''):
"""
rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
1. 'unroll' the assignment. this means that, for each OSD, folders are removed until the OSD has less
total_folder_size than the average total folder size of this distribution multiplied by rebalance_factor.
2. reassign the removed folders using the LPT strategy.
"""
# TODO maybe osd_information should be taken into consideration during the unrolling?
movements = {}
folders_to_be_reassigned = []
reassignment_limit = self.get_average_total_folder_size() * rebalance_factor
# for each OSD, remove the smallest folder until its total_folder_size does not exceed the reassignment_limit
# unrolling
for osd in self.OSDs.values():
while osd.total_folder_size > reassignment_limit:
folder_id, folder_size = osd.get_smallest_folder()
......@@ -244,6 +258,7 @@ class DataDistribution(object):
movements[folder_id] = osd.uuid
osd.remove_folder(folder_id)
# reassignment
new_assignments = self.add_folders(folders_to_be_reassigned,
osd_information=osd_information, ratio_parameter=capacity)
......@@ -252,6 +267,75 @@ class DataDistribution(object):
return movements
def rebalance_one_folder(self, osd_information=None, capacity=''):
"""
rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
1. find OSD with the highest load
2. get folder with smallest size on this OSD
3. find new OSD for this folder using get_lpt_osd
4. if the load on the new OSD is lower than on the original OSD, move the folder to the new OSD.
otherwise, return.
one open question is whether getting the folder with smallest size in step 2 is a clever choice
(in principle, all folders of the OSD with the highest load are eligible).
this optimization scheme classifies as local search. two distributions are neighbors if one can be transformed
into the other by moving one folder from one OSD to another. note, however, that we do not search the whole
neighborhood of a distribution.
but it might be possible to show that if there is no improvement step of the type that we check for,
there is no improvement step at all.
"""
if osd_information is None:
capacity_key = 'capacity'
osd_information = self.get_equal_sized_fake_osd_information(capacity_key)
capacity = capacity_key
movements = {}
while True:
# find OSD with the highest load (origin)
origin_osd, maximum_load = self.get_maximum_osd_load(osd_information, capacity)
# pick a folder of this OSD
# there are several ways to pick a folder (like largest, smallest, constrained by the resulting load of the
# origin OSD, random...), it is not clear which way is a good way
# for now pick the smallest folder on origin OSD
smallest_folder_id, smallest_folder_size = self.OSDs[origin_osd.uuid].get_smallest_folder()
# find other OSD best suited for the picked folder (target)
# check whether moving folder from origin to target decreases the maximum load of all OSDs (makespan).
best_osd, best_osd_load = self.get_lpt_osd(osd_information, capacity, smallest_folder_size)
if best_osd_load < maximum_load:
self.assign_new_osd(smallest_folder_id, best_osd.uuid)
movements[smallest_folder_id] = (origin_osd.uuid, best_osd.uuid)
else:
break
return movements
def get_lpt_osd(self, osd_information, ratio_parameter, folder_size):
"""
calculate the load of all OSDs, using the sum of their current total_folder_size and folder_size.
return (OSD with the smallest such value, the smallest value)
"""
least_used_osd = None
best_load_so_far = 1
for one_osd in self.OSDs.values():
one_osd_load = (one_osd.total_folder_size + folder_size) / osd_information[one_osd.uuid][ratio_parameter]
if (least_used_osd is None) or one_osd_load < best_load_so_far:
least_used_osd = one_osd
best_load_so_far = one_osd_load
return least_used_osd, best_load_so_far
def create_osd_ratios(self, osd_information, ratio_parameter):
osd_ratios = {} # ratios are given - use them to assign proportionally
total_osd_size = 0
for osd_size in osd_information.values():
total_osd_size += osd_size[ratio_parameter]
for osd_uuid, osd_size in osd_information.items():
osd_ratios[osd_uuid] = float(osd_size[ratio_parameter]) / float(total_osd_size)
return osd_ratios
def update_folder(self, folder, size):
"""
updates the size of a given folder
......@@ -259,6 +343,14 @@ class DataDistribution(object):
for one_osd in self.OSDs.values():
if folder in one_osd.folders.keys():
one_osd.update_folder(folder, size)
break
def get_equal_sized_fake_osd_information(self, capacity):
osd_information = {}
for osd_uuid in self.get_osd_list():
osd_information[osd_uuid] = {}
osd_information[osd_uuid][capacity] = 1
return osd_information
def description(self):
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment