dataDistribution.py 16.1 KB
Newer Older
1
2
import random

Felix Seibert's avatar
Felix Seibert committed
3
from xtreemfs_client import osd
4
from xtreemfs_client import folder
5

6

Felix Seibert's avatar
Felix Seibert committed
7
8
9
10
class DataDistribution(object):
    """
    class to keep track of the osd (object storage device) locations of different folders, i.e.,
    their physical location.
11

Felix Seibert's avatar
Felix Seibert committed
12
13
    this class also allows to calculate several data distributions, e.g., mappings from folders to OSDs (each folder
    gets mapped to one OSD).
14
15

    the load is defined as the quotient from the total_folder_size of an OSD divided by its capacity.
Felix Seibert's avatar
Felix Seibert committed
16
    """
17

18
19
    # TODO introduce consistent handling of (missing) OSD capacities / osd_information

20
    def __init__(self):
21
        self.OSDs = {}
22

23
    def add_new_osd(self, osd_uuid):
Felix Seibert's avatar
Felix Seibert committed
24
25
26
        """
        create a new empty osd and add it to the existing OSDs.
        """
27
28
        if osd_uuid in self.OSDs:
            print("key: " + osd_uuid + " is already present!")
29
            return
30
31
        new_osd = osd.OSD(osd_uuid)
        self.OSDs[osd_uuid] = new_osd
32

33
    def add_osd(self, new_osd):
Felix Seibert's avatar
Felix Seibert committed
34
35
36
        """
        add the given OSD (object) to the existing OSDs.
        """
37
38
        if new_osd.uuid in self.OSDs:
            print("key: " + new_osd.uuid + " is already present!")
39
            return
40
        self.OSDs[new_osd.uuid] = new_osd
41

42
    def add_osd_list(self, osd_list):
Felix Seibert's avatar
Felix Seibert committed
43
44
45
        """
        add the given list of OSDs (objects) to the existing OSDs.
        """
46
47
48
49
        for osd_uuid in osd_list:
            if osd_uuid not in self.OSDs:
                new_osd = osd.OSD(osd_uuid)
                self.OSDs[osd_uuid] = new_osd
50
51

    def get_osd_list(self):
Felix Seibert's avatar
Felix Seibert committed
52
        """
53
        get a list of all existing OSD uuids.
Felix Seibert's avatar
Felix Seibert committed
54
        """
55
        osd_list = []
56
        for osd_name in self.OSDs.keys():
57
58
59
            osd_list.append(osd_name)
        return osd_list

60
    def get_containing_osd(self, folder_id):
Felix Seibert's avatar
Felix Seibert committed
61
62
63
        """
        get the OSD containing the given folder_id, or None if the folder is not assigned to any OSD.
        """
64
65
66
        for checked_osd in self.OSDs.values():
            if checked_osd.contains_folder(folder_id):
                return checked_osd
67
68
        return None

Felix Seibert's avatar
Felix Seibert committed
69
70
71
72
73
74
75
76
77
78
79
    def assign_new_osd(self, folder_id, new_osd):
        """
        assign folder_id to new_osd. if folder_id already is assigned to an OSD, this old assignment is deleted.
        """
        old_osd = self.get_containing_osd(folder_id)
        if old_osd is None:
            self.OSDs[new_osd].add_folder(folder_id, self.get_average_folder_size())
        else:
            self.OSDs[new_osd].add_folder(folder_id, self.OSDs[old_osd.uuid].folders[folder_id])
            self.OSDs[old_osd.uuid].remove_folder(folder_id)

80
    def get_average_folder_size(self):
Felix Seibert's avatar
Felix Seibert committed
81
82
83
        """
        get the average folder size of all folders of all OSDs.
        """
84
        total_size = 0
85
        total_number_of_folders = 0
86
        for one_osd in self.OSDs.values():
Felix Seibert's avatar
Felix Seibert committed
87
            total_size += one_osd.total_folder_size
88
            total_number_of_folders += len(one_osd.folders)
89
90
        if total_number_of_folders == 0:
            return 0
91
92
        return total_size / total_number_of_folders

93
94
95
96
97
98
99
100
101
102
103
104
    def get_average_osd_load(self, osd_information, capacity):
        """
        calculate the average OSD load, that is,
        the ratio between the sum of all folder sizes and the total OSD capacity.
        """
        total_folder_size = 0
        total_osd_capacity = 0
        for osd_uuid in self.OSDs.keys():
            total_folder_size += self.OSDs[osd_uuid].total_folder_size
            total_osd_capacity += osd_information[osd_uuid][capacity]
        return total_folder_size / total_osd_capacity

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    def get_maximum_osd_load(self, osd_information, capacity):
        """
        calculate the maximum OSD load.
        """
        assert osd_information is not None
        assert capacity != ''
        maximum_load = 0
        maximum_osd = None
        for osd in self.OSDs.values():
            load = osd.total_folder_size / osd_information[osd.uuid][capacity]
            if maximum_osd is None or load > maximum_load:
                maximum_load = load
                maximum_osd = osd
        return maximum_osd, maximum_load

120
121
122
123
124
125
126
127
128
129
130
131
    def get_average_total_folder_size(self):
        """
        calculate the average total_folder_size of the OSDs.
        """
        total_folder_size = 0
        num_osds = 0
        for osd in self.OSDs.values():
            total_folder_size += osd.total_folder_size
            num_osds += 1

        return total_folder_size / num_osds

132
133
134
135
    def add_folders(self, folders,
                    osd_information=None, ratio_parameter='', capacity='',
                    ignore_osd_capacities=True,
                    random_osd_assignment=False,
136
137
                    ignore_folder_sizes=False,
                    debug=False):
Felix Seibert's avatar
Felix Seibert committed
138
139
        """
        adds a list of folders to the data distribution.
Felix Seibert's avatar
Felix Seibert committed
140
        if not specified otherwise, the assignments are calculated using the LPT algorithm.
Felix Seibert's avatar
Felix Seibert committed
141
142
        returns a list of assignments from folders to OSDs, for which (folders) there was previously no assignment.

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
        if osd_information and ratio_parameter are given,
        OSDs are assigned data proportionally to their ratio_parameter.

        osd_information is a map (that we now call outer map) that contains, for each OSD, an inner_map.
        outer_map[osd_uuid][ratio_parameter] is used to calculate the proportion of data assigned to OSD with uuid
        osd_uuid.

        if ignore_osd_capacities=False,
        outer_map[osd_uuid][capacity] is used (only in combination with random_osd_assignment=True)
        to generate random assignments that do not surpass the capacities of the OSDs.
        (random assignment respecting OSD capacities)

        if random_osd_assignment=True and ignore_osd_capacities=True, a totally random OSD assignment generated.

        if random_osd_assignment=True and ignore_folder_sizes=True,
        folders are randomly assigned to OSDs such that all OSDs have the same number of folders (if possible).
159
160

        the assignment is stable (i.e., folders already assigned to an OSD are not reassigned to another OSD).
Felix Seibert's avatar
Felix Seibert committed
161
        """
162
163

        # find out which folders are not assigned yet
164
        new_folders = []
165
166
        for a_folder in folders:
            containing_osd = self.get_containing_osd(a_folder.id)
167
            if containing_osd is not None:
168
                containing_osd.add_folder(a_folder.id, a_folder.size)
169
            else:
170
                new_folders.append(a_folder)
171

172
173
        if debug:
            print("dataDistribution: random_osd_assignment: " + str(random_osd_assignment))
Felix Seibert's avatar
Felix Seibert committed
174

175
176
        # keep track of which unassigned folder gets assigned to which OSD.
        # this information must be returned
177
178
        osds_for_new_folders = []

179
180
        # totally random OSD assignment, even ignoring OSD capacities
        # (might lead to I/O errors when too many groups are assigned to an OSD)
181
        if random_osd_assignment and ignore_osd_capacities and not ignore_folder_sizes:
182
183
            if debug:
                print("using totally random osd assignment")
184
            for a_folder in new_folders:
Felix Seibert's avatar
Felix Seibert committed
185
                random_osd = random.choice(list(self.OSDs.values()))
186
187
                random_osd.add_folder(a_folder.id, a_folder.size)
                osds_for_new_folders.append((a_folder.id,
Felix Seibert's avatar
Felix Seibert committed
188
                                             random_osd.uuid))
Felix Seibert's avatar
Felix Seibert committed
189
            return osds_for_new_folders
190

191
192
        # random OSD assignment respecting OSD capacities
        elif random_osd_assignment and not ignore_osd_capacities:
193
            if osd_information is None or capacity == '':
194
195
                raise ValueError("ignore_osd_capacities=False is not possible if osd_information or capacity is"
                                 "not given!")
196
197
            if debug:
                print("using random osd assignment, respecting osd capacities")
198
            for a_folder in new_folders:
199
                suitable_osds = []  # list of OSDs with enough capacity
200
                for one_osd in self.OSDs.values():
201
                    if osd_information[one_osd.uuid][capacity] - one_osd.total_folder_size - a_folder.size >= 0:
202
203
                        suitable_osds.append(one_osd)
                suitable_random_osd = random.choice(suitable_osds)
204
205
                suitable_random_osd.add_folder(a_folder.id, a_folder.size)
                osds_for_new_folders.append((a_folder.id,
206
207
208
                                             suitable_random_osd.uuid))
            return osds_for_new_folders

209
210
        # random OSD assignment ignoring folder sizes
        elif random_osd_assignment and ignore_folder_sizes:
211
212
213
214
215
216
217
218
219
220
            if debug:
                print("using random osd assignment ignoring folder sizes")

            average_folder_size = self.get_average_folder_size()
            if average_folder_size == 0:
                average_folder_size = 1

            modified_folders = list(map(lambda f: folder.Folder(f.id, average_folder_size, f.origin), folders))
            random.shuffle(modified_folders)
            return self.add_folders(modified_folders)
221

222
223
        # balanced deterministic OSD assignment (LPT)
        # (following largest processing time first, also called post-greedy approach)
224
225
        list.sort(new_folders, key=lambda x: x.size, reverse=True)

226
227
228
229
230
        # if osd_information is None, use the fake osd_information, which assumes that all OSDs have the same capacity
        # otherwise use the given osd_information
        if osd_information is None:
            ratio_parameter = 'dummy_value'
            osd_information = self.get_equal_sized_fake_osd_information(ratio_parameter)
231

232
        # for each folder calculate the best OSD and add it to it
233
        for a_folder in new_folders:
234
            least_used_osd, _ = self.get_lpt_osd(osd_information, ratio_parameter, a_folder.size)
235
236
            least_used_osd.add_folder(a_folder.id, a_folder.size)
            osds_for_new_folders.append((a_folder.id,
237
                                         least_used_osd.uuid))
238
239
        return osds_for_new_folders

240
    def rebalance_lpt(self, rebalance_factor=1, osd_information=None, capacity=''):
241
242
243
244
245
246
        """
        rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
                1. 'unroll' the assignment. this means that, for each OSD, folders are removed until the OSD has less
                total_folder_size than the average total folder size of this distribution multiplied by rebalance_factor.
                2. reassign the removed folders using the LPT strategy.
        """
247
248
        movements = {}
        folders_to_be_reassigned = []
249
        reassignment_factor = self.get_average_osd_load(osd_information, capacity) * rebalance_factor
250
251

        # for each OSD, remove the smallest folder until its total_folder_size does not exceed the reassignment_limit
252
        # unrolling
253
        for osd in self.OSDs.values():
254
            while osd.total_folder_size > reassignment_factor * osd_information[osd.uuid][capacity]:
255
256
257
258
259
                folder_id, folder_size = osd.get_smallest_folder()
                folders_to_be_reassigned.append(folder.Folder(folder_id, folder_size, None))
                movements[folder_id] = osd.uuid
                osd.remove_folder(folder_id)

260
        # reassignment
261
262
263
264
265
266
267
268
        new_assignments = self.add_folders(folders_to_be_reassigned,
                                           osd_information=osd_information, ratio_parameter=capacity)

        for folder_id, target in new_assignments:
            movements[folder_id] = (movements[folder_id], target)

        return movements

269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
    def rebalance_one_folder(self, osd_information=None, capacity=''):
        """
        rebalance folders to OSDs by assigning folders to new OSDs using the following strategy:
                1. find OSD with the highest load
                2. get folder with smallest size on this OSD
                3. find new OSD for this folder using get_lpt_osd
                4. if the load on the new OSD is lower than on the original OSD, move the folder to the new OSD.
                otherwise, return.
        one open question is whether getting the folder with smallest size in step 2 is a clever choice
        (in principle, all folders of the OSD with the highest load are eligible).

        this optimization scheme classifies as local search. two distributions are neighbors if one can be transformed
        into the other by moving one folder from one OSD to another. note, however, that we do not search the whole
        neighborhood of a distribution.
        but it might be possible to show that if there is no improvement step of the type that we check for,
        there is no improvement step at all.
        """
        if osd_information is None:
            capacity_key = 'capacity'
            osd_information = self.get_equal_sized_fake_osd_information(capacity_key)
            capacity = capacity_key

        movements = {}

        while True:
            # find OSD with the highest load (origin)
            origin_osd, maximum_load = self.get_maximum_osd_load(osd_information, capacity)

            # pick a folder of this OSD
            # there are several ways to pick a folder (like largest, smallest, constrained by the resulting load of the
            # origin OSD, random...), it is not clear which way is a good way
            # for now pick the smallest folder on origin OSD
            smallest_folder_id, smallest_folder_size = self.OSDs[origin_osd.uuid].get_smallest_folder()

            # find other OSD best suited for the picked folder (target)
            # check whether moving folder from origin to target decreases the maximum load of all OSDs (makespan).
            best_osd, best_osd_load = self.get_lpt_osd(osd_information, capacity, smallest_folder_size)

            if best_osd_load < maximum_load:
                self.assign_new_osd(smallest_folder_id, best_osd.uuid)
                movements[smallest_folder_id] = (origin_osd.uuid, best_osd.uuid)
            else:
                break

        return movements

    def get_lpt_osd(self, osd_information, ratio_parameter, folder_size):
        """
        calculate the load of all OSDs, using the sum of their current total_folder_size and folder_size.
        return (OSD with the smallest such value, the smallest value)
        """
        least_used_osd = None
        best_load_so_far = 1
        for one_osd in self.OSDs.values():
            one_osd_load = (one_osd.total_folder_size + folder_size) / osd_information[one_osd.uuid][ratio_parameter]
            if (least_used_osd is None) or one_osd_load < best_load_so_far:
                least_used_osd = one_osd
                best_load_so_far = one_osd_load
        return least_used_osd, best_load_so_far

    def create_osd_ratios(self, osd_information, ratio_parameter):
        osd_ratios = {}  # ratios are given - use them to assign proportionally
        total_osd_size = 0
        for osd_size in osd_information.values():
            total_osd_size += osd_size[ratio_parameter]
        for osd_uuid, osd_size in osd_information.items():
            osd_ratios[osd_uuid] = float(osd_size[ratio_parameter]) / float(total_osd_size)
        return osd_ratios

338
    def update_folder(self, folder, size):
Felix Seibert's avatar
Felix Seibert committed
339
340
341
        """
        updates the size of a given folder
        """
342
343
344
        for one_osd in self.OSDs.values():
            if folder in one_osd.folders.keys():
                one_osd.update_folder(folder, size)
345
346
347
348
349
350
351
352
                break

    def get_equal_sized_fake_osd_information(self, capacity):
        osd_information = {}
        for osd_uuid in self.get_osd_list():
            osd_information[osd_uuid] = {}
            osd_information[osd_uuid][capacity] = 1
        return osd_information
353
354

    def description(self):
Felix Seibert's avatar
Felix Seibert committed
355
356
357
        """
        generates a string describing this data distribution
        """
358
        string = ""
359
360
        for one_osd in self.OSDs.values():
            string += str(one_osd)
361
            string += "\n"
362
            string += "folders : " + str(one_osd.folders)
363
            string += "\n"
364
        string += "average folder size: " + str(self.get_average_folder_size())
365
366
367
        return string

    def __str__(self):
368
369
370
371
372
        string_representation = "DataDistribution has " + str(len(self.OSDs)) \
                                + " osds: \n"
        for key, value in self.OSDs.items():
            string_representation += str(value) + " \n"
        return string_representation