66#include < ATen/mps/MPSEvent.h>
77#include < ATen/mps/MPSStream.h>
88
9+ #include < c10/util/flat_hash_map.h>
10+ #include < mach/vm_page_size.h>
911#include < cstdio>
1012#include < mutex>
1113#include < set>
1214#include < unordered_set>
13- #include < mach/vm_page_size.h>
14- #include < c10/util/flat_hash_map.h>
1515
1616// this implementation is based on CUDACachingAllocator.
1717// It utilizes Metal Heaps to improve the performance with buffer allocation.
1818// Do not include this header. Use MPSAllocatorInterface.h instead.
1919// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
2020namespace at ::mps::HeapAllocator {
2121
22- static const size_t kMaxSmallAlloc = MB(1 ); // largest "small" allocation is 1 MiB
23- static const size_t kMinLargeAlloc = MB(10 ); // allocations between 1 and 10 MiB may use kLargeHeap
24- static const size_t kRoundLarge = MB(2 ); // round up large allocations to 2 MiB
25- static const size_t kSmallHeap = MB(8 ); // "small" allocations are packed in 8 MiB heaps
26- static const size_t kLargeHeap = MB(32 ); // "large" allocations may be packed in 32 MiB heaps
27- static const size_t kXLargeHeapD = MB(128 ); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
28- static const size_t kXLargeHeapU = MB(1024 ); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
22+ static const size_t kMaxSmallAlloc = MB(1 ); // largest "small" allocation is 1 MiB
23+ static const size_t kMinLargeAlloc = MB(10 ); // allocations between 1 and 10 MiB may use kLargeHeap
24+ static const size_t kRoundLarge = MB(2 ); // round up large allocations to 2 MiB
25+ static const size_t kSmallHeap = MB(8 ); // "small" allocations are packed in 8 MiB heaps
26+ static const size_t kLargeHeap = MB(32 ); // "large" allocations may be packed in 32 MiB heaps
27+ static const size_t kXLargeHeapD =
28+ MB (128 ); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
29+ static const size_t kXLargeHeapU =
30+ MB (1024 ); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
2931static const size_t kMaxScalarAlloc = (sizeof (int64_t )); // largest "scalar" allocation
3032
3133// buffer pools could be customized with a combination of usage flags
3234enum UsageFlags : uint32_t {
3335 PRIVATE = 0 ,
34- SMALL = (1 << 0 ), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
35- SHARED = (1 << 1 ), // shared pools allocated on devices with unified memory; otherwise, private between host/device
36+ SMALL = (1 << 0 ), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
37+ SHARED = (1 << 1 ), // shared pools allocated on devices with unified memory; otherwise, private between host/device
3638 MANAGED = (1 << 2 ), // managed storage mode
37- HAZARD = (1 << 3 ), // enables Automatic Hazard Tracking for the resources allocated on the pool
38- SCALAR = (1 << 4 ), // used to import CPU scalar values to GPU and use them in MPS Stream
39+ HAZARD = (1 << 3 ), // enables Automatic Hazard Tracking for the resources allocated on the pool
40+ SCALAR = (1 << 4 ), // used to import CPU scalar values to GPU and use them in MPS Stream
3941};
4042// debug verbosity flags
4143enum DebugVerbosity : uint32_t {
42- SILENT = 0 ,
43- PROFILING = (1 << 0 ), // print generic profiling data for total system memory usage
44+ SILENT = 0 ,
45+ PROFILING = (1 << 0 ), // print generic profiling data for total system memory usage
4446 ALLOCATIONS = (1 << 1 ), // print buffer allocations
45- RECYCLES = (1 << 2 ), // print buffer recycling
46- RELEASES = (1 << 3 ), // print buffer releases
47- LARGE_ONLY = (1 << 4 ), // only log large buffer pool transactions
47+ RECYCLES = (1 << 2 ), // print buffer recycling
48+ RELEASES = (1 << 3 ), // print buffer releases
49+ LARGE_ONLY = (1 << 4 ), // only log large buffer pool transactions
4850};
4951
5052struct HeapBlock ;
@@ -67,10 +69,8 @@ struct BufferBlock {
6769 // Metal events used to sync GPU/CPU operations on the shared-storage buffers
6870 MPSEventPtr event;
6971
70- BufferBlock (size_t Size, size_t RequestedSize = 0 , const id<MTLBuffer> Buffer = nullptr ,
71- HeapBlock* Heap = nullptr ) :
72- buffer (Buffer), size(Size), requested_size(RequestedSize),
73- heap (Heap), buf_id(Buffer ? ++buffer_counter : 0 ) { }
72+ BufferBlock (size_t Size, size_t RequestedSize = 0 , const id<MTLBuffer> Buffer = nullptr , HeapBlock* Heap = nullptr )
73+ : buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0 ) {}
7474
7575 static bool Comparator (const BufferBlock* a, const BufferBlock* b) {
7676 return (a->size != b->size ) ? a->size < b->size : (uintptr_t )a->buffer < (uintptr_t )b->buffer ;
@@ -79,15 +79,19 @@ struct BufferBlock {
7979 assert (((Alignment - 1 ) & Alignment) == 0 );
8080 return ((Size + Alignment - 1 ) & ~(Alignment - 1 ));
8181 }
82- uint32_t retainCount () const { return [buffer retainCount]; }
82+ uint32_t retainCount () const {
83+ return [buffer retainCount];
84+ }
8385};
8486typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
8587
8688struct BufferPool ;
8789struct AllocParams {
88- AllocParams (size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
89- search_key (Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
90- size_t size () const { return search_key.size ; }
90+ AllocParams (size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool)
91+ : search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {}
92+ size_t size () const {
93+ return search_key.size ;
94+ }
9195
9296 BufferBlock search_key;
9397 BufferPool* pool;
@@ -102,7 +106,9 @@ struct AllocParams {
102106
103107struct HeapBlock {
104108 id<MTLHeap> heap;
105- struct { size_t total, available; } size;
109+ struct {
110+ size_t total, available;
111+ } size;
106112 BufferPool* pool;
107113 unsigned int n_buffers = 0 ;
108114 id_t heap_id;
@@ -111,9 +117,12 @@ struct HeapBlock {
111117 // counter to assign unique ids to heap blocks
112118 static uint64_t heap_counter;
113119
114- HeapBlock (size_t Size, const id<MTLHeap> Heap = nullptr , BufferPool *Pool = nullptr ) :
115- heap (Heap), size({.total = Size, .available = Size}), pool(Pool),
116- heap_id (Heap ? ++heap_counter : 0 ), is_split(true ) { }
120+ HeapBlock (size_t Size, const id<MTLHeap> Heap = nullptr , BufferPool* Pool = nullptr )
121+ : heap(Heap),
122+ size ({.total = Size, .available = Size}),
123+ pool(Pool),
124+ heap_id(Heap ? ++heap_counter : 0 ),
125+ is_split(true ) {}
117126
118127 static MTLResourceOptions getOptions (uint32_t usage) {
119128 // TODO: check the caching performance of write-combined mode
@@ -126,16 +135,17 @@ struct HeapBlock {
126135 else
127136 options |= MTLResourceStorageModePrivate;
128137
129- options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
138+ options |=
139+ (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
130140
131141 return options;
132142 }
133143
134144 static HeapBlock* createHeapBlock (AllocParams& params, id<MTLDevice> device, uint32_t usage) {
135- HeapBlock * heapBlock = nullptr ;
145+ HeapBlock* heapBlock = nullptr ;
136146 bool is_split = true ;
137147 const size_t size = params.size ();
138- MTLHeapDescriptor * d = [MTLHeapDescriptor new];
148+ MTLHeapDescriptor* d = [MTLHeapDescriptor new];
139149 if (d) {
140150 const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD ;
141151 if (size <= kMaxSmallAlloc ) {
@@ -152,10 +162,11 @@ struct HeapBlock {
152162 d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
153163 // this automatically handles Metal buffer access synchronizations at the
154164 // cost of slightly lower performance.
155- d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
165+ d.hazardTrackingMode =
166+ (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
156167 d.resourceOptions = getOptions (usage);
157168 d.type = MTLHeapTypeAutomatic;
158- id<MTLHeap> heap = [device newHeapWithDescriptor: d];
169+ id<MTLHeap> heap = [device newHeapWithDescriptor:d];
159170 if (heap) {
160171 [heap setPurgeableState:MTLPurgeableStateNonVolatile];
161172 const size_t heap_size = heapAvailableSize (heap);
@@ -169,8 +180,8 @@ struct HeapBlock {
169180 return heapBlock;
170181 }
171182 static bool Comparator (const HeapBlock* a, const HeapBlock* b) {
172- return (a->size .available != b->size .available ) ? a->size .available < b->size .available :
173- (uintptr_t )a->heap < (uintptr_t )b->heap ;
183+ return (a->size .available != b->size .available ) ? a->size .available < b->size .available
184+ : (uintptr_t )a->heap < (uintptr_t )b->heap ;
174185 }
175186 static NSUInteger heapAvailableSize (id<MTLHeap> heap, size_t Alignment = vm_page_size) {
176187 return [heap maxAvailableSizeWithAlignment:Alignment];
@@ -205,8 +216,12 @@ struct HeapBlock {
205216 size.available = 0 ;
206217 return retainCount;
207218 }
208- uint32_t retainCount () const { return [heap retainCount]; }
209- void updateAvailableSize () { size.available = heapAvailableSize (heap); }
219+ uint32_t retainCount () const {
220+ return [heap retainCount];
221+ }
222+ void updateAvailableSize () {
223+ size.available = heapAvailableSize (heap);
224+ }
210225};
211226typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
212227
@@ -219,9 +234,8 @@ struct BufferPool {
219234 SCALAR,
220235 };
221236
222- BufferPool (const id<MTLDevice> Device, uint32_t Usage) :
223- device (Device), usage(Usage),
224- heaps (HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
237+ BufferPool (const id<MTLDevice> Device, uint32_t Usage)
238+ : device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {}
225239
226240 const id<MTLDevice> device;
227241 // usage flags to customize the pool for various purposes (see UsageFlags enum)
@@ -248,12 +262,12 @@ struct BufferPool {
248262};
249263
250264class MPSHeapAllocatorImpl {
251- public:
252- explicit MPSHeapAllocatorImpl () :
253- m_device(at::mps::MPSDevice::getInstance()->device()),
254- m_max_buffer_size([m_device maxBufferLength]),
255- m_stream(getDefaultMPSStream()),
256- m_event_pool(getMPSEventPool()) {
265+ public:
266+ explicit MPSHeapAllocatorImpl ()
267+ : m_device(at::mps::MPSDevice::getInstance()->device()),
268+ m_max_buffer_size([m_device maxBufferLength]),
269+ m_stream(getDefaultMPSStream()),
270+ m_event_pool(getMPSEventPool()) {
257271 init_allocator ();
258272 }
259273 ~MPSHeapAllocatorImpl () {
@@ -298,34 +312,50 @@ class MPSHeapAllocatorImpl {
298312 // (see m_high_watermark_ratio for description)
299313 void setHighWatermarkRatio (double ratio);
300314 // (see m_low_watermark_limit for description)
301- size_t getLowWatermarkLimit () const { return m_low_watermark_limit; }
315+ size_t getLowWatermarkLimit () const {
316+ return m_low_watermark_limit;
317+ }
302318 // (see m_max_total_allowed_size for description)
303- size_t getHighWatermarkLimit () const { return m_max_total_allowed_size; }
319+ size_t getHighWatermarkLimit () const {
320+ return m_max_total_allowed_size;
321+ }
304322 // (see m_total_allocated_memory for description)
305- size_t getTotalAllocatedMemory () const { return m_total_allocated_memory; }
323+ size_t getTotalAllocatedMemory () const {
324+ return m_total_allocated_memory;
325+ }
306326 // (see m_current_allocated_memory for description)
307- size_t getCurrentAllocatedMemory () const { return m_current_allocated_memory; }
327+ size_t getCurrentAllocatedMemory () const {
328+ return m_current_allocated_memory;
329+ }
308330 // total GPU memory allocated in the process by Metal driver; including
309331 // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
310- size_t getDriverAllocatedMemory () const { return current_allocated_size (); }
332+ size_t getDriverAllocatedMemory () const {
333+ return current_allocated_size ();
334+ }
311335 // recommended Max memory for Metal
312- size_t getRecommendedMaxMemory () const { return max_device_size (); }
336+ size_t getRecommendedMaxMemory () const {
337+ return max_device_size ();
338+ }
313339 // (see enum DebugVerbosity for description)
314- uint32_t getDebugVerbosity () const { return m_debug_verbosity; }
340+ uint32_t getDebugVerbosity () const {
341+ return m_debug_verbosity;
342+ }
315343 // returns the device that we allocate from
316- inline id<MTLDevice> Device () const { return m_device; }
344+ inline id<MTLDevice> Device () const {
345+ return m_device;
346+ }
317347
318348 // TODO: make a common function to do size unit conversions in PyTorch.
319349 inline std::string format_size (uint64_t size) const ;
320350
321- private:
351+ private:
322352 // (see m_high_watermark_ratio for description)
323353 constexpr static double default_high_watermark_ratio = 1.7 ;
324354 // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
325355 constexpr static double default_high_watermark_upper_bound = 2.0 ;
326356 // (see m_low_watermark_ratio for description)
327357 // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
328- constexpr static double default_low_watermark_ratio_unified = 1.4 ;
358+ constexpr static double default_low_watermark_ratio_unified = 1.4 ;
329359 constexpr static double default_low_watermark_ratio_discrete = 1.0 ;
330360
331361 const id<MTLDevice> m_device;
@@ -387,14 +417,19 @@ class MPSHeapAllocatorImpl {
387417 size_t get_allocation_size (size_t size, uint32_t usage) const ;
388418 // maximum size of device memory available for allocation in current process
389419 // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
390- size_t max_device_size () const { return [m_device recommendedMaxWorkingSetSize]; }
420+ size_t max_device_size () const {
421+ return [m_device recommendedMaxWorkingSetSize];
422+ }
391423 // there are implicit allocations from MPS backend, so we need to query the 'device' for
392424 // total allocated size instead of manually tracking in MPSAllocator
393- size_t current_allocated_size () const { return [m_device currentAllocatedSize]; }
425+ size_t current_allocated_size () const {
426+ return [m_device currentAllocatedSize];
427+ }
394428
395429 bool trigger_memory_callbacks (BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
396430 for (const auto & name : MPSAllocatorCallbacksRegistry ()->Keys ()) {
397- MPSAllocatorCallbacksRegistry ()->Create (name)->executeMPSAllocatorCallback (buffer_block ? buffer_block->buffer : nullptr , event);
431+ MPSAllocatorCallbacksRegistry ()->Create (name)->executeMPSAllocatorCallback (
432+ buffer_block ? buffer_block->buffer : nullptr , event);
398433 }
399434 return true ;
400435 }
0 commit comments