Skip to content

Commit afcbd9d

Browse files
committed
Move config/util to AllocatorConfig for cross-allocator sharing
ghstack-source-id: a1120ba Pull Request resolved: #159553
1 parent 2ffb510 commit afcbd9d

File tree

4 files changed

+63
-90
lines changed

4 files changed

+63
-90
lines changed

c10/core/AllocatorConfig.h

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,17 @@
1313
namespace c10::CachingAllocator {
1414

1515
// "large" allocations may be packed in 20 MiB blocks
16-
const size_t kLargeBuffer = 20971520;
16+
constexpr size_t kLargeBuffer = 20971520;
17+
// "small" allocations are packed in 2 MiB blocks
18+
constexpr size_t kSmallBuffer = 2097152;
19+
// all sizes are rounded to at least 512 bytes
20+
constexpr size_t kMinBlockSize = 512;
21+
// largest "small" allocation is 1 MiB
22+
constexpr size_t kSmallSize = 1048576;
23+
// allocations between 1 and 10 MiB may use kLargeBuffer
24+
constexpr size_t kMinLargeAlloc = 10485760;
25+
// round up large allocations to 2 MiB
26+
constexpr size_t kRoundLarge = 2097152;
1727

1828
// A utility class for tokenizing allocator configuration strings into discrete
1929
// parts. For example, the config string:
@@ -369,4 +379,30 @@ struct DeviceConfigParserHookRegistry {
369379
parser_cls::getKeys()); \
370380
}
371381

382+
// This function takes the size and number of divisions argument and rounds
383+
// up the size argument for the nearest power-of-2 division.
384+
// For example, if we need to round-up 1200 and number of divisions is 4,
385+
// the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
386+
// them, the values are 1024, 1280, 1536, and 1792. So the function will
387+
// return 1280 as the nearest ceiling of power-2 division.
388+
inline size_t roundup_power2_next_division(size_t size, size_t divisions) {
389+
if (llvm::isPowerOf2_64(size)) {
390+
return size;
391+
}
392+
393+
TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
394+
395+
// divide the space between these 2's power into equal divisions
396+
// If division is zero, return the power-of-2 ceiling.
397+
size_t power2_floor = llvm::PowerOf2Floor(size);
398+
// power2_divison is the division size between power2_floor and power2_floor*2
399+
size_t power2_divison =
400+
power2_floor >> (63 - llvm::countLeadingZeros(divisions));
401+
if (C10_UNLIKELY(power2_divison == 0)) {
402+
return (power2_floor << 1);
403+
}
404+
size_t round_size_floor = size & (~(power2_divison - 1));
405+
return (round_size_floor == size) ? size : round_size_floor + power2_divison;
406+
}
407+
372408
} // namespace c10::CachingAllocator

c10/core/CachingDeviceAllocator.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include <c10/core/Allocator.h>
4+
#include <c10/core/AllocatorConfig.h>
45

56
namespace c10::CachingDeviceAllocator {
67

@@ -58,4 +59,27 @@ struct DeviceStats {
5859
int64_t max_split_size = 0;
5960
};
6061

62+
inline size_t get_round_size(size_t size) {
63+
if (size < kMinBlockSize) {
64+
return kMinBlockSize;
65+
}
66+
67+
auto divisions = AcceleratorAllocatorConfig::roundup_power2_divisions(size);
68+
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
69+
return roundup_power2_next_division(size, divisions);
70+
} else {
71+
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
72+
}
73+
}
74+
75+
inline size_t get_allocation_size(size_t size) {
76+
if (size <= kSmallSize) {
77+
return kSmallBuffer;
78+
} else if (size < kMinLargeAlloc) {
79+
return kLargeBuffer;
80+
} else {
81+
return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
82+
}
83+
}
84+
6185
} // namespace c10::CachingDeviceAllocator

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -130,15 +130,6 @@ namespace Native {
130130
* notifyCaptureDestroy.
131131
*/
132132

133-
constexpr size_t kMinBlockSize =
134-
512; // all sizes are rounded to at least 512 bytes
135-
constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
136-
constexpr size_t kSmallBuffer =
137-
2097152; // "small" allocations are packed in 2 MiB blocks
138-
constexpr size_t kMinLargeAlloc =
139-
10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
140-
constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
141-
142133
static char SHAREABLE_HANDLE_VERSION = 2;
143134
enum ShareableHandleType : char {
144135
SHAREABLE_CUDA_MALLOC = 'c',
@@ -1332,7 +1323,7 @@ class DeviceCachingAllocator {
13321323
// effect on memory use during capture should be small.
13331324
process_events(context);
13341325
}
1335-
size_t size = round_size(orig_size);
1326+
const size_t size = get_round_size(orig_size);
13361327
auto& pool = get_pool(size, stream);
13371328
const size_t alloc_size = get_allocation_size(size);
13381329
AllocParams params(device, size, stream, &pool, alloc_size);
@@ -2173,46 +2164,6 @@ class DeviceCachingAllocator {
21732164
return result;
21742165
}
21752166

2176-
// This function takes the size and number of divisions argument and rounds
2177-
// up the size argument for the nearest power-of-2 division.
2178-
// For example, if we need to round-up 1200 and number of divisions is 4,
2179-
// the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
2180-
// them, the values are 1024, 1280, 1536, and 1792. So the function will
2181-
// return 1280 as the nearest ceiling of power-2 division.
2182-
static size_t roundup_power2_next_division(size_t size, size_t divisions) {
2183-
if (llvm::isPowerOf2_64(size)) {
2184-
return size;
2185-
}
2186-
2187-
TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
2188-
2189-
// divide the space between these 2's power into equal divisions
2190-
// If division is zero, return the power-of-2 ceiling.
2191-
size_t power2_floor = llvm::PowerOf2Floor(size);
2192-
size_t power2_divison =
2193-
power2_floor >> (63 - llvm::countLeadingZeros(divisions));
2194-
if (C10_UNLIKELY(power2_divison == 0)) {
2195-
return (power2_floor << 1);
2196-
}
2197-
size_t round_size_floor = size & (~(power2_divison - 1));
2198-
return (round_size_floor == size) ? size
2199-
: round_size_floor + power2_divison;
2200-
}
2201-
2202-
static size_t round_size(size_t size) {
2203-
if (size < kMinBlockSize) {
2204-
return kMinBlockSize;
2205-
} else {
2206-
auto divisions =
2207-
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
2208-
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
2209-
return roundup_power2_next_division(size, divisions);
2210-
} else {
2211-
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
2212-
}
2213-
}
2214-
}
2215-
22162167
void createOrIncrefPool(MempoolId_t mempool_id, CUDAAllocator* allocator) {
22172168
// Create a PrivatePool object if it does not exist yet
22182169
// and increment its use_count
@@ -2699,16 +2650,6 @@ class DeviceCachingAllocator {
26992650
}
27002651
}
27012652

2702-
static size_t get_allocation_size(size_t size) {
2703-
if (size <= kSmallSize) {
2704-
return kSmallBuffer;
2705-
} else if (size < kMinLargeAlloc) {
2706-
return kLargeBuffer;
2707-
} else {
2708-
return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
2709-
}
2710-
}
2711-
27122653
bool get_free_block(AllocParams& p) {
27132654
BlockPool& pool = *p.pool;
27142655

c10/xpu/XPUCachingAllocator.cpp

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,6 @@ using namespace c10::CachingDeviceAllocator;
1515

1616
// newly allocated memory with 512-byte alignment.
1717
constexpr size_t kDeviceAlignment = 512;
18-
// all sizes are rounded to at least 512 bytes
19-
constexpr size_t kMinBlockSize = 512;
20-
// largest "small" allocation is 1 MiB
21-
constexpr size_t kSmallSize = 1048576;
22-
// "small" allocations are packed in 2 MiB blocks
23-
constexpr size_t kSmallBuffer = 2097152;
24-
// allocations between 1 and 10 MiB may use kLargeBuffer
25-
constexpr size_t kMinLargeAlloc = 10485760;
26-
// round up large allocations to 2 MiB
27-
constexpr size_t kRoundLarge = 2097152;
2818

2919
namespace {
3020
using stream_set = ska::flat_hash_set<xpu::XPUStream>;
@@ -213,24 +203,6 @@ class DeviceCachingAllocator {
213203
}
214204
}
215205

216-
static size_t round_size(size_t size) {
217-
if (size < kMinBlockSize) {
218-
return kMinBlockSize;
219-
} else {
220-
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
221-
}
222-
}
223-
224-
static size_t get_allocation_size(size_t size) {
225-
if (size <= kSmallSize) {
226-
return kSmallBuffer;
227-
} else if (size < kMinLargeAlloc) {
228-
return kLargeBuffer;
229-
} else {
230-
return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
231-
}
232-
}
233-
234206
BlockPool& get_pool(size_t size) {
235207
if (size < kSmallSize) {
236208
return small_blocks;
@@ -417,7 +389,7 @@ class DeviceCachingAllocator {
417389
Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
418390
std::scoped_lock<std::recursive_mutex> lock(mutex);
419391
process_events();
420-
size_t size = round_size(orig_size);
392+
const size_t size = get_round_size(orig_size);
421393
auto& pool = get_pool(size);
422394
const size_t alloc_size = get_allocation_size(size);
423395
AllocParams params(device, size, &queue, &pool, alloc_size);

0 commit comments

Comments
 (0)