Skip to content

Commit a2cbc94

Browse files
committed
Move config/util to AllocatorConfig for cross-allocator sharing
ghstack-source-id: 52749a1 Pull Request resolved: #159553
1 parent 20b5f69 commit a2cbc94

File tree

3 files changed

+52
-70
lines changed

3 files changed

+52
-70
lines changed

c10/core/AllocatorConfig.h

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,17 @@
1313
namespace c10::CachingAllocator {
1414

1515
// "large" allocations may be packed in 20 MiB blocks
16-
const size_t kLargeBuffer = 20971520;
16+
constexpr size_t kLargeBuffer = 20971520;
17+
// "small" allocations are packed in 2 MiB blocks
18+
constexpr size_t kSmallBuffer = 2097152;
19+
// all sizes are rounded to at least 512 bytes
20+
constexpr size_t kMinBlockSize = 512;
21+
// largest "small" allocation is 1 MiB
22+
constexpr size_t kSmallSize = 1048576;
23+
// allocations between 1 and 10 MiB may use kLargeBuffer
24+
constexpr size_t kMinLargeAlloc = 10485760;
25+
// round up large allocations to 2 MiB
26+
constexpr size_t kRoundLarge = 2097152;
1727

1828
// A utility class for tokenizing allocator configuration strings into discrete
1929
// parts. For example, the config string:
@@ -369,4 +379,43 @@ struct DeviceConfigParserHookRegistry {
369379
parser_cls::getKeys()); \
370380
}
371381

382+
// This function takes the size and number of divisions argument and rounds
383+
// up the size argument for the nearest power-of-2 division.
384+
// For example, if we need to round-up 1200 and number of divisions is 4,
385+
// the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
386+
// them, the values are 1024, 1280, 1536, and 1792. So the function will
387+
// return 1280 as the nearest ceiling of power-2 division.
388+
inline size_t roundup_power2_next_division(size_t size, size_t divisions) {
389+
if (llvm::isPowerOf2_64(size)) {
390+
return size;
391+
}
392+
393+
TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
394+
395+
// divide the space between these 2's power into equal divisions
396+
// If division is zero, return the power-of-2 ceiling.
397+
size_t power2_floor = llvm::PowerOf2Floor(size);
398+
// power2_divison is the division size between power2_floor and power2_floor*2
399+
size_t power2_divison =
400+
power2_floor >> (63 - llvm::countLeadingZeros(divisions));
401+
if (C10_UNLIKELY(power2_divison == 0)) {
402+
return (power2_floor << 1);
403+
}
404+
size_t round_size_floor = size & (~(power2_divison - 1));
405+
return (round_size_floor == size) ? size : round_size_floor + power2_divison;
406+
}
407+
408+
inline size_t round_size(size_t size) {
409+
if (size < kMinBlockSize) {
410+
return kMinBlockSize;
411+
} else {
412+
auto divisions = AcceleratorAllocatorConfig::roundup_power2_divisions(size);
413+
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
414+
return roundup_power2_next_division(size, divisions);
415+
} else {
416+
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
417+
}
418+
}
419+
}
420+
372421
} // namespace c10::CachingAllocator

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 1 addition & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -130,15 +130,6 @@ namespace Native {
130130
* notifyCaptureDestroy.
131131
*/
132132

133-
constexpr size_t kMinBlockSize =
134-
512; // all sizes are rounded to at least 512 bytes
135-
constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
136-
constexpr size_t kSmallBuffer =
137-
2097152; // "small" allocations are packed in 2 MiB blocks
138-
constexpr size_t kMinLargeAlloc =
139-
10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
140-
constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
141-
142133
static char SHAREABLE_HANDLE_VERSION = 2;
143134
enum ShareableHandleType : char {
144135
SHAREABLE_CUDA_MALLOC = 'c',
@@ -1332,7 +1323,7 @@ class DeviceCachingAllocator {
13321323
// effect on memory use during capture should be small.
13331324
process_events(context);
13341325
}
1335-
size_t size = round_size(orig_size);
1326+
size_t size = c10::CachingAllocator::round_size(orig_size);
13361327
auto& pool = get_pool(size, stream);
13371328
const size_t alloc_size = get_allocation_size(size);
13381329
AllocParams params(device, size, stream, &pool, alloc_size);
@@ -2173,46 +2164,6 @@ class DeviceCachingAllocator {
21732164
return result;
21742165
}
21752166

2176-
// This function takes the size and number of divisions argument and rounds
2177-
// up the size argument for the nearest power-of-2 division.
2178-
// For example, if we need to round-up 1200 and number of divisions is 4,
2179-
// the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
2180-
// them, the values are 1024, 1280, 1536, and 1792. So the function will
2181-
// return 1280 as the nearest ceiling of power-2 division.
2182-
static size_t roundup_power2_next_division(size_t size, size_t divisions) {
2183-
if (llvm::isPowerOf2_64(size)) {
2184-
return size;
2185-
}
2186-
2187-
TORCH_CHECK(divisions >= 2, "Only 2 or more divisions are supported");
2188-
2189-
// divide the space between these 2's power into equal divisions
2190-
// If division is zero, return the power-of-2 ceiling.
2191-
size_t power2_floor = llvm::PowerOf2Floor(size);
2192-
size_t power2_divison =
2193-
power2_floor >> (63 - llvm::countLeadingZeros(divisions));
2194-
if (C10_UNLIKELY(power2_divison == 0)) {
2195-
return (power2_floor << 1);
2196-
}
2197-
size_t round_size_floor = size & (~(power2_divison - 1));
2198-
return (round_size_floor == size) ? size
2199-
: round_size_floor + power2_divison;
2200-
}
2201-
2202-
static size_t round_size(size_t size) {
2203-
if (size < kMinBlockSize) {
2204-
return kMinBlockSize;
2205-
} else {
2206-
auto divisions =
2207-
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
2208-
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
2209-
return roundup_power2_next_division(size, divisions);
2210-
} else {
2211-
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
2212-
}
2213-
}
2214-
}
2215-
22162167
void createOrIncrefPool(MempoolId_t mempool_id, CUDAAllocator* allocator) {
22172168
// Create a PrivatePool object if it does not exist yet
22182169
// and increment its use_count

c10/xpu/XPUCachingAllocator.cpp

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,6 @@ using namespace c10::CachingDeviceAllocator;
1515

1616
// newly allocated memory with 512-byte alignment.
1717
constexpr size_t kDeviceAlignment = 512;
18-
// all sizes are rounded to at least 512 bytes
19-
constexpr size_t kMinBlockSize = 512;
20-
// largest "small" allocation is 1 MiB
21-
constexpr size_t kSmallSize = 1048576;
22-
// "small" allocations are packed in 2 MiB blocks
23-
constexpr size_t kSmallBuffer = 2097152;
24-
// allocations between 1 and 10 MiB may use kLargeBuffer
25-
constexpr size_t kMinLargeAlloc = 10485760;
26-
// round up large allocations to 2 MiB
27-
constexpr size_t kRoundLarge = 2097152;
2818

2919
namespace {
3020
using stream_set = ska::flat_hash_set<xpu::XPUStream>;
@@ -213,14 +203,6 @@ class DeviceCachingAllocator {
213203
}
214204
}
215205

216-
static size_t round_size(size_t size) {
217-
if (size < kMinBlockSize) {
218-
return kMinBlockSize;
219-
} else {
220-
return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
221-
}
222-
}
223-
224206
static size_t get_allocation_size(size_t size) {
225207
if (size <= kSmallSize) {
226208
return kSmallBuffer;
@@ -417,7 +399,7 @@ class DeviceCachingAllocator {
417399
Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
418400
std::scoped_lock<std::recursive_mutex> lock(mutex);
419401
process_events();
420-
size_t size = round_size(orig_size);
402+
size_t size = c10::CachingAllocator::round_size(orig_size);
421403
auto& pool = get_pool(size);
422404
const size_t alloc_size = get_allocation_size(size);
423405
AllocParams params(device, size, &queue, &pool, alloc_size);

0 commit comments

Comments
 (0)