Skip to content

Commit 1e8e9f7

Browse files
guangyeypytorchmergebot
authored andcommitted
Introduce AcceleratorAllocatorConfig as the common class (#149601)
# Motivation This PR aims to generalize `AllocatorConfig` to be device-agnostic. Introduce the class `AcceleratorAllocatorConfig` to clarify its scope as a configuration manager for accelerator backends (e.g., CUDA, XPU). The another name `AllocatorConfig` is now reserved for a potential future base class that can unify configuration handling for both CPU and accelerator allocators, should similar requirements arise for the CPU path. # Design Rule ## Overall This class configures memory allocation for both device and host memory. A single `AcceleratorAllocatorConfig` instance is shared across all accelerator backends, such as CUDA and XPU, under the assumption that relevant environment variables apply uniformly to all accelerators. Device-specific configuration extensions are supported via hooks (see `registerDeviceConfigParserHook`). Introduce a new class `ConfigTokenizer` to help process the env variable config key-value pair ## Naming Convention: - Public API names in `AcceleratorAllocatorConfig` should be device-generic. - Members prefixed with `pinned_` are specific to the host/pinned allocator. - Environment variable names should be generic across backends. - Comma-separated key-value pairs in the format: `key:value`. Use square brackets `[]` for list values Example: `key1:123, key2:[val1,val2]` ## Environment Variables: - The default environment variable for configuration is `PYTORCH_ALLOC_CONF`. - For backward compatibility, `PYTORCH_CUDA_ALLOC_CONF` and `PYTORCH_HIP_ALLOC_CONF` are also supported with lower priority. Pull Request resolved: #149601 Approved by: https://github.com/albanD
1 parent af3d069 commit 1e8e9f7

File tree

3 files changed

+693
-0
lines changed

3 files changed

+693
-0
lines changed

c10/core/AllocatorConfig.cpp

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
#include <c10/core/AllocatorConfig.h>
2+
#include <c10/core/DeviceType.h>
3+
#include <c10/util/env.h>
4+
#include <c10/util/irange.h>
5+
6+
namespace c10::CachingAllocator {
7+
8+
namespace {
9+
constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
10+
constexpr size_t kMB = 1024 * 1024ul;
11+
constexpr size_t kRoundUpPowerOfTwoStart = 1 * kMB; // 1MB
12+
constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
13+
} // anonymous namespace
14+
15+
AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
16+
static AcceleratorAllocatorConfig instance;
17+
#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated) \
18+
auto env##_name = c10::utils::get_env(#env); \
19+
if (env##_name.has_value()) { \
20+
if (deprecated) { \
21+
TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
22+
} \
23+
instance.parseArgs(env##_name.value()); \
24+
return true; \
25+
}
26+
static bool env_flag [[maybe_unused]] = []() {
27+
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
28+
// Keep this for backwards compatibility
29+
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
30+
C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
31+
return false;
32+
}();
33+
#undef C10_ALLOCATOR_CONFIG_PARSE_ENV
34+
return instance;
35+
}
36+
37+
AcceleratorAllocatorConfig::AcceleratorAllocatorConfig() {
38+
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
39+
}
40+
41+
size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
42+
size_t log_size = (63 - llvm::countLeadingZeros(size));
43+
44+
// Our intervals start at 1MB and end at 64GB
45+
const size_t interval_start =
46+
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
47+
const size_t interval_end =
48+
63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
49+
TORCH_CHECK(
50+
interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
51+
"kRoundUpPowerOfTwoIntervals mismatch");
52+
53+
size_t index =
54+
(log_size > interval_start) ? (log_size - interval_start) : 0ul;
55+
index = std::min(index, kRoundUpPowerOfTwoIntervals - 1);
56+
return instance().roundup_power2_divisions_[index];
57+
}
58+
59+
size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
60+
const ConfigTokenizer& tokenizer,
61+
size_t i) {
62+
tokenizer.checkToken(++i, ":");
63+
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
64+
constexpr size_t max_allowed_split_size_mb =
65+
std::numeric_limits<size_t>::max() / kMB;
66+
67+
size_t val_env = tokenizer.toSizeT(++i);
68+
TORCH_CHECK(
69+
val_env >= min_allowed_split_size_mb,
70+
"CachingAllocator option max_split_size_mb too small, must be >= ",
71+
min_allowed_split_size_mb);
72+
val_env = std::min(val_env, max_allowed_split_size_mb);
73+
max_split_size_ = val_env * kMB;
74+
75+
return i;
76+
}
77+
78+
size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
79+
const ConfigTokenizer& tokenizer,
80+
size_t i) {
81+
tokenizer.checkToken(++i, ":");
82+
constexpr size_t min_allowed_split_size_mb = kLargeBuffer / kMB;
83+
constexpr size_t max_allowed_split_size_mb =
84+
std::numeric_limits<size_t>::max() / kMB;
85+
86+
size_t val_env = tokenizer.toSizeT(++i);
87+
TORCH_CHECK(
88+
val_env >= min_allowed_split_size_mb,
89+
"CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
90+
min_allowed_split_size_mb);
91+
val_env = std::min(val_env, max_allowed_split_size_mb);
92+
max_non_split_rounding_size_ = val_env * kMB;
93+
94+
return i;
95+
}
96+
97+
size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
98+
const ConfigTokenizer& tokenizer,
99+
size_t i) {
100+
tokenizer.checkToken(++i, ":");
101+
double val_env = tokenizer.toDouble(++i);
102+
TORCH_CHECK(
103+
val_env > 0 && val_env < 1.0,
104+
"garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
105+
garbage_collection_threshold_ = val_env;
106+
107+
return i;
108+
}
109+
110+
size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
111+
const ConfigTokenizer& tokenizer,
112+
size_t i) {
113+
tokenizer.checkToken(++i, ":");
114+
bool first_value = true;
115+
116+
if (tokenizer[++i] == "[") {
117+
size_t last_index = 0;
118+
// NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
119+
while (++i < tokenizer.size() && tokenizer[i] != "]") {
120+
size_t value_index = i;
121+
tokenizer.checkToken(++i, ":");
122+
size_t value = tokenizer.toSizeT(++i);
123+
TORCH_CHECK(
124+
value == 0 || llvm::isPowerOf2_64(value),
125+
"For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
126+
127+
if (tokenizer[value_index] == ">") {
128+
std::fill(
129+
std::next(
130+
roundup_power2_divisions_.begin(),
131+
static_cast<std::vector<size_t>::difference_type>(
132+
last_index + 1)),
133+
roundup_power2_divisions_.end(),
134+
value);
135+
} else {
136+
size_t boundary = tokenizer.toSizeT(value_index);
137+
TORCH_CHECK(
138+
llvm::isPowerOf2_64(boundary),
139+
"For roundups, the intervals have to be power of 2 ");
140+
141+
size_t index = 63 - llvm::countLeadingZeros(boundary);
142+
index =
143+
std::clamp(index, size_t{0}, roundup_power2_divisions_.size() - 1);
144+
145+
if (first_value) {
146+
std::fill(
147+
roundup_power2_divisions_.begin(),
148+
std::next(
149+
roundup_power2_divisions_.begin(),
150+
static_cast<std::vector<size_t>::difference_type>(index)),
151+
value);
152+
first_value = false;
153+
}
154+
roundup_power2_divisions_[index] = value;
155+
last_index = index;
156+
}
157+
158+
if (tokenizer[i + 1] != "]") {
159+
tokenizer.checkToken(++i, ",");
160+
}
161+
}
162+
TORCH_INTERNAL_ASSERT(
163+
i < tokenizer.size(),
164+
"Expected closing bracket ']' in ConfigTokenizer but reached end of config");
165+
} else { // Keep this for backwards compatibility
166+
size_t value = tokenizer.toSizeT(i);
167+
TORCH_CHECK(
168+
llvm::isPowerOf2_64(value),
169+
"For roundups, the divisions has to be power of 2 ");
170+
std::fill(
171+
roundup_power2_divisions_.begin(),
172+
roundup_power2_divisions_.end(),
173+
value);
174+
}
175+
return i;
176+
}
177+
178+
size_t AcceleratorAllocatorConfig::parseExpandableSegments(
179+
const ConfigTokenizer& tokenizer,
180+
size_t i) {
181+
tokenizer.checkToken(++i, ":");
182+
use_expandable_segments_ = tokenizer.toBool(++i);
183+
184+
return i;
185+
}
186+
187+
size_t AcceleratorAllocatorConfig::parsePinnedUseBackgroundThreads(
188+
const ConfigTokenizer& tokenizer,
189+
size_t i) {
190+
tokenizer.checkToken(++i, ":");
191+
pinned_use_background_threads_ = tokenizer.toBool(++i);
192+
193+
return i;
194+
}
195+
196+
void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
197+
// The following option will be reset to its default value if not explicitly
198+
// set each time.
199+
max_split_size_ = std::numeric_limits<size_t>::max();
200+
roundup_power2_divisions_.assign(kRoundUpPowerOfTwoIntervals, 0);
201+
garbage_collection_threshold_ = 0;
202+
203+
{
204+
std::lock_guard<std::mutex> lock(last_allocator_settings_mutex_);
205+
last_allocator_settings_ = env;
206+
}
207+
208+
ConfigTokenizer tokenizer(env);
209+
for (size_t i = 0; i < tokenizer.size(); i++) {
210+
const auto& key = tokenizer[i];
211+
if (key == "max_split_size_mb") {
212+
i = parseMaxSplitSize(tokenizer, i);
213+
} else if (key == "max_non_split_rounding_mb") {
214+
i = parseMaxNonSplitRoundingSize(tokenizer, i);
215+
} else if (key == "garbage_collection_threshold") {
216+
i = parseGarbageCollectionThreshold(tokenizer, i);
217+
} else if (key == "roundup_power2_divisions") {
218+
i = parseRoundUpPower2Divisions(tokenizer, i);
219+
} else if (key == "expandable_segments") {
220+
i = parseExpandableSegments(tokenizer, i);
221+
} else if (key == "pinned_use_background_threads") {
222+
i = parsePinnedUseBackgroundThreads(tokenizer, i);
223+
} else {
224+
i = tokenizer.skipKey(i);
225+
}
226+
227+
if (i + 1 < tokenizer.size()) {
228+
tokenizer.checkToken(++i, ",");
229+
}
230+
}
231+
}
232+
233+
} // namespace c10::CachingAllocator

0 commit comments

Comments
 (0)