-
Notifications
You must be signed in to change notification settings - Fork 13
Unit tests and benchmark for subgroup2 and workgroup2 stuff #192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't use JIT traits, you need to benchmark both (compile pipeline twice) see #190 (comment)
// method emulations on the CPU, to verify the results of the GPU methods | ||
template<class Binop> | ||
struct emulatedReduction | ||
{ | ||
using type_t = typename Binop::type_t; | ||
|
||
static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) | ||
{ | ||
const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); | ||
std::fill(out,out+itemCount,red); | ||
} | ||
|
||
static inline constexpr const char* name = "reduction"; | ||
}; | ||
template<class Binop> | ||
struct emulatedScanInclusive | ||
{ | ||
using type_t = typename Binop::type_t; | ||
|
||
static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) | ||
{ | ||
std::inclusive_scan(in,in+itemCount,out,Binop()); | ||
} | ||
static inline constexpr const char* name = "inclusive_scan"; | ||
}; | ||
template<class Binop> | ||
struct emulatedScanExclusive | ||
{ | ||
using type_t = typename Binop::type_t; | ||
|
||
static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) | ||
{ | ||
std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); | ||
} | ||
static inline constexpr const char* name = "exclusive_scan"; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
example 23 does that, don't check results, so you can benchmark faster
ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; | ||
if (!swapchainParams.deduceFormat(m_physicalDevice)) | ||
return logFail("Could not choose a Surface Format for the Swapchain!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to achieve #190 (comment) you just need to add EUF_STORAGE
usage here
// TODO: get the element count from argv | ||
const uint32_t elementCount = Output<>::ScanElementCount; | ||
// populate our random data buffer on the CPU and create a GPU copy | ||
inputData = new uint32_t[elementCount]; | ||
{ | ||
std::mt19937 randGenerator(0xdeadbeefu); | ||
for (uint32_t i = 0u; i < elementCount; i++) | ||
inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all | ||
|
||
IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; | ||
inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; | ||
inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; | ||
m_utils->createFilledDeviceLocalBufferOnDedMem( | ||
SIntendedSubmitInfo{.queue=getTransferUpQueue()}, | ||
std::move(inputDataBufferCreationParams), | ||
inputData | ||
).move_into(gpuinputDataBuffer); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use pcg or other hash in the shader and get rid of the input buffer, it will reduce the data access overhead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// create buffer to store BDA of output buffers | ||
{ | ||
std::array<uint64_t, OutputBufferCount> outputAddresses; | ||
for (uint32_t i = 0; i < OutputBufferCount; i++) | ||
outputAddresses[i] = outputBuffers[i]->getDeviceAddress(); | ||
|
||
IGPUBuffer::SCreationParams params; | ||
params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_INLINE_UPDATE_VIA_CMDBUF | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; | ||
params.size = OutputBufferCount * sizeof(uint64_t); | ||
m_utils->createFilledDeviceLocalBufferOnDedMem(SIntendedSubmitInfo{ .queue = getTransferUpQueue() }, std::move(params), outputAddresses.data()).move_into(gpuOutputAddressesBuffer); | ||
} | ||
pc.inputBufAddress = gpuinputDataBuffer->getDeviceAddress(); | ||
pc.outputAddressBufAddress = gpuOutputAddressesBuffer->getDeviceAddress(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer the Push Constants to have an array of output addresses, this way you're doing 1 extra BDA load to find where you're storing to (huge latency as it introduces a dependent store)
binding[1].count = OutputBufferCount; | ||
dsLayout = m_device->createDescriptorSetLayout(binding); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
huh, aren't you using BDA for output?
smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout; | ||
{ | ||
IGPUDescriptorSetLayout::SBinding binding[1]; | ||
binding[0] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nitpick if you make it an array large enough to hold all your swapchain images, then it only needs ECF_PARTIALLY_BOUND_BIT
(because swapchain may be created with less images than your max)
|
||
// const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; | ||
const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; | ||
const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused var?
benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); | ||
} | ||
else | ||
{ | ||
for (uint32_t i = 0; i < workgroupSizes.size(); i++) | ||
benchSets[i] = createBenchmarkPipelines<ArithmeticOp, DoWorkgroupBenchmarks>(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
benchmark with MAX subgroup size, not MIN
template<class BinOp> | ||
using ArithmeticOp = emulatedReduction<BinOp>; // change this to test other arithmetic ops |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bench all 3 within a single application run (separate pipelines ofc.
uint32_t* inputData = nullptr; | ||
smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer; | ||
constexpr static inline uint32_t OutputBufferCount = 8u; | ||
smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount]; | ||
smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer; | ||
PushConstantData pc; | ||
|
||
smart_refctd_ptr<ISemaphore> sema; | ||
uint64_t timelineValue = 0; | ||
smart_refctd_ptr<ICPUBuffer> resultsBuffer; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't compare so get rid of inputData
and resultsBuffer
void logTestOutcome(bool passed, uint32_t workgroupSize) | ||
{ | ||
if (passed) | ||
m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); | ||
else | ||
{ | ||
totalFailCount++; | ||
m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove
options.preprocessorOptions.includeFinder = includeFinder; | ||
|
||
const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; | ||
const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make a NBL_CONSTEXPR
non templated variant of the config to use in C++ I was about to ask for that cause I realized that from the C++ side the utility is cumbersome to use
const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); | ||
const std::string definitions[7] = { | ||
"workgroup2::" + arith_name, | ||
std::to_string(workgroupSizeLog2), | ||
std::to_string(itemsPerWG), | ||
std::to_string(itemsPerInvoc), | ||
std::to_string(subgroupSizeLog2), | ||
std::to_string(numLoops), | ||
std::to_string(arith_name=="reduction") | ||
}; | ||
|
||
const IShaderCompiler::SMacroDefinition defines[7] = { | ||
{ "OPERATION", definitions[0] }, | ||
{ "WORKGROUP_SIZE_LOG2", definitions[1] }, | ||
{ "ITEMS_PER_WG", definitions[2] }, | ||
{ "ITEMS_PER_INVOCATION", definitions[3] }, | ||
{ "SUBGROUP_SIZE_LOG2", definitions[4] }, | ||
{ "NUM_LOOPS", definitions[5] }, | ||
{ "IS_REDUCTION", definitions[6] } | ||
}; | ||
options.preprocessorOptions.extraDefines = { defines, defines + 7 }; | ||
|
||
overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); | ||
} | ||
else | ||
{ | ||
const std::string definitions[5] = { | ||
"subgroup2::" + arith_name, | ||
std::to_string(workgroupSize), | ||
std::to_string(itemsPerInvoc), | ||
std::to_string(subgroupSizeLog2), | ||
std::to_string(numLoops) | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also would be awesome if the said NBL_CONSTEXPR
config had a method to spit out its templated counterpart instantiated type as a std::string
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can be like
template<template<typename> Binop>
using {PERFIX}Config{SUFFIX} = nbl::hlsl::workgroup2::Config<Binop, ... stuff from the runtime struct ....>;
make one for subgroup too
// barrier transition to GENERAL | ||
{ | ||
IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; | ||
imageBarriers[0].barrier = { | ||
.dep = { | ||
.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, | ||
.srcAccessMask = ACCESS_FLAGS::NONE, | ||
.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, | ||
.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ||
} | ||
}; | ||
imageBarriers[0].image = dummyImg.get(); | ||
imageBarriers[0].subresourceRange = { | ||
.aspectMask = IImage::EAF_COLOR_BIT, | ||
.baseMipLevel = 0u, | ||
.levelCount = 1u, | ||
.baseArrayLayer = 0u, | ||
.layerCount = 1u | ||
}; | ||
imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; | ||
imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; | ||
|
||
cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does Nsight stop capturing if you don't transition the image ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I already mentioned this #190 (comment)
// bind dummy image | ||
IGPUImageView::SCreationParams viewParams = { | ||
.flags = IGPUImageView::ECF_NONE, | ||
.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, | ||
.image = dummyImg, | ||
.viewType = IGPUImageView::ET_2D, | ||
.format = dummyImg->getCreationParameters().format | ||
}; | ||
auto dummyImgView = m_device->createImageView(std::move(viewParams)); | ||
|
||
video::IGPUDescriptorSet::SDescriptorInfo dsInfo; | ||
dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; | ||
dsInfo.desc = dummyImgView; | ||
|
||
IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = | ||
{ | ||
{ | ||
.dstSet = benchDs.get(), | ||
.binding = 2u, | ||
.arrayElement = 0u, | ||
.count = 1u, | ||
.info = &dsInfo, | ||
} | ||
}; | ||
m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this needs to go, the swapchain images need to be written into the descriptor set at application startup
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I already mentioned this #190 (comment)
resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); | ||
smart_refctd_ptr<IGPUCommandBuffer> cmdbuf; | ||
{ | ||
smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); | ||
if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) | ||
{ | ||
logFail("Failed to create Command Buffers!\n"); | ||
return false; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't even use this commandbuffer
const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; | ||
const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; | ||
|
||
const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you should be benchmarking with max subgroup size, not min
std::string caption = "[Nabla Engine] Geometry Creator"; | ||
{ | ||
caption += ", displaying [all objects]"; | ||
m_window->setCaption(caption); | ||
} | ||
m_surface->present(m_currentImageAcquire.imageIndex, rendered); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't change caption every frame, it affects your benchmark, I already mentioned this in #190
// reflects calculations in workgroup2::ArithmeticConfiguration | ||
uint32_t calculateItemsPerWorkgroup(const uint32_t workgroupSize, const uint32_t subgroupSize, const uint32_t itemsPerInvocation) | ||
{ | ||
if (workgroupSize <= subgroupSize) | ||
return workgroupSize * itemsPerInvocation; | ||
|
||
const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); | ||
const uint8_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); | ||
|
||
const uint16_t levels = (workgroupSizeLog2 == subgroupSizeLog2) ? 1 : | ||
(workgroupSizeLog2 > subgroupSizeLog2 * 2 + 2) ? 3 : 2; | ||
|
||
const uint16_t itemsPerInvocationProductLog2 = max(workgroupSizeLog2 - subgroupSizeLog2 * levels, 0); | ||
uint16_t itemsPerInvocation1 = (levels == 3) ? min(itemsPerInvocationProductLog2, 2) : itemsPerInvocationProductLog2; | ||
itemsPerInvocation1 = uint16_t(1u) << itemsPerInvocation1; | ||
|
||
uint32_t virtualWorkgroupSize = 1u << max(subgroupSizeLog2 * levels, workgroupSizeLog2); | ||
|
||
return itemsPerInvocation * virtualWorkgroupSize; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we definitely need a workgroup2::CxprArithmeticConfiguration
that does all the templated one does but with NBL_CONSTEXPR
labelled methods
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() {return true;} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename Binop::base_t, N, nbl::hlsl::jit::device_capabilities>; | ||
type_t value = sourceVal; | ||
|
||
const uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t), sizeof(uint64_t)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dependant store (depends on this read) store all output addresses in push constant
typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t; | ||
|
||
// final (level 1/2) scan needs to fit in one subgroup exactly | ||
groupshared uint32_t scratch[config_t::SharedScratchElementCount]; | ||
|
||
struct ScratchProxy | ||
{ | ||
template<typename AccessType, typename IndexType> | ||
void get(const IndexType ix, NBL_REF_ARG(AccessType) value) | ||
{ | ||
value = scratch[ix]; | ||
} | ||
template<typename AccessType, typename IndexType> | ||
void set(const IndexType ix, const AccessType value) | ||
{ | ||
scratch[ix] = value; | ||
} | ||
|
||
uint32_t atomicOr(const uint32_t ix, const uint32_t value) | ||
{ | ||
return nbl::hlsl::glsl::atomicOr(scratch[ix],value); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
|
||
|
||
template<class Config, class Binop> | ||
struct DataProxy | ||
{ | ||
using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>; | ||
static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>); | ||
|
||
// we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv | ||
template<typename AccessType, typename IndexType> | ||
void get(const IndexType ix, NBL_REF_ARG(dtype_t) value) | ||
{ | ||
// value = inputValue[ix]; | ||
value = nbl::hlsl::promote<dtype_t>(globalIndex()); | ||
} | ||
template<typename AccessType, typename IndexType> | ||
void set(const IndexType ix, const dtype_t value) | ||
{ | ||
// output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
|
||
template<class Config, class Binop> | ||
struct PreloadedDataProxy | ||
{ | ||
using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>; | ||
static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>); | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint32_t PreloadedDataCount = Config::VirtualWorkgroupSize / Config::WorkgroupSize; | ||
|
||
template<typename AccessType, typename IndexType> | ||
void get(const IndexType ix, NBL_REF_ARG(AccessType) value) | ||
{ | ||
value = preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2]; | ||
} | ||
template<typename AccessType, typename IndexType> | ||
void set(const IndexType ix, const AccessType value) | ||
{ | ||
preloaded[(ix-nbl::hlsl::workgroup::SubgroupContiguousIndex())>>Config::WorkgroupSizeLog2] = value; | ||
} | ||
|
||
void preload() | ||
{ | ||
const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; | ||
[unroll] | ||
for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) | ||
preloaded[idx] = vk::RawBufferLoad<dtype_t>(pc.inputBufAddress + (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()) * sizeof(dtype_t)); | ||
} | ||
void unload() | ||
{ | ||
const uint32_t workgroupOffset = nbl::hlsl::glsl::gl_WorkGroupID().x * Config::VirtualWorkgroupSize; | ||
uint64_t outputBufAddr = vk::RawBufferLoad<uint64_t>(pc.outputAddressBufAddress + Binop::BindingIndex * sizeof(uint64_t)); | ||
[unroll] | ||
for (uint32_t idx = 0; idx < PreloadedDataCount; idx++) | ||
vk::RawBufferStore<dtype_t>(outputBufAddr + sizeof(uint32_t) + sizeof(dtype_t) * (workgroupOffset + idx * Config::WorkgroupSize + nbl::hlsl::workgroup::SubgroupContiguousIndex()), preloaded[idx], sizeof(uint32_t)); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
|
||
dtype_t preloaded[PreloadedDataCount]; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe you wanna make a common header for ex23 and 29 with this?
if (globalIndex()==0u) | ||
vk::RawBufferStore<uint32_t>(outputBufAddr, nbl::hlsl::glsl::gl_SubgroupSize()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't store this, this is only for test
PreloadedDataProxy<config_t,Binop> dataAccessor; | ||
dataAccessor.preload(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
best to generate from hash
type_t benchmark() | ||
{ | ||
const type_t sourceVal = vk::RawBufferLoad<type_t>(pc.inputBufAddress + globalIndex() * sizeof(type_t)); | ||
|
||
subbench<bit_and<uint32_t> >(sourceVal); | ||
subbench<bit_xor<uint32_t> >(sourceVal); | ||
subbench<bit_or<uint32_t> >(sourceVal); | ||
subbench<plus<uint32_t> >(sourceVal); | ||
subbench<multiplies<uint32_t> >(sourceVal); | ||
subbench<minimum<uint32_t> >(sourceVal); | ||
subbench<maximum<uint32_t> >(sourceVal); | ||
return sourceVal; | ||
} | ||
|
||
|
||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() | ||
{ | ||
return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see #190 on how to cleanup
No description provided.