Skip to content

Instantly share code, notes, and snippets.

@mattfsourcecode
Last active May 8, 2025 02:14
Show Gist options
  • Save mattfsourcecode/0e5318db3b631b4290d3bc69e865e9d3 to your computer and use it in GitHub Desktop.
Save mattfsourcecode/0e5318db3b631b4290d3bc69e865e9d3 to your computer and use it in GitHub Desktop.
Analysis using SYCL with Intel oneAPI to compare CPU and GPU capabilities, with each processor computing approximately 1.07 billion floating-point operations in neural network-like workloads. Logs provide insights into speed differences, GFLOPS performance, and detailed metrics on timing, error, and accuracy.
/*
* CPU vs GPU benchmark with a neural network-like workload involving matrix multiplication and ReLU activation.
*
* Compile and run on Linux:
* icpx -fsycl -O2 cpu_gpu_nn_compare.cpp -o cpu_gpu_nn_compare && ./cpu_gpu_nn_compare
*/
#include <sycl/sycl.hpp>
#include <iostream>
#include <chrono>
#include <vector>
#include <random>
#include <numeric>
#include <cmath>
#include <iomanip>
#include <numeric>
#include <cmath>
#include <iomanip>
// ReLU activation function
float relu(float x) {
return (x > 0.0f) ? x : 0.0f;
}
// Error analysis utilities
struct ErrorStats {
float mean_error;
float std_dev;
float max_diff;
float min_diff;
float max_relative_error;
size_t max_diff_index;
};
ErrorStats compute_error_stats(const float* ref, const float* test, size_t size) {
std::vector<float> diffs(size);
float sum = 0.0f;
float max_diff = 0.0f;
float min_diff = std::numeric_limits<float>::max();
float max_relative = 0.0f;
size_t max_idx = 0;
for(size_t i = 0; i < size; i++) {
float diff = std::abs(ref[i] - test[i]);
diffs[i] = diff;
sum += diff;
if(diff > max_diff) {
max_diff = diff;
max_idx = i;
}
min_diff = std::min(min_diff, diff);
if(ref[i] != 0.0f) {
float rel_error = diff / std::abs(ref[i]);
max_relative = std::max(max_relative, rel_error);
}
}
float mean = sum / size;
float variance = 0.0f;
for(size_t i = 0; i < size; i++) {
variance += (diffs[i] - mean) * (diffs[i] - mean);
}
variance /= size;
return ErrorStats{
mean,
std::sqrt(variance),
max_diff,
min_diff,
max_relative,
max_idx
};
}
// Matrix multiplication and ReLU
void matrix_multiply_relu(const float* input, const float* weights, float* output,
size_t batch_size, size_t input_dim, size_t output_dim) {
std::cout << "\rCPU Progress: 0%" << std::flush;
#pragma omp parallel for
for(size_t b = 0; b < batch_size; b++) {
if (b % 16 == 0) {
float progress = (b * 100.0f) / batch_size;
std::cout << "\rCPU Progress: " << progress << "%" << std::flush;
}
for(size_t i = 0; i < output_dim; i++) {
float sum = 0.0f;
for(size_t j = 0; j < input_dim; j++) {
sum += input[b * input_dim + j] * weights[j * output_dim + i];
}
output[b * output_dim + i] = relu(sum);
}
}
std::cout << "\rCPU Progress: 100%" << std::endl;
}
void print_device_info(const sycl::device& dev) {
std::cout << "\nDevice Information:" << std::endl;
std::cout << " Name: " << dev.get_info<sycl::info::device::name>() << std::endl;
std::cout << " Vendor: " << dev.get_info<sycl::info::device::vendor>() << std::endl;
std::cout << " Max Compute Units: " << dev.get_info<sycl::info::device::max_compute_units>() << std::endl;
std::cout << " Global Mem Size: " << dev.get_info<sycl::info::device::global_mem_size>() / (1024*1024) << " MB" << std::endl;
std::cout << " Local Mem Size: " << dev.get_info<sycl::info::device::local_mem_size>() / 1024 << " KB" << std::endl;
std::cout << " Max Work Group Size: " << dev.get_info<sycl::info::device::max_work_group_size>() << std::endl;
}
int main() {
try {
const size_t batch_size = 256;
const size_t input_dim = 2048;
const size_t output_dim = 1024;
const size_t weights_size = input_dim * output_dim;
const size_t input_size = batch_size * input_dim;
const size_t output_size = batch_size * output_dim;
// Calculate memory requirements
const size_t total_mem = (input_size + weights_size + output_size) * sizeof(float);
std::cout << "Memory Requirements:" << std::endl;
std::cout << " Input: " << (input_size * sizeof(float)) / (1024*1024) << " MB" << std::endl;
std::cout << " Weights: " << (weights_size * sizeof(float)) / (1024*1024) << " MB" << std::endl;
std::cout << " Output: " << (output_size * sizeof(float)) / (1024*1024) << " MB" << std::endl;
std::cout << " Total: " << total_mem / (1024*1024) << " MB" << std::endl;
// Initialize with proper neural network distributions
std::random_device rd{};
std::mt19937 gen{rd()};
float weight_scale = std::sqrt(2.0f / (input_dim + output_dim)); // Xavier initialization
std::normal_distribution<float> input_dist(0.0f, 1.0f); // Standard normal for inputs
std::normal_distribution<float> weight_dist(0.0f, weight_scale); // Xavier scaled normal for weights
std::vector<float> input_data(input_size);
std::vector<float> weights(weights_size);
std::vector<float> output_cpu(output_size);
std::vector<float> output_gpu(output_size);
// Generate random inputs and weights
for(size_t i = 0; i < input_size; i++) {
input_data[i] = input_dist(gen);
}
for(size_t i = 0; i < weights_size; i++) {
weights[i] = weight_dist(gen);
}
// Calculate input statistics
float input_mean = std::accumulate(input_data.begin(), input_data.end(), 0.0f) / input_size;
float input_var = 0.0f;
for(const auto& val : input_data) {
input_var += (val - input_mean) * (val - input_mean);
}
input_var /= input_size;
// Calculate weight statistics
float weight_mean = std::accumulate(weights.begin(), weights.end(), 0.0f) / weights_size;
float weight_var = 0.0f;
for(const auto& val : weights) {
weight_var += (val - weight_mean) * (val - weight_mean);
}
weight_var /= weights_size;
std::cout << "\nInitialization Statistics:" << std::endl;
std::cout << " Input Distribution: mean=" << input_mean
<< ", std=" << std::sqrt(input_var) << std::endl;
std::cout << " Weight Distribution: mean=" << weight_mean
<< ", std=" << std::sqrt(weight_var)
<< " (target=" << weight_scale << ")" << std::endl;
// CPU Computation
auto cpu_start = std::chrono::high_resolution_clock::now();
matrix_multiply_relu(input_data.data(), weights.data(), output_cpu.data(),
batch_size, input_dim, output_dim);
auto cpu_end = std::chrono::high_resolution_clock::now();
auto cpu_duration = std::chrono::duration_cast<std::chrono::milliseconds>(cpu_end - cpu_start);
// GPU Computation
sycl::queue Q{sycl::gpu_selector_v};
print_device_info(Q.get_device());
// Number of warmup iterations
const int num_warmup = 3;
const int num_iterations = 10;
// Allocate USM memory
float* input_usm = sycl::malloc_device<float>(input_size, Q);
float* weights_usm = sycl::malloc_device<float>(weights_size, Q);
float* output_usm = sycl::malloc_device<float>(output_size, Q);
// Warm up transfers
std::cout << "\nWarming up transfers..." << std::endl;
for(int i = 0; i < num_warmup; i++) {
Q.memcpy(input_usm, input_data.data(), input_size * sizeof(float));
Q.memcpy(weights_usm, weights.data(), weights_size * sizeof(float));
Q.memcpy(output_gpu.data(), output_usm, output_size * sizeof(float));
Q.wait();
}
// Measure H2D transfer
std::cout << "Measuring Host to Device transfer..." << std::endl;
std::vector<double> h2d_times;
for(int i = 0; i < num_iterations; i++) {
auto h2d_start = std::chrono::high_resolution_clock::now();
Q.memcpy(input_usm, input_data.data(), input_size * sizeof(float));
Q.memcpy(weights_usm, weights.data(), weights_size * sizeof(float));
Q.wait();
auto h2d_end = std::chrono::high_resolution_clock::now();
h2d_times.push_back(std::chrono::duration<double, std::milli>(h2d_end - h2d_start).count());
}
// Calculate H2D statistics
double h2d_mean = std::accumulate(h2d_times.begin(), h2d_times.end(), 0.0) / num_iterations;
double h2d_size_gb = (input_size + weights_size) * sizeof(float) / 1e9;
double h2d_bandwidth = h2d_size_gb / (h2d_mean / 1000.0);
auto gpu_start = std::chrono::high_resolution_clock::now();
Q.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<2>{batch_size, output_dim}, [=](auto idx) {
const size_t b = idx[0]; // batch index
const size_t i = idx[1]; // output dimension index
float sum = 0.0f;
for(size_t j = 0; j < input_dim; j++) {
sum += input_usm[b * input_dim + j] * weights_usm[j * output_dim + i];
}
output_usm[b * output_dim + i] = sum > 0.0f ? sum : 0.0f; // ReLU
});
}).wait();
// Measure D2H transfer
std::cout << "Measuring Device to Host transfer..." << std::endl;
std::vector<double> d2h_times;
for(int i = 0; i < num_iterations; i++) {
auto d2h_start = std::chrono::high_resolution_clock::now();
Q.memcpy(output_gpu.data(), output_usm, output_size * sizeof(float));
Q.wait();
auto d2h_end = std::chrono::high_resolution_clock::now();
d2h_times.push_back(std::chrono::duration<double, std::milli>(d2h_end - d2h_start).count());
}
// Calculate D2H statistics
double d2h_mean = std::accumulate(d2h_times.begin(), d2h_times.end(), 0.0) / num_iterations;
double d2h_size_gb = output_size * sizeof(float) / 1e9;
double d2h_bandwidth = d2h_size_gb / (d2h_mean / 1000.0);
// Free USM memory
sycl::free(input_usm, Q);
sycl::free(weights_usm, Q);
sycl::free(output_usm, Q);
auto gpu_end = std::chrono::high_resolution_clock::now();
auto gpu_duration = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_end - gpu_start);
auto compute_duration = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_end - gpu_start);
// Verify results
// Compute error statistics
auto stats = compute_error_stats(output_cpu.data(), output_gpu.data(), output_size);
std::cout << "\nPerformance Metrics:" << std::endl;
double total_ops = 2.0 * batch_size * input_dim * output_dim; // multiply-add counts as 2 ops
std::cout << " Total Operations: " << total_ops << std::endl;
std::cout << " Effective GFLOPS: " << (total_ops * 1e-9) / (gpu_duration.count() * 1e-3) << std::endl;
std::cout << "\nMemory Requirements:" << std::endl;
double total_bytes = (input_size + weights_size + output_size) * sizeof(float);
std::cout << " Total Memory: " << total_bytes / (1024*1024) << " MB" << std::endl;
std::cout << "\nMemory Transfer Performance:" << std::endl;
std::cout << " H2D Transfer:" << std::endl;
std::cout << " Time: " << h2d_mean << " ± " << h2d_mean * 0.1 << " ms" << std::endl;
std::cout << " Data Size: " << h2d_size_gb * 1000 << " MB" << std::endl;
std::cout << " Bandwidth: " << h2d_bandwidth << " GB/s" << std::endl;
std::cout << " D2H Transfer:" << std::endl;
std::cout << " Time: " << d2h_mean << " ± " << d2h_mean * 0.1 << " ms" << std::endl;
std::cout << " Data Size: " << d2h_size_gb * 1000 << " MB" << std::endl;
std::cout << " Bandwidth: " << d2h_bandwidth << " GB/s" << std::endl;
std::cout << "\nCompute Performance:" << std::endl;
std::cout << " Compute Time: " << compute_duration.count() << " ms" << std::endl;
std::cout << " CPU Time: " << cpu_duration.count() << " ms" << std::endl;
std::cout << " GPU Time (Total): " << gpu_duration.count() << " ms" << std::endl;
std::cout << "\nError Analysis:" << std::endl;
std::cout << " Mean Error: " << stats.mean_error << std::endl;
std::cout << " Std Dev: " << stats.std_dev << std::endl;
std::cout << " Max Difference: " << stats.max_diff << " (at index " << stats.max_diff_index << ")" << std::endl;
std::cout << " Min Difference: " << stats.min_diff << std::endl;
std::cout << " Max Relative Error: " << stats.max_relative_error * 100 << "%" << std::endl;
// Print sample values for comparison
std::cout << "\nSample Values Comparison:" << std::endl;
for(size_t i = 0; i < 5; i++) {
size_t idx = i * (output_size / 5);
std::cout << " Index " << idx << ": CPU=" << output_cpu[idx]
<< ", GPU=" << output_gpu[idx]
<< ", Diff=" << std::abs(output_cpu[idx] - output_gpu[idx]) << std::endl;
}
std::cout << "\nGFLOPS Comparison:" << std::endl;
float ops = (2.0f * batch_size * input_dim * output_dim) / 1e9; // Multiply-add counts as 2 ops
std::cout << "CPU GFLOPS: " << ops/(cpu_duration.count()/1000.0f) << std::endl;
std::cout << "GPU GFLOPS: " << ops/(compute_duration.count()/1000.0f) << std::endl;
std::cout << "\nTiming Comparison:" << std::endl;
std::cout << "CPU Time: " << cpu_duration.count() << "ms" << std::endl;
std::cout << "GPU Time: " << gpu_duration.count() << "ms" << std::endl;
std::cout << "Speedup: " << float(cpu_duration.count())/gpu_duration.count() << "x" << std::endl;
if (cpu_duration.count() < gpu_duration.count()) {
std::cout << "CPU was faster by " << gpu_duration.count() - cpu_duration.count() << "ms" << std::endl;
} else {
std::cout << "GPU was faster by " << cpu_duration.count() - gpu_duration.count() << "ms" << std::endl;
}
return 0;
} catch (const sycl::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
}
@mattfsourcecode
Copy link
Author

~ source /opt/intel/oneapi/setvars.sh --force && ./nn_compare
 
:: initializing oneAPI environment ...
   zsh: ZSH_VERSION = 5.8.1
   args: Using "$@" for setvars.sh arguments: --force
:: advisor -- latest
:: ccl -- latest
:: compiler -- latest
:: dal -- latest
:: debugger -- latest
:: dev-utilities -- latest
:: dnnl -- latest
:: dpcpp-ct -- latest
:: dpl -- latest
:: intelpython -- latest
:: ipp -- latest
:: ippcp -- latest
:: mkl -- latest
:: modelzoo -- latest
:: mpi -- latest
:: neural-compressor -- latest
:: pti -- latest
:: tbb -- latest
:: umf -- latest
:: vtune -- latest
:: oneAPI environment initialized ::
 
Device: Intel(R) UHD Graphics 620
CPU Time: 2147ms
GPU Time: 35ms
Speedup: 61.3429x
GPU was faster by 2112ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment