Last active
May 8, 2025 02:14
-
-
Save mattfsourcecode/0e5318db3b631b4290d3bc69e865e9d3 to your computer and use it in GitHub Desktop.
Analysis using SYCL with Intel oneAPI to compare CPU and GPU capabilities, with each processor computing approximately 1.07 billion floating-point operations in neural network-like workloads. Logs provide insights into speed differences, GFLOPS performance, and detailed metrics on timing, error, and accuracy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* CPU vs GPU benchmark with a neural network-like workload involving matrix multiplication and ReLU activation. | |
* | |
* Compile and run on Linux: | |
* icpx -fsycl -O2 cpu_gpu_nn_compare.cpp -o cpu_gpu_nn_compare && ./cpu_gpu_nn_compare | |
*/ | |
#include <sycl/sycl.hpp> | |
#include <iostream> | |
#include <chrono> | |
#include <vector> | |
#include <random> | |
#include <numeric> | |
#include <cmath> | |
#include <iomanip> | |
#include <numeric> | |
#include <cmath> | |
#include <iomanip> | |
// ReLU activation function | |
float relu(float x) { | |
return (x > 0.0f) ? x : 0.0f; | |
} | |
// Error analysis utilities | |
struct ErrorStats { | |
float mean_error; | |
float std_dev; | |
float max_diff; | |
float min_diff; | |
float max_relative_error; | |
size_t max_diff_index; | |
}; | |
ErrorStats compute_error_stats(const float* ref, const float* test, size_t size) { | |
std::vector<float> diffs(size); | |
float sum = 0.0f; | |
float max_diff = 0.0f; | |
float min_diff = std::numeric_limits<float>::max(); | |
float max_relative = 0.0f; | |
size_t max_idx = 0; | |
for(size_t i = 0; i < size; i++) { | |
float diff = std::abs(ref[i] - test[i]); | |
diffs[i] = diff; | |
sum += diff; | |
if(diff > max_diff) { | |
max_diff = diff; | |
max_idx = i; | |
} | |
min_diff = std::min(min_diff, diff); | |
if(ref[i] != 0.0f) { | |
float rel_error = diff / std::abs(ref[i]); | |
max_relative = std::max(max_relative, rel_error); | |
} | |
} | |
float mean = sum / size; | |
float variance = 0.0f; | |
for(size_t i = 0; i < size; i++) { | |
variance += (diffs[i] - mean) * (diffs[i] - mean); | |
} | |
variance /= size; | |
return ErrorStats{ | |
mean, | |
std::sqrt(variance), | |
max_diff, | |
min_diff, | |
max_relative, | |
max_idx | |
}; | |
} | |
// Matrix multiplication and ReLU | |
void matrix_multiply_relu(const float* input, const float* weights, float* output, | |
size_t batch_size, size_t input_dim, size_t output_dim) { | |
std::cout << "\rCPU Progress: 0%" << std::flush; | |
#pragma omp parallel for | |
for(size_t b = 0; b < batch_size; b++) { | |
if (b % 16 == 0) { | |
float progress = (b * 100.0f) / batch_size; | |
std::cout << "\rCPU Progress: " << progress << "%" << std::flush; | |
} | |
for(size_t i = 0; i < output_dim; i++) { | |
float sum = 0.0f; | |
for(size_t j = 0; j < input_dim; j++) { | |
sum += input[b * input_dim + j] * weights[j * output_dim + i]; | |
} | |
output[b * output_dim + i] = relu(sum); | |
} | |
} | |
std::cout << "\rCPU Progress: 100%" << std::endl; | |
} | |
void print_device_info(const sycl::device& dev) { | |
std::cout << "\nDevice Information:" << std::endl; | |
std::cout << " Name: " << dev.get_info<sycl::info::device::name>() << std::endl; | |
std::cout << " Vendor: " << dev.get_info<sycl::info::device::vendor>() << std::endl; | |
std::cout << " Max Compute Units: " << dev.get_info<sycl::info::device::max_compute_units>() << std::endl; | |
std::cout << " Global Mem Size: " << dev.get_info<sycl::info::device::global_mem_size>() / (1024*1024) << " MB" << std::endl; | |
std::cout << " Local Mem Size: " << dev.get_info<sycl::info::device::local_mem_size>() / 1024 << " KB" << std::endl; | |
std::cout << " Max Work Group Size: " << dev.get_info<sycl::info::device::max_work_group_size>() << std::endl; | |
} | |
int main() { | |
try { | |
const size_t batch_size = 256; | |
const size_t input_dim = 2048; | |
const size_t output_dim = 1024; | |
const size_t weights_size = input_dim * output_dim; | |
const size_t input_size = batch_size * input_dim; | |
const size_t output_size = batch_size * output_dim; | |
// Calculate memory requirements | |
const size_t total_mem = (input_size + weights_size + output_size) * sizeof(float); | |
std::cout << "Memory Requirements:" << std::endl; | |
std::cout << " Input: " << (input_size * sizeof(float)) / (1024*1024) << " MB" << std::endl; | |
std::cout << " Weights: " << (weights_size * sizeof(float)) / (1024*1024) << " MB" << std::endl; | |
std::cout << " Output: " << (output_size * sizeof(float)) / (1024*1024) << " MB" << std::endl; | |
std::cout << " Total: " << total_mem / (1024*1024) << " MB" << std::endl; | |
// Initialize with proper neural network distributions | |
std::random_device rd{}; | |
std::mt19937 gen{rd()}; | |
float weight_scale = std::sqrt(2.0f / (input_dim + output_dim)); // Xavier initialization | |
std::normal_distribution<float> input_dist(0.0f, 1.0f); // Standard normal for inputs | |
std::normal_distribution<float> weight_dist(0.0f, weight_scale); // Xavier scaled normal for weights | |
std::vector<float> input_data(input_size); | |
std::vector<float> weights(weights_size); | |
std::vector<float> output_cpu(output_size); | |
std::vector<float> output_gpu(output_size); | |
// Generate random inputs and weights | |
for(size_t i = 0; i < input_size; i++) { | |
input_data[i] = input_dist(gen); | |
} | |
for(size_t i = 0; i < weights_size; i++) { | |
weights[i] = weight_dist(gen); | |
} | |
// Calculate input statistics | |
float input_mean = std::accumulate(input_data.begin(), input_data.end(), 0.0f) / input_size; | |
float input_var = 0.0f; | |
for(const auto& val : input_data) { | |
input_var += (val - input_mean) * (val - input_mean); | |
} | |
input_var /= input_size; | |
// Calculate weight statistics | |
float weight_mean = std::accumulate(weights.begin(), weights.end(), 0.0f) / weights_size; | |
float weight_var = 0.0f; | |
for(const auto& val : weights) { | |
weight_var += (val - weight_mean) * (val - weight_mean); | |
} | |
weight_var /= weights_size; | |
std::cout << "\nInitialization Statistics:" << std::endl; | |
std::cout << " Input Distribution: mean=" << input_mean | |
<< ", std=" << std::sqrt(input_var) << std::endl; | |
std::cout << " Weight Distribution: mean=" << weight_mean | |
<< ", std=" << std::sqrt(weight_var) | |
<< " (target=" << weight_scale << ")" << std::endl; | |
// CPU Computation | |
auto cpu_start = std::chrono::high_resolution_clock::now(); | |
matrix_multiply_relu(input_data.data(), weights.data(), output_cpu.data(), | |
batch_size, input_dim, output_dim); | |
auto cpu_end = std::chrono::high_resolution_clock::now(); | |
auto cpu_duration = std::chrono::duration_cast<std::chrono::milliseconds>(cpu_end - cpu_start); | |
// GPU Computation | |
sycl::queue Q{sycl::gpu_selector_v}; | |
print_device_info(Q.get_device()); | |
// Number of warmup iterations | |
const int num_warmup = 3; | |
const int num_iterations = 10; | |
// Allocate USM memory | |
float* input_usm = sycl::malloc_device<float>(input_size, Q); | |
float* weights_usm = sycl::malloc_device<float>(weights_size, Q); | |
float* output_usm = sycl::malloc_device<float>(output_size, Q); | |
// Warm up transfers | |
std::cout << "\nWarming up transfers..." << std::endl; | |
for(int i = 0; i < num_warmup; i++) { | |
Q.memcpy(input_usm, input_data.data(), input_size * sizeof(float)); | |
Q.memcpy(weights_usm, weights.data(), weights_size * sizeof(float)); | |
Q.memcpy(output_gpu.data(), output_usm, output_size * sizeof(float)); | |
Q.wait(); | |
} | |
// Measure H2D transfer | |
std::cout << "Measuring Host to Device transfer..." << std::endl; | |
std::vector<double> h2d_times; | |
for(int i = 0; i < num_iterations; i++) { | |
auto h2d_start = std::chrono::high_resolution_clock::now(); | |
Q.memcpy(input_usm, input_data.data(), input_size * sizeof(float)); | |
Q.memcpy(weights_usm, weights.data(), weights_size * sizeof(float)); | |
Q.wait(); | |
auto h2d_end = std::chrono::high_resolution_clock::now(); | |
h2d_times.push_back(std::chrono::duration<double, std::milli>(h2d_end - h2d_start).count()); | |
} | |
// Calculate H2D statistics | |
double h2d_mean = std::accumulate(h2d_times.begin(), h2d_times.end(), 0.0) / num_iterations; | |
double h2d_size_gb = (input_size + weights_size) * sizeof(float) / 1e9; | |
double h2d_bandwidth = h2d_size_gb / (h2d_mean / 1000.0); | |
auto gpu_start = std::chrono::high_resolution_clock::now(); | |
Q.submit([&](sycl::handler& h) { | |
h.parallel_for(sycl::range<2>{batch_size, output_dim}, [=](auto idx) { | |
const size_t b = idx[0]; // batch index | |
const size_t i = idx[1]; // output dimension index | |
float sum = 0.0f; | |
for(size_t j = 0; j < input_dim; j++) { | |
sum += input_usm[b * input_dim + j] * weights_usm[j * output_dim + i]; | |
} | |
output_usm[b * output_dim + i] = sum > 0.0f ? sum : 0.0f; // ReLU | |
}); | |
}).wait(); | |
// Measure D2H transfer | |
std::cout << "Measuring Device to Host transfer..." << std::endl; | |
std::vector<double> d2h_times; | |
for(int i = 0; i < num_iterations; i++) { | |
auto d2h_start = std::chrono::high_resolution_clock::now(); | |
Q.memcpy(output_gpu.data(), output_usm, output_size * sizeof(float)); | |
Q.wait(); | |
auto d2h_end = std::chrono::high_resolution_clock::now(); | |
d2h_times.push_back(std::chrono::duration<double, std::milli>(d2h_end - d2h_start).count()); | |
} | |
// Calculate D2H statistics | |
double d2h_mean = std::accumulate(d2h_times.begin(), d2h_times.end(), 0.0) / num_iterations; | |
double d2h_size_gb = output_size * sizeof(float) / 1e9; | |
double d2h_bandwidth = d2h_size_gb / (d2h_mean / 1000.0); | |
// Free USM memory | |
sycl::free(input_usm, Q); | |
sycl::free(weights_usm, Q); | |
sycl::free(output_usm, Q); | |
auto gpu_end = std::chrono::high_resolution_clock::now(); | |
auto gpu_duration = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_end - gpu_start); | |
auto compute_duration = std::chrono::duration_cast<std::chrono::milliseconds>(gpu_end - gpu_start); | |
// Verify results | |
// Compute error statistics | |
auto stats = compute_error_stats(output_cpu.data(), output_gpu.data(), output_size); | |
std::cout << "\nPerformance Metrics:" << std::endl; | |
double total_ops = 2.0 * batch_size * input_dim * output_dim; // multiply-add counts as 2 ops | |
std::cout << " Total Operations: " << total_ops << std::endl; | |
std::cout << " Effective GFLOPS: " << (total_ops * 1e-9) / (gpu_duration.count() * 1e-3) << std::endl; | |
std::cout << "\nMemory Requirements:" << std::endl; | |
double total_bytes = (input_size + weights_size + output_size) * sizeof(float); | |
std::cout << " Total Memory: " << total_bytes / (1024*1024) << " MB" << std::endl; | |
std::cout << "\nMemory Transfer Performance:" << std::endl; | |
std::cout << " H2D Transfer:" << std::endl; | |
std::cout << " Time: " << h2d_mean << " ± " << h2d_mean * 0.1 << " ms" << std::endl; | |
std::cout << " Data Size: " << h2d_size_gb * 1000 << " MB" << std::endl; | |
std::cout << " Bandwidth: " << h2d_bandwidth << " GB/s" << std::endl; | |
std::cout << " D2H Transfer:" << std::endl; | |
std::cout << " Time: " << d2h_mean << " ± " << d2h_mean * 0.1 << " ms" << std::endl; | |
std::cout << " Data Size: " << d2h_size_gb * 1000 << " MB" << std::endl; | |
std::cout << " Bandwidth: " << d2h_bandwidth << " GB/s" << std::endl; | |
std::cout << "\nCompute Performance:" << std::endl; | |
std::cout << " Compute Time: " << compute_duration.count() << " ms" << std::endl; | |
std::cout << " CPU Time: " << cpu_duration.count() << " ms" << std::endl; | |
std::cout << " GPU Time (Total): " << gpu_duration.count() << " ms" << std::endl; | |
std::cout << "\nError Analysis:" << std::endl; | |
std::cout << " Mean Error: " << stats.mean_error << std::endl; | |
std::cout << " Std Dev: " << stats.std_dev << std::endl; | |
std::cout << " Max Difference: " << stats.max_diff << " (at index " << stats.max_diff_index << ")" << std::endl; | |
std::cout << " Min Difference: " << stats.min_diff << std::endl; | |
std::cout << " Max Relative Error: " << stats.max_relative_error * 100 << "%" << std::endl; | |
// Print sample values for comparison | |
std::cout << "\nSample Values Comparison:" << std::endl; | |
for(size_t i = 0; i < 5; i++) { | |
size_t idx = i * (output_size / 5); | |
std::cout << " Index " << idx << ": CPU=" << output_cpu[idx] | |
<< ", GPU=" << output_gpu[idx] | |
<< ", Diff=" << std::abs(output_cpu[idx] - output_gpu[idx]) << std::endl; | |
} | |
std::cout << "\nGFLOPS Comparison:" << std::endl; | |
float ops = (2.0f * batch_size * input_dim * output_dim) / 1e9; // Multiply-add counts as 2 ops | |
std::cout << "CPU GFLOPS: " << ops/(cpu_duration.count()/1000.0f) << std::endl; | |
std::cout << "GPU GFLOPS: " << ops/(compute_duration.count()/1000.0f) << std::endl; | |
std::cout << "\nTiming Comparison:" << std::endl; | |
std::cout << "CPU Time: " << cpu_duration.count() << "ms" << std::endl; | |
std::cout << "GPU Time: " << gpu_duration.count() << "ms" << std::endl; | |
std::cout << "Speedup: " << float(cpu_duration.count())/gpu_duration.count() << "x" << std::endl; | |
if (cpu_duration.count() < gpu_duration.count()) { | |
std::cout << "CPU was faster by " << gpu_duration.count() - cpu_duration.count() << "ms" << std::endl; | |
} else { | |
std::cout << "GPU was faster by " << cpu_duration.count() - gpu_duration.count() << "ms" << std::endl; | |
} | |
return 0; | |
} catch (const sycl::exception& e) { | |
std::cerr << "Error: " << e.what() << std::endl; | |
return 1; | |
} | |
} |
Author
mattfsourcecode
commented
May 8, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment