// Allocate device memory float *d_a, *d_b, *d_c; cudaMalloc(&d_a, bytes); cudaMalloc(&d_b, bytes); cudaMalloc(&d_c, bytes);
// Copy result back to host cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
return 0; # Compile nvcc -o vector_add vector_add.cu Run ./vector_add Makefile for larger projects CUDA_PATH ?= /usr/local/cuda NVCC = $(CUDA_PATH)/bin/nvcc NVCC_FLAGS = -arch=sm_75 -O3 -std=c++17 TARGET = vector_add SOURCES = vector_add.cu cuda toolkit
all: $(TARGET)
run: $(TARGET) ./$(TARGET)
// Verify result bool correct = true; for (int i = 0; i < n; i++) if (abs(h_c[i] - (h_a[i] + h_b[i])) > 1e-5) correct = false; break;
std::cout << (correct ? "SUCCESS" : "FAILURE") << std::endl; // Allocate device memory float *d_a, *d_b, *d_c;
// Copy data to device cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);