12th October 2025

A CUDA Program for Linear Regression

Optimizing CUDA code to compute OLS from scratch

In this article, I present a CUDA Kernel to compute Ordinary Least Squares Regression.
In OLS, we have a matrix X with dimensions N x p, where N is the number of samples and p is the number of parameters. We minimize the L2 norm of (Xb - y).
This comes out to be b = (X^T X)^-1 X^Ty
Writing an efficient CUDA kernel for this is non trivial because you need many steps in this computation.
At the time of writing, I'm the 4th person on LeetGPU to solve this problem

Matrix multiplication
Matrix and vector multiplication
Matrix inversion

You can argue that points (1) and (2) are the same, but actually, optimizing them is very different.

Matrix Multiplication

How do we optimize Matrix multiplication? An obvious method is to simply allow each thread to represent a single output. I had ChatGPT write this Naive kernel below. It just loads every element in A and B (in the corresponding rows and columns) and multiplies them and adds them.

									 __global__ void matmul_naive(const float* __restrict__ A,
				const float* __restrict__ B,
				float* __restrict__ C,
				int N, int M, int K)
{
	// Row and column of output cell
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= N || col >= K) return;
	float acc = 0.0f; // accumulate output in here
	for (int e = 0; e < M; ++e)
		acc += A[row * M + e] * B[e * K + col]; // CUDA arrays are flattened
	C[row * K + col] = acc;
}

But, this has a lot of global memory accesses which are very slow (100s of clock cycles). Adjacent element calculations can share a lot of memory accesses with each other. We can do this by breaking up both input matrices and the output matrix into patches of size TILE. We make Blocks of shape TILE x TILE. Each block maintains a shared memory array to cache in a patch from A and a patch from B. Each thread loads its own corresponding element. We then multiply the patches and accumulate. Multiplying from shared memory is MUCH faster.
The kernel below is my implementation for this tiled technique.

 #define TILE 32
__global__ void matmul(const float* A, const float* B, float *out, int N, int M, int K){
	// Initialize Shared Memory for each tile to load in A and B elements
	__shared__ float As[TILE][TILE];
	__shared__ float Bs[TILE][TILE];
	
	int i = threadIdx.y + blockDim.y * blockIdx.y;
	int j = threadIdx.x + blockDim.x * blockIdx.x;
	float s = 0; // accumulate the output in s
	for(int x = 0; x < (M + TILE - 1) / TILE; x++){
		int cA = x * TILE + threadIdx.x;
		int rB = x * TILE + threadIdx.y;
		
		// Threads collaborate to load in a single patch of data
		As[threadIdx.y][threadIdx.x] = (i < N && cA < M) ? A[i * M + cA] : 0;
		Bs[threadIdx.y][threadIdx.x] = (rB < M && j < K) ? B[rB * K + j] : 0;
		__syncthreads();
		
		// With the loaded in data, collect the terms into s
		#pragma unroll
		for(int k = 0; k < TILE; k++){
			s += As[threadIdx.y][k] * Bs[k][threadIdx.x];
		}
		__syncthreads();
	}
	// Write to output
	if(0 <= i && i < N && 0 <= j && j < K){
		out[i * K + j] = s;
	}
}

But we also need to compute the transpose, and you may have noticed, I didn't include a step for it. The truth is, transposing a matrix would waste time, so we just account for it in our matrix computation.
This results in a similar multiplication, but with slightly different indexing.

									 __global__ void transpose_matmul(const float* A, const float* B, float *out, int N, int M, int K){
	// Treat A as a transposed matrix, so compute A^T * B
	// Size of A is M x N and of B is M x K
	__shared__ float As[TILE][TILE];
	__shared__ float Bs[TILE][TILE];
	
	int i = threadIdx.y + blockDim.y * blockIdx.y;
	int j = threadIdx.x + blockDim.x * blockIdx.x;
	float s = 0;
	for(int x = 0; x < (M + TILE - 1) / TILE; x++){
		int rA = x * TILE + threadIdx.x;
		int rB = x * TILE + threadIdx.y;
		
		// Loading index method changes!
		As[threadIdx.y][threadIdx.x] = (rA < M && i < N) ? A[rA * N + i] : 0;
		Bs[threadIdx.y][threadIdx.x] = (rB < M && j < K) ? B[rB * K + j] : 0;
		__syncthreads();
		#pragma unroll
		for(int k = 0; k < TILE; k++){
			s += As[threadIdx.y][k] * Bs[k][threadIdx.x];
		}
		__syncthreads();
	}
	
	if(0 <= i && i < N && 0 <= j && j < K){
		out[i * K + j] = s;
	}
}

Matrix and Vector Multiplication

Here, I implemented the Naive solution rather than a reduction tree. Each thread computes its own output element. It can be parallelized a bit more, but on the platform I implemented, I know number of features <= 1024. So this was ok.

									 __global__ void transpose_matprod(const float* A, const float* y, float *out, int N, int M){
	// Treat A as a transposed matrix, so compute A^T * y
	// Size of A is N x M and of y is N x 1
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	float s = 0;
	if(i < M){
		// Each thread computes one output element.
		// Can be optimized, try to see if you can figure out from the article here: "Fastest BatchNorm Kernel on LeetGPU"
		for(int j = 0; j < N; j++){
			s += y[j] * A[j * M + i];
		}
		out[i] = s;
	}
}
__global__ void matprod(const float* A, const float* x, float* y, int N, int M){
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= N) return;
	float s = 0;
	// Same optimization can be applied here.
	// Note, we can return early because we are not collaborating threads 
	// (and not using any sync like__syncthreads)
	for (int k = 0; k < M; ++k) {
		s += A[i * M + k] * x[k];
	}
	y[i] = s;
}

Matrix Inversion

Matrix Inversion was the hardest part by far. I implemented the standard augmented matrix approach. The "vectorization" here is only for the row reduction steps. I felt the rest of the steps were pretty sequential.
The optimizations are:

Load current row into shared memory
Load current pivot column into shared memory

This greatly reduced global memory accesses.
One thing that helped was since the matrix (X^T X) has dim num features x num features, I knew I can just fit it in 1 block. Otherwise, it would require some more complex techniques.

									 __global__ void matrix_inverse(float *A, float *out, int N){
	// Single block expected
	int i = threadIdx.x;
	// Initialize augmented matrix
	for (int j = 0; j < N; j++) {
		if (i < N) {
			out[j * N + i] = (i == j) ? 1 : 0;
		}
	}
	__syncthreads();
	// We will be re-reading not only the current row
	// but also the elements in the column we're 0ing
	// so we can load all these elements in shared mem
	__shared__ float As[1024]; // row j of A
	__shared__ float Ac[1024]; // column j of A
	__shared__ float Os[1024]; // row j of out
	__shared__ int spivot; // for matrix pivot
	// Pivot logic
	for (int j = 0; j < N; j++) {
		if (i == 0) {
			int p = j;
			float maxv = fabsf(A[j * N + j]);
			for (int r = j + 1; r < N; ++r) {
				float v = fabsf(A[r * N + j]);
				if (v > maxv) { maxv = v; p = r; }
			}
			spivot = p;
		}
		__syncthreads();
		if (spivot != j && i < N) {
			float tA = A[j * N + i];
			A[j * N + i] = A[spivot * N + i];
			A[spivot * N + i] = tA;
			float tO = out[j * N + i];
			out[j * N + i] = out[spivot * N + i];
			out[spivot * N + i] = tO;
		}
		__syncthreads();
		// Load into shared mem
		if (i < N) {
			As[i] = A[j * N + i];
			Ac[i] = A[i * N + j];
			Os[i] = out[j * N + i];
		}
		__syncthreads();
		
		// Row reduction
		float red = As[j];
		if (i < N) {
			As[i] /= red;
			Os[i] /= red;
		}
		__syncthreads();
		for (int k = j + 1; k < N; k++) {
			if (i < N) {
				float f = Ac[k];
				A[k * N + i]   -= f * As[i];
				out[k * N + i] -= f * Os[i];
			}
		}
		__syncthreads();
		
		// Write out row reduction results
		if (i < N) {
			A[j * N + i] = As[i];
			out[j * N + i] = Os[i];
		}
		__syncthreads();
	}
	// We have the upper triangular matrix
	// Now we're reducing upwards!
	for (int j = N - 1; j >= 0; j--) {
		if (i < N) {
			As[i] = A[j * N + i];
			Ac[i] = A[i * N + j];
			Os[i] = out[j * N + i];
		}
		__syncthreads();
		for (int k = j - 1; k >= 0; k--) {
			if (i < N) {
				float f = Ac[k];
				A[k * N + i]   -= f * As[i];
				out[k * N + i] -= f * Os[i];
			}
		}
		__syncthreads();
		if (i < N) {
			A[j * N + i] = As[i];
			out[j * N + i] = Os[i];
		}
		__syncthreads();
	}
}

OLS function

Putting it all together, we can write a single OLS function. Assuming X, y, b are already allocated appropriately

									 extern "C" void ols(const float* X, const float* y, float* beta, int samples, int features) {
	const int N = samples;
	const int M = features;
	// Allocate space for intermediate results
	float *d_XtX = nullptr;
	float *d_Inv = nullptr;
	float *d_z   = nullptr;
	cudaMalloc(&d_XtX, sizeof(float) * M * M);
	cudaMalloc(&d_Inv, sizeof(float) * M * M);
	cudaMalloc(&d_z, sizeof(float) * M);
	
	// Compute X^T X
	dim3 block(TILE, TILE);
	dim3 grid_XtX((M + TILE - 1)/TILE, (M + TILE - 1)/TILE);
	transpose_matmul<<<grid_XtX, block>>>(X, X, d_XtX, M, N, M);
	
	// Compute (X^T X)^-1
	matrix_inverse<<<1, M>>>(d_XtX, d_Inv, M);
	{
		// Compute X^T y
		int threads = 256;
		int blocks  = (M + threads - 1) / threads;
		transpose_matprod<<<blocks, threads>>>(X, y, d_z, N, M);
	}
	{
		// Compute (X^T X)^-1 * (X^T y)
		int threads = 256;
		int blocks  = (M + threads - 1) / threads;
		matprod<<<blocks, threads>>>(d_Inv, d_z, beta, M, M);
	}
	// Free allocated blocks
	cudaFree(d_XtX);
	cudaFree(d_Inv);
	cudaFree(d_z);
}

Results and Going Further

Result was that I was the 4th person to solve this problem!
To extend this, some indexing can be reordered in transpose_matmul to make use of coalescing. We can also use reduction patterns to improve transpose_matprod.

If my cuda work was interesting, please check out my github for more:
CUDA Practice Repo

I hope you enjoyed reading the article

Any feedback is greatly appreciated!

	__global__ void matmul_naive(const float* __restrict__ A,
	const float* __restrict__ B,
	float* __restrict__ C,
	int N, int M, int K)
	{
	// Row and column of output cell
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= N \|\| col >= K) return;

	float acc = 0.0f; // accumulate output in here
	for (int e = 0; e < M; ++e)
	acc += A[row * M + e] * B[e * K + col]; // CUDA arrays are flattened

	C[row * K + col] = acc;
	}

	#define TILE 32
	__global__ void matmul(const float* A, const float* B, float *out, int N, int M, int K){
	// Initialize Shared Memory for each tile to load in A and B elements
	__shared__ float As[TILE][TILE];
	__shared__ float Bs[TILE][TILE];

	int i = threadIdx.y + blockDim.y * blockIdx.y;
	int j = threadIdx.x + blockDim.x * blockIdx.x;

	float s = 0; // accumulate the output in s

	for(int x = 0; x < (M + TILE - 1) / TILE; x++){
	int cA = x * TILE + threadIdx.x;
	int rB = x * TILE + threadIdx.y;

	// Threads collaborate to load in a single patch of data
	As[threadIdx.y][threadIdx.x] = (i < N && cA < M) ? A[i * M + cA] : 0;
	Bs[threadIdx.y][threadIdx.x] = (rB < M && j < K) ? B[rB * K + j] : 0;
	__syncthreads();

	// With the loaded in data, collect the terms into s
	#pragma unroll
	for(int k = 0; k < TILE; k++){
	s += As[threadIdx.y][k] * Bs[k][threadIdx.x];
	}
	__syncthreads();
	}
	// Write to output
	if(0 <= i && i < N && 0 <= j && j < K){
	out[i * K + j] = s;
	}
	}

	__global__ void transpose_matmul(const float* A, const float* B, float *out, int N, int M, int K){
	// Treat A as a transposed matrix, so compute A^T * B
	// Size of A is M x N and of B is M x K
	__shared__ float As[TILE][TILE];
	__shared__ float Bs[TILE][TILE];

	int i = threadIdx.y + blockDim.y * blockIdx.y;
	int j = threadIdx.x + blockDim.x * blockIdx.x;

	float s = 0;

	for(int x = 0; x < (M + TILE - 1) / TILE; x++){
	int rA = x * TILE + threadIdx.x;
	int rB = x * TILE + threadIdx.y;

	// Loading index method changes!
	As[threadIdx.y][threadIdx.x] = (rA < M && i < N) ? A[rA * N + i] : 0;
	Bs[threadIdx.y][threadIdx.x] = (rB < M && j < K) ? B[rB * K + j] : 0;
	__syncthreads();

	#pragma unroll
	for(int k = 0; k < TILE; k++){
	s += As[threadIdx.y][k] * Bs[k][threadIdx.x];
	}
	__syncthreads();
	}

	if(0 <= i && i < N && 0 <= j && j < K){
	out[i * K + j] = s;
	}
	}

	__global__ void transpose_matprod(const float* A, const float* y, float *out, int N, int M){
	// Treat A as a transposed matrix, so compute A^T * y
	// Size of A is N x M and of y is N x 1
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	float s = 0;
	if(i < M){
	// Each thread computes one output element.
	// Can be optimized, try to see if you can figure out from the article here: "Fastest BatchNorm Kernel on LeetGPU"
	for(int j = 0; j < N; j++){
	s += y[j] * A[j * M + i];
	}
	out[i] = s;
	}
	}

	__global__ void matprod(const float* A, const float* x, float* y, int N, int M){
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= N) return;
	float s = 0;
	// Same optimization can be applied here.
	// Note, we can return early because we are not collaborating threads
	// (and not using any sync like__syncthreads)
	for (int k = 0; k < M; ++k) {
	s += A[i * M + k] * x[k];
	}
	y[i] = s;
	}

	__global__ void matrix_inverse(float A, float out, int N){
	// Single block expected
	int i = threadIdx.x;

	// Initialize augmented matrix
	for (int j = 0; j < N; j++) {
	if (i < N) {
	out[j * N + i] = (i == j) ? 1 : 0;
	}
	}
	__syncthreads();

	// We will be re-reading not only the current row
	// but also the elements in the column we're 0ing
	// so we can load all these elements in shared mem
	__shared__ float As[1024]; // row j of A
	__shared__ float Ac[1024]; // column j of A
	__shared__ float Os[1024]; // row j of out
	__shared__ int spivot; // for matrix pivot

	// Pivot logic
	for (int j = 0; j < N; j++) {
	if (i == 0) {
	int p = j;
	float maxv = fabsf(A[j * N + j]);
	for (int r = j + 1; r < N; ++r) {
	float v = fabsf(A[r * N + j]);
	if (v > maxv) { maxv = v; p = r; }
	}
	spivot = p;
	}
	__syncthreads();

	if (spivot != j && i < N) {
	float tA = A[j * N + i];
	A[j * N + i] = A[spivot * N + i];
	A[spivot * N + i] = tA;

	float tO = out[j * N + i];
	out[j * N + i] = out[spivot * N + i];
	out[spivot * N + i] = tO;
	}
	__syncthreads();

	// Load into shared mem
	if (i < N) {
	As[i] = A[j * N + i];
	Ac[i] = A[i * N + j];
	Os[i] = out[j * N + i];
	}
	__syncthreads();

	// Row reduction
	float red = As[j];

	if (i < N) {
	As[i] /= red;
	Os[i] /= red;
	}
	__syncthreads();

	for (int k = j + 1; k < N; k++) {
	if (i < N) {
	float f = Ac[k];
	A[k * N + i] -= f * As[i];
	out[k * N + i] -= f * Os[i];
	}
	}
	__syncthreads();

	// Write out row reduction results
	if (i < N) {
	A[j * N + i] = As[i];
	out[j * N + i] = Os[i];
	}
	__syncthreads();
	}

	// We have the upper triangular matrix
	// Now we're reducing upwards!
	for (int j = N - 1; j >= 0; j--) {
	if (i < N) {
	As[i] = A[j * N + i];
	Ac[i] = A[i * N + j];
	Os[i] = out[j * N + i];
	}
	__syncthreads();

	for (int k = j - 1; k >= 0; k--) {
	if (i < N) {
	float f = Ac[k];
	A[k * N + i] -= f * As[i];
	out[k * N + i] -= f * Os[i];
	}
	}
	__syncthreads();

	if (i < N) {
	A[j * N + i] = As[i];
	out[j * N + i] = Os[i];
	}
	__syncthreads();
	}
	}

	extern "C" void ols(const float* X, const float* y, float* beta, int samples, int features) {
	const int N = samples;
	const int M = features;

	// Allocate space for intermediate results
	float *d_XtX = nullptr;
	float *d_Inv = nullptr;
	float *d_z = nullptr;
	cudaMalloc(&d_XtX, sizeof(float) * M * M);
	cudaMalloc(&d_Inv, sizeof(float) * M * M);
	cudaMalloc(&d_z, sizeof(float) * M);

	// Compute X^T X
	dim3 block(TILE, TILE);
	dim3 grid_XtX((M + TILE - 1)/TILE, (M + TILE - 1)/TILE);
	transpose_matmul<<<grid_XtX, block>>>(X, X, d_XtX, M, N, M);

	// Compute (X^T X)^-1
	matrix_inverse<<<1, M>>>(d_XtX, d_Inv, M);

	{
	// Compute X^T y
	int threads = 256;
	int blocks = (M + threads - 1) / threads;
	transpose_matprod<<<blocks, threads>>>(X, y, d_z, N, M);
	}

	{
	// Compute (X^T X)^-1 * (X^T y)
	int threads = 256;
	int blocks = (M + threads - 1) / threads;
	matprod<<<blocks, threads>>>(d_Inv, d_z, beta, M, M);
	}

	// Free allocated blocks
	cudaFree(d_XtX);
	cudaFree(d_Inv);
	cudaFree(d_z);

	}