This is the simplest CUDA program for transposing a square matrix as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:
make -f matrix_transpose_naive.mk
To run, type:
bin/darwin/release/matrix_transpose_naive
Below is the kernel source code for this matrix_transpose variant. Click here for the complete source code from Day 2.
#include "matrix_transpose_naive_kernel.h"
#include <assert.h>
#define BLOCK_SIZE 16
__global__
void
kernel_matrix_transpose_naive( float * output, float const * input, int dim )
{
int x = blockIdx.x*BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y*BLOCK_SIZE + threadIdx.y;
output[y + dim*x] = input[x + dim*y];
}
void
cuda_matrix_transpose_naive( float * output, float const * input, int dim )
{
assert(output!=0);
assert(input!=0);
assert(dim>0);
assert(!(dim%BLOCK_SIZE));
dim3 block(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(dim/block.x,dim/block.y);
kernel_matrix_transpose_naive<<<grid,block>>>(output,input,dim);
}
Other matrix_transpose variants used: