Parallel Programming Seminar

August 5-7 2009

Interdisciplinary Mathematics Institute
University of South Carolina

matrix_transpose_naive

This is the simplest CUDA program for transposing a square matrix as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:

make -f matrix_transpose_naive.mk

To run, type:

bin/darwin/release/matrix_transpose_naive

Below is the kernel source code for this matrix_transpose variant. Click here for the complete source code from Day 2.

#include "matrix_transpose_naive_kernel.h"
#include <assert.h>

#define BLOCK_SIZE 16

__global__
void
kernel_matrix_transpose_naive( float * output, float const * input, int dim )
    {
    int x = blockIdx.x*BLOCK_SIZE + threadIdx.x;
    int y = blockIdx.y*BLOCK_SIZE + threadIdx.y;
    output[y + dim*x] = input[x + dim*y];
    }

void
cuda_matrix_transpose_naive( float * output, float const * input, int dim )
    {
    assert(output!=0);
    assert(input!=0);
    assert(dim>0);
    assert(!(dim%BLOCK_SIZE));
    dim3 block(BLOCK_SIZE,BLOCK_SIZE);
    dim3 grid(dim/block.x,dim/block.y);
    kernel_matrix_transpose_naive<<<grid,block>>>(output,input,dim);
    }

Other matrix_transpose variants used: