Parallel Programming Seminar

August 5-7 2009

Interdisciplinary Mathematics Institute
University of South Carolina

matrix_transpose_texture_1d

This is a simple CUDA program for transposing a square matrix as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:

make -f matrix_transpose_texture_1d.mk

To run, type:

bin/darwin/release/matrix_transpose_texture_1d

Below is the kernel source code for this matrix_transpose variant. It uses 1D texture cache, which is somewhat inefficient but allows plain device memory to be directly mapped to a CUDA texture reference. This technique is somewhat obsolete as of version 2.2 of the CUDA SDK, which allows plain device memory to be directly mapped to 2D texture references as well. Previously, 2D texture references could only be mapped to cudaArrays, which are opaque regions of device memory that can not be written directly into by kernel code.

Click here for the complete source code from Day 2.

#include "matrix_transpose_texture_1d_kernel.h"
#include <assert.h>

#define BLOCK_SIZE 16

texture<float,1,cudaReadModeElementType> matrix_transpose_texture_1d_texref;

__global__
void
matrix_transpose_texture_1d_kernel( float * output, int dim )
    {
    int x = blockIdx.x*BLOCK_SIZE + threadIdx.x;
    int y = blockIdx.y*BLOCK_SIZE + threadIdx.y;
    float v=tex1Dfetch(matrix_transpose_texture_1d_texref,y + dim*x);
    __syncthreads();
    output[dim*y + x] = v;
    }

void
cuda_matrix_transpose_texture_1d( float * output, float const * input, int dim )
    {
    assert(output!=0);
    assert(input!=0);
    assert(dim>0);
    assert(!(dim%BLOCK_SIZE));
    cudaBindTexture(0,matrix_transpose_texture_1d_texref,input,cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat)); 
    dim3 block(BLOCK_SIZE,BLOCK_SIZE);
    dim3 grid(dim/block.x,dim/block.y);
    matrix_transpose_texture_1d_kernel<<<grid,block>>>(output,dim);
    }

Other matrix_transpose variants used: