University of South Carolina

This is a simple CUDA program for transposing a square matrix as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:

make -f matrix_transpose_texture_2d.mk

To run, type:

bin/darwin/release/matrix_transpose_texture_2d

Below is the kernel source code for this matrix_transpose variant. It uses maps 2D texture reference directly to a 2D device memory buffer. This technique is new as of version 2.2 of the CUDA SDK, previously, 2D texture references could only be mapped to cudaArrays, which are opaque regions of device memory that can not be written directly into by kernel code.

Click here for the complete source code from Day 2.

#include "matrix_transpose_texture_2d_kernel.h" #include <assert.h> #define BLOCK_SIZE 16 texture<float,2,cudaReadModeElementType> matrix_transpose_texture_2d_texref; __global__ void matrix_transpose_texture_2d_kernel( float * output, int dim ) { int x = blockIdx.x*BLOCK_SIZE + threadIdx.x; int y = blockIdx.y*BLOCK_SIZE + threadIdx.y; float v=tex2D(matrix_transpose_texture_2d_texref,y,x); __syncthreads(); output[dim*y + x] = v; } void cuda_matrix_transpose_texture_2d( float * output, float const * input, int dim ) { assert(output!=0); assert(input!=0); assert(dim>0); assert(!(dim%BLOCK_SIZE)); cudaBindTexture2D(0,matrix_transpose_texture_2d_texref,input,cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat),dim,dim,dim*sizeof(float)); dim3 block(BLOCK_SIZE,BLOCK_SIZE); dim3 grid(dim/block.x,dim/block.y); matrix_transpose_texture_2d_kernel<<<grid,block>>>(output,dim); }

Other matrix_transpose variants used:

matrix_transpose_naive

matrix_transpose_shared

matrix_transpose_swizzle_texture_1d

matrix_transpose_texture_1d

matrix_transpose_texture_2d

matrix_transpose_shared

matrix_transpose_swizzle_texture_1d

matrix_transpose_texture_1d

matrix_transpose_texture_2d