CUDA:纹理内存入门到精通--纹理对象
导读CUDA:纹理内存入门到精通–纹理内存概要CUDA:纹理内存入门到精通–纹理参考背景纹理对象是CUDA针对纹理参考缺点而提出的升级版,其作用和纹理参考完全一致,但是使用方法更加灵活。纹理对象与纹理参考相比,CUDA对其进行各方面的升级,一方面是可以再代码中申请和销毁,另一方面则可以作为设备函数的参数进行传入;可以满足一些特殊的需求。纹理对象API使用纹理对象...
导读
CUDA:纹理内存入门到精通–纹理内存概要
CUDA:纹理内存入门到精通–纹理参考
CUDA:纹理内存入门到精通–纹理对象
背景
纹理对象是CUDA针对纹理参考缺点而提出的升级版,其作用和纹理参考完全一致,但是使用方法更加灵活。
纹理对象
与纹理参考相比,CUDA对其进行各方面的升级,一方面是可以再代码中申请和销毁,另一方面则可以作为设备函数的参数进行传入;可以满足一些特殊的需求。
纹理对象API
使用纹理对象主要包括纹理对象创建、纹理访问和纹理对象销毁。
######纹理对象创建
纹理对象创建之前首先要分别对纹理资源和纹理对象属性进行确定,分别对应cudaResoruceDesc和cudaTextureDesc;然后即可利用cudaCreateTextureObject来创建纹理对象。本文通过代码来解释这个过程:
// 纹理资源
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
//resType指定对应设备内存的形式,主要包括
//cudaResourceTypeArray(二维纹理内存和二维纹理对象)
//cudaResourceTypeMipmappedArray(不常用)
//cudaResourceTypeLinear(一维纹理内存和一维纹理对象)
//cudaResourceTypePitch2D(一维纹理内存和二维纹理对象)
resDesc.resType = cudaResourceTypeArray;
//res是一个枚举变量,针对不同内存也有不同的形式
//cudaResourceTypeArray 对应 res.array.array
//cudaResourceTypeMipmappedArray 对应res.mipmap.mipmap
//cudaResourceTypeLinear 对应 res.linear.devPtr(同时还需要设置res.linear.sizeInBytes和res.linear.desc)
//cudaResourceTypePitch2D 对应 res.pitch2D.devPtr(同时需要设定res.pitch2D.pitchInBytes,res.pitch2D.width,res.pitch2D.height,res.pitch2D.ddesc)
resDesc.res.array.array = cuArray;//指定需要绑定的二维纹理内存
// 纹理对象的属性
// 由于与纹理参考类似,不再重复介绍
// 再次提醒一次:cudaFilterModeLinear必须配合cudaReadModeNormalizedFloat使用
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeNormalizedFloat;
texDesc.normalizedCoords = 1;
// 创建纹理对象
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
######纹理访问
纹理对象的纹理访问也和纹理参考一样,也是使用tex1D或tex2D等函数进行操作。
//一维纹理
template<class T>
T tex1D(cudaTextureObject_t texObj, float x);
//二维纹理
template<class T>
T tex2D(cudaTextureObject_t texObj, float x, float y);
######纹理对象销毁
纹理对象的销毁直接使用cudaDestroyTextureObject即可。值得注意的是应该先销毁对象,然后再释放对应的设备内存。
__host__ cudaError_t cudaDestroyTextureObject ( cudaTextureObject_t texObject )
示例代码
//C++
#include <time.h>
#include <iostream>
using namespace std;
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <cmath>
// 核函数
__global__ void transformKernel(float* output,
cudaTextureObject_t texObj,
int width, int height)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if ( x<0 || x>width || y<0 || y>height )
return;
// 获取纹理
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#linear-filtering
output[ y*width+x ] = tex2D<float>(texObj, x+0.5f, y+0.5f);
}
int main()
{
//实验数据
int width = 10;
int height = 10;
int size = width*height*sizeof(float);
float *h_data = new float[width*height];
for (int y = 0; y<height; y++)
{
for (int x = 0; x<width; x++)
{
h_data[y*width + x] = x;
}
}
for (int y = 0; y<height; y++)
{
for (int x = 0; x<width; x++)
{
printf("%f ", h_data[y*width + x]);
}
printf("\n");
}
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaMemcpyToArray(cuArray, 0, 0, h_data, size,cudaMemcpyHostToDevice);
// 创建纹理对象
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
float* output;
cudaMalloc((void**)&output, size);
// 调用核函数
dim3 dimBlock(4, 4);
dim3 dimGrid( max( (width + dimBlock.x - 1) / dimBlock.x,1),
max( (height + dimBlock.y - 1) / dimBlock.y,1) );
transformKernel <<<dimGrid, dimBlock >>>(output,
texObj,
width, height);
cudaMemcpy(h_data, output, size, cudaMemcpyDeviceToHost);
for (int y = 0; y<height; y++)
{
for (int x = 0; x<width; x++)
{
printf("%f ",h_data[y*width + x]);
}
printf("\n");
}
// 销毁纹理对象
cudaDestroyTextureObject(texObj);
// 释放设备内存
cudaFreeArray(cuArray);
cudaFree(output);
delete[]h_data;
return 0;
}
额外注意
在使用纹理对象时还有一些特殊注意。通常而言,使用纹理对象的目的就是使用多个纹理对象,因此会申请一个cudaTextureObject_t 数组。注意在使用时必须将其拷贝到device端才能正常使用。下边给出一个示例。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <time.h>
__global__ void kernel_set_value(const cudaTextureObject_t *texObj,float *dev_result,int width,int height)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
if(x<0 || x>width || y<0 || y>height)
{
return;
}
float sum = 0;
for(int i=0;i<4;i++)
{
sum += tex2D<float>(texObj[i],x,y);
}
int pos = y*width + x;
dev_result[pos]= sum;
}
int main()
{
const int array_size_width = 10;
const int array_size_height = 10;
float random_array[array_size_width*array_size_height];
for(int i=0;i<array_size_width*array_size_height;i++)
{
random_array[i] = 1;
}
//error status
cudaError_t cuda_status;
//only chose one GPU
cuda_status = cudaSetDevice(0);
if(cuda_status != cudaSuccess)
{
fprintf(stderr,"cudaSetDevice failed! Do you have a CUDA-Capable GPU installed?");
return 1;
}
cudaArray *dev_random_array[4];
cudaTextureObject_t texObj[4];
for(int i=0;i<4;i++)
{
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//allocate memory on the GPU
cuda_status = cudaMallocArray(&dev_random_array[i],
&channelDesc,
array_size_width,
array_size_height);
if(cuda_status != cudaSuccess)
{
fprintf(stderr,"cudaMallocArray Failed");
exit( EXIT_FAILURE );
}
cuda_status = cudaMemcpyToArray(dev_random_array[i],
0,
0,
random_array,
sizeof(float)*array_size_height*array_size_width,
cudaMemcpyHostToDevice);
if(cuda_status != cudaSuccess)
{
fprintf(stderr,"cudaMemcpyToArray Failed");
exit( EXIT_FAILURE );
}
// Specify texture
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = dev_random_array[i];
// Specify texture object parameters
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
cudaCreateTextureObject(&texObj[i], &resDesc, &texDesc, NULL);
}
//将纹理对象拷贝到设备端
cudaTextureObject_t *dev_texObj;
cudaMalloc((void**)&dev_texObj,sizeof(cudaTextureObject_t)*4);
cudaMemcpy(dev_texObj,texObj,sizeof(cudaTextureObject_t)*4,cudaMemcpyHostToDevice);
float *dev_result;
cudaMalloc((void**)&dev_result,sizeof(float)*array_size_height*array_size_width);
dim3 threads(16,16);
dim3 grid((array_size_width+threads.x-1)/threads.x,(array_size_height+threads.y-1)/threads.y);
kernel_set_value<<<grid,threads>>>(dev_texObj,dev_result,array_size_width,array_size_height);
cuda_status = cudaGetLastError();
if(cuda_status != cudaSuccess)
{
fprintf(stderr,"kernel_set_value Failed");
exit( EXIT_FAILURE );
}
cuda_status = cudaMemcpy(random_array,dev_result,sizeof(float)*array_size_width*array_size_height,cudaMemcpyDeviceToHost);//dev_depthMap
if(cuda_status != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy Failed");
exit( EXIT_FAILURE );
}
for(int i=0;i<array_size_width*array_size_height;i++)
{
printf("%f\n",random_array[i]);
}
//free
cudaFree(dev_texObj);
cudaFree(dev_result);
for(int i=0;i<4;i++)
{
cudaFreeArray(dev_random_array[i]);
cudaDestroyTextureObject(texObj[i]);
}
return 0;
}
更多推荐


所有评论(0)