测试程序kernel是参考官方例程写的为:
__global__ void LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *output, cudaTextureObject_t tex_obj,
int width, int height, float theta) {
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / static_cast<float>(width);
float v = y / static_cast<float>(height);
// Read from texture and write to global memory
output[y * width + x] = tex2D<float>(tex_obj, u, v);
}
void LtsTBytesEquivL1sectormissPipeTexMemTexture() {
const int test_num = 1;
const int loop_num = 1;
int grid_num[test_num]{32};
int thread_num[test_num]{32};
float *h_src[test_num] = {nullptr};
float *d_res[test_num] = {nullptr};
float angle = 0.5;
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray_t cu_array[test_num];
std::srand(time(nullptr)); // use current time as seed for random generator
CUDA_CHECK(cudaSetDevice(0));
for (int i = 0; i < loop_num; i++) {
dim3 block(thread_num[i]);
dim3 grid(grid_num[i]);
// alloc cpu memory
h_src[i] = new float[thread_num[i] * grid_num[i]];
for (int j = 0; j < grid_num[i] * thread_num[i]; j++) {
h_src[i][j] = (std::rand() % 255);
}
// Allocate CUDA array in device memory
CUDA_CHECK(cudaMallocArray(&cu_array[i], &channel_desc, thread_num[i], grid_num[i]));
// Set pitch of the source (the width in memory in bytes of the 2D array pointed
// to by src, including padding), we dont have any padding
const size_t spitch = thread_num[i] * sizeof(float);
// Copy data located at address h_data in host memory to device memory
CUDA_CHECK(cudaMemcpy2DToArray(cu_array[i], 0, 0, h_src[i], spitch, thread_num[i] * sizeof(float), grid_num[i],
cudaMemcpyHostToDevice));
// Specify texture
struct cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypeArray;
res_desc.res.array.array = cu_array[i];
// Specify texture object parameters
struct cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeWrap;
tex_desc.addressMode[1] = cudaAddressModeWrap;
tex_desc.filterMode = cudaFilterModeLinear;
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t tex_obj = 0;
cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
// Allocate result of transformation in device memory
CUDA_CHECK(cudaMalloc(&d_res[i], grid_num[i] * thread_num[i] * sizeof(float)));
// kernel
switch (i) {
case 0: {
LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0<<<grid, block>>>(d_res[i], tex_obj, thread_num[i],
grid_num[i], angle);
break;
}
default:
break;
}
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaGetLastError());
// Destroy texture object
cudaDestroyTextureObject(tex_obj);
cudaFreeArray(cu_array[i]);
delete[] h_src[i];
CUDA_FREE(d_res[i]);
}
}
程序实现的功能为:从texture中读取32*32的数据写入到global memory中,然后测试了lts__t_bytes_equiv_l1sectormiss_pipe_tex、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld和lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex这四个metrics,测试结果为:
LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *, unsigned long long, int, int, float), 2022-Apr-15 10:38:35, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
lts__t_bytes_equiv_l1sectormiss_pipe_tex.avg byte 512
lts__t_bytes_equiv_l1sectormiss_pipe_tex.max byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex.min byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex.sum Kbyte 16.38
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.avg byte 512
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.max byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.min byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.sum Kbyte 16.38
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.avg byte 0
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.max byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.min byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.sum byte 0
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.avg byte 512
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.max byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.min byte nan
lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.sum Kbyte 16.38
---------------------------------------------------------------------- --------------- ------------------------------
以lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex为例,该metrics的解释为:
# of bytes requested for TEX instructions
使用sass代码的确是有这个指令,但是我比较疑惑的是为什么数据量是16384byte,
因为我的数据总量为32*32*4=4096byte,请问这是为什么呢?