访问texture和global memory的区别

  • 1 replies
  • 3245 views
访问texture和global memory的区别
« 于: 四月 15, 2022, 06:48:03 pm »
测试程序kernel是参考官方例程写的为:
程序代码: [选择]
__global__ void LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *output, cudaTextureObject_t tex_obj,
                                                                   int width, int height, float theta) {
    // Calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    float u = x / static_cast<float>(width);
    float v = y / static_cast<float>(height);


    // Read from texture and write to global memory
    output[y * width + x] = tex2D<float>(tex_obj, u, v);
}

void LtsTBytesEquivL1sectormissPipeTexMemTexture() {
    const int test_num = 1;
    const int loop_num = 1;
    int grid_num[test_num]{32};
    int thread_num[test_num]{32};
    float *h_src[test_num] = {nullptr};
    float *d_res[test_num] = {nullptr};

    float angle = 0.5;
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray_t cu_array[test_num];

    std::srand(time(nullptr));  // use current time as seed for random generator
    CUDA_CHECK(cudaSetDevice(0));
    for (int i = 0; i < loop_num; i++) {
        dim3 block(thread_num[i]);
        dim3 grid(grid_num[i]);

        // alloc cpu memory
        h_src[i] = new float[thread_num[i] * grid_num[i]];
        for (int j = 0; j < grid_num[i] * thread_num[i]; j++) {
            h_src[i][j] = (std::rand() % 255);
        }

        // Allocate CUDA array in device memory
        CUDA_CHECK(cudaMallocArray(&cu_array[i], &channel_desc, thread_num[i], grid_num[i]));

        // Set pitch of the source (the width in memory in bytes of the 2D array pointed
        // to by src, including padding), we dont have any padding
        const size_t spitch = thread_num[i] * sizeof(float);

        // Copy data located at address h_data in host memory to device memory
        CUDA_CHECK(cudaMemcpy2DToArray(cu_array[i], 0, 0, h_src[i], spitch, thread_num[i] * sizeof(float), grid_num[i],
                                       cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc res_desc;
        memset(&res_desc, 0, sizeof(res_desc));
        res_desc.resType = cudaResourceTypeArray;
        res_desc.res.array.array = cu_array[i];

        // Specify texture object parameters
        struct cudaTextureDesc tex_desc;
        memset(&tex_desc, 0, sizeof(tex_desc));
        tex_desc.addressMode[0] = cudaAddressModeWrap;
        tex_desc.addressMode[1] = cudaAddressModeWrap;
        tex_desc.filterMode = cudaFilterModeLinear;
        tex_desc.readMode = cudaReadModeElementType;
        tex_desc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t tex_obj = 0;
        cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);

        // Allocate result of transformation in device memory
        CUDA_CHECK(cudaMalloc(&d_res[i], grid_num[i] * thread_num[i] * sizeof(float)));

        // kernel
        switch (i) {
            case 0: {
                LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0<<<grid, block>>>(d_res[i], tex_obj, thread_num[i],
                                                                                    grid_num[i], angle);
                break;
            }
            default:
                break;
        }

        CUDA_CHECK(cudaStreamSynchronize(0));
        CUDA_CHECK(cudaGetLastError());

        // Destroy texture object
        cudaDestroyTextureObject(tex_obj);
        cudaFreeArray(cu_array[i]);

        delete[] h_src[i];
        CUDA_FREE(d_res[i]);
    }
}

程序实现的功能为:从texture中读取32*32的数据写入到global memory中,然后测试了lts__t_bytes_equiv_l1sectormiss_pipe_tex、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld和lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex这四个metrics,测试结果为:
程序代码: [选择]
  LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *, unsigned long long, int, int, float), 2022-Apr-15 10:38:35, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.avg                                      byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.max                                      byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.min                                      byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.sum                                     Kbyte                          16.38
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.avg                          byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.max                          byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.min                          byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.sum                         Kbyte                          16.38
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.avg                    byte                              0
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.max                    byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.min                    byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.sum                    byte                              0
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.avg                   byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.max                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.min                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.sum                  Kbyte                          16.38
    ---------------------------------------------------------------------- --------------- ------------------------------

以lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex为例,该metrics的解释为:
# of bytes requested for TEX instructions
使用sass代码的确是有这个指令,但是我比较疑惑的是为什么数据量是16384byte,
因为我的数据总量为32*32*4=4096byte,请问这是为什么呢?

Re: 访问texture和global memory的区别
« 回复 #1 于: 五月 19, 2022, 07:16:40 pm »
测试程序kernel是参考官方例程写的为:
程序代码: [选择]
__global__ void LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *output, cudaTextureObject_t tex_obj,
                                                                   int width, int height, float theta) {
    // Calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    float u = x / static_cast<float>(width);
    float v = y / static_cast<float>(height);


    // Read from texture and write to global memory
    output[y * width + x] = tex2D<float>(tex_obj, u, v);
}

void LtsTBytesEquivL1sectormissPipeTexMemTexture() {
    const int test_num = 1;
    const int loop_num = 1;
    int grid_num[test_num]{32};
    int thread_num[test_num]{32};
    float *h_src[test_num] = {nullptr};
    float *d_res[test_num] = {nullptr};

    float angle = 0.5;
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray_t cu_array[test_num];

    std::srand(time(nullptr));  // use current time as seed for random generator
    CUDA_CHECK(cudaSetDevice(0));
    for (int i = 0; i < loop_num; i++) {
        dim3 block(thread_num[i]);
        dim3 grid(grid_num[i]);

        // alloc cpu memory
        h_src[i] = new float[thread_num[i] * grid_num[i]];
        for (int j = 0; j < grid_num[i] * thread_num[i]; j++) {
            h_src[i][j] = (std::rand() % 255);
        }

        // Allocate CUDA array in device memory
        CUDA_CHECK(cudaMallocArray(&cu_array[i], &channel_desc, thread_num[i], grid_num[i]));

        // Set pitch of the source (the width in memory in bytes of the 2D array pointed
        // to by src, including padding), we dont have any padding
        const size_t spitch = thread_num[i] * sizeof(float);

        // Copy data located at address h_data in host memory to device memory
        CUDA_CHECK(cudaMemcpy2DToArray(cu_array[i], 0, 0, h_src[i], spitch, thread_num[i] * sizeof(float), grid_num[i],
                                       cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc res_desc;
        memset(&res_desc, 0, sizeof(res_desc));
        res_desc.resType = cudaResourceTypeArray;
        res_desc.res.array.array = cu_array[i];

        // Specify texture object parameters
        struct cudaTextureDesc tex_desc;
        memset(&tex_desc, 0, sizeof(tex_desc));
        tex_desc.addressMode[0] = cudaAddressModeWrap;
        tex_desc.addressMode[1] = cudaAddressModeWrap;
        tex_desc.filterMode = cudaFilterModeLinear;
        tex_desc.readMode = cudaReadModeElementType;
        tex_desc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t tex_obj = 0;
        cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);

        // Allocate result of transformation in device memory
        CUDA_CHECK(cudaMalloc(&d_res[i], grid_num[i] * thread_num[i] * sizeof(float)));

        // kernel
        switch (i) {
            case 0: {
                LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0<<<grid, block>>>(d_res[i], tex_obj, thread_num[i],
                                                                                    grid_num[i], angle);
                break;
            }
            default:
                break;
        }

        CUDA_CHECK(cudaStreamSynchronize(0));
        CUDA_CHECK(cudaGetLastError());

        // Destroy texture object
        cudaDestroyTextureObject(tex_obj);
        cudaFreeArray(cu_array[i]);

        delete[] h_src[i];
        CUDA_FREE(d_res[i]);
    }
}

程序实现的功能为:从texture中读取32*32的数据写入到global memory中,然后测试了lts__t_bytes_equiv_l1sectormiss_pipe_tex、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture、lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld和lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex这四个metrics,测试结果为:
程序代码: [选择]
  LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *, unsigned long long, int, int, float), 2022-Apr-15 10:38:35, Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.avg                                      byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.max                                      byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.min                                      byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex.sum                                     Kbyte                          16.38
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.avg                          byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.max                          byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.min                          byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture.sum                         Kbyte                          16.38
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.avg                    byte                              0
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.max                    byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.min                    byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_ld.sum                    byte                              0
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.avg                   byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.max                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.min                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.sum                  Kbyte                          16.38
    ---------------------------------------------------------------------- --------------- ------------------------------

以lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex为例,该metrics的解释为:
# of bytes requested for TEX instructions
使用sass代码的确是有这个指令,但是我比较疑惑的是为什么数据量是16384byte,
因为我的数据总量为32*32*4=4096byte,请问这是为什么呢?

可能存在插值时候的读取放大效应,例如你可能在2D的归一化坐标的线性滤波的时候,某个坐标需要读取临近的4个点;1D的时候,可能需要读取临近的2个点(或者精确的说,纹元,texel,texture elment)。

放大的读取也可能被CUDA Array的不透明的数据重排列方式+cache的读取而导致的。关于这点,2013年的<CUDA Handbook>里面解释了这个不透明的数据安排方式(2D下4个点一组)。可能这也会贡献了读取放大的一部分可能因素。

欢迎继续回帖,我回复的太晚了。