11
« 于: 十二月 14, 2021, 11:43:55 am »
// Main.cu
#define _CRT_SECURE_NO_WARNINGS
#define nFrame 2000
int main()
{
//译码程序计时
// struct timeval startAll,endAll;
// gettimeofday(&startAll, NULL);
cudaEvent_t startAll, stopAll;
float elapsedTime = 0.0;
cudaEventCreate(&startAll);
cudaEventCreate(&stopAll);
cudaEventRecord(startAll, 0);
genSource(src,mCol); //信源产生
rcmEncode(P,V, src, encOut,i,nFrame); //编码
//计算程序运行时间
cudaEventRecord(stopAll, 0);
cudaEventSynchronize(startAll); //这句有没有结果都是0
cudaEventSynchronize(stopAll);
cudaEventElapsedTime(&elapsedTime, startAll, stopAll);
cudaEventDestroy(startAll);
cudaEventDestroy(stopAll);
printf("程序运行时间time = %f(ms)\n", elapsedTime);
printf("Processing Finish!!");
return 0;
}
// rcmEncode的实现
//kernel:编码,实质就是测量矩阵与权矢量相乘
__global__ void encode1(int *d_P,int* d_S,int* d_C)
{
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
d_C[i] = -4 * d_S[d_P[i * wr + 0]] - 4 * d_S[d_P[i * wr + 1]] - 2 * d_S[d_P[i * wr + 2]] - d_S[d_P[i * wr + 3]]
+ d_S[d_P[i * wr + 4]] + 2 * d_S[d_P[i * wr + 5]] + 4 * d_S[d_P[i * wr + 6]] + 4 * d_S[d_P[i * wr + 7]];
}
void rcmEncode(int* P, int* V, int* src, int* encOut,int fi,int frame) {
//分配设备全局内存
int *d_S, *d_C,*d_P;
cudaMalloc((int**)&d_S, mCol*sizeof(int));//src
cudaMalloc((int**)&d_C, mea*sizeof(int)); //测量值
cudaMalloc((int**)&d_P, mea * wr * sizeof(int));//P
cudaMemset(d_C, 0, mea * sizeof(int));//初始化
//复制数据:从host --> device global,默认同步方式
cudaMemcpy(d_S, src, frameLen * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_P, P, mea * wr * sizeof(int), cudaMemcpyHostToDevice);
//分配线程和线程块:开启M个线程并行计算M个测量值,其中每个线程负责计算其中的一个测量值。
dim3 block(512); //一个block的最大线程数为1024
dim3 grid((mea+block.x-1)/block.x);
//调用核函数
encode1 <<<grid, block >>>(d_P, d_S,d_C); //异步执行
//同步,编码完成后再进行数据的复制
cudaDeviceSynchronize();
//复制数据:从device --> host
cudaMemcpy(encOut, d_C, mea* sizeof(int), cudaMemcpyDeviceToHost);
//释放设备的全局内存(常量区不需要释放)
cudaFree(d_S);
cudaFree(d_C);
cudaFree(d_P);
}