kernel运行速度太慢的问题,请[名词6]看看怎么优化

  • 0 replies
  • 253 views
__global__ void propagate(int t, int xl, int yl, float nu_R, float nu_B, float angle, float hh, float kk, float beta,
   int *walls, int *flag, float *fnorm, float *M_inv,
   float *nx, float *ny,float *Forcex,float *Forcey,
   float *r0old, float *r1old, float *r2old, float *r3old, float *r4old, float *r5old, float *r6old, float *r7old, float *r8old,
   float *r0new, float *r1new, float *r2new, float *r3new, float *r4new, float *r5new, float *r6new, float *r7new, float *r8new,
   float *b0old, float *b1old, float *b2old, float *b3old, float *b4old, float *b5old, float *b6old, float *b7old, float *b8old,
   float *b0new, float *b1new, float *b2new, float *b3new, float *b4new, float *b5new, float *b6new, float *b7new, float *b8new)
{
   
   float R0, R1, R2, R3, R4, R5, R6, R7, R8,
      B0, B1, B2, B3, B4, B5, B6, B7, B8;
   float RHO_R, RHO_B, RHO_N, RHO, VX, VY, NX, NY, forcex, forcey;//FNORM
   //int x, y;
   int WALLS,FLAG;
   float nu, su;
   float mi[Q], meq[Q],fk[Q];
   //shared memeory for propagation with east and west part
   //red fluid
   __shared__ float F_R1[THREAD_NUM];
   __shared__ float F_R5[THREAD_NUM];
   __shared__ float F_R8[THREAD_NUM];
   __shared__ float F_R3[THREAD_NUM];
   __shared__ float F_R6[THREAD_NUM];
   __shared__ float F_R7[THREAD_NUM];
   //blue fluid
   __shared__ float F_B1[THREAD_NUM];
   __shared__ float F_B5[THREAD_NUM];
   __shared__ float F_B8[THREAD_NUM];
   __shared__ float F_B3[THREAD_NUM];
   __shared__ float F_B6[THREAD_NUM];
   __shared__ float F_B7[THREAD_NUM];
   //index
   int tx = threadIdx.x;
   int x = blockIdx.x * THREAD_NUM + tx;
   int y = blockIdx.y;
   int k = y * xl + x;
   //Load data
   R0 = r0old[k]; R1 = r1old[k], R2 = r2old[k], R3 = r3old[k], R4 = r4old[k], R5 = r5old[k], R6 = r6old[k], R7 = r7old[k], R8 = r8old[k];
   B0 = b0old[k]; B1 = b1old[k], B2 = b2old[k], B3 = b3old[k], B4 = b4old[k], B5 = b5old[k], B6 = b6old[k], B7 = b7old[k], B8 = b8old[k];
   NX = nx[k], NY = ny[k];
   WALLS = walls[k], FLAG = flag[k];
   forcex = Forcex[k], forcey = Forcey[k];//FNORM = fnorm[k],
   if (WALLS <= 0)
   {
      //Project
      //计算宏观量      
      VX = B1 + R1 + B5 + R5 + B8 + R8 - B3 - R3 - B6 - R6 - B7 - R7;
      VY = B2 + R2 + B5 + R5 + B6 + R6 - B4 - R4 - B7 - R7 - B8 - R8;
      RHO_R = R0 + R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8;
      RHO_B = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 + B8;
      RHO = RHO_B + RHO_R;
      RHO_N = (RHO_R - RHO_B) / RHO;   
      //method 1
      nu = 2.0 / ((1.0 + RHO_N) / nu_R + (1.0 - RHO_N) / nu_B);
      su = 1.0 / (3 * nu + 0.5);
      float si[Q] = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,su,su };

      //外力对速度的作用以及准三维修正
      VX = (VX + 0.5*forcex) / (RHO + nu / (2 * kk));
      VY = (VY + 0.5*forcey) / (RHO + nu / (2 * kk));
   
      for (int i = 0; i < Q; i++)
         mi = M[0] * (B0 + R0) + M[1] * (B1 + R1) + M[2] * (B2 + R2) + M[3] * (B3 + R3) +
                 M[4] * (B4 + R4) + M[5] * (B5 + R5) + M[6] * (B6 + R6) + M[7] * (B7 + R7) + M[8] * (B8 + R8);
      
      forcex -= nu / kk * VX;   //准三维修正项,debug *VX
      forcey -= nu / kk * VY;

      //M*feq
      meq[0] = RHO;
      meq[1] = 3 * RHO*(VX*VX + VY * VY) - 2 * RHO;
      meq[2] = RHO - 3 * RHO*(VX*VX + VY * VY);
      meq[3] = RHO * VX;
      meq[4] = (-1.0)*RHO * VX;
      meq[5] = RHO * VY;
      meq[6] = (-1.0)*RHO * VY;
      meq[7] = RHO * (VX*VX - VY * VY);
      meq[8] = RHO * VX*VY;
      //M*F
      fk[0] = 0;
      fk[1] = 6 * (VX*forcex + VY * forcey);
      fk[2] = (-6) * (VX*forcex + VY * forcey);
      fk[3] = forcex;
      fk[4] = (-1) * forcex;
      fk[5] = forcey;
      fk[6] = (-1) * forcey;
      fk[7] = 2 * (VX*forcex - VY * forcey);
      fk[8] = VX * forcey + VY * forcex;         //found a bug  ux*fy+uy*fx
      for (int i = 0; i < Q; i++)
         mi = mi - si * (mi - meq) + (1 - 0.5*si)*fk;
      //fk代替为下一时刻分布函数
      for (int i = 0; i < Q; i++)
      {
         fk = 0;
         for (int j = 0; j < Q; j++)
            fk += M_inv[i*Q + j] * mi[j];
      }
      //Recolor
      R0 = RHO_R / RHO * fk[0];
      R1 = RHO_R / RHO * fk[1];
      R2 = RHO_R / RHO * fk[2];
      R3 = RHO_R / RHO * fk[3];
      R4 = RHO_R / RHO * fk[4];
      R5 = RHO_R / RHO * fk[5];
      R6 = RHO_R / RHO * fk[6];
      R7 = RHO_R / RHO * fk[7];
      R8 = RHO_R / RHO * fk[8];
      if (FLAG)             //两相界面
      {
         R1 = R1 + RHO_R * RHO_B / RHO * beta*w[1] * NX;
         R2 = R2 + RHO_R * RHO_B / RHO * beta*w[2] * NY;
         R3 = R3 + RHO_R * RHO_B / RHO * beta*w[3] * (NX * ix[3]);
         R4 = R4 + RHO_R * RHO_B / RHO * beta*w[4] * (NY * iy[4]);
         R5 = R5 + RHO_R * RHO_B / RHO * beta*w[5] * (NX + NY) / sqrt(2.0);
         R6 = R6 + RHO_R * RHO_B / RHO * beta*w[6] * (NY - NX) / sqrt(2.0);
         R7 = R7 + RHO_R * RHO_B / RHO * beta*w[7] * (-1.0)*(NX + NY) / sqrt(2.0);
         R8 = R8 + RHO_R * RHO_B / RHO * beta*w[8] * (NX - NY) / sqrt(2.0);
      }
      B0 = fk[0] - R0;
      B1 = fk[1] - R1;
      B2 = fk[2] - R2;
      B3 = fk[3] - R3;
      B4 = fk[4] - R4;
      B5 = fk[5] - R5;
      B6 = fk[6] - R6;
      B7 = fk[7] - R7;
      B8 = fk[8] - R8;
   }
   else
   {
      float temp;
      temp = R1; R1 = R3; R3 = temp;//red fluid
      temp = R2; R2 = R4; R4 = temp;
      temp = R5; R5 = R7; R7 = temp;
      temp = R6; R6 = R8; R8 = temp;

      temp = B1; B1 = B3; B3 = temp;//blue fluid
      temp = B2; B2 = B4; B4 = temp;
      temp = B5; B5 = B7; B7 = temp;
      temp = B6; B6 = B8; B8 = temp;
   }

   //Exchange boundary fluid
   if (k < xl)//bottom
   {
      float temp = R4; R4 = B4; B4 = temp;
      temp = R7; R7 = B7; B7 = temp;
      temp = R8; R8 = B8; B8 = temp;
   }
   if (k >= xl * (yl - 1))//top
   {
      float temp = B2; B2 = R2; R2 = temp;
      temp = B5; B5 = R5; R5 = temp;
      temp = B6; B6 = R6; R6 = temp;
   }
   //propagate
   //左右迁移
   F_R1[(tx + 1) % THREAD_NUM] = R1;
   F_R5[(tx + 1) % THREAD_NUM] = R5;
   F_R8[(tx + 1) % THREAD_NUM] = R8;

   F_R3[(tx - 1 + THREAD_NUM) % THREAD_NUM] = R3;
   F_R6[(tx - 1 + THREAD_NUM) % THREAD_NUM] = R6;
   F_R7[(tx - 1 + THREAD_NUM) % THREAD_NUM] = R7;

   F_B1[(tx + 1) % THREAD_NUM] = B1;
   F_B5[(tx + 1) % THREAD_NUM] = B5;
   F_B8[(tx + 1) % THREAD_NUM] = B8;

   F_B3[(tx - 1 + THREAD_NUM) % THREAD_NUM] = B3;
   F_B6[(tx - 1 + THREAD_NUM) % THREAD_NUM] = B6;
   F_B7[(tx - 1 + THREAD_NUM) % THREAD_NUM] = B7;
   __syncthreads();

   //write back data
   r0new[k] = R0;
   r1new[k] = F_R1[tx];
   r3new[k] = F_R3[tx];

   b0new[k] = B0;
   b1new[k] = F_B1[tx];
   b3new[k] = F_B3[tx];
   int ts = xl * ((y + 1) % yl) + x;
   r2new[ts] = R2;
   r5new[ts] = F_R5[tx];
   r6new[ts] = F_R6[tx];

   b2new[ts] = B2;
   b5new[ts] = F_B5[tx];
   b6new[ts] = F_B6[tx];
   ts = xl * ((y - 1 + yl) % yl) + x;
   r4new[ts] = R4;
   r7new[ts] = F_R7[tx];
   r8new[ts] = F_R8[tx];

   b4new[ts] = B4;
   b7new[ts] = F_B7[tx];
   b8new[ts] = F_B8[tx];
}

代码如上。一开始速度太慢,在下用visual profiler 分析发现寄存器数量过多导致占有率太低,于是限制寄存器后该函数加快一倍,但是仍然有缺陷。由于寄存器被限制,导致global memory load 速率很慢,还有指令延迟变严重了,不知道是什么原因。
请哪位[名词6]指点一下,感激不尽!