Cuda Shared Memory配列変数

Question

次のように、行列乗算の変数を宣言しようとしています。

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

ユーザーが行列のサイズを入力して計算できるようにしていますが、これはBLOCK_SIZEを変更することを意味します。変更しましたが、コンパイラエラー「エラー：定数値が不明です」が発生します。調べてみましたが、これは thread に似ています。だから私は試した：

__shared__ int buf [];

しかし、「エラー：不完全な型は許可されていません」と表示されます。

おかげで、Danはコードで更新します（かなりフォローされていますこのガイドおよびcudaガイドで開始されます）：ブロックサイズは、マトリックスのサイズをユーザーに尋ねることによって渡されます。 xとyを入力します。ブロックサイズはxのみで、現在はxおよびyと同じサイズを受け入れる必要があります。

__global__ void matrixMul( float* C, float* A, float* B, int wA, int wB,size_t block_size) { // Block index int bx = blockIdx.x; int by = blockIdx.y; // Thread index int tx = threadIdx.x; int ty = threadIdx.y; // Index of the first sub-matrix of A processed // by the block int aBegin = wA * block_size * by; // Index of the last sub-matrix of A processed // by the block int aEnd = aBegin + wA - 1; // Step size used to iterate through the // sub-matrices of A int aStep = block_size; // Index of the first sub-matrix of B processed // by the block int bBegin = block_size * bx; // Step size used to iterate through the // sub-matrices of B int bStep = block_size * wB; float Csub=0; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Declaration of the shared memory array As // used to store the sub-matrix of A extern __shared__ float As[]; // Declaration of the shared memory array Bs // used to store the sub-matrix of B extern __shared__ float Bs[]; extern __shared__ float smem[]; // Load the matrices from global memory // to shared memory; each thread loads // one element of each matrix smem[ty*block_size+tx] = A[a + wA * ty + tx]; //cuPrintf("

What are the memory locations?
"); //cuPrintf("The shared memory(A) is: %.2f
",smem[ty*block_size+tx]); smem[block_size*block_size+ty*block_size+tx] = B[b + wB * ty + tx]; //cuPrintf("The shared memory(B) is: %.2f
",smem[block_size*block_size+ty*block_size+tx]); // Synchronize to make sure the matrices // are loaded __syncthreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix for (int k = 0; k < block_size; ++k) { Csub += smem[ty*block_size+k] * smem[block_size*block_size+k*block_size+tx] ; //cuPrintf("Csub is currently: %.2f
",Csub); } //cuPrintf("


"); // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration //cuPrintf("the results are csub: %.2f
",Csub); __syncthreads(); } // Write the block sub-matrix to device memory; // each thread writes one element int c = wB * block_size * by + block_size * bx; C[c + wB * ty + tx] = Csub; }

brano · Accepted Answer

_extern __shared__ int buf[];_

カーネルを起動するときは、この方法で起動する必要があります。

kernel<<<blocks,threads,numbytes_for_shared>>>(...);

共有の複数のextern宣言がある場合：

_extern __shared__ float As[];_

_extern __shared__ float Bs[];_

これにより、AsがBsと同じアドレスを指すようになります。

AsとBsを1D配列内に保持する必要があります。

_extern __shared__ float smem[]; _

カーネルを呼び出すときは、2*BLOCK_SIZE*BLOCK_SIZE*sizeof(float)で起動する必要があります。

Asにインデックスする場合は_smem[y*BLOCK_SIZE+x]_を使用し、Bにインデックスする場合は_smem[BLOCK_SIZE*BLOCK_SIZE+y*BLOCK_SIZE+x]_を使用します

talonmies · Answer

カーネル内で共有メモリを宣言するには、静的または動的の2つの選択肢があります。私はあなたが現在何をしているのかが次のようになっていると思います：

#define BLOCK_SIZE (16) __global__ void sgemm0(const float *A, const float *B, float *C) { __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; }

また、BLOCK_SIZEを簡単に変更できるようにしたいとします。

1つの可能性は、静的共有メモリ割り当てを引き続き使用することですが、次のように、割り当てサイズをテンプレートパラメータにします。

template<int blocksize=16> __global__ void sgemm1(const float *A, const float *B, float *C) { __shared__ float As[blocksize][blocksize]; } template void sgemm1<16>(const float *, const float *, float *C);

その後、必要なだけコンパイル時にさまざまなブロックサイズのバリアントをインスタンス化できます。

メモリを動的に割り当てる場合は、次のように定義します。

__global__ void sgemm2(const float *A, const float *B, float *C) { extern __shared__ float As[]; }

次に、割り当てのサイズを引数としてカーネル呼び出しに追加します。

size_t blocksize = BLOCK_SIZE * BLOCK_SIZE; sgemm2<<< gridDim, blockDim, sizeof(float)*blocksize >>>(....);

動的に割り当てられた共有メモリで置き換える静的に宣言された配列が複数ある場合は、カーネルごとに動的共有メモリの割り当てが1つしかないため、そのメモリセグメント内に複数のアイテムが存在する（共有される）ことに注意してください。したがって、次のようなものがある場合：

#define BLOCK_SIZE (16) __global__ void sgemm0(const float *A, const float *B, float *C) { __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; }

あなたはそれと置き換えることができます：

#define BLOCK_SIZE (16) __global__ void sgemm3(const float *A, const float *B, float *C) { extern __shared__ float buffer[]; float *As = &buffer[0]; float *Bs = &buffer[BLOCK_SIZE*BLOCK_SIZE]; }

次のようにカーネルを起動します。

size_t blocksize = 2 * BLOCK_SIZE * BLOCK_SIZE; sgemm3<<< gridDim, blockDim, sizeof(float)*blocksize >>>(....);

すべてが同じように有効ですが、動的バージョンでは余計な作業なしにはできない自動ループ展開のような他のコンパイラーの最適化を許可できるため、私は個人的にテンプレートバージョンを好みます。

Nash · Answer

正解ですね。

通常、この場合は、何かをmallocする必要があります。

ここに2つあります。1つのCは2D配列（配列の配列にすぎません）を認識しておらず、配列サイズは時間定数（またはコンパイラがコンパイル時に計算できるもの）をコンパイルする必要があります。

C99を使用している場合は、関数のパラメーターを使用して配列サイズを宣言できますが、C99のサポートは...せいぜいむらがあります。