ユーザーが書いたカーネルの内部を推し進める

Question

私はスラストの初心者です。 Thrustのすべてのプレゼンテーションと例にホストコードのみが表示されていることがわかります。

自分のカーネルにdevice_vectorを渡すことができるかどうか知りたいのですが？どうやって？はいの場合、カーネル/デバイスコード内で許可されている操作は何ですか？

talonmies · Answer

最初に書かれたように、Thrustは純粋にホスト側の抽象化です。カーネル内では使用できません。次のように、thrust::device_vector内にカプセル化されたデバイスメモリを独自のカーネルに渡すことができます。

thrust::device_vector< Foo > fooVector; // Do something thrust-y with fooVector Foo* fooArray = thrust::raw_pointer_cast( &fooVector[0] ); // Pass raw array and its size to kernel someKernelCall<<< x, y >>>( fooArray, fooVector.size() );

また、スラストアルゴリズム内でスラストによって割り当てられていないデバイスメモリを使用するには、ベアcudaデバイスメモリポインターでthrust :: device_ptrをインスタンス化します。

4年半後に編集 @JackOLanternの回答に従って追加すると、スラスト1.8は、デバイスでスラストのアルゴリズムのシングルスレッドバージョンを実行できることを意味する順次実行ポリシーを追加します。ただし、スラストデバイスベクトルをカーネルに直接渡すことはまだ不可能であり、デバイスベクトルをデバイスコードで直接使用することはできません。

場合によっては、thrust::device実行ポリシーを使用して、子グリッドとしてカーネルによって並列推力実行を起動することもできます。これには、個別のコンパイル/デバイスリンケージと動的並列処理をサポートするハードウェアが必要です。これが実際にすべての推力アルゴリズムでサポートされているかどうかはわかりませんが、一部のアルゴリズムでは確実に機能します。

JackOLantern · Answer

これは私の以前の答えの更新です。

Thrust 1.8.1以降、CUDA Thrustプリミティブをthrust::device実行ポリシーと組み合わせて、CUDA動的並列処理を利用する単一のCUDAスレッド内で並列に実行できます。以下に例を示します。

#include <stdio.h> #include <thrust/reduce.h> #include <thrust/execution_policy.h> #include "TimingGPU.cuh" #include "Utilities.cuh" #define BLOCKSIZE_1D 256 #define BLOCKSIZE_2D_X 32 #define BLOCKSIZE_2D_Y 32 /*************************/ /* TEST KERNEL FUNCTIONS */ /*************************/ __global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) { const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols); } __global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) { const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols); } /********/ /* MAIN */ /********/ int main() { const int Nrows = 64; const int Ncols = 2048; gpuErrchk(cudaFree(0)); // size_t DevQueue; // gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount)); // DevQueue *= 128; // gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue)); float *h_data = (float *)malloc(Nrows * Ncols * sizeof(float)); float *h_results = (float *)malloc(Nrows * sizeof(float)); float *h_results1 = (float *)malloc(Nrows * sizeof(float)); float *h_results2 = (float *)malloc(Nrows * sizeof(float)); float sum = 0.f; for (int i=0; i<Nrows; i++) { h_results[i] = 0.f; for (int j=0; j<Ncols; j++) { h_data[i*Ncols+j] = i; h_results[i] = h_results[i] + h_data[i*Ncols+j]; } } TimingGPU timerGPU; float *d_data; gpuErrchk(cudaMalloc((void**)&d_data, Nrows * Ncols * sizeof(float))); float *d_results1; gpuErrchk(cudaMalloc((void**)&d_results1, Nrows * sizeof(float))); float *d_results2; gpuErrchk(cudaMalloc((void**)&d_results2, Nrows * sizeof(float))); gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice)); timerGPU.StartCounter(); test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); printf("Timing approach nr. 1 = %f
", timerGPU.GetCounter()); gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost)); for (int i=0; i<Nrows; i++) { if (h_results1[i] != h_results[i]) { printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]); return 0; } } timerGPU.StartCounter(); test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); printf("Timing approach nr. 2 = %f
", timerGPU.GetCounter()); gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost)); for (int i=0; i<Nrows; i++) { if (h_results1[i] != h_results[i]) { printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]); return 0; } } printf("Test passed!
"); }

上記の例は、マトリックスの行の削減を Reduce matrix rows with CUDA と同じように実行しますが、上記の投稿とは異なり、つまり、ユーザーがCUDA推力プリミティブを直接呼び出すことで行われますカーネル。また、上記の例は、2つの実行ポリシー、つまりthrust::seqとthrust::deviceを使用して実行した場合の同じ操作のパフォーマンスを比較するのに役立ちます。以下に、パフォーマンスの違いを示すいくつかのグラフを示します。

パフォーマンスは、Kepler K20cおよびMaxwell GeForce GTX 850Mで評価されています。

JackOLantern · Answer

この質問に対する更新された回答を提供したいと思います。

Thrust 1.8から、CUDA Thrustプリミティブはthrust::seq単一のCUDAスレッド内で順次実行する（または単一のCPUスレッド内で順次実行する）実行ポリシー。以下に例を示します。

スレッド内で並列実行する場合は、カードが有効であれば、スレッドブロック内から呼び出すことができるリダクションルーチンを提供する [〜＃〜] cub [〜＃〜] の使用を検討できます。動的並列処理。

これがスラストの例です

#include <stdio.h> #include <thrust/reduce.h> #include <thrust/execution_policy.h> /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d
", cudaGetErrorString(code), file, line); if (abort) exit(code); } } __global__ void test(float *d_A, int N) { float sum = thrust::reduce(thrust::seq, d_A, d_A + N); printf("Device side result = %f
", sum); } int main() { const int N = 16; float *h_A = (float*)malloc(N * sizeof(float)); float sum = 0.f; for (int i=0; i<N; i++) { h_A[i] = i; sum = sum + h_A[i]; } printf("Host side result = %f
", sum); float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float))); gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice)); test<<<1,1>>>(d_A, N); }

fabrizioM · Answer

Thrust yesで割り当て/処理されたデータを使用する場合は、割り当てられたデータの生のポインタを取得してください。

int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);

カーネルに推力ベクトルを割り当てたい場合、私は試したことはありませんが、うまくいくとは思いませんし、うまくいくとしたら、何のメリットもないと思います。