CUDAで並列文字列生成 - nishio-dens's diary

CUDA勉強のため，現在実行中のスレッドIDとブロックIDをメモリに書き出す簡単なプログラムを作成．実行結果はこんな感じ

I'm thread 0. My Block ID is 0
I'm thread 1. My Block ID is 0
I'm thread 2. My Block ID is 0
I'm thread 0. My Block ID is 1
I'm thread 1. My Block ID is 1
I'm thread 2. My Block ID is 1
I'm thread 0. My Block ID is 2
I'm thread 1. My Block ID is 2
I'm thread 2. My Block ID is 2
I'm thread 0. My Block ID is 3
I'm thread 1. My Block ID is 3
I'm thread 2. My Block ID is 3
I'm thread 0. My Block ID is 4
I'm thread 1. My Block ID is 4
I'm thread 2. My Block ID is 4

こんな単純なプログラムを作っただけだが，色々CUDAプログラミングを行う上でわかったことがあった．
一つは標準関数がほとんど用意されていないので，必要な関数は自分で作る必要があるということ．今回はGPU側でatoiを使いたかったのだが，標準では用意されていない模様．なので自作する必要があった．

もう一つ学んだことは，ポインタのポインタがうまく使えない場合があるということ．下記のプログラムのgenText関数内の一部コメントアウトされている部分は正常に動かない．

char* text[4] = {text1, threadIDStr, text2, blockIDStr};

このように，文字列へのポインタの配列を作ったのだが，

CUDA Cannot tell what pointer points to, assuming global memory space

なる警告が発生．警告だけなら無視すれば，と思ったのだが，結果もうまく出なかった．警告の内容から，ポインタが指している先のアドレスが曖昧だということなのだろうが，どういうことなのだろうか．よく分からないが，ポインタのポインタはうまく動かない可能性があるということは学んだ．

#include <stdio.h>

__device__ char text1[] = "I'm thread ";
__device__ char text2[] =  ". My Block ID is "; 
__device__ char convNum[] = "0123456789ABCDEF";

#define TEXT1_SIZE (sizeof(text1) - 1)
#define TEXT2_SIZE (sizeof(text2) - 1)
#define EXTRA_SIZE 10
#define THREAD_DATA_SIZE ( TEXT1_SIZE + TEXT2_SIZE + EXTRA_SIZE )
#define BYTE_SIZE (8) 
#define INT_SIZE (sizeof(int) * BYTE_SIZE)

__device__ void gpuAtoi(int value, char* str, int radix)
{
  int v = value;
  char convData[INT_SIZE + 1];
  int len = 0;
  int head = 0;
  int i;
  
  if( v < 0 ) {
    str[head] = '-';
    head++;
    v = -v;
  }
  
  do{
    convData[len] = convNum[ v % radix ];
    len++;
    v /= radix;
  }while( v > 0 && len < INT_SIZE);

  for(i=0; i < len + head; i++) {
      str[i + head] = convData[ len - i - 1 ];
  }
  str[i] = '\0';
}

__global__ void genText(char* address) {
  int threadID = threadIdx.x;
  int blockID = blockIdx.x;
  char* writeAddress = address + (blockID * blockDim.x * THREAD_DATA_SIZE) + (threadID * THREAD_DATA_SIZE);
  
  char threadIDStr[10];
  char blockIDStr[10];

  gpuAtoi( threadID, threadIDStr, 10);
  gpuAtoi( blockID, blockIDStr, 10);

  int currentPos = 0;
  int i;
  // int j;

  // char* text[4] = {text1, threadIDStr, text2, blockIDStr};

  // for(i=0; i < 4; i++) {
  //   for(j=0; text[i][j] != '\0'; j++) {
  //      writeAddress[j + currentPos] = text[i][j];
  //   }
  //   currentPos +=  j;
  // }

  for(i=0; i < text1[i] != '\0' ; i++) {
    writeAddress[i + currentPos] = text1[i];
  }
  currentPos += i;

  for(i=0; threadIDStr[i] != '\0'; i++) {
    writeAddress[i + currentPos] = threadIDStr[i];
  }
  currentPos += i;

  for(i=0; i < text2[i] != '\0'; i++) {
    writeAddress[i + currentPos] = text2[i];
  }
  currentPos += i;

  for(i=0; blockIDStr[i] != '\0'; i++) {
    writeAddress[i + currentPos] = blockIDStr[i];
  }
  currentPos += i;

  //padding
  for(; currentPos < THREAD_DATA_SIZE; currentPos++) {
    writeAddress[currentPos] = ' ';
  }
  writeAddress[currentPos-1] = '\n';
}

int main()
{
  char* buffer;
  char* deviceBuffer;
  dim3 threadNum(3);
  dim3 blockNum(5);
  int allocSize = THREAD_DATA_SIZE * threadNum.x * blockNum.x + 1;
  
  buffer = (char*)malloc(allocSize);
  cudaMalloc(&deviceBuffer, allocSize);

  genText<<<blockNum, threadNum>>>(deviceBuffer);
  cudaMemcpy( buffer, deviceBuffer, allocSize, cudaMemcpyDeviceToHost);
  cudaFree(deviceBuffer);

  buffer[allocSize-1] = '\0';
  puts(buffer);
  free(buffer);

  return 0;
}