CUDA 3D, 2D data processing.

2009. 12. 18. 17:10·Computer/CUDA
728x90
728x90

In CUDA, kernels can only operate out of device memory.

Device memory can be allocated either as linear memory or CUDA arrays.

 

In this sample, we observe how to use 3D and 2D linear memory in CUDA.

Linear memory exists on the device in a 32-bit address space.

 

Typically, following methods are used for device memory allocaton.

cudaError_t cudaMalloc (void** devPtr,size_t size)

cudaError_t cudaMallocPitch (void** devPtr, size_t* pitch, size_t width, size_t height)

cudaErrot_t cudaMalloc3D (struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent)

 

In order to communicate host and device, following methods are used.

cudaErrot_t cudaMemcpy(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind)

cudaErrot_t cudaMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t ehight, enum cudaMemcpyKind kind)

cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms* p)

 

Following example shows how to use 3d and 2d linear memory in CUDA.

This example calculate the sum of each row, so 3D data is used input data and 2D data is result. 

  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <math.h>
  5. #include <conio.h>
  6. #include <cutil_inline.h>

  7. //-----------------------------------------------------------
  8. // Device code
  9. __global__ void SumRow(cudaPitchedPtr devPitchedPtr, cudaExtent extent, float* devPtr_out,int pitch_out)
  10. {
  11. char* devPtr   = (char*)devPitchedPtr.ptr;
  12. int pitch      = devPitchedPtr.pitch;
  13. int slicePitch = pitch*extent.height;

  14. //blockDim.x  : depth
  15. //threadIdx.x : height
  16. char* slice = devPtr + blockIdx.x * slicePitch;
  17. float* row  = (float*)(slice + threadIdx.x * pitch);
  18. float* sum  = (float*)((char*)devPtr_out + blockIdx.x*pitch_out);

  19. sum[threadIdx.x] = 0.f;
  20. for(int x = 0; x<extent.width;x++){
  21. sum[threadIdx.x] += row[x];
  22. }
  23. }

  24. //-----------------------------------------------------------
  25. // host code

  26. int main_impl();

  27. int main( int argc, char** argv)  {
  28.  
  29. main_impl();
  30. }

  31. int main_impl(){

  32. //print_data();
  33. getch();
  34. size_t s[3] = {256,256,209};
  35. size_t N    = s[2]*s[1]*s[0];
  36. size_t size = N * sizeof(float);
  37. size_t N_out    = s[2]*s[1];
  38. size_t size_out = N_out * sizeof(float);
  39. printf("size[%f]GB\n",size/1024.f/1204.f/1024.f);


  40. // memory allocation and init. in the host memory.
  41. float* h_A;
  42. h_A = (float*)malloc(size);
  43. float* h_B;
  44. h_B = (float*)malloc(size);
  45. float* h_C;
  46. h_C = (float*)malloc(size_out);
  47. float* h_D;
  48. h_D = (float*)malloc(size_out);

  49. for(int i=0; i<N;i++){
  50. h_A[i] = i;
  51. h_B[i] = N-i;
  52. }

  53. // memory allocation for device memory
  54. cudaPitchedPtr devPitchedPtr;
  55. cudaExtent     extent = make_cudaExtent(s[0],s[1],s[2]);
  56. cudaMalloc3D(&devPitchedPtr,extent);

  57. float* devPtr; //2D result. (y,z)
  58. size_t pitch_out ;    // This variable will be set by cudaMallocPitch;
  59. cudaMallocPitch( (void**)&devPtr,&pitch_out, (size_t)(s[1]*sizeof(float)), s[2]);


  60. unsigned int timer = 0;
  61.     cutilCheckError( cutCreateTimer( &timer));
  62.     cutilCheckError( cutStartTimer( timer));


  63. // copy from host memory data to device memory data
  64. cudaMemcpy3DParms p = {0};
  65. p.srcPtr.ptr   = h_A;
  66. p.srcPtr.pitch = s[0] * sizeof(float);
  67. p.srcPtr.xsize = s[0];
  68. p.srcPtr.ysize = s[1];
  69. p.dstPtr.ptr   = devPitchedPtr.ptr;
  70. p.dstPtr.pitch = devPitchedPtr.pitch;
  71. p.dstPtr.xsize = s[0];
  72. p.dstPtr.ysize = s[1];
  73. p.extent.width  = s[0]*sizeof(float);
  74. p.extent.height = s[1];
  75. p.extent.depth  = s[2];
  76. p.kind = cudaMemcpyHostToDevice;
  77. cudaMemcpy3D(&p);

  78. int threadsPerBlock = s[1];
  79. int blocksPerGrid   = s[2];
  80. SumRow<<<blocksPerGrid,threadsPerBlock>>>(devPitchedPtr,extent,devPtr,pitch_out);
  81. cudaGetErrorString(cudaGetLastError());
  82. cudaMemcpy2D(h_C,s[1]*sizeof(float),devPtr,pitch_out,s[1]*sizeof(float),s[2],cudaMemcpyDeviceToHost);
  83. // free device memory.
  84. cudaFree(devPitchedPtr.ptr);

  85. // finish Timer
  86. cutilCheckError( cutStopTimer( timer));
  87. printf( "Processing time1: %f (ms)\n", cutGetTimerValue( timer));
  88. cutilCheckError( cutDeleteTimer( timer));


  89. // for reference.
  90. timer = 0;
  91. cutilCheckError( cutCreateTimer( &timer));
  92. cutilCheckError( cutStartTimer( timer));
  93. int i=0;
  94. int j=0;
  95. for(int z=0; z< s[2];z++){
  96. for(int y=0; y< s[1];y++){
  97. h_D[j] =0.f;
  98. for(int x=0; x< s[0];x++){
  99. h_D[j] += h_A[i];
  100. i++;
  101. }
  102. j++;
  103. }
  104. }
  105. cutilCheckError( cutStopTimer( timer));
  106. printf( "Processing time2: %f (ms)\n", cutGetTimerValue( timer));
  107. cutilCheckError( cutDeleteTimer( timer));

  108. // comparison
  109. for(i=0; i< N_out;i++){

  110. // printf("test N[%d][%6.5f/%6.5f]\n",i,h_C[i],h_D[i]);
  111. if( 0.0f < (h_C[i] - h_D[i]) ){
  112. printf("error N[%d][%6.5f/%6.5f]\n",i,h_C[i],h_D[i]);
  113. }
  114. }
  115. return 0;
  116. };

 

 

 

이 글은 스프링노트에서 작성되었습니다.

'Computer > CUDA' 카테고리의 다른 글

[ML] WSL2 : Install Tensorflow (GPU)  (0) 2022.07.17
CUDA Compile Environment with VS 2008  (0) 2009.12.18
'Computer/CUDA' 카테고리의 다른 글
  • [ML] WSL2 : Install Tensorflow (GPU)
  • CUDA Compile Environment with VS 2008
dsaint31x
dsaint31x
    반응형
    250x250
  • dsaint31x
    Dsaint31's blog
    dsaint31x
  • 전체
    오늘
    어제
    • 분류 전체보기 (748)
      • Private Life (13)
      • Programming (56)
        • DIP (112)
        • ML (26)
      • Computer (119)
        • CE (53)
        • ETC (33)
        • CUDA (3)
        • Blog, Markdown, Latex (4)
        • Linux (9)
      • ... (351)
        • Signals and Systems (103)
        • Math (172)
        • Linear Algebra (33)
        • Physics (42)
        • 인성세미나 (1)
      • 정리필요. (54)
        • 의료기기의 이해 (6)
        • PET, MRI and so on. (1)
        • PET Study 2009 (1)
        • 방사선 장해방호 (4)
        • 방사선 생물학 (3)
        • 방사선 계측 (9)
        • 기타 방사능관련 (3)
        • 고시 (9)
        • 정리 (18)
      • RI (0)
      • 원자력,방사능 관련법 (2)
  • 블로그 메뉴

    • Math
    • Programming
    • SS
    • DIP
  • 링크

    • Convex Optimization For All
  • 공지사항

    • Test
    • PET Study 2009
    • 기타 방사능관련.
  • 인기 글

  • 태그

    function
    Convolution
    numpy
    인허가제도
    SS
    Vector
    signal_and_system
    Term
    fourier transform
    opencv
    Optimization
    Programming
    linear algebra
    Probability
    Python
    DIP
    signals_and_systems
    SIGNAL
    math
    cv2
  • 최근 댓글

  • 최근 글

  • hELLO· Designed By정상우.v4.10.3
dsaint31x
CUDA 3D, 2D data processing.
상단으로

티스토리툴바