CUDA API 메모리 종류

Programming/openCL & CUDA2012. 5. 18. 21:22

CUDA API 메모리 종류

CUDA device에서 제공하는 메모리의 종류는 다음과 같다.

5.3.2 Device Memory Accesses .................................................................... 70

5.3.2.1 Global Memory ............................................................................ 70

5.3.2.2 Local Memory .............................................................................. 72

5.3.2.3 Shared Memory ........................................................................... 72

5.3.2.4 Constant Memory ........................................................................ 73

5.3.2.5 Texture and Surface Memory ........................................................ 73

[출처 : CUDA C Programming guide.pdf]

Local memory 와 Global memory는 그래픽 카드의 비디오 메모리(통상 512MB / 1기가 이런식으로 말하는)에 존재하고
Shared memory는 GPU 내의 Multi-Processor에 통합되어있다.

Devicequery를 비교하면서 보자면
8800GT 512MB 짜리에서
Global memory와 Local memory는 512MB 까지 가능하며
Shared memory는 블럭당 16KB 까지 가능하다.

Device 0: "GeForce 8800 GT"
CUDA Driver Version:                           3.20
CUDA Runtime Version:                          3.10
CUDA Capability Major revision number:         1
CUDA Capability Minor revision number:         1
Total amount of global memory:                 536543232 bytes
Number of multiprocessors:                     14
Number of cores:                               112

Total amount of constant memory:               65536 bytes
Total amount of shared memory per block:       16384 bytes
Total number of registers available per block: 8192
Warp size:                                     32
  Maximum number of threads per block:           512
  Maximum sizes of each dimension of a block:    512 x 512 x 64
Maximum sizes of each dimension of a grid:     65535 x 65535 x 1

2011/01/02 - [Programming/openCL / CUDA] - deviceQuery on 8600GT 512MB + CUDA 하드웨어 구조

예제로 들어있는 행렬곱 예제에서
shared memory를 사용하고 사용하지 않는 차이점은 아래의 그림처럼
Global memory에 직접 한 바이트씩 읽어서 계산하는지

아니면 global memory의 블럭을
shared memory로 일정 영역만(블럭 사이즈 만큼) 복사해서 계산을 하는지의 차이점이 있다.

다른 책에 의하면 global memory는 700~900 cuda clock에 읽어오고
shared memory는 거의 1 cuda clock에 읽어 온다고 하니
되도록이면 shared memory에 복사해서 더욱 빠르게 연산하는게 유리하다고 한다.

// Matrices are stored in row-major order:

// M(row, col) = *(M.elements + row * M.width + col)

typedef struct {

int width;

int height;

float* elements;

} Matrix;

// Thread block size

#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void MatMul(const Matrix A, const Matrix B, Matrix C)

{

// Load A and B to device memory

Matrix d_A;

d_A.width = A.width; d_A.height = A.height;

size_t size = A.width * A.height * sizeof(float);

cudaMalloc(&d_A.elements, size);

cudaMemcpy(d_A.elements, A.elements, size,

cudaMemcpyHostToDevice);

Matrix d_B;

d_B.width = B.width; d_B.height = B.height;

size = B.width * B.height * sizeof(float);

cudaMalloc(&d_B.elements, size);

cudaMemcpy(d_B.elements, B.elements, size,

cudaMemcpyHostToDevice);

// Allocate C in device memory

Matrix d_C;

d_C.width = C.width; d_C.height = C.height;

size = C.width * C.height * sizeof(float);

cudaMalloc(&d_C.elements, size);

// Invoke kernel

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

// Read C from device memory

cudaMemcpy(C.elements, Cd.elements, size,

cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(d_A.elements);

cudaFree(d_B.elements);

cudaFree(d_C.elements);

}

// Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

// Each thread computes one element of C

// by accumulating results into Cvalue

float Cvalue = 0;

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

for (int e = 0; e < A.width; ++e)

Cvalue += A.elements[row * A.width + e]

* B.elements[e * B.width + col];

C.elements[row * C.width + col] = Cvalue;

}

// Matrices are stored in row-major order:

// M(row, col) = *(M.elements + row * M.stride + col)

typedef struct {

int width;

int height;

int stride;

float* elements;

} Matrix;

// Get a matrix element

__device__ float GetElement(const Matrix A, int row, int col)

{

return A.elements[row * A.stride + col];

}

// Set a matrix element

__device__ void SetElement(Matrix A, int row, int col,

float value)

{

A.elements[row * A.stride + col] = value;

}

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is

// located col sub-matrices to the right and row sub-matrices down

// from the upper-left corner of A

__device__ Matrix GetSubMatrix(Matrix A, int row, int col)

{

Matrix Asub;

Asub.width = BLOCK_SIZE;

Asub.height = BLOCK_SIZE;

Asub.stride = A.stride;

Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row

+ BLOCK_SIZE * col];

return Asub;

}

// Thread block size

#define BLOCK_SIZE 16

// Forward declaration of the matrix multiplication kernel

__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code

// Matrix dimensions are assumed to be multiples of BLOCK_SIZE

void MatMul(const Matrix A, const Matrix B, Matrix C)

{

// Load A and B to device memory

Matrix d_A;

d_A.width = d_A.stride = A.width; d_A.height = A.height;

size_t size = A.width * A.height * sizeof(float);

cudaMalloc(&d_A.elements, size);

cudaMemcpy(d_A.elements, A.elements, size,

cudaMemcpyHostToDevice);

Matrix d_B;

d_B.width = d_B.stride = B.width; d_B.height = B.height;

size = B.width * B.height * sizeof(float);

cudaMalloc(&d_B.elements, size);

cudaMemcpy(d_B.elements, B.elements, size,

cudaMemcpyHostToDevice);

// Allocate C in device memory

Matrix d_C;

d_C.width = d_C.stride = C.width; d_C.height = C.height;

size = C.width * C.height * sizeof(float);

cudaMalloc(&d_C.elements, size);

// Invoke kernel

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);

MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

// Read C from device memory

cudaMemcpy(C.elements, d_C.elements, size,

cudaMemcpyDeviceToHost);

// Free device memory

cudaFree(d_A.elements);

cudaFree(d_B.elements);

cudaFree(d_C.elements);

}

// Matrix multiplication kernel called by MatMul()

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)

{

// Block row and column

int blockRow = blockIdx.y;

int blockCol = blockIdx.x;

// Each thread block computes one sub-matrix Csub of C

Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

// Each thread computes one element of Csub

// by accumulating results into Cvalue

float Cvalue = 0;

// Thread row and column within Csub

int row = threadIdx.y;

int col = threadIdx.x;

// Loop over all the sub-matrices of A and B that are

// required to compute Csub

// Multiply each pair of sub-matrices together

// and accumulate the results

for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {

// Get sub-matrix Asub of A

Matrix Asub = GetSubMatrix(A, blockRow, m);

// Get sub-matrix Bsub of B

Matrix Bsub = GetSubMatrix(B, m, blockCol);

// Shared memory used to store Asub and Bsub respectively

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load Asub and Bsub from device memory to shared memory

// Each thread loads one element of each sub-matrix

As[row][col] = GetElement(Asub, row, col);

Bs[row][col] = GetElement(Bsub, row, col);

// Synchronize to make sure the sub-matrices are loaded

// before starting the computation

__syncthreads();

// Multiply Asub and Bsub together

for (int e = 0; e < BLOCK_SIZE; ++e)

Cvalue += As[row][e] * Bs[e][col];

// Synchronize to make sure that the preceding

// computation is done before loading two new

// sub-matrices of A and B in the next iteration

__syncthreads();

}

// Write Csub to device memory

// Each thread writes one element

SetElement(Csub, row, col, Cvalue);

}

'Programming > openCL & CUDA' 카테고리의 다른 글

cuda 5 preview (0)	2012.06.02
nvidia ion cuda core와 h.264 library (0)	2012.05.22
Interoperability (상호운용성) (0)	2012.05.04
cuda 내장변수 (0)	2012.04.30
kernel block 과 thread (0)	2012.04.26

Posted by 구차니

구차니의 잡동사니 모음

CUDA API 메모리 종류

'Programming > openCL & CUDA' 카테고리의 다른 글

카테고리

공지사항

태그목록

최근에 올라온 글

최근에 달린 댓글

티스토리툴바