cuda + openmp 적용 예제

Programming/openCL & CUDA2014. 1. 17. 19:41

cuda + openmp 적용 예제

어제 했던 소스에서 대충 openmp 적용해봄
2014/01/16 - [Programming/openCL / CUDA] - cuda 1차원 데이터, 2차원 처리 예제

CUDA Runtime API - Host - Extra C++ Options 에서 /openmp 를 추가하면 된다.

[링크 : https://devtalk.nvidia.com/.../vs2010-cuda4rc2-howto-compile-openmp-host-code...]

물론 #inclued <omp.h>를 추가하지 않으면
컴파일 시에는 문제가 없으나 실행시에 이런 문제가 발생한다.

[링크 : http://blog.naver.com/changfull7/70110120004]

openmp 적용

cpu Time : 0.048000
gpu Time : 0.001000

openmp 미적용

cpu Time : 0.131000
gpu Time : 0.001000

#include < stdio.h >
#include < stdlib.h >
#include < time.h >

#include < omp.h >

#include "cuda_runtime.h"

#define BLOCK_WID	128
#define THREAD_WID	32
#define ARRAY_SIZE	 (THREAD_WID * THREAD_WID * BLOCK_WID * BLOCK_WID)

__global__ void kernel_test(int *a, int *b, int *c)
{
	int idx = threadIdx.x +blockIdx.x * blockDim.x + (gridDim.x * blockDim.x) * (blockIdx.y * blockDim.y + threadIdx.y);
	c[idx] = a[idx] + b[idx];
}

void main()
{
	clock_t start_time, end_time;
	int *a, *b, *c, *res;
	int *dev_a,*dev_b,*dev_c;
	int idx = 0;
	dim3 block(BLOCK_WID,BLOCK_WID);
	dim3 thread(THREAD_WID,THREAD_WID);

	a = (int *)malloc(ARRAY_SIZE * sizeof(int));
	b = (int *)malloc(ARRAY_SIZE * sizeof(int));
	c = (int *)malloc(ARRAY_SIZE * sizeof(int));
	res = (int *)malloc(ARRAY_SIZE * sizeof(int));

	// initialize
	srand (time(NULL));

#pragma omp parallel for
	for(idx = 0;idx < ARRAY_SIZE ; idx++)
	{
		a[idx] = rand() & 0xFFFF;
		b[idx] = rand() & 0xFFFF;
		c[idx] = 0;
	}

	start_time = clock();

#pragma omp parallel for
	for(idx = 0;idx < ARRAY_SIZE ; idx++)
	{
		res[idx] = a[idx] + b[idx];
	}
	end_time = clock();
	printf("cpu Time : %f\n", ((double)(end_time-start_time)) / CLOCKS_PER_SEC); 

	cudaMalloc(&dev_a, ARRAY_SIZE * sizeof(int));
	cudaMalloc(&dev_b, ARRAY_SIZE * sizeof(int));
	cudaMalloc(&dev_c, ARRAY_SIZE * sizeof(int));

	cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);

	start_time = clock();
	kernel_test<<<block,thread>>>(dev_a,dev_b,dev_c);
	end_time = clock();
	printf("gpu Time : %f\n", ((double)(end_time-start_time)) / CLOCKS_PER_SEC); 

	cudaMemcpy(c, dev_c, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);

	for(idx = 0;idx < ARRAY_SIZE ; idx++)
	{
		if(res[idx] != c[idx])
		{
			printf("%5d a:%5d b:%5d c:%5d", idx, a[idx], b[idx], c[idx]);
			if(res[idx] != c[idx])
					printf(" != ");
			else	printf(" == ");
			printf("r:%5d\n",res[idx]);
			break;
		}
	}

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	free(a);
	free(b);
	free(c);
	free(res);
}

'Programming > openCL & CUDA' 카테고리의 다른 글

cuda 6.0 rc (0)	2014.02.15
cuda + openmp 프로젝트 생성은.... (0)	2014.01.17
cuda 1차원 데이터, 2차원 처리 예제 (4)	2014.01.16
CUDA 쓰레드 계산 (0)	2014.01.15
visual studio 2008 nsight 실행..된거 맞나? (0)	2014.01.15

Posted by 구차니

구차니의 잡동사니 모음

cuda + openmp 적용 예제

'Programming > openCL & CUDA' 카테고리의 다른 글

카테고리

공지사항

태그목록

최근에 올라온 글

최근에 달린 댓글

티스토리툴바