Programming/openCL & CUDA2014. 1. 14. 19:53
visual studio 2008 / cuda 5.5 / GTX650 기준 작성

    0 a:26326 b:14567 c:40893 == r:40893
    1 a:20769 b:29469 c:50238 == r:50238
    2 a:19293 b:19828 c:39121 == r:39121
    3 a: 5720 b:16164 c:21884 == r:21884
    4 a:10116 b:16010 c:26126 == r:26126
    5 a:24503 b: 1380 c:25883 == r:25883
    6 a: 1261 b:20500 c:21761 == r:21761
    7 a:32527 b:14265 c:46792 == r:46792
    8 a: 6165 b: 1639 c: 7804 == r: 7804
    9 a:16881 b: 7619 c:24500 == r:24500
   10 a:14636 b: 3016 c:17652 == r:17652
   11 a:20766 b: 1675 c:22441 == r:22441
   12 a:24356 b: 3886 c:28242 == r:28242
   13 a: 9279 b:15721 c:25000 == r:25000
   14 a:20744 b:   74 c:20818 == r:20818
   15 a:24023 b:17957 c:41980 == r:41980
   16 a:  399 b:19653 c:20052 == r:20052
   17 a: 9077 b: 9308 c:18385 == r:18385
   18 a:18673 b:  713 c:19386 == r:19386
   19 a:17966 b:12837 c:30803 == r:30803
   20 a:28921 b:31938 c:60859 == r:60859
   21 a:20298 b:18933 c:39231 == r:39231
   22 a:18267 b:31334 c:49601 == r:49601
   23 a:17726 b:18368 c:36094 == r:36094
   24 a:10825 b:19187 c:30012 == r:30012
   25 a:15579 b: 9569 c:25148 == r:25148
   26 a:17217 b:27831 c:45048 == r:45048
   27 a: 2756 b:13884 c:16640 == r:16640
   28 a:25641 b:17878 c:43519 == r:43519
   29 a:10533 b:17954 c:28487 == r:28487
   30 a:15005 b:23112 c:38117 == r:38117
   31 a: 9634 b: 8053 c:17687 == r:17687 


#include < stdio.h >
#include < stdlib.h >
#include < time.h >

#include "cuda_runtime.h"

#define ARRAY_SIZE	32

__global__ void kernel_test(int *a, int *b, int *c)
{
	int idx = threadIdx.x;
	c[idx] = a[idx] + b[idx];
}

void main()
{
	int a[ARRAY_SIZE],b[ARRAY_SIZE],c[ARRAY_SIZE],res[ARRAY_SIZE];
	int *dev_a,*dev_b,*dev_c;
	int idx = 0;
	dim3 block(1);
	dim3 thread(ARRAY_SIZE);

	// initialize
	srand (time(NULL));
	for(idx = 0;idx < ARRAY_SIZE ; idx++)
	{
		a[idx] = rand();
		b[idx] = rand();
		res[idx] = a[idx] + b[idx];
		c[idx] = 0;
	}

	cudaMalloc(&dev_a, ARRAY_SIZE * sizeof(int));
	cudaMalloc(&dev_b, ARRAY_SIZE * sizeof(int));
	cudaMalloc(&dev_c, ARRAY_SIZE * sizeof(int));

	cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);

	kernel_test<<<block,thread>>>(dev_a,dev_b,dev_c);

	cudaMemcpy(c, dev_c, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);

	for(idx = 0;idx < ARRAY_SIZE ; idx++)
	{
		printf("%5d a:%5d b:%5d c:%5d", idx, a[idx], b[idx], c[idx]);
		if(res[idx] != c[idx])
				printf(" != ");
		else	printf(" == ");
		printf("r:%5d\n",res[idx]);
	}

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
}


#define ARRAY_SIZE	64

__global__ void kernel_test(int *a, int *b, int *c)
{
	int idx = threadIdx.x + threadIdx.y * 8;
	c[idx] = a[idx] + b[idx];
}

void main()
{
	dim3 block(1);
	dim3 thread(8,8);
}
Posted by 구차니

댓글을 달아 주세요