visual studio 2008 / cuda 5.5 / GTX650 기준 작성
0 a:26326 b:14567 c:40893 == r:40893
1 a:20769 b:29469 c:50238 == r:50238
2 a:19293 b:19828 c:39121 == r:39121
3 a: 5720 b:16164 c:21884 == r:21884
4 a:10116 b:16010 c:26126 == r:26126
5 a:24503 b: 1380 c:25883 == r:25883
6 a: 1261 b:20500 c:21761 == r:21761
7 a:32527 b:14265 c:46792 == r:46792
8 a: 6165 b: 1639 c: 7804 == r: 7804
9 a:16881 b: 7619 c:24500 == r:24500
10 a:14636 b: 3016 c:17652 == r:17652
11 a:20766 b: 1675 c:22441 == r:22441
12 a:24356 b: 3886 c:28242 == r:28242
13 a: 9279 b:15721 c:25000 == r:25000
14 a:20744 b: 74 c:20818 == r:20818
15 a:24023 b:17957 c:41980 == r:41980
16 a: 399 b:19653 c:20052 == r:20052
17 a: 9077 b: 9308 c:18385 == r:18385
18 a:18673 b: 713 c:19386 == r:19386
19 a:17966 b:12837 c:30803 == r:30803
20 a:28921 b:31938 c:60859 == r:60859
21 a:20298 b:18933 c:39231 == r:39231
22 a:18267 b:31334 c:49601 == r:49601
23 a:17726 b:18368 c:36094 == r:36094
24 a:10825 b:19187 c:30012 == r:30012
25 a:15579 b: 9569 c:25148 == r:25148
26 a:17217 b:27831 c:45048 == r:45048
27 a: 2756 b:13884 c:16640 == r:16640
28 a:25641 b:17878 c:43519 == r:43519
29 a:10533 b:17954 c:28487 == r:28487
30 a:15005 b:23112 c:38117 == r:38117
31 a: 9634 b: 8053 c:17687 == r:17687 |
#include < stdio.h > #include < stdlib.h > #include < time.h > #include "cuda_runtime.h" #define ARRAY_SIZE 32 __global__ void kernel_test(int *a, int *b, int *c) { int idx = threadIdx.x; c[idx] = a[idx] + b[idx]; } void main() { int a[ARRAY_SIZE],b[ARRAY_SIZE],c[ARRAY_SIZE],res[ARRAY_SIZE]; int *dev_a,*dev_b,*dev_c; int idx = 0; dim3 block(1); dim3 thread(ARRAY_SIZE); // initialize srand (time(NULL)); for(idx = 0;idx < ARRAY_SIZE ; idx++) { a[idx] = rand(); b[idx] = rand(); res[idx] = a[idx] + b[idx]; c[idx] = 0; } cudaMalloc(&dev_a, ARRAY_SIZE * sizeof(int)); cudaMalloc(&dev_b, ARRAY_SIZE * sizeof(int)); cudaMalloc(&dev_c, ARRAY_SIZE * sizeof(int)); cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice); kernel_test<<<block,thread>>>(dev_a,dev_b,dev_c); cudaMemcpy(c, dev_c, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost); for(idx = 0;idx < ARRAY_SIZE ; idx++) { printf("%5d a:%5d b:%5d c:%5d", idx, a[idx], b[idx], c[idx]); if(res[idx] != c[idx]) printf(" != "); else printf(" == "); printf("r:%5d\n",res[idx]); } cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); }
#define ARRAY_SIZE 64 __global__ void kernel_test(int *a, int *b, int *c) { int idx = threadIdx.x + threadIdx.y * 8; c[idx] = a[idx] + b[idx]; } void main() { dim3 block(1); dim3 thread(8,8); }
'Programming > openCL & CUDA' 카테고리의 다른 글
visual studio 2008 nsight 실행..된거 맞나? (0) | 2014.01.15 |
---|---|
nsight 설치... -_- (0) | 2014.01.15 |
cuda 5.5 조금은 더 빠르게 컴파일 하기 (0) | 2014.01.14 |
GTX650 / ion devicequery (4) | 2014.01.13 |
cuda dim3 변수 초기화 (0) | 2014.01.13 |