'2021/06/30'에 해당되는 글 7건

  1. 2021.06.30 aarch, armv8 asimd build (neon)
  2. 2021.06.30 rpi 4b 32bit vs 64bit?
  3. 2021.06.30 tensorflow lite on rpi4 64bit
  4. 2021.06.30 tensorflow delegate
  5. 2021.06.30 rpi 4 32bit / 64bit cpuinfo
  6. 2021.06.30 AArch64 linux cpu features
  7. 2021.06.30 gcc vectorized loop
embeded/raspberry pi2021. 6. 30. 18:25

 

5.7.2 Advanced SIMD Mnemonics Although derived from the AArch32 Advanced SIMD syntax, a number of changes have been made to harmonise with the AArch64 core integer and floating point instruction set syntax, and to unify AArch32’s divergent “architectural” and “programmers’” notations: • The ‘V’ mnemonic prefix has been removed, and S/U/F/P added to indicate signed/unsigned/floatingpoint/polynomial data type. The mnemonic always indicates the data type(s) of the operation. • The vector organisation (element size and number of lanes) is described by the register qualifiers and never by a mnemonic qualifier. See the description of the vector register syntax in §4.4.2 above. • The ‘P’ prefix for “pairwise” operations becomes a suffix. • A ‘V’ suffix has been added for the new reduction (across-all-lanes) operations • A ‘2’ suffix has been added for the new widening/narrowing “second part” instructions, described below. • Vector compares now use the integer condition code names to indicate whether an integer comparison is signed or unsigned (e.g. CMLT, CMLO, CMGE, CMHI, etc) • Some mnemonics have been renamed where the removal of the V prefix caused clash with the core instruction set mnemonics.

ADD Vd.<T>, Vn.<T>, Vm.<T> 
Integer add (vector). Where <T> is 8B, 16B, 4H, 8H, 2S, 4S or 2D

[링크 : https://www.element14.com/community/servlet/JiveServlet/previewBody/41836-102-1-229511/ARM.Reference_Manual.pdf]

 

$ gcc neon.c -fopt-info-vec -O3
neon.c:10:9: note: loop vectorized

 

vadd 이런게 안보이네?

$ objdump -d a.out

a.out:     file format elf64-littleaarch64


Disassembly of section .init:

00000000000005d0 <_init>:
 5d0:   a9bf7bfd        stp     x29, x30, [sp, #-16]!
 5d4:   910003fd        mov     x29, sp
 5d8:   94000043        bl      6e4 <call_weak_fn>
 5dc:   a8c17bfd        ldp     x29, x30, [sp], #16
 5e0:   d65f03c0        ret

Disassembly of section .plt:

00000000000005f0 <.plt>:
 5f0:   a9bf7bf0        stp     x16, x30, [sp, #-16]!
 5f4:   90000090        adrp    x16, 10000 <__FRAME_END__+0xf680>
 5f8:   f947fe11        ldr     x17, [x16, #4088]
 5fc:   913fe210        add     x16, x16, #0xff8
 600:   d61f0220        br      x17
 604:   d503201f        nop
 608:   d503201f        nop
 60c:   d503201f        nop

0000000000000610 <__cxa_finalize@plt>:
 610:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 614:   f9400211        ldr     x17, [x16]
 618:   91000210        add     x16, x16, #0x0
 61c:   d61f0220        br      x17

0000000000000620 <__libc_start_main@plt>:
 620:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 624:   f9400611        ldr     x17, [x16, #8]
 628:   91002210        add     x16, x16, #0x8
 62c:   d61f0220        br      x17

0000000000000630 <__gmon_start__@plt>:
 630:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 634:   f9400a11        ldr     x17, [x16, #16]
 638:   91004210        add     x16, x16, #0x10
 63c:   d61f0220        br      x17

0000000000000640 <abort@plt>:
 640:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 644:   f9400e11        ldr     x17, [x16, #24]
 648:   91006210        add     x16, x16, #0x18
 64c:   d61f0220        br      x17

0000000000000650 <printf@plt>:
 650:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 654:   f9401211        ldr     x17, [x16, #32]
 658:   91008210        add     x16, x16, #0x20
 65c:   d61f0220        br      x17

Disassembly of section .text:

0000000000000660 <main>:
 660:   d13003ff        sub     sp, sp, #0xc00
 664:   912003e0        add     x0, sp, #0x800
 668:   911003e2        add     x2, sp, #0x400
 66c:   910003e1        mov     x1, sp
 670:   913003e3        add     x3, sp, #0xc00
 674:   d503201f        nop
 678:   3cc10401        ldr     q1, [x0], #16
 67c:   3cc10440        ldr     q0, [x2], #16
 680:   eb03001f        cmp     x0, x3
 684:   4ea18400        add     v0.4s, v0.4s, v1.4s
 688:   3c810420        str     q0, [x1], #16
 68c:   54ffff61        b.ne    678 <main+0x18>  // b.any
 690:   b94003e1        ldr     w1, [sp]
 694:   90000000        adrp    x0, 0 <_init-0x5d0>
 698:   b94403e2        ldr     w2, [sp, #1024]
 69c:   91214000        add     x0, x0, #0x850
 6a0:   b94803e3        ldr     w3, [sp, #2048]
 6a4:   913003ff        add     sp, sp, #0xc00
 6a8:   17ffffea        b       650 <printf@plt>

00000000000006ac <_start>:
 6ac:   d280001d        mov     x29, #0x0                       // #0
 6b0:   d280001e        mov     x30, #0x0                       // #0
 6b4:   aa0003e5        mov     x5, x0
 6b8:   f94003e1        ldr     x1, [sp]
 6bc:   910023e2        add     x2, sp, #0x8
 6c0:   910003e6        mov     x6, sp
 6c4:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 6c8:   f947ec00        ldr     x0, [x0, #4056]
 6cc:   90000083        adrp    x3, 10000 <__FRAME_END__+0xf680>
 6d0:   f947e863        ldr     x3, [x3, #4048]
 6d4:   90000084        adrp    x4, 10000 <__FRAME_END__+0xf680>
 6d8:   f947d884        ldr     x4, [x4, #4016]
 6dc:   97ffffd1        bl      620 <__libc_start_main@plt>
 6e0:   97ffffd8        bl      640 <abort@plt>

00000000000006e4 <call_weak_fn>:
 6e4:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 6e8:   f947e400        ldr     x0, [x0, #4040]
 6ec:   b4000040        cbz     x0, 6f4 <call_weak_fn+0x10>
 6f0:   17ffffd0        b       630 <__gmon_start__@plt>
 6f4:   d65f03c0        ret

00000000000006f8 <deregister_tm_clones>:
 6f8:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 6fc:   9100e000        add     x0, x0, #0x38
 700:   b0000081        adrp    x1, 11000 <__cxa_finalize@GLIBC_2.17>
 704:   9100e021        add     x1, x1, #0x38
 708:   eb00003f        cmp     x1, x0
 70c:   540000a0        b.eq    720 <deregister_tm_clones+0x28>  // b.none
 710:   90000081        adrp    x1, 10000 <__FRAME_END__+0xf680>
 714:   f947dc21        ldr     x1, [x1, #4024]
 718:   b4000041        cbz     x1, 720 <deregister_tm_clones+0x28>
 71c:   d61f0020        br      x1
 720:   d65f03c0        ret
 724:   d503201f        nop

0000000000000728 <register_tm_clones>:
 728:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 72c:   9100e000        add     x0, x0, #0x38
 730:   b0000081        adrp    x1, 11000 <__cxa_finalize@GLIBC_2.17>
 734:   9100e021        add     x1, x1, #0x38
 738:   cb000021        sub     x1, x1, x0
 73c:   9343fc21        asr     x1, x1, #3
 740:   8b41fc21        add     x1, x1, x1, lsr #63
 744:   9341fc21        asr     x1, x1, #1
 748:   b40000a1        cbz     x1, 75c <register_tm_clones+0x34>
 74c:   90000082        adrp    x2, 10000 <__FRAME_END__+0xf680>
 750:   f947f042        ldr     x2, [x2, #4064]
 754:   b4000042        cbz     x2, 75c <register_tm_clones+0x34>
 758:   d61f0040        br      x2
 75c:   d65f03c0        ret

0000000000000760 <__do_global_dtors_aux>:
 760:   a9be7bfd        stp     x29, x30, [sp, #-32]!
 764:   910003fd        mov     x29, sp
 768:   f9000bf3        str     x19, [sp, #16]
 76c:   b0000093        adrp    x19, 11000 <__cxa_finalize@GLIBC_2.17>
 770:   3940e260        ldrb    w0, [x19, #56]
 774:   35000140        cbnz    w0, 79c <__do_global_dtors_aux+0x3c>
 778:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 77c:   f947e000        ldr     x0, [x0, #4032]
 780:   b4000080        cbz     x0, 790 <__do_global_dtors_aux+0x30>
 784:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 788:   f9401800        ldr     x0, [x0, #48]
 78c:   97ffffa1        bl      610 <__cxa_finalize@plt>
 790:   97ffffda        bl      6f8 <deregister_tm_clones>
 794:   52800020        mov     w0, #0x1                        // #1
 798:   3900e260        strb    w0, [x19, #56]
 79c:   f9400bf3        ldr     x19, [sp, #16]
 7a0:   a8c27bfd        ldp     x29, x30, [sp], #32
 7a4:   d65f03c0        ret

00000000000007a8 <frame_dummy>:
 7a8:   17ffffe0        b       728 <register_tm_clones>
 7ac:   d503201f        nop

00000000000007b0 <__libc_csu_init>:
 7b0:   a9bc7bfd        stp     x29, x30, [sp, #-64]!
 7b4:   910003fd        mov     x29, sp
 7b8:   a90153f3        stp     x19, x20, [sp, #16]
 7bc:   90000094        adrp    x20, 10000 <__FRAME_END__+0xf680>
 7c0:   91370294        add     x20, x20, #0xdc0
 7c4:   a9025bf5        stp     x21, x22, [sp, #32]
 7c8:   90000095        adrp    x21, 10000 <__FRAME_END__+0xf680>
 7cc:   9136e2b5        add     x21, x21, #0xdb8
 7d0:   cb150294        sub     x20, x20, x21
 7d4:   2a0003f6        mov     w22, w0
 7d8:   a90363f7        stp     x23, x24, [sp, #48]
 7dc:   aa0103f7        mov     x23, x1
 7e0:   aa0203f8        mov     x24, x2
 7e4:   9343fe94        asr     x20, x20, #3
 7e8:   97ffff7a        bl      5d0 <_init>
 7ec:   b4000174        cbz     x20, 818 <__libc_csu_init+0x68>
 7f0:   d2800013        mov     x19, #0x0                       // #0
 7f4:   d503201f        nop
 7f8:   f8737aa3        ldr     x3, [x21, x19, lsl #3]
 7fc:   aa1803e2        mov     x2, x24
 800:   91000673        add     x19, x19, #0x1
 804:   aa1703e1        mov     x1, x23
 808:   2a1603e0        mov     w0, w22
 80c:   d63f0060        blr     x3
 810:   eb13029f        cmp     x20, x19
 814:   54ffff21        b.ne    7f8 <__libc_csu_init+0x48>  // b.any
 818:   a94153f3        ldp     x19, x20, [sp, #16]
 81c:   a9425bf5        ldp     x21, x22, [sp, #32]
 820:   a94363f7        ldp     x23, x24, [sp, #48]
 824:   a8c47bfd        ldp     x29, x30, [sp], #64
 828:   d65f03c0        ret
 82c:   d503201f        nop

0000000000000830 <__libc_csu_fini>:
 830:   d65f03c0        ret

Disassembly of section .fini:

0000000000000834 <_fini>:
 834:   a9bf7bfd        stp     x29, x30, [sp, #-16]!
 838:   910003fd        mov     x29, sp
 83c:   a8c17bfd        ldp     x29, x30, [sp], #16
 840:   d65f03c0        ret

'embeded > raspberry pi' 카테고리의 다른 글

rpi pico c  (0) 2021.07.07
rpi pico USB 키보드 코드 수정  (0) 2021.07.01
aarch, armv8 asimd build (neon)  (0) 2021.06.30
rpi 4b 32bit vs 64bit?  (0) 2021.06.30
rpi 4 32bit / 64bit cpuinfo  (0) 2021.06.30
AArch64 linux cpu features  (0) 2021.06.30
Posted by 구차니

댓글을 달아 주세요

embeded/raspberry pi2021. 6. 30. 18:12

라고 적기에는 좀 애매한데

아무튼 NEON 가속을 통해 동일한 프로그램을 돌리는데

32bit 에서는 초당 7.5 프레임

64bit 에서는 초당 11 프레임까지 연산이 가능해지는게 신기해서 찾아보는 중

 

 

다른분에게 여쭤보니

메모리 대역폭이 늘어나거나 neon bit width 영향이 아닐까 라고 말씀하셔서 찾아보는데

보기에는 어짜피 float 변수라 single precision으로 차이가 없고

int 형이 연산에 시간을 많이 빼앗길 부분이라 up to 16x8bit operations per instruction 이라

속도에 영향을 주는건 그럼.. 전송 대역폭 혹은 메모리 복사에 다른 속도 차이 정도 이려나?

[링크 : https://community.arm.com/developer/tools-software/oss-platforms/b/android-blog/posts/arm-neon-programming-quick-reference]

 

 

[링크 : https://developer.arm.com/documentation/dht0002/a/Introducing-NEON/NEON-architecture-overview/NEON-registers]

'embeded > raspberry pi' 카테고리의 다른 글

rpi pico USB 키보드 코드 수정  (0) 2021.07.01
aarch, armv8 asimd build (neon)  (0) 2021.06.30
rpi 4b 32bit vs 64bit?  (0) 2021.06.30
rpi 4 32bit / 64bit cpuinfo  (0) 2021.06.30
AArch64 linux cpu features  (0) 2021.06.30
citcuitpyrhon joystick  (0) 2021.06.28
Posted by 구차니

댓글을 달아 주세요

build_rpi_lib.sh를 통해 빌드 하려는데 에러 뿜뿜 -_-

그래서 makefile을 고쳐야 하나 멘붕이 와서 다시 찾아 보는데

/bin/bash: arm-linux-gnueabihf-g++: command not found

 

./tensorflow/lite/tools/make/build_aarch64_lib.sh

[링크 : https://www.tensorflow.org/lite/guide/build_arm64?hl=ko]

'프로그램 사용 > yolo_tensorflow_golo' 카테고리의 다른 글

tensorflow cross compile  (0) 2021.07.01
tensorflow lite on rpi4 64bit  (0) 2021.06.30
tensorflow delegate  (0) 2021.06.30
tflite typed_tensor(), tensor()  (0) 2021.06.25
tflite yolov4  (0) 2021.06.14
tensorflow lite yolov4  (0) 2021.06.10
Posted by 구차니

댓글을 달아 주세요

전에 찾아볼때 대리자 이런게 있길래 그냥 넘겼는데

그래프 실행의 "일부" 또는 "전체"를 다른 실행자에게 넘기는 방법

오.. 그러면 전체 연산중에 일부만을 넘길수 있다면.. delegate를 이용해 분산 처리도 가능할지도?

 

TensorFlow Lite 대리자는 그래프 실행의 일부 또는 전체를 다른 executor에 위임하는 방법입니다.

 

NPU도 지원이 가능하다는데

  • 최신 Android 기기용 NNAPI 대리자 - NNAPI 대리자를 사용하여 GPU, DSP 및/또는 NPU를 사용할 수 있는 Android 기기에서 모델을 가속화할 수 있습니다. Android 8.1(API 27+) 이상에서 사용할 수 있습니다. NNAPI 대리자 개요, 단계별 지침 및 모범 사례는 TensorFlow Lite NNAPI 대리자를 참조하세요.

[링크 : https://www.tensorflow.org/lite/performance/delegates?hl=ko]

 

cpp 로는 안되나.. android 8.1 이상 기기에서 쓸 수 있다고만 나오네

Android Neural Networks API(NNAPI)는 Android 8.1(API 레벨 27) 이상을 실행하는 모든 Android 기기에서 사용할 수 있습니다. 다음과 같은 하드웨어 가속기를 지원하는 Android 기기의 TensorFlow Lite 모델을 속도를 향상합니다.
  • 그래픽 처리 장치(GPU)
  • 디지털 신호 프로세서(DSP)
  • 신경 처리 장치(NPU)

[링크 : https://www.tensorflow.org/lite/performance/nnapi?hl=ko]

'프로그램 사용 > yolo_tensorflow_golo' 카테고리의 다른 글

tensorflow cross compile  (0) 2021.07.01
tensorflow lite on rpi4 64bit  (0) 2021.06.30
tensorflow delegate  (0) 2021.06.30
tflite typed_tensor(), tensor()  (0) 2021.06.25
tflite yolov4  (0) 2021.06.14
tensorflow lite yolov4  (0) 2021.06.10
Posted by 구차니

댓글을 달아 주세요

embeded/raspberry pi2021. 6. 30. 16:30

32bit kernel info

$ cat /proc/cpuinfo
processor       : 3
model name      : ARMv7 Processor rev 3 (v7l)
BogoMIPS        : 108.00
Features        : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae evtstrm crc32
CPU implementer : 0x41
CPU architecture: 7
CPU variant     : 0x0
CPU part        : 0xd08
CPU revision    : 3

Hardware        : BCM2711
Revision        : c03112
Serial          : 100000003e728750
Model           : Raspberry Pi 4 Model B Rev 1.2

$ uname -a
Linux raspberrypi 5.10.17-v7l+ #1403 SMP Mon Feb 22 11:33:35 GMT 2021 armv7l GNU/Linux

 

64bit kernel info

$ cat /proc/cpuinfo
processor       : 3
BogoMIPS        : 108.00
Features        : fp asimd evtstrm crc32 cpuid
CPU implementer : 0x41
CPU architecture: 8
CPU variant     : 0x0
CPU part        : 0xd08
CPU revision    : 3

Hardware        : BCM2835
Revision        : c03112
Serial          : 100000003e728750
Model           : Raspberry Pi 4 Model B Rev 1.2

$ uname -a
Linux raspberrypi 5.10.17-v8+ #1403 SMP PREEMPT Mon Feb 22 11:37:54 GMT 2021 aarch64 GNU/Linux

'embeded > raspberry pi' 카테고리의 다른 글

aarch, armv8 asimd build (neon)  (0) 2021.06.30
rpi 4b 32bit vs 64bit?  (0) 2021.06.30
rpi 4 32bit / 64bit cpuinfo  (0) 2021.06.30
AArch64 linux cpu features  (0) 2021.06.30
citcuitpyrhon joystick  (0) 2021.06.28
rpi pico, circuitpython, USB HID  (0) 2021.06.28
Posted by 구차니

댓글을 달아 주세요

embeded/raspberry pi2021. 6. 30. 16:23

cpuinfo 플래그가 희한하게 떠서 찾아보니 aarch64 에서는 명칭들이 죄다 바뀐 듯

asimd는 Advanced SIMD 줄여서 표현한건데 기존의 neon을 대체하고

fp는 vfp이런걸 전부 다 대체하는건가?

Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32 cpuid

 

[링크 : https://unix.stackexchange.com/questions/43539/what-do-the-flags-in-proc-cpuinfo-mean]

[링크 : https://stackoverflow.com/questions/59379848/]

 

[링크 : http://javathunderx.blogspot.com/2018/11/cheat-sheet-for-cpuinfo-features-on.html]

'embeded > raspberry pi' 카테고리의 다른 글

rpi 4b 32bit vs 64bit?  (0) 2021.06.30
rpi 4 32bit / 64bit cpuinfo  (0) 2021.06.30
AArch64 linux cpu features  (0) 2021.06.30
citcuitpyrhon joystick  (0) 2021.06.28
rpi pico, circuitpython, USB HID  (0) 2021.06.28
rpi pico pinout  (0) 2021.06.28
Posted by 구차니

댓글을 달아 주세요

프로그램 사용/gcc2021. 6. 30. 11:43

-O3 하면 자동으로 -ftree-vectorize가 추가되었다고.

아무튼 연산만 하고 출력을 안하니 사용하지 않는 코드로 해서 vadd가 안나와서 한참을 헤맸네..

 

$ g++ -O3 -mavx autovector.cpp -fopt-info-vec-all
autovector.cpp:22:22: missed: couldn't vectorize loop
autovector.cpp:25:19: missed: not vectorized: complicated access pattern.
autovector.cpp:23:21: missed: couldn't vectorize loop
autovector.cpp:25:14: missed: not vectorized: complicated access pattern.
autovector.cpp:16:23: optimized: loop vectorized using 32 byte vectors
autovector.cpp:10:5: note: vectorized 1 loops in function.
autovector.cpp:15:43: missed: statement clobbers memory: now = std::chrono::_V2::system_clock::now ();
autovector.cpp:27:77: missed: statement clobbers memory: D.189348 = std::chrono::_V2::system_clock::now ();
autovector.cpp:28:2: missed: statement clobbers memory: __assert_fail ("result[2] == ( 2.0f + 0.1335f)+( 1.50f*2.0f + 0.9383f)-(0.33f*2.0f+0.1172f)+3*(float)(noTests-1)", "autovector.cpp", 28, "int main()");
/usr/include/c++/9/ostream:570:18: missed: statement clobbers memory: std::__ostream_insert<char, std::char_traits<char> > (&cout, "CG> message -channel \"exercise results\" Time used: ", 51);
/usr/include/c++/9/ostream:221:29: missed: statement clobbers memory: _46 = std::basic_ostream<char>::_M_insert<double> (&cout, _42);
/usr/include/c++/9/ostream:570:18: missed: statement clobbers memory: std::__ostream_insert<char, std::char_traits<char> > (_46, "s, N * noTests=", 15);
autovector.cpp:29:112: missed: statement clobbers memory: _35 = std::basic_ostream<char>::operator<< (_46, 2000000000);
/usr/include/c++/9/ostream:113:13: missed: statement clobbers memory: std::endl<char, std::char_traits<char> > (_35);
/usr/include/c++/9/iostream:74:25: missed: statement clobbers memory: std::ios_base::Init::Init (&__ioinit);
/usr/include/c++/9/iostream:74:25: missed: statement clobbers memory: __cxa_atexit (__dt_comp , &__ioinit, &__dso_handle);
$ gcc -mcpu=native -march=native -Q --help=target
The following options are target specific:
  -mabi=                                aapcs-linux
  -mabort-on-noreturn                   [disabled]
  -mandroid                             [disabled]
  -mapcs                                [disabled]
  -mapcs-frame                          [disabled]
  -mapcs-reentrant                      [disabled]
  -mapcs-stack-check                    [disabled]
  -march=                               armv7ve+vfpv3-d16
  -marm                                 [enabled]
  -masm-syntax-unified                  [disabled]
  -mbe32                                [enabled]
  -mbe8                                 [disabled]
  -mbig-endian                          [disabled]
  -mbionic                              [disabled]
  -mbranch-cost=                        -1
  -mcallee-super-interworking           [disabled]
  -mcaller-super-interworking           [disabled]
  -mcmse                                [disabled]
  -mcpu=                                cortex-a7
  -mfix-cortex-m3-ldrd                  [disabled]
  -mflip-thumb                          [disabled]
  -mfloat-abi=                          hard
  -mfp16-format=                        none
  -mfpu=                                vfp
  -mglibc                               [enabled]
  -mhard-float
  -mlittle-endian                       [enabled]
  -mlong-calls                          [disabled]
  -mmusl                                [disabled]
  -mneon-for-64bits                     [disabled]
  -mpic-data-is-text-relative           [enabled]
  -mpic-register=
  -mpoke-function-name                  [disabled]
  -mprint-tune-info                     [disabled]
  -mpure-code                           [disabled]
  -mrestrict-it                         [disabled]
  -msched-prolog                        [enabled]
  -msingle-pic-base                     [disabled]
  -mslow-flash-data                     [disabled]
  -msoft-float
  -mstructure-size-boundary=            8
  -mthumb                               [disabled]
  -mthumb-interwork                     [disabled]
  -mtls-dialect=                        gnu
  -mtp=                                 cp15
  -mtpcs-frame                          [disabled]
  -mtpcs-leaf-frame                     [disabled]
  -mtune=
  -muclibc                              [disabled]
  -munaligned-access                    [enabled]
  -mvectorize-with-neon-double          [disabled]
  -mvectorize-with-neon-quad            [enabled]
  -mword-relocations                    [disabled]

  Known ARM ABIs (for use with the -mabi= option):
    aapcs aapcs-linux apcs-gnu atpcs iwmmxt

  Known __fp16 formats (for use with the -mfp16-format= option):
    alternative ieee none

  Known ARM FPUs (for use with the -mfpu= option):
    auto crypto-neon-fp-armv8 fp-armv8 fpv4-sp-d16 fpv5-d16 fpv5-sp-d16 neon neon-fp-armv8 neon-fp16 neon-vfpv3 neon-vfpv4 vfp vfp3 vfpv2 vfpv3 vfpv3-d16 vfpv3-d16-fp16 vfpv3-fp16 vfpv3xd
    vfpv3xd-fp16 vfpv4 vfpv4-d16

  Valid arguments to -mtp=:
    auto cp15 soft

  Known floating-point ABIs (for use with the -mfloat-abi= option):
    hard soft softfp

  TLS dialect to use:
    gnu gnu2


[링크 : https://www.raspberrypi.org/forums/viewtopic.php?t=155461]

[링크 : https://www.codingame.com/playgrounds/283/sse-avx-vectorization/autovectorization]

 

+

$ cat neon.c
#include <stdio.h>

void main()
{
        int a[256];
        int b[256];
        int c[256];

        int i;
        for(i = 0; i < 256; i++)
        {
                a[i] = b[i] + c[i];
        }

        printf("%d %d %d\n", a[0], b[0], c[0]);
}

 

$ gcc -O3 neon.c -mfpu=neon
$ objdump -d a.out  | grep v
   10320:       e1a01000        mov     r1, r0
   10328:       e1a0300d        mov     r3, sp
   1032c:       f4610add        vld1.64 {d16-d17}, [r1 :64]!
   10330:       f4622add        vld1.64 {d18-d19}, [r2 :64]!
   10334:       f26008e2        vadd.i32        q8, q8, q9
   10338:       f4430add        vst1.64 {d16-d17}, [r3 :64]!
   10360:       e3a0b000        mov     fp, #0
   10364:       e3a0e000        mov     lr, #0
   1036c:       e1a0200d        mov     r2, sp
   1043c:       e3a03001        mov     r3, #1
   10454:       e1a07000        mov     r7, r0
   1046c:       e1a08001        mov     r8, r1
   10470:       e1a09002        mov     r9, r2
   10480:       e3a04000        mov     r4, #0
   1048c:       e1a02009        mov     r2, r9
   10490:       e1a01008        mov     r1, r8
   10494:       e1a00007        mov     r0, r7

 

$ gcc neon.c -mfpu=neon
$ objdump -d a.out  | grep v
   10318:       e3a0b000        mov     fp, #0
   1031c:       e3a0e000        mov     lr, #0
   10324:       e1a0200d        mov     r2, sp
   103f4:       e3a03001        mov     r3, #1
   10418:       e3a03000        mov     r3, #0
   10490:       e1a00000        nop                     ; (mov r0, r0)
   104a4:       e1a07000        mov     r7, r0
   104bc:       e1a08001        mov     r8, r1
   104c0:       e1a09002        mov     r9, r2
   104d0:       e3a04000        mov     r4, #0
   104dc:       e1a02009        mov     r2, r9
   104e0:       e1a01008        mov     r1, r8
   104e4:       e1a00007        mov     r0, r7

 

-fopt-info-vec-all 추가. -all 때문인지 어마어마하게 나오네

-fopt-info-vec 으로만 하니 깔끔하게 vectorized 라고 뜬다.

$ gcc neon.c -mfpu=neon -fopt-info-vec -O3
neon.c:10:2: note: loop vectorized

'프로그램 사용 > gcc' 카테고리의 다른 글

gcc unsigned to signed upcast 테스트  (0) 2021.07.08
gcc vectorized loop  (0) 2021.06.30
gcc unsigned to signed cast  (0) 2021.06.22
gcc %p (nil)  (0) 2021.05.07
gcc -D 옵션 인자를 printf로 출력하기  (0) 2021.04.08
Auto-vectorization in GCC  (0) 2021.03.25
Posted by 구차니

댓글을 달아 주세요