#include <stdio.h>
#include "arm_neon.h"
void rgb2bgr_with_neon(unsigned char *img, int height, int width)
{
const int total_bytes = height * width * 3;
const int stride_bytes = 48;
for (int i = 0; i < total_bytes; i += stride_bytes)
{
uint8_t *target = img + i;
// swap R and B channel with NEON
uint8x16x3_t a = vld3q_u8(target);
uint8x16x3_t b;
b.val[0] = a.val[2];
b.val[1] = a.val[1];
b.val[2] = a.val[0];
vst3q_u8(target, b);
}
}
void main(){
unsigned char buf[100*100];
rgb2bgr_with_neon(buf, 100, 100);
}
aarch64-linux-gnu-gcc main.c -g -O0
aarch64-linux-gnu-objdump -S -d a.out > a.s
a.out: file format elf64-littleaarch64
Disassembly of section .init:
0000000000400408 <_init>:
400408: a9bf7bfd stp x29, x30, [sp, #-16]!
40040c: 910003fd mov x29, sp
400410: 94000029 bl 4004b4 <call_weak_fn>
400414: a8c17bfd ldp x29, x30, [sp], #16
400418: d65f03c0 ret
Disassembly of section .plt:
0000000000400420 <.plt>:
400420: a9bf7bf0 stp x16, x30, [sp, #-16]!
400424: 90000090 adrp x16, 410000 <__FRAME_END__+0xf6a8>
400428: f947fe11 ldr x17, [x16, #4088]
40042c: 913fe210 add x16, x16, #0xff8
400430: d61f0220 br x17
400434: d503201f nop
400438: d503201f nop
40043c: d503201f nop
0000000000400440 <__libc_start_main@plt>:
400440: b0000090 adrp x16, 411000 <__libc_start_main@GLIBC_2.17>
400444: f9400211 ldr x17, [x16]
400448: 91000210 add x16, x16, #0x0
40044c: d61f0220 br x17
0000000000400450 <__gmon_start__@plt>:
400450: b0000090 adrp x16, 411000 <__libc_start_main@GLIBC_2.17>
400454: f9400611 ldr x17, [x16, #8]
400458: 91002210 add x16, x16, #0x8
40045c: d61f0220 br x17
0000000000400460 <abort@plt>:
400460: b0000090 adrp x16, 411000 <__libc_start_main@GLIBC_2.17>
400464: f9400a11 ldr x17, [x16, #16]
400468: 91004210 add x16, x16, #0x10
40046c: d61f0220 br x17
Disassembly of section .text:
0000000000400470 <_start>:
400470: d280001d mov x29, #0x0 // #0
400474: d280001e mov x30, #0x0 // #0
400478: aa0003e5 mov x5, x0
40047c: f94003e1 ldr x1, [sp]
400480: 910023e2 add x2, sp, #0x8
400484: 910003e6 mov x6, sp
400488: 90000000 adrp x0, 400000 <_init-0x408>
40048c: 9112a000 add x0, x0, #0x4a8
400490: 90000003 adrp x3, 400000 <_init-0x408>
400494: 911d0063 add x3, x3, #0x740
400498: 90000004 adrp x4, 400000 <_init-0x408>
40049c: 911f0084 add x4, x4, #0x7c0
4004a0: 97ffffe8 bl 400440 <__libc_start_main@plt>
4004a4: 97ffffef bl 400460 <abort@plt>
00000000004004a8 <__wrap_main>:
4004a8: 14000097 b 400704 <main>
4004ac: d503201f nop
00000000004004b0 <_dl_relocate_static_pie>:
4004b0: d65f03c0 ret
00000000004004b4 <call_weak_fn>:
4004b4: 90000080 adrp x0, 410000 <__FRAME_END__+0xf6a8>
4004b8: f947f000 ldr x0, [x0, #4064]
4004bc: b4000040 cbz x0, 4004c4 <call_weak_fn+0x10>
4004c0: 17ffffe4 b 400450 <__gmon_start__@plt>
4004c4: d65f03c0 ret
4004c8: d503201f nop
4004cc: d503201f nop
00000000004004d0 <deregister_tm_clones>:
4004d0: b0000080 adrp x0, 411000 <__libc_start_main@GLIBC_2.17>
4004d4: 9100a000 add x0, x0, #0x28
4004d8: b0000081 adrp x1, 411000 <__libc_start_main@GLIBC_2.17>
4004dc: 9100a021 add x1, x1, #0x28
4004e0: eb00003f cmp x1, x0
4004e4: 540000c0 b.eq 4004fc <deregister_tm_clones+0x2c> // b.none
4004e8: 90000001 adrp x1, 400000 <_init-0x408>
4004ec: f943f021 ldr x1, [x1, #2016]
4004f0: b4000061 cbz x1, 4004fc <deregister_tm_clones+0x2c>
4004f4: aa0103f0 mov x16, x1
4004f8: d61f0200 br x16
4004fc: d65f03c0 ret
0000000000400500 <register_tm_clones>:
400500: b0000080 adrp x0, 411000 <__libc_start_main@GLIBC_2.17>
400504: 9100a000 add x0, x0, #0x28
400508: b0000081 adrp x1, 411000 <__libc_start_main@GLIBC_2.17>
40050c: 9100a021 add x1, x1, #0x28
400510: cb000021 sub x1, x1, x0
400514: d37ffc22 lsr x2, x1, #63
400518: 8b810c41 add x1, x2, x1, asr #3
40051c: 9341fc21 asr x1, x1, #1
400520: b40000c1 cbz x1, 400538 <register_tm_clones+0x38>
400524: 90000002 adrp x2, 400000 <_init-0x408>
400528: f943f442 ldr x2, [x2, #2024]
40052c: b4000062 cbz x2, 400538 <register_tm_clones+0x38>
400530: aa0203f0 mov x16, x2
400534: d61f0200 br x16
400538: d65f03c0 ret
40053c: d503201f nop
0000000000400540 <__do_global_dtors_aux>:
400540: a9be7bfd stp x29, x30, [sp, #-32]!
400544: 910003fd mov x29, sp
400548: f9000bf3 str x19, [sp, #16]
40054c: b0000093 adrp x19, 411000 <__libc_start_main@GLIBC_2.17>
400550: 3940a260 ldrb w0, [x19, #40]
400554: 35000080 cbnz w0, 400564 <__do_global_dtors_aux+0x24>
400558: 97ffffde bl 4004d0 <deregister_tm_clones>
40055c: 52800020 mov w0, #0x1 // #1
400560: 3900a260 strb w0, [x19, #40]
400564: f9400bf3 ldr x19, [sp, #16]
400568: a8c27bfd ldp x29, x30, [sp], #32
40056c: d65f03c0 ret
0000000000400570 <frame_dummy>:
400570: 17ffffe4 b 400500 <register_tm_clones>
0000000000400574 <rgb2bgr_with_neon>:
#include <stdio.h>
#include "arm_neon.h"
void rgb2bgr_with_neon(unsigned char *img, int height, int width)
{
400574: d104c3ff sub sp, sp, #0x130
400578: f90007e0 str x0, [sp, #8]
40057c: b90007e1 str w1, [sp, #4]
400580: b90003e2 str w2, [sp]
const int total_bytes = height * width * 3;
400584: b94007e1 ldr w1, [sp, #4]
400588: b94003e0 ldr w0, [sp]
40058c: 1b007c21 mul w1, w1, w0
400590: 2a0103e0 mov w0, w1
400594: 531f7800 lsl w0, w0, #1
400598: 0b010000 add w0, w0, w1
40059c: b9012be0 str w0, [sp, #296]
const int stride_bytes = 48;
4005a0: 52800600 mov w0, #0x30 // #48
4005a4: b90127e0 str w0, [sp, #292]
for (int i = 0; i < total_bytes; i += stride_bytes)
4005a8: b9012fff str wzr, [sp, #300]
4005ac: 1400004e b 4006e4 <rgb2bgr_with_neon+0x170>
{
uint8_t *target = img + i;
4005b0: b9812fe0 ldrsw x0, [sp, #300]
4005b4: f94007e1 ldr x1, [sp, #8]
4005b8: 8b000020 add x0, x1, x0
4005bc: f9008fe0 str x0, [sp, #280]
4005c0: f9408fe0 ldr x0, [sp, #280]
4005c4: f9006fe0 str x0, [sp, #216]
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld3q_u8 (const uint8_t * __a)
{
uint8x16x3_t ret;
__builtin_aarch64_simd_ci __o;
__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
4005c8: 910283e0 add x0, sp, #0xa0
4005cc: f9406fe1 ldr x1, [sp, #216]
4005d0: 4c404021 ld3 {v1.16b-v3.16b}, [x1]
4005d4: 4c006001 st1 {v1.16b-v3.16b}, [x0]
ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
4005d8: 910283e0 add x0, sp, #0xa0
4005dc: 4c406001 ld1 {v1.16b-v3.16b}, [x0]
4005e0: 4ea11c20 mov v0.16b, v1.16b
4005e4: 9101c3e0 add x0, sp, #0x70
4005e8: 3d800000 str q0, [x0]
ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
4005ec: 910283e0 add x0, sp, #0xa0
4005f0: 4c406001 ld1 {v1.16b-v3.16b}, [x0]
4005f4: 4ea21c40 mov v0.16b, v2.16b
4005f8: 9101c3e0 add x0, sp, #0x70
4005fc: 3d800400 str q0, [x0, #16]
ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
400600: 910283e0 add x0, sp, #0xa0
400604: 4c406001 ld1 {v1.16b-v3.16b}, [x0]
400608: 4ea31c60 mov v0.16b, v3.16b
40060c: 9101c3e0 add x0, sp, #0x70
400610: 3d800800 str q0, [x0, #32]
return ret;
400614: 9101c3e0 add x0, sp, #0x70
400618: 4c406001 ld1 {v1.16b-v3.16b}, [x0]
40061c: 910043e0 add x0, sp, #0x10
400620: 4c006001 st1 {v1.16b-v3.16b}, [x0]
// swap R and B channel with NEON
uint8x16x3_t a = vld3q_u8(target);
uint8x16x3_t b;
b.val[0] = a.val[2];
400624: 910043e0 add x0, sp, #0x10
400628: 3dc00800 ldr q0, [x0, #32]
40062c: 910103e0 add x0, sp, #0x40
400630: 3d800000 str q0, [x0]
b.val[1] = a.val[1];
400634: 910043e0 add x0, sp, #0x10
400638: 3dc00400 ldr q0, [x0, #16]
40063c: 910103e0 add x0, sp, #0x40
400640: 3d800400 str q0, [x0, #16]
b.val[2] = a.val[0];
400644: 910043e0 add x0, sp, #0x10
400648: 3dc00000 ldr q0, [x0]
40064c: 910103e0 add x0, sp, #0x40
400650: 3d800800 str q0, [x0, #32]
400654: f9408fe0 ldr x0, [sp, #280]
400658: f9008be0 str x0, [sp, #272]
40065c: 9101c3e0 add x0, sp, #0x70
400660: 910103e1 add x1, sp, #0x40
400664: 4c406021 ld1 {v1.16b-v3.16b}, [x1]
400668: 4c006001 st1 {v1.16b-v3.16b}, [x0]
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst3q_u8 (uint8_t * __a, uint8x16x3_t __val)
{
__builtin_aarch64_simd_ci __o;
__o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
40066c: 9101c3e0 add x0, sp, #0x70
400670: 3dc00000 ldr q0, [x0]
400674: 910383e0 add x0, sp, #0xe0
400678: 910383e1 add x1, sp, #0xe0
40067c: 4c406021 ld1 {v1.16b-v3.16b}, [x1]
400680: 4ea01c01 mov v1.16b, v0.16b
400684: 4c006001 st1 {v1.16b-v3.16b}, [x0]
__o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
400688: 9101c3e0 add x0, sp, #0x70
40068c: 3dc00400 ldr q0, [x0, #16]
400690: 910383e0 add x0, sp, #0xe0
400694: 910383e1 add x1, sp, #0xe0
400698: 4c406021 ld1 {v1.16b-v3.16b}, [x1]
40069c: 4ea01c02 mov v2.16b, v0.16b
4006a0: 4c006001 st1 {v1.16b-v3.16b}, [x0]
__o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
4006a4: 9101c3e0 add x0, sp, #0x70
4006a8: 3dc00800 ldr q0, [x0, #32]
4006ac: 910383e0 add x0, sp, #0xe0
4006b0: 910383e1 add x1, sp, #0xe0
4006b4: 4c406021 ld1 {v1.16b-v3.16b}, [x1]
4006b8: 4ea01c03 mov v3.16b, v0.16b
4006bc: 4c006001 st1 {v1.16b-v3.16b}, [x0]
__builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
4006c0: f9408be0 ldr x0, [sp, #272]
4006c4: 910383e1 add x1, sp, #0xe0
4006c8: 4c406021 ld1 {v1.16b-v3.16b}, [x1]
4006cc: 4c004001 st3 {v1.16b-v3.16b}, [x0]
}
4006d0: d503201f nop
for (int i = 0; i < total_bytes; i += stride_bytes)
4006d4: b9412fe1 ldr w1, [sp, #300]
4006d8: b94127e0 ldr w0, [sp, #292]
4006dc: 0b000020 add w0, w1, w0
4006e0: b9012fe0 str w0, [sp, #300]
4006e4: b9412fe1 ldr w1, [sp, #300]
4006e8: b9412be0 ldr w0, [sp, #296]
4006ec: 6b00003f cmp w1, w0
4006f0: 54fff60b b.lt 4005b0 <rgb2bgr_with_neon+0x3c> // b.tstop
vst3q_u8(target, b);
}
}
4006f4: d503201f nop
4006f8: d503201f nop
4006fc: 9104c3ff add sp, sp, #0x130
400700: d65f03c0 ret
0000000000400704 <main>:
void main(){
400704: d284e40c mov x12, #0x2720 // #10016
400708: cb2c63ff sub sp, sp, x12
40070c: a9007bfd stp x29, x30, [sp]
400710: 910003fd mov x29, sp
unsigned char buf[100*100];
rgb2bgr_with_neon(buf, 100, 100);
400714: 910043e0 add x0, sp, #0x10
400718: 52800c82 mov w2, #0x64 // #100
40071c: 52800c81 mov w1, #0x64 // #100
400720: 97ffff95 bl 400574 <rgb2bgr_with_neon>
}
400724: d503201f nop
400728: a9407bfd ldp x29, x30, [sp]
40072c: d284e40c mov x12, #0x2720 // #10016
400730: 8b2c63ff add sp, sp, x12
400734: d65f03c0 ret
400738: d503201f nop
40073c: d503201f nop
0000000000400740 <__libc_csu_init>:
400740: a9bc7bfd stp x29, x30, [sp, #-64]!
400744: 910003fd mov x29, sp
400748: a90153f3 stp x19, x20, [sp, #16]
40074c: 90000094 adrp x20, 410000 <__FRAME_END__+0xf6a8>
400750: 9137c294 add x20, x20, #0xdf0
400754: a9025bf5 stp x21, x22, [sp, #32]
400758: 90000095 adrp x21, 410000 <__FRAME_END__+0xf6a8>
40075c: 9137a2b5 add x21, x21, #0xde8
400760: cb150294 sub x20, x20, x21
400764: 2a0003f6 mov w22, w0
400768: a90363f7 stp x23, x24, [sp, #48]
40076c: aa0103f7 mov x23, x1
400770: aa0203f8 mov x24, x2
400774: 9343fe94 asr x20, x20, #3
400778: 97ffff24 bl 400408 <_init>
40077c: b4000174 cbz x20, 4007a8 <__libc_csu_init+0x68>
400780: d2800013 mov x19, #0x0 // #0
400784: d503201f nop
400788: f8737aa3 ldr x3, [x21, x19, lsl #3]
40078c: aa1803e2 mov x2, x24
400790: 91000673 add x19, x19, #0x1
400794: aa1703e1 mov x1, x23
400798: 2a1603e0 mov w0, w22
40079c: d63f0060 blr x3
4007a0: eb13029f cmp x20, x19
4007a4: 54ffff21 b.ne 400788 <__libc_csu_init+0x48> // b.any
4007a8: a94153f3 ldp x19, x20, [sp, #16]
4007ac: a9425bf5 ldp x21, x22, [sp, #32]
4007b0: a94363f7 ldp x23, x24, [sp, #48]
4007b4: a8c47bfd ldp x29, x30, [sp], #64
4007b8: d65f03c0 ret
4007bc: d503201f nop
00000000004007c0 <__libc_csu_fini>:
4007c0: d65f03c0 ret
Disassembly of section .fini:
00000000004007c4 <_fini>:
4007c4: a9bf7bfd stp x29, x30, [sp, #-16]!
4007c8: 910003fd mov x29, sp
4007cc: a8c17bfd ldp x29, x30, [sp], #16
4007d0: d65f03c0 ret