diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2009-12-23 04:32:55 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-10 06:31:21 +0200 |
commit | d134ba037329b11d2ba8a9d160989967b2bfa86e (patch) | |
tree | a0222cc85508102e0db3e7afd7823ce1d49f54c3 | |
parent | 4eaa1f42e360681c7b9919c1e10614636d524aa6 (diff) |
ARM NEON optimized version of 'jpeg_idct_4x4'
Is approximately 4x faster than original C variant.
-rw-r--r-- | jdct.h | 6 | ||||
-rw-r--r-- | jidctred.c | 283 |
2 files changed, 289 insertions, 0 deletions
@@ -100,6 +100,12 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ #define jpeg_idct_1x1 jRD1x1 #endif /* NEED_SHORT_EXTERNAL_NAMES */ +/* Ensure '_neon' suffixes for optimized functions to simplify profiling */ + +#if defined(WITH_SIMD) && defined(__ARM_NEON__) +#define jpeg_idct_4x4 jpeg_idct_4x4_neon +#endif + /* Extern declarations for the forward and inverse DCT routines. */ EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data)); @@ -2,6 +2,11 @@ * jidctred.c * * Copyright (C) 1994-1998, Thomas G. Lane. + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -114,6 +119,283 @@ * producing a reduced-size 4x4 output block. */ +#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8) + +/* ARM NEON optimized version of 'jpeg_idct_4x4' */ +GLOBAL(void) +jpeg_idct_4x4_neon (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JCOEFPTR inptr = coef_block; + ISLOW_MULT_TYPE * quantptr = compptr->dct_table; + int tmp; + + const static short c[12] = { + FIX_1_847759065, /* d0[0] */ + -FIX_0_765366865, /* d0[1] */ + -FIX_0_211164243, /* d0[2] */ + FIX_1_451774981, /* d0[3] */ + -FIX_2_172734803, /* d1[0] */ + FIX_1_061594337, /* d1[1] */ + -FIX_0_509795579, /* d1[2] */ + -FIX_0_601344887, /* d1[3] */ + FIX_0_899976223, /* d2[0] */ + FIX_2_562915447, /* d2[1] */ + 1 << (CONST_BITS+1), /* d2[2] */ + 0}; /* d2[3] */ + + asm volatile ( + /* load constants */ + "vld1.16 {d0, d1, d2}, [%[c]]\n" + /* load all coef block: + * 0 | d4 d5 + * 1 | d6 d7 + * 2 | d8 d9 + * 3 | d10 d11 + * 4 | + * 5 | d12 d13 + * 6 | d14 d15 + * 7 | d16 d17 + */ + "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n" + "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n" + "add %[inptr], %[inptr], #16\n" + "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n" + "vld1.16 {d16, d17}, [%[inptr]]!\n" + /* dequantize */ + "vld1.16 {d18, d19, d20, d21}, [%[quantptr]]!\n" + "vmul.s16 q2, q2, q9\n" + "vld1.16 {d22, d23, d24, d25}, [%[quantptr]]!\n" + "vmul.s16 q3, q3, q10\n" + "vmul.s16 q4, q4, q11\n" + "add %[quantptr], %[quantptr], #16\n" + "vld1.16 {d26, d27, d28, d29}, [%[quantptr]]!\n" + "vmul.s16 q5, q5, q12\n" + "vmul.s16 q6, q6, q13\n" + "vld1.16 {d30, d31}, [%[quantptr]]!\n" + "vmul.s16 q7, q7, q14\n" + "vmul.s16 q8, q8, q15\n" + /* + * tmp0 : q12 + * tmp2 : q13 + * tmp10 : q14 + * tmp12 : q15 + */ + ".macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift," + " y26, y27, y28, y29\n" + "vmull.s16 q14, \\x4, d2[2]\n" + "vmlal.s16 q14, \\x8, d0[0]\n" + "vmlal.s16 q14, \\x14, d0[1]\n" + + "vmull.s16 q13, \\x16, d1[2]\n" + "vmlal.s16 q13, \\x12, d1[3]\n" + "vmlal.s16 q13, \\x10, d2[0]\n" + "vmlal.s16 q13, \\x6, d2[1]\n" + + "vmull.s16 q15, \\x4, d2[2]\n" + "vmlsl.s16 q15, \\x8, d0[0]\n" + "vmlsl.s16 q15, \\x14, d0[1]\n" + + "vmull.s16 q12, \\x16, d0[2]\n" + "vmlal.s16 q12, \\x12, d0[3]\n" + "vmlal.s16 q12, \\x10, d1[0]\n" + "vmlal.s16 q12, \\x6, d1[1]\n" + + "vadd.s32 q10, q14, q13\n" + "vsub.s32 q14, q14, q13\n" + ".if \\shift > 16\n" + " vrshr.s32 q10, q10, #\\shift\n" + " vrshr.s32 q14, q14, #\\shift\n" + " vmovn.s32 \\y26, q10\n" + " vmovn.s32 \\y27, q14\n" + ".else\n" + " vrshrn.s32 \\y26, q10, #\\shift\n" + " vrshrn.s32 \\y27, q14, #\\shift\n" + ".endif\n" + "vadd.s32 q10, q15, q12\n" + "vsub.s32 q15, q15, q12\n" + ".if \\shift > 16\n" + " vrshr.s32 q10, q10, #\\shift\n" + " vrshr.s32 q15, q15, #\\shift\n" + " vmovn.s32 \\y28, q10\n" + " vmovn.s32 \\y29, q15\n" + ".else\n" + " vrshrn.s32 \\y28, q10, #\\shift\n" + " vrshrn.s32 \\y29, q15, #\\shift\n" + ".endif\n" + ".endm\n" + /* do idct, transposing results after each step */ + /* pass 1 */ + "idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10\n" + "vtrn.16 d4, d8\n" + "vtrn.16 d10, d6\n" + "vtrn.32 d4, d10\n" + "vtrn.32 d8, d6\n" + "idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11\n" + "vtrn.16 d5, d9\n" + "vtrn.16 d11, d7\n" + "vtrn.32 d5, d11\n" + "vtrn.32 d9, d7\n" + /* pass 2 */ + "idct_helper d4, d8, d10, d6, d9, d11, d7, 19, d26, d27, d28, d29\n" + "vtrn.16 d26, d28\n" + "vtrn.16 d29, d27\n" + "vtrn.32 d26, d29\n" + "vtrn.32 d28, d27\n" + /* range limit */ + "vmov.u16 q15, #0x80\n" + "vadd.s16 q13, q13, q15\n" + "vadd.s16 q14, q14, q15\n" + "vqmovun.s16 d26, q13\n" + "vqmovun.s16 d27, q14\n" + /* store results to the output buffer */ + "ldr %[tmp], [%[output_buf]], #4\n" + "add %[tmp], %[tmp], %[output_col]\n" + "vst1.8 {d26[0]}, [%[tmp]]!\n" + "vst1.8 {d26[1]}, [%[tmp]]!\n" + "vst1.8 {d26[2]}, [%[tmp]]!\n" + "vst1.8 {d26[3]}, [%[tmp]]!\n" + + "ldr %[tmp], [%[output_buf]], #4\n" + "add %[tmp], %[tmp], %[output_col]\n" + "vst1.8 {d27[0]}, [%[tmp]]!\n" + "vst1.8 {d27[1]}, [%[tmp]]!\n" + "vst1.8 {d27[2]}, [%[tmp]]!\n" + "vst1.8 {d27[3]}, [%[tmp]]!\n" + + "ldr %[tmp], [%[output_buf]], #4\n" + "add %[tmp], %[tmp], %[output_col]\n" + "vst1.8 {d27[4]}, [%[tmp]]!\n" + "vst1.8 {d27[5]}, [%[tmp]]!\n" + "vst1.8 {d27[6]}, [%[tmp]]!\n" + "vst1.8 {d27[7]}, [%[tmp]]!\n" + + "ldr %[tmp], [%[output_buf]], #4\n" + "add %[tmp], %[tmp], %[output_col]\n" + "vst1.8 {d26[4]}, [%[tmp]]!\n" + "vst1.8 {d26[5]}, [%[tmp]]!\n" + "vst1.8 {d26[6]}, [%[tmp]]!\n" + "vst1.8 {d26[7]}, [%[tmp]]!\n" + + : [inptr] "+&r" (inptr), + [quantptr] "+&r" (quantptr), + [tmp] "=&r" (tmp), + [output_buf] "+&r" (output_buf) + : [c] "r" (c), + [output_col] "r" (output_col) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +#if 0 + +/* + * A slightly modified C code (which maps to NEON instructions better), + * which was used as a reference implementation for converting to NEON. + */ +GLOBAL(void) +jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + INT32 tmp0, tmp2, tmp10, tmp12; + INT32 z1, z2, z3, z4; + JCOEFPTR inptr; + ISLOW_MULT_TYPE * quantptr; + short * wsptr; + JSAMPROW outptr; + JSAMPLE *range_limit = IDCT_range_limit(cinfo); + int ctr; + short workspace[DCTSIZE*8]; /* buffers data between passes */ + JCOEF dequantized_input[DCTSIZE*8]; + int i, tmp; + SHIFT_TEMPS + + /* Pass 0: dequantize data. */ + quantptr = compptr->dct_table; + inptr = coef_block; + for (ctr = 0; ctr < 64; ctr++) + dequantized_input[ctr] = DEQUANTIZE(inptr[ctr], quantptr[ctr]); + + /* Pass 1: process columns from input, store into work array. */ + inptr = dequantized_input; + wsptr = workspace; + for (ctr = DCTSIZE; ctr > 0; inptr++, wsptr+=DCTSIZE, ctr--) { + + /* Even part */ + tmp10 = (inptr[DCTSIZE*0] << (CONST_BITS+1)) + + MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065) + + MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865); + tmp12 = (inptr[DCTSIZE*0] << (CONST_BITS+1)) + - MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065) + - MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865); + + /* Odd part */ + tmp0 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_211164243) + + MULTIPLY(inptr[DCTSIZE*5], FIX_1_451774981) + + MULTIPLY(inptr[DCTSIZE*3], - FIX_2_172734803) + + MULTIPLY(inptr[DCTSIZE*1], FIX_1_061594337); + + tmp2 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_509795579) + + MULTIPLY(inptr[DCTSIZE*5], - FIX_0_601344887) + + MULTIPLY(inptr[DCTSIZE*3], FIX_0_899976223) + + MULTIPLY(inptr[DCTSIZE*1], FIX_2_562915447); + + /* Final output stage */ + wsptr[0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1); + wsptr[3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1); + wsptr[1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1); + wsptr[2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1); + } + + /* Pass 2: process 4 rows from work array, store into output array. */ + inptr = workspace; + for (ctr = 0; ctr < 4; ctr++, inptr++) { + outptr = output_buf[ctr] + output_col; + + /* Even part */ + tmp10 = (inptr[DCTSIZE*0] << (CONST_BITS+1)) + + MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065) + + MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865); + tmp12 = (inptr[DCTSIZE*0] << (CONST_BITS+1)) + - MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065) + - MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865); + + /* Odd part */ + tmp0 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_211164243) + + MULTIPLY(inptr[DCTSIZE*5], FIX_1_451774981) + + MULTIPLY(inptr[DCTSIZE*3], - FIX_2_172734803) + + MULTIPLY(inptr[DCTSIZE*1], FIX_1_061594337); + + tmp2 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_509795579) + + MULTIPLY(inptr[DCTSIZE*5], - FIX_0_601344887) + + MULTIPLY(inptr[DCTSIZE*3], FIX_0_899976223) + + MULTIPLY(inptr[DCTSIZE*1], FIX_2_562915447); + + /* Final output stage */ + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + } +} + +#endif + +#else + GLOBAL(void) jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, @@ -261,6 +543,7 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, } } +#endif /* * Perform dequantization and inverse DCT on one block of coefficients, |