aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2009-12-23 04:32:55 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:31:21 +0200
commitd134ba037329b11d2ba8a9d160989967b2bfa86e (patch)
treea0222cc85508102e0db3e7afd7823ce1d49f54c3
parent4eaa1f42e360681c7b9919c1e10614636d524aa6 (diff)
ARM NEON optimized version of 'jpeg_idct_4x4'
Is approximately 4x faster than original C variant.
-rw-r--r--jdct.h6
-rw-r--r--jidctred.c283
2 files changed, 289 insertions, 0 deletions
diff --git a/jdct.h b/jdct.h
index 7b49a97..5e26a7b 100644
--- a/jdct.h
+++ b/jdct.h
@@ -100,6 +100,12 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
#define jpeg_idct_1x1 jRD1x1
#endif /* NEED_SHORT_EXTERNAL_NAMES */
+/* Ensure '_neon' suffixes for optimized functions to simplify profiling */
+
+#if defined(WITH_SIMD) && defined(__ARM_NEON__)
+#define jpeg_idct_4x4 jpeg_idct_4x4_neon
+#endif
+
/* Extern declarations for the forward and inverse DCT routines. */
EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
diff --git a/jidctred.c b/jidctred.c
index 421f3c7..95817f9 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -2,6 +2,11 @@
* jidctred.c
*
* Copyright (C) 1994-1998, Thomas G. Lane.
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -114,6 +119,283 @@
* producing a reduced-size 4x4 output block.
*/
+#if defined(WITH_SIMD) && defined(__ARM_NEON__) && (BITS_IN_JSAMPLE == 8)
+
+/* ARM NEON optimized version of 'jpeg_idct_4x4' */
+GLOBAL(void)
+jpeg_idct_4x4_neon (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ JCOEFPTR inptr = coef_block;
+ ISLOW_MULT_TYPE * quantptr = compptr->dct_table;
+ int tmp;
+
+ const static short c[12] = {
+ FIX_1_847759065, /* d0[0] */
+ -FIX_0_765366865, /* d0[1] */
+ -FIX_0_211164243, /* d0[2] */
+ FIX_1_451774981, /* d0[3] */
+ -FIX_2_172734803, /* d1[0] */
+ FIX_1_061594337, /* d1[1] */
+ -FIX_0_509795579, /* d1[2] */
+ -FIX_0_601344887, /* d1[3] */
+ FIX_0_899976223, /* d2[0] */
+ FIX_2_562915447, /* d2[1] */
+ 1 << (CONST_BITS+1), /* d2[2] */
+ 0}; /* d2[3] */
+
+ asm volatile (
+ /* load constants */
+ "vld1.16 {d0, d1, d2}, [%[c]]\n"
+ /* load all coef block:
+ * 0 | d4 d5
+ * 1 | d6 d7
+ * 2 | d8 d9
+ * 3 | d10 d11
+ * 4 |
+ * 5 | d12 d13
+ * 6 | d14 d15
+ * 7 | d16 d17
+ */
+ "vld1.16 {d4, d5, d6, d7}, [%[inptr]]!\n"
+ "vld1.16 {d8, d9, d10, d11}, [%[inptr]]!\n"
+ "add %[inptr], %[inptr], #16\n"
+ "vld1.16 {d12, d13, d14, d15}, [%[inptr]]!\n"
+ "vld1.16 {d16, d17}, [%[inptr]]!\n"
+ /* dequantize */
+ "vld1.16 {d18, d19, d20, d21}, [%[quantptr]]!\n"
+ "vmul.s16 q2, q2, q9\n"
+ "vld1.16 {d22, d23, d24, d25}, [%[quantptr]]!\n"
+ "vmul.s16 q3, q3, q10\n"
+ "vmul.s16 q4, q4, q11\n"
+ "add %[quantptr], %[quantptr], #16\n"
+ "vld1.16 {d26, d27, d28, d29}, [%[quantptr]]!\n"
+ "vmul.s16 q5, q5, q12\n"
+ "vmul.s16 q6, q6, q13\n"
+ "vld1.16 {d30, d31}, [%[quantptr]]!\n"
+ "vmul.s16 q7, q7, q14\n"
+ "vmul.s16 q8, q8, q15\n"
+ /*
+ * tmp0 : q12
+ * tmp2 : q13
+ * tmp10 : q14
+ * tmp12 : q15
+ */
+ ".macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift,"
+ " y26, y27, y28, y29\n"
+ "vmull.s16 q14, \\x4, d2[2]\n"
+ "vmlal.s16 q14, \\x8, d0[0]\n"
+ "vmlal.s16 q14, \\x14, d0[1]\n"
+
+ "vmull.s16 q13, \\x16, d1[2]\n"
+ "vmlal.s16 q13, \\x12, d1[3]\n"
+ "vmlal.s16 q13, \\x10, d2[0]\n"
+ "vmlal.s16 q13, \\x6, d2[1]\n"
+
+ "vmull.s16 q15, \\x4, d2[2]\n"
+ "vmlsl.s16 q15, \\x8, d0[0]\n"
+ "vmlsl.s16 q15, \\x14, d0[1]\n"
+
+ "vmull.s16 q12, \\x16, d0[2]\n"
+ "vmlal.s16 q12, \\x12, d0[3]\n"
+ "vmlal.s16 q12, \\x10, d1[0]\n"
+ "vmlal.s16 q12, \\x6, d1[1]\n"
+
+ "vadd.s32 q10, q14, q13\n"
+ "vsub.s32 q14, q14, q13\n"
+ ".if \\shift > 16\n"
+ " vrshr.s32 q10, q10, #\\shift\n"
+ " vrshr.s32 q14, q14, #\\shift\n"
+ " vmovn.s32 \\y26, q10\n"
+ " vmovn.s32 \\y27, q14\n"
+ ".else\n"
+ " vrshrn.s32 \\y26, q10, #\\shift\n"
+ " vrshrn.s32 \\y27, q14, #\\shift\n"
+ ".endif\n"
+ "vadd.s32 q10, q15, q12\n"
+ "vsub.s32 q15, q15, q12\n"
+ ".if \\shift > 16\n"
+ " vrshr.s32 q10, q10, #\\shift\n"
+ " vrshr.s32 q15, q15, #\\shift\n"
+ " vmovn.s32 \\y28, q10\n"
+ " vmovn.s32 \\y29, q15\n"
+ ".else\n"
+ " vrshrn.s32 \\y28, q10, #\\shift\n"
+ " vrshrn.s32 \\y29, q15, #\\shift\n"
+ ".endif\n"
+ ".endm\n"
+ /* do idct, transposing results after each step */
+ /* pass 1 */
+ "idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10\n"
+ "vtrn.16 d4, d8\n"
+ "vtrn.16 d10, d6\n"
+ "vtrn.32 d4, d10\n"
+ "vtrn.32 d8, d6\n"
+ "idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11\n"
+ "vtrn.16 d5, d9\n"
+ "vtrn.16 d11, d7\n"
+ "vtrn.32 d5, d11\n"
+ "vtrn.32 d9, d7\n"
+ /* pass 2 */
+ "idct_helper d4, d8, d10, d6, d9, d11, d7, 19, d26, d27, d28, d29\n"
+ "vtrn.16 d26, d28\n"
+ "vtrn.16 d29, d27\n"
+ "vtrn.32 d26, d29\n"
+ "vtrn.32 d28, d27\n"
+ /* range limit */
+ "vmov.u16 q15, #0x80\n"
+ "vadd.s16 q13, q13, q15\n"
+ "vadd.s16 q14, q14, q15\n"
+ "vqmovun.s16 d26, q13\n"
+ "vqmovun.s16 d27, q14\n"
+ /* store results to the output buffer */
+ "ldr %[tmp], [%[output_buf]], #4\n"
+ "add %[tmp], %[tmp], %[output_col]\n"
+ "vst1.8 {d26[0]}, [%[tmp]]!\n"
+ "vst1.8 {d26[1]}, [%[tmp]]!\n"
+ "vst1.8 {d26[2]}, [%[tmp]]!\n"
+ "vst1.8 {d26[3]}, [%[tmp]]!\n"
+
+ "ldr %[tmp], [%[output_buf]], #4\n"
+ "add %[tmp], %[tmp], %[output_col]\n"
+ "vst1.8 {d27[0]}, [%[tmp]]!\n"
+ "vst1.8 {d27[1]}, [%[tmp]]!\n"
+ "vst1.8 {d27[2]}, [%[tmp]]!\n"
+ "vst1.8 {d27[3]}, [%[tmp]]!\n"
+
+ "ldr %[tmp], [%[output_buf]], #4\n"
+ "add %[tmp], %[tmp], %[output_col]\n"
+ "vst1.8 {d27[4]}, [%[tmp]]!\n"
+ "vst1.8 {d27[5]}, [%[tmp]]!\n"
+ "vst1.8 {d27[6]}, [%[tmp]]!\n"
+ "vst1.8 {d27[7]}, [%[tmp]]!\n"
+
+ "ldr %[tmp], [%[output_buf]], #4\n"
+ "add %[tmp], %[tmp], %[output_col]\n"
+ "vst1.8 {d26[4]}, [%[tmp]]!\n"
+ "vst1.8 {d26[5]}, [%[tmp]]!\n"
+ "vst1.8 {d26[6]}, [%[tmp]]!\n"
+ "vst1.8 {d26[7]}, [%[tmp]]!\n"
+
+ : [inptr] "+&r" (inptr),
+ [quantptr] "+&r" (quantptr),
+ [tmp] "=&r" (tmp),
+ [output_buf] "+&r" (output_buf)
+ : [c] "r" (c),
+ [output_col] "r" (output_col)
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+}
+
+#if 0
+
+/*
+ * A slightly modified C code (which maps to NEON instructions better),
+ * which was used as a reference implementation for converting to NEON.
+ */
+GLOBAL(void)
+jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ INT32 tmp0, tmp2, tmp10, tmp12;
+ INT32 z1, z2, z3, z4;
+ JCOEFPTR inptr;
+ ISLOW_MULT_TYPE * quantptr;
+ short * wsptr;
+ JSAMPROW outptr;
+ JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+ int ctr;
+ short workspace[DCTSIZE*8]; /* buffers data between passes */
+ JCOEF dequantized_input[DCTSIZE*8];
+ int i, tmp;
+ SHIFT_TEMPS
+
+ /* Pass 0: dequantize data. */
+ quantptr = compptr->dct_table;
+ inptr = coef_block;
+ for (ctr = 0; ctr < 64; ctr++)
+ dequantized_input[ctr] = DEQUANTIZE(inptr[ctr], quantptr[ctr]);
+
+ /* Pass 1: process columns from input, store into work array. */
+ inptr = dequantized_input;
+ wsptr = workspace;
+ for (ctr = DCTSIZE; ctr > 0; inptr++, wsptr+=DCTSIZE, ctr--) {
+
+ /* Even part */
+ tmp10 = (inptr[DCTSIZE*0] << (CONST_BITS+1))
+ + MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065)
+ + MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865);
+ tmp12 = (inptr[DCTSIZE*0] << (CONST_BITS+1))
+ - MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065)
+ - MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865);
+
+ /* Odd part */
+ tmp0 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_211164243)
+ + MULTIPLY(inptr[DCTSIZE*5], FIX_1_451774981)
+ + MULTIPLY(inptr[DCTSIZE*3], - FIX_2_172734803)
+ + MULTIPLY(inptr[DCTSIZE*1], FIX_1_061594337);
+
+ tmp2 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_509795579)
+ + MULTIPLY(inptr[DCTSIZE*5], - FIX_0_601344887)
+ + MULTIPLY(inptr[DCTSIZE*3], FIX_0_899976223)
+ + MULTIPLY(inptr[DCTSIZE*1], FIX_2_562915447);
+
+ /* Final output stage */
+ wsptr[0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
+ wsptr[3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
+ wsptr[1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
+ wsptr[2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
+ }
+
+ /* Pass 2: process 4 rows from work array, store into output array. */
+ inptr = workspace;
+ for (ctr = 0; ctr < 4; ctr++, inptr++) {
+ outptr = output_buf[ctr] + output_col;
+
+ /* Even part */
+ tmp10 = (inptr[DCTSIZE*0] << (CONST_BITS+1))
+ + MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065)
+ + MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865);
+ tmp12 = (inptr[DCTSIZE*0] << (CONST_BITS+1))
+ - MULTIPLY(inptr[DCTSIZE*2], FIX_1_847759065)
+ - MULTIPLY(inptr[DCTSIZE*6], - FIX_0_765366865);
+
+ /* Odd part */
+ tmp0 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_211164243)
+ + MULTIPLY(inptr[DCTSIZE*5], FIX_1_451774981)
+ + MULTIPLY(inptr[DCTSIZE*3], - FIX_2_172734803)
+ + MULTIPLY(inptr[DCTSIZE*1], FIX_1_061594337);
+
+ tmp2 = MULTIPLY(inptr[DCTSIZE*7], - FIX_0_509795579)
+ + MULTIPLY(inptr[DCTSIZE*5], - FIX_0_601344887)
+ + MULTIPLY(inptr[DCTSIZE*3], FIX_0_899976223)
+ + MULTIPLY(inptr[DCTSIZE*1], FIX_2_562915447);
+
+ /* Final output stage */
+ outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
+ CONST_BITS+PASS1_BITS+3+1)
+ & RANGE_MASK];
+ outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
+ CONST_BITS+PASS1_BITS+3+1)
+ & RANGE_MASK];
+ outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
+ CONST_BITS+PASS1_BITS+3+1)
+ & RANGE_MASK];
+ outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
+ CONST_BITS+PASS1_BITS+3+1)
+ & RANGE_MASK];
+ }
+}
+
+#endif
+
+#else
+
GLOBAL(void)
jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
JCOEFPTR coef_block,
@@ -261,6 +543,7 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
}
}
+#endif
/*
* Perform dequantization and inverse DCT on one block of coefficients,