aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2009-12-21 17:36:45 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:33:30 +0200
commitf95b0a1b126a7530a3b9d9fa0717dcdf9d2f2ca3 (patch)
tree76f1ee11332984c3d09dc5660f65c0a2e4805db2
parent90a4196134aac33af8a5b2c33fdeb3e916fc2cc1 (diff)
ARM NEON optimized version of 'ycc_rgb_convert'
Is approximately 6x faster than original C variant.
-rw-r--r--jdcolor.c175
1 files changed, 175 insertions, 0 deletions
diff --git a/jdcolor.c b/jdcolor.c
index e02ea4f..079240c 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -4,6 +4,11 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2009, D. R. Commander.
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -119,6 +124,169 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
* offset required on that side.
*/
+#if defined(__ARM_NEON__) && defined(WITH_SIMD) && \
+ (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8)
+
+LOCAL(void)
+yuv444_to_rgb24_neon(unsigned char *rgb,
+ unsigned char *y, unsigned char *u, unsigned char *v,
+ int n)
+{
+ const static signed short c[12] = {
+ 22971, -11277, -23401, 29033,
+ -128, -128, -128, -128, -128, -128, -128, -128
+ };
+ asm volatile (
+ ".fpu neon\n"
+ ".macro do_yuv_to_rgb\n"
+ " vaddw.u8 q3, q1, d4\n" /* q3 = u - 128 */
+ " vaddw.u8 q4, q1, d5\n" /* q2 = v - 128 */
+ " vmull.s16 q10, d6, d1[1]\n" /* multiply by -11277 */
+ " vmull.s16 q11, d7, d1[1]\n" /* multiply by -11277 */
+ " vmlal.s16 q10, d8, d1[2]\n" /* multiply by -23401 */
+ " vmlal.s16 q11, d9, d1[2]\n" /* multiply by -23401 */
+ " vmull.s16 q12, d8, d1[0]\n" /* multiply by 22971 */
+ " vmull.s16 q13, d9, d1[0]\n" /* multiply by 22971 */
+ " vmull.s16 q14, d6, d1[3]\n" /* multiply by 29033 */
+ " vmull.s16 q15, d7, d1[3]\n" /* multiply by 29033 */
+ " vrshrn.s32 d20, q10, #15\n"
+ " vrshrn.s32 d21, q11, #15\n"
+ " vrshrn.s32 d24, q12, #14\n"
+ " vrshrn.s32 d25, q13, #14\n"
+ " vrshrn.s32 d28, q14, #14\n"
+ " vrshrn.s32 d29, q15, #14\n"
+ " vaddw.u8 q10, q10, d0\n"
+ " vaddw.u8 q12, q12, d0\n"
+ " vaddw.u8 q14, q14, d0\n"
+ " vqmovun.s16 d11, q10\n"
+ " vqmovun.s16 d10, q12\n"
+ " vqmovun.s16 d12, q14\n"
+ ".endm\n"
+ ".macro do_load size\n"
+ ".if \\size == 8\n"
+ "vld1.8 {d4}, [%[u]]!\n"
+ "vld1.8 {d5}, [%[v]]!\n"
+ "vld1.8 {d0}, [%[y]]!\n"
+ "pld [%[y], #64]\n"
+ "pld [%[u], #64]\n"
+ "pld [%[v], #64]\n"
+ ".elseif \\size == 4\n"
+ "vld1.8 {d4[0]}, [%[u]]!\n"
+ "vld1.8 {d4[1]}, [%[u]]!\n"
+ "vld1.8 {d4[2]}, [%[u]]!\n"
+ "vld1.8 {d4[3]}, [%[u]]!\n"
+ "vld1.8 {d5[0]}, [%[v]]!\n"
+ "vld1.8 {d5[1]}, [%[v]]!\n"
+ "vld1.8 {d5[2]}, [%[v]]!\n"
+ "vld1.8 {d5[3]}, [%[v]]!\n"
+ "vld1.8 {d0[0]}, [%[y]]!\n"
+ "vld1.8 {d0[1]}, [%[y]]!\n"
+ "vld1.8 {d0[2]}, [%[y]]!\n"
+ "vld1.8 {d0[3]}, [%[y]]!\n"
+ ".elseif \\size == 2\n"
+ "vld1.8 {d4[4]}, [%[u]]!\n"
+ "vld1.8 {d4[5]}, [%[u]]!\n"
+ "vld1.8 {d5[4]}, [%[v]]!\n"
+ "vld1.8 {d5[5]}, [%[v]]!\n"
+ "vld1.8 {d0[4]}, [%[y]]!\n"
+ "vld1.8 {d0[5]}, [%[y]]!\n"
+ ".elseif \\size == 1\n"
+ "vld1.8 {d4[6]}, [%[u]]!\n"
+ "vld1.8 {d5[6]}, [%[v]]!\n"
+ "vld1.8 {d0[6]}, [%[y]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+ ".endm\n"
+ ".macro do_store size\n"
+ ".if \\size == 8\n"
+ "vst3.8 {d10, d11, d12}, [%[rgb]]!\n"
+ ".elseif \\size == 4\n"
+ "vst3.8 {d10[0], d11[0], d12[0]}, [%[rgb]]!\n"
+ "vst3.8 {d10[1], d11[1], d12[1]}, [%[rgb]]!\n"
+ "vst3.8 {d10[2], d11[2], d12[2]}, [%[rgb]]!\n"
+ "vst3.8 {d10[3], d11[3], d12[3]}, [%[rgb]]!\n"
+ ".elseif \\size == 2\n"
+ "vst3.8 {d10[4], d11[4], d12[4]}, [%[rgb]]!\n"
+ "vst3.8 {d10[5], d11[5], d12[5]}, [%[rgb]]!\n"
+ ".elseif \\size == 1\n"
+ "vst3.8 {d10[6], d11[6], d12[6]}, [%[rgb]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+ ".endm\n"
+ "vld1.16 {d1, d2, d3}, [%[c]]\n" /* load constants */
+ "subs %[n], %[n], #8\n"
+ "blt 2f\n"
+ "1:\n"
+ "do_load 8\n"
+ "do_yuv_to_rgb\n"
+ "do_store 8\n"
+ "subs %[n], %[n], #8\n"
+ "bge 1b\n"
+ "tst %[n], #7\n"
+ "beq 8f\n"
+ "2:\n"
+ "tst %[n], #4\n"
+ "beq 3f\n"
+ "do_load 4\n"
+ "3:\n"
+ "tst %[n], #2\n"
+ "beq 4f\n"
+ "do_load 2\n"
+ "4:\n"
+ "tst %[n], #1\n"
+ "beq 5f\n"
+ "do_load 1\n"
+ "5:\n"
+ "do_yuv_to_rgb\n"
+ "tst %[n], #4\n"
+ "beq 6f\n"
+ "do_store 4\n"
+ "6:\n"
+ "tst %[n], #2\n"
+ "beq 7f\n"
+ "do_store 2\n"
+ "7:\n"
+ "tst %[n], #1\n"
+ "beq 8f\n"
+ "do_store 1\n"
+ "8:\n"
+ ".purgem do_load\n"
+ ".purgem do_yuv_to_rgb\n"
+ ".purgem do_store\n"
+ : [rgb] "+&r" (rgb), [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v),
+ [n] "+&r" (n)
+ : [c] "r" (&c[0])
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+}
+
+METHODDEF(void)
+ycc_rgb_convert_neon (j_decompress_ptr cinfo,
+ JSAMPIMAGE input_buf, JDIMENSION input_row,
+ JSAMPARRAY output_buf, int num_rows)
+{
+ register JSAMPROW outptr;
+ register JSAMPROW inptr0, inptr1, inptr2;
+ JDIMENSION num_cols = cinfo->output_width;
+
+ while (--num_rows >= 0) {
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+ yuv444_to_rgb24_neon(outptr, inptr0, inptr1, inptr2, num_cols);
+ outptr += 3 * num_cols;
+ }
+}
+
+#endif
+
METHODDEF(void)
ycc_rgb_convert (j_decompress_ptr cinfo,
JSAMPIMAGE input_buf, JDIMENSION input_row,
@@ -365,6 +533,13 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
case JCS_RGB:
case JCS_EXT_RGB:
+#if defined(__ARM_NEON__) && defined(WITH_SIMD) && \
+ (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8)
+ if (cinfo->jpeg_color_space == JCS_YCbCr) {
+ cconvert->pub.color_convert = ycc_rgb_convert_neon;
+ break;
+ }
+#endif
case JCS_EXT_RGBX:
case JCS_EXT_BGR:
case JCS_EXT_BGRX: