diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2009-12-21 17:36:45 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-10 06:33:30 +0200 |
commit | f95b0a1b126a7530a3b9d9fa0717dcdf9d2f2ca3 (patch) | |
tree | 76f1ee11332984c3d09dc5660f65c0a2e4805db2 | |
parent | 90a4196134aac33af8a5b2c33fdeb3e916fc2cc1 (diff) |
ARM NEON optimized version of 'ycc_rgb_convert'
Is approximately 6x faster than original C variant.
-rw-r--r-- | jdcolor.c | 175 |
1 files changed, 175 insertions, 0 deletions
@@ -4,6 +4,11 @@ * Copyright (C) 1991-1997, Thomas G. Lane. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright (C) 2009, D. R. Commander. + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -119,6 +124,169 @@ build_ycc_rgb_table (j_decompress_ptr cinfo) * offset required on that side. */ +#if defined(__ARM_NEON__) && defined(WITH_SIMD) && \ + (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8) + +LOCAL(void) +yuv444_to_rgb24_neon(unsigned char *rgb, + unsigned char *y, unsigned char *u, unsigned char *v, + int n) +{ + const static signed short c[12] = { + 22971, -11277, -23401, 29033, + -128, -128, -128, -128, -128, -128, -128, -128 + }; + asm volatile ( + ".fpu neon\n" + ".macro do_yuv_to_rgb\n" + " vaddw.u8 q3, q1, d4\n" /* q3 = u - 128 */ + " vaddw.u8 q4, q1, d5\n" /* q2 = v - 128 */ + " vmull.s16 q10, d6, d1[1]\n" /* multiply by -11277 */ + " vmull.s16 q11, d7, d1[1]\n" /* multiply by -11277 */ + " vmlal.s16 q10, d8, d1[2]\n" /* multiply by -23401 */ + " vmlal.s16 q11, d9, d1[2]\n" /* multiply by -23401 */ + " vmull.s16 q12, d8, d1[0]\n" /* multiply by 22971 */ + " vmull.s16 q13, d9, d1[0]\n" /* multiply by 22971 */ + " vmull.s16 q14, d6, d1[3]\n" /* multiply by 29033 */ + " vmull.s16 q15, d7, d1[3]\n" /* multiply by 29033 */ + " vrshrn.s32 d20, q10, #15\n" + " vrshrn.s32 d21, q11, #15\n" + " vrshrn.s32 d24, q12, #14\n" + " vrshrn.s32 d25, q13, #14\n" + " vrshrn.s32 d28, q14, #14\n" + " vrshrn.s32 d29, q15, #14\n" + " vaddw.u8 q10, q10, d0\n" + " vaddw.u8 q12, q12, d0\n" + " vaddw.u8 q14, q14, d0\n" + " vqmovun.s16 d11, q10\n" + " vqmovun.s16 d10, q12\n" + " vqmovun.s16 d12, q14\n" + ".endm\n" + ".macro do_load size\n" + ".if \\size == 8\n" + "vld1.8 {d4}, [%[u]]!\n" + "vld1.8 {d5}, [%[v]]!\n" + "vld1.8 {d0}, [%[y]]!\n" + "pld [%[y], #64]\n" + "pld [%[u], #64]\n" + "pld [%[v], #64]\n" + ".elseif \\size == 4\n" + "vld1.8 {d4[0]}, [%[u]]!\n" + "vld1.8 {d4[1]}, [%[u]]!\n" + "vld1.8 {d4[2]}, [%[u]]!\n" + "vld1.8 {d4[3]}, [%[u]]!\n" + "vld1.8 {d5[0]}, [%[v]]!\n" + "vld1.8 {d5[1]}, [%[v]]!\n" + "vld1.8 {d5[2]}, [%[v]]!\n" + "vld1.8 {d5[3]}, [%[v]]!\n" + "vld1.8 {d0[0]}, [%[y]]!\n" + "vld1.8 {d0[1]}, [%[y]]!\n" + "vld1.8 {d0[2]}, [%[y]]!\n" + "vld1.8 {d0[3]}, [%[y]]!\n" + ".elseif \\size == 2\n" + "vld1.8 {d4[4]}, [%[u]]!\n" + "vld1.8 {d4[5]}, [%[u]]!\n" + "vld1.8 {d5[4]}, [%[v]]!\n" + "vld1.8 {d5[5]}, [%[v]]!\n" + "vld1.8 {d0[4]}, [%[y]]!\n" + "vld1.8 {d0[5]}, [%[y]]!\n" + ".elseif \\size == 1\n" + "vld1.8 {d4[6]}, [%[u]]!\n" + "vld1.8 {d5[6]}, [%[v]]!\n" + "vld1.8 {d0[6]}, [%[y]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + ".endm\n" + ".macro do_store size\n" + ".if \\size == 8\n" + "vst3.8 {d10, d11, d12}, [%[rgb]]!\n" + ".elseif \\size == 4\n" + "vst3.8 {d10[0], d11[0], d12[0]}, [%[rgb]]!\n" + "vst3.8 {d10[1], d11[1], d12[1]}, [%[rgb]]!\n" + "vst3.8 {d10[2], d11[2], d12[2]}, [%[rgb]]!\n" + "vst3.8 {d10[3], d11[3], d12[3]}, [%[rgb]]!\n" + ".elseif \\size == 2\n" + "vst3.8 {d10[4], d11[4], d12[4]}, [%[rgb]]!\n" + "vst3.8 {d10[5], d11[5], d12[5]}, [%[rgb]]!\n" + ".elseif \\size == 1\n" + "vst3.8 {d10[6], d11[6], d12[6]}, [%[rgb]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + ".endm\n" + "vld1.16 {d1, d2, d3}, [%[c]]\n" /* load constants */ + "subs %[n], %[n], #8\n" + "blt 2f\n" + "1:\n" + "do_load 8\n" + "do_yuv_to_rgb\n" + "do_store 8\n" + "subs %[n], %[n], #8\n" + "bge 1b\n" + "tst %[n], #7\n" + "beq 8f\n" + "2:\n" + "tst %[n], #4\n" + "beq 3f\n" + "do_load 4\n" + "3:\n" + "tst %[n], #2\n" + "beq 4f\n" + "do_load 2\n" + "4:\n" + "tst %[n], #1\n" + "beq 5f\n" + "do_load 1\n" + "5:\n" + "do_yuv_to_rgb\n" + "tst %[n], #4\n" + "beq 6f\n" + "do_store 4\n" + "6:\n" + "tst %[n], #2\n" + "beq 7f\n" + "do_store 2\n" + "7:\n" + "tst %[n], #1\n" + "beq 8f\n" + "do_store 1\n" + "8:\n" + ".purgem do_load\n" + ".purgem do_yuv_to_rgb\n" + ".purgem do_store\n" + : [rgb] "+&r" (rgb), [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), + [n] "+&r" (n) + : [c] "r" (&c[0]) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +METHODDEF(void) +ycc_rgb_convert_neon (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ + register JSAMPROW outptr; + register JSAMPROW inptr0, inptr1, inptr2; + JDIMENSION num_cols = cinfo->output_width; + + while (--num_rows >= 0) { + inptr0 = input_buf[0][input_row]; + inptr1 = input_buf[1][input_row]; + inptr2 = input_buf[2][input_row]; + input_row++; + outptr = *output_buf++; + yuv444_to_rgb24_neon(outptr, inptr0, inptr1, inptr2, num_cols); + outptr += 3 * num_cols; + } +} + +#endif + METHODDEF(void) ycc_rgb_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row, @@ -365,6 +533,13 @@ jinit_color_deconverter (j_decompress_ptr cinfo) case JCS_RGB: case JCS_EXT_RGB: +#if defined(__ARM_NEON__) && defined(WITH_SIMD) && \ + (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8) + if (cinfo->jpeg_color_space == JCS_YCbCr) { + cconvert->pub.color_convert = ycc_rgb_convert_neon; + break; + } +#endif case JCS_EXT_RGBX: case JCS_EXT_BGR: case JCS_EXT_BGRX: |