diff options
author | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-01-21 02:35:17 +0200 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@nokia.com> | 2010-11-10 06:33:45 +0200 |
commit | d7f750fa9fb6b55e2ffb3a6d6cafa3cb4494cec4 (patch) | |
tree | a0b959f1f50f4dc29bcddb1071bd7419707d7fd6 | |
parent | f95b0a1b126a7530a3b9d9fa0717dcdf9d2f2ca3 (diff) |
ARM NEON optimized version of 'rgb_ycc_convert'
Is approximately 10x faster than original C variant.
-rw-r--r-- | jccolor.c | 194 |
1 files changed, 194 insertions, 0 deletions
@@ -4,6 +4,11 @@ * Copyright (C) 1991-1996, Thomas G. Lane. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright 2009 D. R. Commander + * + * ARM NEON optimizations + * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. + * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com> + * * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. * @@ -197,6 +202,187 @@ rgb_ycc_start (j_compress_ptr cinfo) * offset required on that side. */ +#if defined(WITH_SIMD) && defined(__ARM_NEON__) && \ + (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8) + +LOCAL(void) +rgb24_to_yuv444_neon (unsigned char *rgb, + unsigned char *y, unsigned char *u, unsigned char *v, + int n) +{ + const static unsigned short c[16] = { + FIX(0.29900), FIX(0.58700), FIX(0.11400), /* d0[0], d0[1], d0[2] */ + FIX(0.16874), FIX(0.33126), FIX(0.50000), /* -d0[3], -d1[0], d1[1] */ + FIX(0.41869), FIX(0.08131), /* d1[1], -d1[2], -d1[3] */ + 0x7FFF, 128, 0x7FFF, 128, 0x7FFF, 128, 0x7FFF, 128 + }; + asm volatile ( + /* r = d10, g = d11, b = d12 */ + ".macro do_rgb_to_yuv\n" + " vmovl.u8 q2, d10\n" /* r = { d4, d5 } */ + " vmovl.u8 q3, d11\n" /* g = { d6, d7 } */ + " vmovl.u8 q4, d12\n" /* b = { d8, d9 } */ + + " vmull.u16 q10, d4, d0[0]\n" + " vmlal.u16 q10, d6, d0[1]\n" + " vmlal.u16 q10, d8, d0[2]\n" + " vmull.u16 q11, d5, d0[0]\n" + " vmlal.u16 q11, d7, d0[1]\n" + " vmlal.u16 q11, d9, d0[2]\n" + + " vmov.u32 q12, q1\n" + " vmov.u32 q13, q1\n" + " vmlsl.u16 q12, d4, d0[3]\n" + " vmlsl.u16 q12, d6, d1[0]\n" + " vmlal.u16 q12, d8, d1[1]\n" + " vmlsl.u16 q13, d5, d0[3]\n" + " vmlsl.u16 q13, d7, d1[0]\n" + " vmlal.u16 q13, d9, d1[1]\n" + + " vmov.u32 q14, q1\n" + " vmov.u32 q15, q1\n" + " vmlal.u16 q14, d4, d1[1]\n" + " vmlsl.u16 q14, d6, d1[2]\n" + " vmlsl.u16 q14, d8, d1[3]\n" + " vmlal.u16 q15, d5, d1[1]\n" + " vmlsl.u16 q15, d7, d1[2]\n" + " vmlsl.u16 q15, d9, d1[3]\n" + + " vrshrn.u32 d20, q10, #16\n" + " vrshrn.u32 d21, q11, #16\n" + " vshrn.u32 d24, q12, #16\n" + " vshrn.u32 d25, q13, #16\n" + " vshrn.u32 d28, q14, #16\n" + " vshrn.u32 d29, q15, #16\n" + + " vmovn.u16 d20, q10\n" /* d20 = y */ + " vmovn.u16 d21, q12\n" /* d21 = u */ + " vmovn.u16 d22, q14\n" /* d22 = v */ + ".endm\n" + ".macro do_load size\n" + ".if \\size == 8\n" + "vld3.8 {d10, d11, d12}, [%[rgb]]!\n" + "pld [%[rgb], #128]\n" + ".elseif \\size == 4\n" + "vld3.8 {d10[0], d11[0], d12[0]}, [%[rgb]]!\n" + "vld3.8 {d10[1], d11[1], d12[1]}, [%[rgb]]!\n" + "vld3.8 {d10[2], d11[2], d12[2]}, [%[rgb]]!\n" + "vld3.8 {d10[3], d11[3], d12[3]}, [%[rgb]]!\n" + ".elseif \\size == 2\n" + "vld3.8 {d10[4], d11[4], d12[4]}, [%[rgb]]!\n" + "vld3.8 {d10[5], d11[5], d12[5]}, [%[rgb]]!\n" + ".elseif \\size == 1\n" + "vld3.8 {d10[6], d11[6], d12[6]}, [%[rgb]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + ".endm\n" + ".macro do_store size\n" + ".if \\size == 8\n" + "vst1.8 {d21}, [%[u]]!\n" + "vst1.8 {d22}, [%[v]]!\n" + "vst1.8 {d20}, [%[y]]!\n" + ".elseif \\size == 4\n" + "vst1.8 {d21[0]}, [%[u]]!\n" + "vst1.8 {d21[1]}, [%[u]]!\n" + "vst1.8 {d21[2]}, [%[u]]!\n" + "vst1.8 {d21[3]}, [%[u]]!\n" + "vst1.8 {d22[0]}, [%[v]]!\n" + "vst1.8 {d22[1]}, [%[v]]!\n" + "vst1.8 {d22[2]}, [%[v]]!\n" + "vst1.8 {d22[3]}, [%[v]]!\n" + "vst1.8 {d20[0]}, [%[y]]!\n" + "vst1.8 {d20[1]}, [%[y]]!\n" + "vst1.8 {d20[2]}, [%[y]]!\n" + "vst1.8 {d20[3]}, [%[y]]!\n" + ".elseif \\size == 2\n" + "vst1.8 {d21[4]}, [%[u]]!\n" + "vst1.8 {d21[5]}, [%[u]]!\n" + "vst1.8 {d22[4]}, [%[v]]!\n" + "vst1.8 {d22[5]}, [%[v]]!\n" + "vst1.8 {d20[4]}, [%[y]]!\n" + "vst1.8 {d20[5]}, [%[y]]!\n" + ".elseif \\size == 1\n" + "vst1.8 {d21[6]}, [%[u]]!\n" + "vst1.8 {d22[6]}, [%[v]]!\n" + "vst1.8 {d20[6]}, [%[y]]!\n" + ".else\n" + ".error \"unsupported macroblock size\"\n" + ".endif\n" + ".endm\n" + + "vld1.16 {d0, d1, d2, d3}, [%[c]]\n" /* load constants */ + "subs %[n], %[n], #8\n" + "blt 2f\n" + "1:\n" + "do_load 8\n" + "do_rgb_to_yuv\n" + "do_store 8\n" + "subs %[n], %[n], #8\n" + "bge 1b\n" + "tst %[n], #7\n" + "beq 8f\n" + "2:\n" + "tst %[n], #4\n" + "beq 3f\n" + "do_load 4\n" + "3:\n" + "tst %[n], #2\n" + "beq 4f\n" + "do_load 2\n" + "4:\n" + "tst %[n], #1\n" + "beq 5f\n" + "do_load 1\n" + "5:\n" + "do_rgb_to_yuv\n" + "tst %[n], #4\n" + "beq 6f\n" + "do_store 4\n" + "6:\n" + "tst %[n], #2\n" + "beq 7f\n" + "do_store 2\n" + "7:\n" + "tst %[n], #1\n" + "beq 8f\n" + "do_store 1\n" + "8:\n" + ".purgem do_load\n" + ".purgem do_rgb_to_yuv\n" + ".purgem do_store\n" + : [rgb] "+&r" (rgb), [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), + [n] "+&r" (n) + : [c] "r" (&c[0]) + : "cc", "memory", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"); +} + +METHODDEF(void) +rgb_ycc_convert_neon (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ + register JSAMPROW inptr; + register JSAMPROW outptr0, outptr1, outptr2; + JDIMENSION num_cols = cinfo->image_width; + + while (--num_rows >= 0) { + inptr = *input_buf++; + outptr0 = output_buf[0][output_row]; + outptr1 = output_buf[1][output_row]; + outptr2 = output_buf[2][output_row]; + output_row++; + + rgb24_to_yuv444_neon(inptr, outptr0, outptr1, outptr2, num_cols); + } +} + +#endif + METHODDEF(void) rgb_ycc_convert (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf, @@ -517,6 +703,14 @@ jinit_color_converter (j_compress_ptr cinfo) case JCS_YCbCr: if (cinfo->num_components != 3) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); +#if defined(WITH_SIMD) && defined(__ARM_NEON__) && \ + (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8) + if (cinfo->in_color_space == JCS_RGB || + cinfo->in_color_space == JCS_EXT_RGB) { + cconvert->pub.color_convert = rgb_ycc_convert_neon; + break; + } +#endif if (cinfo->in_color_space == JCS_RGB || cinfo->in_color_space == JCS_EXT_RGB || cinfo->in_color_space == JCS_EXT_RGBX || |