aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-01-21 02:35:17 +0200
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>2010-11-10 06:33:45 +0200
commitd7f750fa9fb6b55e2ffb3a6d6cafa3cb4494cec4 (patch)
treea0b959f1f50f4dc29bcddb1071bd7419707d7fd6
parentf95b0a1b126a7530a3b9d9fa0717dcdf9d2f2ca3 (diff)
ARM NEON optimized version of 'rgb_ycc_convert'
Is approximately 10x faster than original C variant.
-rw-r--r--jccolor.c194
1 files changed, 194 insertions, 0 deletions
diff --git a/jccolor.c b/jccolor.c
index 2e2bfd2..69412b3 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -4,6 +4,11 @@
* Copyright (C) 1991-1996, Thomas G. Lane.
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright 2009 D. R. Commander
+ *
+ * ARM NEON optimizations
+ * Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
+ * Contact: Alexander Bokovoy <alexander.bokovoy@nokia.com>
+ *
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
@@ -197,6 +202,187 @@ rgb_ycc_start (j_compress_ptr cinfo)
* offset required on that side.
*/
+#if defined(WITH_SIMD) && defined(__ARM_NEON__) && \
+ (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8)
+
+LOCAL(void)
+rgb24_to_yuv444_neon (unsigned char *rgb,
+ unsigned char *y, unsigned char *u, unsigned char *v,
+ int n)
+{
+ const static unsigned short c[16] = {
+ FIX(0.29900), FIX(0.58700), FIX(0.11400), /* d0[0], d0[1], d0[2] */
+ FIX(0.16874), FIX(0.33126), FIX(0.50000), /* -d0[3], -d1[0], d1[1] */
+ FIX(0.41869), FIX(0.08131), /* d1[1], -d1[2], -d1[3] */
+ 0x7FFF, 128, 0x7FFF, 128, 0x7FFF, 128, 0x7FFF, 128
+ };
+ asm volatile (
+ /* r = d10, g = d11, b = d12 */
+ ".macro do_rgb_to_yuv\n"
+ " vmovl.u8 q2, d10\n" /* r = { d4, d5 } */
+ " vmovl.u8 q3, d11\n" /* g = { d6, d7 } */
+ " vmovl.u8 q4, d12\n" /* b = { d8, d9 } */
+
+ " vmull.u16 q10, d4, d0[0]\n"
+ " vmlal.u16 q10, d6, d0[1]\n"
+ " vmlal.u16 q10, d8, d0[2]\n"
+ " vmull.u16 q11, d5, d0[0]\n"
+ " vmlal.u16 q11, d7, d0[1]\n"
+ " vmlal.u16 q11, d9, d0[2]\n"
+
+ " vmov.u32 q12, q1\n"
+ " vmov.u32 q13, q1\n"
+ " vmlsl.u16 q12, d4, d0[3]\n"
+ " vmlsl.u16 q12, d6, d1[0]\n"
+ " vmlal.u16 q12, d8, d1[1]\n"
+ " vmlsl.u16 q13, d5, d0[3]\n"
+ " vmlsl.u16 q13, d7, d1[0]\n"
+ " vmlal.u16 q13, d9, d1[1]\n"
+
+ " vmov.u32 q14, q1\n"
+ " vmov.u32 q15, q1\n"
+ " vmlal.u16 q14, d4, d1[1]\n"
+ " vmlsl.u16 q14, d6, d1[2]\n"
+ " vmlsl.u16 q14, d8, d1[3]\n"
+ " vmlal.u16 q15, d5, d1[1]\n"
+ " vmlsl.u16 q15, d7, d1[2]\n"
+ " vmlsl.u16 q15, d9, d1[3]\n"
+
+ " vrshrn.u32 d20, q10, #16\n"
+ " vrshrn.u32 d21, q11, #16\n"
+ " vshrn.u32 d24, q12, #16\n"
+ " vshrn.u32 d25, q13, #16\n"
+ " vshrn.u32 d28, q14, #16\n"
+ " vshrn.u32 d29, q15, #16\n"
+
+ " vmovn.u16 d20, q10\n" /* d20 = y */
+ " vmovn.u16 d21, q12\n" /* d21 = u */
+ " vmovn.u16 d22, q14\n" /* d22 = v */
+ ".endm\n"
+ ".macro do_load size\n"
+ ".if \\size == 8\n"
+ "vld3.8 {d10, d11, d12}, [%[rgb]]!\n"
+ "pld [%[rgb], #128]\n"
+ ".elseif \\size == 4\n"
+ "vld3.8 {d10[0], d11[0], d12[0]}, [%[rgb]]!\n"
+ "vld3.8 {d10[1], d11[1], d12[1]}, [%[rgb]]!\n"
+ "vld3.8 {d10[2], d11[2], d12[2]}, [%[rgb]]!\n"
+ "vld3.8 {d10[3], d11[3], d12[3]}, [%[rgb]]!\n"
+ ".elseif \\size == 2\n"
+ "vld3.8 {d10[4], d11[4], d12[4]}, [%[rgb]]!\n"
+ "vld3.8 {d10[5], d11[5], d12[5]}, [%[rgb]]!\n"
+ ".elseif \\size == 1\n"
+ "vld3.8 {d10[6], d11[6], d12[6]}, [%[rgb]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+ ".endm\n"
+ ".macro do_store size\n"
+ ".if \\size == 8\n"
+ "vst1.8 {d21}, [%[u]]!\n"
+ "vst1.8 {d22}, [%[v]]!\n"
+ "vst1.8 {d20}, [%[y]]!\n"
+ ".elseif \\size == 4\n"
+ "vst1.8 {d21[0]}, [%[u]]!\n"
+ "vst1.8 {d21[1]}, [%[u]]!\n"
+ "vst1.8 {d21[2]}, [%[u]]!\n"
+ "vst1.8 {d21[3]}, [%[u]]!\n"
+ "vst1.8 {d22[0]}, [%[v]]!\n"
+ "vst1.8 {d22[1]}, [%[v]]!\n"
+ "vst1.8 {d22[2]}, [%[v]]!\n"
+ "vst1.8 {d22[3]}, [%[v]]!\n"
+ "vst1.8 {d20[0]}, [%[y]]!\n"
+ "vst1.8 {d20[1]}, [%[y]]!\n"
+ "vst1.8 {d20[2]}, [%[y]]!\n"
+ "vst1.8 {d20[3]}, [%[y]]!\n"
+ ".elseif \\size == 2\n"
+ "vst1.8 {d21[4]}, [%[u]]!\n"
+ "vst1.8 {d21[5]}, [%[u]]!\n"
+ "vst1.8 {d22[4]}, [%[v]]!\n"
+ "vst1.8 {d22[5]}, [%[v]]!\n"
+ "vst1.8 {d20[4]}, [%[y]]!\n"
+ "vst1.8 {d20[5]}, [%[y]]!\n"
+ ".elseif \\size == 1\n"
+ "vst1.8 {d21[6]}, [%[u]]!\n"
+ "vst1.8 {d22[6]}, [%[v]]!\n"
+ "vst1.8 {d20[6]}, [%[y]]!\n"
+ ".else\n"
+ ".error \"unsupported macroblock size\"\n"
+ ".endif\n"
+ ".endm\n"
+
+ "vld1.16 {d0, d1, d2, d3}, [%[c]]\n" /* load constants */
+ "subs %[n], %[n], #8\n"
+ "blt 2f\n"
+ "1:\n"
+ "do_load 8\n"
+ "do_rgb_to_yuv\n"
+ "do_store 8\n"
+ "subs %[n], %[n], #8\n"
+ "bge 1b\n"
+ "tst %[n], #7\n"
+ "beq 8f\n"
+ "2:\n"
+ "tst %[n], #4\n"
+ "beq 3f\n"
+ "do_load 4\n"
+ "3:\n"
+ "tst %[n], #2\n"
+ "beq 4f\n"
+ "do_load 2\n"
+ "4:\n"
+ "tst %[n], #1\n"
+ "beq 5f\n"
+ "do_load 1\n"
+ "5:\n"
+ "do_rgb_to_yuv\n"
+ "tst %[n], #4\n"
+ "beq 6f\n"
+ "do_store 4\n"
+ "6:\n"
+ "tst %[n], #2\n"
+ "beq 7f\n"
+ "do_store 2\n"
+ "7:\n"
+ "tst %[n], #1\n"
+ "beq 8f\n"
+ "do_store 1\n"
+ "8:\n"
+ ".purgem do_load\n"
+ ".purgem do_rgb_to_yuv\n"
+ ".purgem do_store\n"
+ : [rgb] "+&r" (rgb), [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v),
+ [n] "+&r" (n)
+ : [c] "r" (&c[0])
+ : "cc", "memory",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+ "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+ "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+ "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
+}
+
+METHODDEF(void)
+rgb_ycc_convert_neon (j_compress_ptr cinfo,
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+ JDIMENSION output_row, int num_rows)
+{
+ register JSAMPROW inptr;
+ register JSAMPROW outptr0, outptr1, outptr2;
+ JDIMENSION num_cols = cinfo->image_width;
+
+ while (--num_rows >= 0) {
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ rgb24_to_yuv444_neon(inptr, outptr0, outptr1, outptr2, num_cols);
+ }
+}
+
+#endif
+
METHODDEF(void)
rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
@@ -517,6 +703,14 @@ jinit_color_converter (j_compress_ptr cinfo)
case JCS_YCbCr:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
+#if defined(WITH_SIMD) && defined(__ARM_NEON__) && \
+ (RGB_PIXELSIZE == 3) && (BITS_IN_JSAMPLE == 8)
+ if (cinfo->in_color_space == JCS_RGB ||
+ cinfo->in_color_space == JCS_EXT_RGB) {
+ cconvert->pub.color_convert = rgb_ycc_convert_neon;
+ break;
+ }
+#endif
if (cinfo->in_color_space == JCS_RGB ||
cinfo->in_color_space == JCS_EXT_RGB ||
cinfo->in_color_space == JCS_EXT_RGBX ||