Optimise the blending algorithm on the Pandora.

The code now conditionally supports the ARM NEON intrinsics giving a nice speedup when blending surfaces on processors supporting this instruction set.
2012-03-31 14:54:41 +00:00 · 2012-03-31 14:54:41 +00:00 · 6fed611b12
commit 6fed611b12
parent b5240e1bf9
3 changed files with 260 additions and 0 deletions
--- a/2
+++ b/2
@ -202,6 +202,8 @@ Version 1.11.0-svn:
   * Changed: Add a small cache for reading files, giving a minor improvement.
   * Added: Helper tool to create images for unit tests.
   * Added: Image manipulation unit tests.
+   * Changed: Rewrote the surface blending algorithm, and improving its speed
+     using NEON intrinsics on processors supporting NEON (the Pandora).

 Version 1.10.0:
 * Campaigns:
--- a/src/neon.hpp
+++ b/src/neon.hpp
@ -0,0 +1,225 @@
+/* $Id$ */
+/*
+   Copyright (C) 2012 by Mark de Wever <koraq@xs4all.nl>
+   Part of the Battle for Wesnoth Project http://www.wesnoth.org/
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY.
+
+   See the COPYING file for more details.
+*/
+
+/**
+ * @file
+ * Helper class for ARM NEON support.
+ *
+ * When using g++ on an ARM that support the NEON it uses the gcc intrinsics
+ * [1], for all other platforms an emulation is used. The emulation is based on
+ * the RealView Compilation Tool Assembler Guide (ARM DUI 0204J (ID101213)) [2].
+ * The emulation follows the latter convensions instead of the former. The
+ * numbers in the sections refer to the section numers in [2].
+ *
+ * Not everything is implemented, only functions used are implemented.
+ *
+ * Common template parameters are:
+ * * Td type of the destination.
+ * * Tn type of the first operand.
+ * * Tm type of the second operand.
+ * * Ts type of the source (Tm and Tn).
+ * * S number of vector elements.
+ * * D number of matrix vectors.
+ *
+ * [1]
+ * http://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/ARM-NEON-Intrinsics.html
+ * [2]
+ * http://infocenter.arm.com/help/topic/com.arm.doc.dui0204i/DUI0204I_rvct_assembler_guide.pdf
+ */
+
+#ifndef NEON_HPP_INCLUDED
+#define NEON_HPP_INCLUDED
+
+#if defined __GNUC__ && defined __ARM_NEON__
+
+#include <arm_neon.h>
+
+#else
+
+#include <inttypes.h>
+
+/***** ***** ***** ***** types ***** ***** ***** *****/
+
+/**
+ * Emulates a vector.
+ *
+ * Gcc also supports  __attribute__ ((__vector_size__ (8))) but that only works
+ * with gcc, _not_ with g++. It also is not portable.
+ *
+ * @tparam T                      The base type of the vector.
+ * @tparam S                      The size of the vector.
+ */
+template<class T, unsigned S>
+struct tvector
+{
+	const T&
+	operator[](unsigned i) const
+	{
+		return data[i];
+	}
+
+	T&
+	operator[](unsigned i)
+	{
+		return data[i];
+	}
+
+	T data[S];
+};
+
+typedef tvector<uint8_t, 8> uint8x8_t;
+typedef tvector<uint16_t, 8> uint16x8_t;
+
+/**
+ * Emulates a matrix.
+ *
+ * The guide [2] doesn't use the term matrix, but uses various terms for it;
+ * e.g. table in the VTBL instructions (5.8.9) and lanes in the VLDn
+ * instructions (5.12).
+ */
+template<class T, unsigned S, unsigned D>
+struct tmatrix
+{
+	tvector<T, S> val[D];
+};
+
+typedef tmatrix<uint8_t, 8, 4> uint8x8x4_t;
+
+
+/***** ***** ***** ***** 5.8.3 VDUP ***** ***** ***** *****/
+
+/* The imm is actually the Rm. */
+template<class Td, unsigned S>
+inline tvector<Td, S>
+vdup_n(Td imm)
+{
+	tvector<Td, S> d;
+	for(unsigned i = 0; i < S; ++i) {
+		d[i] = imm;
+	}
+	return d;
+}
+
+inline uint16x8_t
+vdupq_n_u16(uint16_t imm)
+{
+	return vdup_n<uint16_t, 8>(imm);
+}
+
+inline uint8x8_t
+vdup_n_u8(uint8_t imm)
+{
+	return vdup_n<uint8_t, 8>(imm);
+}
+
+/***** ***** ***** ***** 5.9.3 VSHR ***** ***** ***** *****/
+
+template<class Td, class Tm, unsigned S>
+inline tvector<Td, S>
+vshr(tvector<Tm, S> m, const unsigned imm)
+{
+	 tvector<Td, S> d;
+	 for(unsigned i = 0; i < S; ++i) {
+		d[i] = m[i] >> imm;
+	 }
+	 return d;
+}
+
+inline uint8x8_t
+vshrn_n_u16(uint16x8_t m, const unsigned imm)
+{
+	return vshr<uint8_t, uint16_t, 8>(m, imm);
+}
+
+/***** ***** ***** ***** 5.10.3 VADD ***** ***** ***** *****/
+
+template<class Td, class Tn, class Tm, unsigned S>
+inline tvector<Td, S>
+vadd(tvector<Tn, S> n, tvector<Tm, S> m)
+{
+	 tvector<Td, S> d;
+	 for(unsigned i = 0; i < S; ++i) {
+		d[i] = n[i] + m[i];
+	 }
+	 return d;
+}
+
+inline uint16x8_t
+vaddq_u16(uint16x8_t n, uint16x8_t m)
+{
+	return vadd<uint16_t, uint16_t, uint16_t, 8>(n, m);
+}
+
+/***** ***** ***** ***** 5.11.1 VMUL ***** ***** ***** *****/
+
+template<class Td, class Ts, unsigned S>
+inline tvector<Td, S>
+vmul(tvector<Ts, S> n, tvector<Ts, S> m)
+{
+	 tvector<Td, S> d;
+	 for(unsigned i = 0; i < S; ++i) {
+		d[i] = n[i] * m[i];
+	 }
+	 return d;
+}
+
+
+inline uint16x8_t
+vmull_u8(uint8x8_t n, uint8x8_t m)
+{
+	return vmul<uint16_t, uint8_t, 8>(n, m);
+}
+
+/***** ***** ***** ***** 5.12.3 VLDn and VSTn ***** ***** ***** *****/
+
+template<class Td, unsigned S, unsigned D>
+inline tmatrix<Td, S, D>
+vld(Td* base)
+{
+	tmatrix<Td, S, D> d;
+	for(unsigned i = 0; i < S; ++i) {
+		for(unsigned j = 0; j < D; ++j) {
+			d.val[j][i] = static_cast<Td>(base[i * D + j]);
+		}
+	}
+	return d;
+}
+
+inline uint8x8x4_t
+vld4_u8(uint8_t* base)
+{
+	return vld<uint8_t, 8, 4>(base);
+}
+
+template<class Td, unsigned S, unsigned D>
+inline void
+vst(Td* base, tmatrix<Td, S, D> list)
+{
+	for(unsigned i = 0; i < S; ++i) {
+		for(unsigned j = 0; j < D; ++j) {
+			base[i * D + j] = list.val[j][i];
+		}
+	}
+}
+
+inline void
+vst4_u8(uint8_t* base, uint8x8x4_t list)
+{
+	vst<uint8_t, 8, 4>(base, list);
+}
+
+#endif
+
+#endif
--- a/src/sdl_utils.cpp
+++ b/src/sdl_utils.cpp
@ -23,6 +23,7 @@
 #include "sdl_utils.hpp"

 #include "floating_point_emulation.hpp"
+#include "neon.hpp"
 #include "video.hpp"

 #include <algorithm>
@ -1488,6 +1489,38 @@ surface blend_surface(
 		const Uint16 blue  = ratio * static_cast<Uint8>(color);
 		ratio = 256 - ratio;

+#ifdef PANDORA
+		/*
+		 * Use an optimised version of the generic algorithm. The optimised
+		 * version processes 8 pixels a time. If the number of pixels is not an
+		 * exact multiple of 8 it falls back to the generic algorithm to handle
+		 * the last pixels.
+		 */
+		uint16x8_t vred = vdupq_n_u16(red);
+		uint16x8_t vgreen = vdupq_n_u16(green);
+		uint16x8_t vblue = vdupq_n_u16(blue);
+
+		uint8x8_t vratio = vdup_n_u8(ratio);
+
+		const int div = (nsurf->w * surf->h) / 8;
+		for(int i = 0; i < div; ++i, beg += 8) {
+			uint8x8x4_t rgba = vld4_u8(reinterpret_cast<Uint8*>(beg));
+
+			uint16x8_t b = vmull_u8(rgba.val[0], vratio);
+			uint16x8_t g = vmull_u8(rgba.val[1], vratio);
+			uint16x8_t r = vmull_u8(rgba.val[2], vratio);
+
+			b = vaddq_u16(b, vblue);
+			g = vaddq_u16(g, vgreen);
+			r = vaddq_u16(r, vred);
+
+			rgba.val[0] = vshrn_n_u16(b, 8);
+			rgba.val[1] = vshrn_n_u16(g, 8);
+			rgba.val[2] = vshrn_n_u16(r, 8);
+
+			vst4_u8(reinterpret_cast<Uint8*>(beg), rgba);
+		}
+#endif
 		while(beg != end) {
 			Uint8 a = static_cast<Uint8>(*beg >> 24);
 			Uint8 r = (ratio * static_cast<Uint8>(*beg >> 16) + red)   >> 8;