Optimise the blending algorithm on the Pandora.
The code now conditionally supports the ARM NEON intrinsics giving a nice speedup when blending surfaces on processors supporting this instruction set.
This commit is contained in:
parent
b5240e1bf9
commit
6fed611b12
3 changed files with 260 additions and 0 deletions
|
@ -202,6 +202,8 @@ Version 1.11.0-svn:
|
|||
* Changed: Add a small cache for reading files, giving a minor improvement.
|
||||
* Added: Helper tool to create images for unit tests.
|
||||
* Added: Image manipulation unit tests.
|
||||
* Changed: Rewrote the surface blending algorithm, and improving its speed
|
||||
using NEON intrinsics on processors supporting NEON (the Pandora).
|
||||
|
||||
Version 1.10.0:
|
||||
* Campaigns:
|
||||
|
|
225
src/neon.hpp
Normal file
225
src/neon.hpp
Normal file
|
@ -0,0 +1,225 @@
|
|||
/* $Id$ */
|
||||
/*
|
||||
Copyright (C) 2012 by Mark de Wever <koraq@xs4all.nl>
|
||||
Part of the Battle for Wesnoth Project http://www.wesnoth.org/
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY.
|
||||
|
||||
See the COPYING file for more details.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Helper class for ARM NEON support.
|
||||
*
|
||||
* When using g++ on an ARM that support the NEON it uses the gcc intrinsics
|
||||
* [1], for all other platforms an emulation is used. The emulation is based on
|
||||
* the RealView Compilation Tool Assembler Guide (ARM DUI 0204J (ID101213)) [2].
|
||||
* The emulation follows the latter convensions instead of the former. The
|
||||
* numbers in the sections refer to the section numers in [2].
|
||||
*
|
||||
* Not everything is implemented, only functions used are implemented.
|
||||
*
|
||||
* Common template parameters are:
|
||||
* * Td type of the destination.
|
||||
* * Tn type of the first operand.
|
||||
* * Tm type of the second operand.
|
||||
* * Ts type of the source (Tm and Tn).
|
||||
* * S number of vector elements.
|
||||
* * D number of matrix vectors.
|
||||
*
|
||||
* [1]
|
||||
* http://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/ARM-NEON-Intrinsics.html
|
||||
* [2]
|
||||
* http://infocenter.arm.com/help/topic/com.arm.doc.dui0204i/DUI0204I_rvct_assembler_guide.pdf
|
||||
*/
|
||||
|
||||
#ifndef NEON_HPP_INCLUDED
|
||||
#define NEON_HPP_INCLUDED
|
||||
|
||||
#if defined __GNUC__ && defined __ARM_NEON__
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#else
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
/***** ***** ***** ***** types ***** ***** ***** *****/
|
||||
|
||||
/**
|
||||
* Emulates a vector.
|
||||
*
|
||||
* Gcc also supports __attribute__ ((__vector_size__ (8))) but that only works
|
||||
* with gcc, _not_ with g++. It also is not portable.
|
||||
*
|
||||
* @tparam T The base type of the vector.
|
||||
* @tparam S The size of the vector.
|
||||
*/
|
||||
template<class T, unsigned S>
|
||||
struct tvector
|
||||
{
|
||||
const T&
|
||||
operator[](unsigned i) const
|
||||
{
|
||||
return data[i];
|
||||
}
|
||||
|
||||
T&
|
||||
operator[](unsigned i)
|
||||
{
|
||||
return data[i];
|
||||
}
|
||||
|
||||
T data[S];
|
||||
};
|
||||
|
||||
typedef tvector<uint8_t, 8> uint8x8_t;
|
||||
typedef tvector<uint16_t, 8> uint16x8_t;
|
||||
|
||||
/**
|
||||
* Emulates a matrix.
|
||||
*
|
||||
* The guide [2] doesn't use the term matrix, but uses various terms for it;
|
||||
* e.g. table in the VTBL instructions (5.8.9) and lanes in the VLDn
|
||||
* instructions (5.12).
|
||||
*/
|
||||
template<class T, unsigned S, unsigned D>
|
||||
struct tmatrix
|
||||
{
|
||||
tvector<T, S> val[D];
|
||||
};
|
||||
|
||||
typedef tmatrix<uint8_t, 8, 4> uint8x8x4_t;
|
||||
|
||||
|
||||
/***** ***** ***** ***** 5.8.3 VDUP ***** ***** ***** *****/
|
||||
|
||||
/* The imm is actually the Rm. */
|
||||
template<class Td, unsigned S>
|
||||
inline tvector<Td, S>
|
||||
vdup_n(Td imm)
|
||||
{
|
||||
tvector<Td, S> d;
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
d[i] = imm;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
inline uint16x8_t
|
||||
vdupq_n_u16(uint16_t imm)
|
||||
{
|
||||
return vdup_n<uint16_t, 8>(imm);
|
||||
}
|
||||
|
||||
inline uint8x8_t
|
||||
vdup_n_u8(uint8_t imm)
|
||||
{
|
||||
return vdup_n<uint8_t, 8>(imm);
|
||||
}
|
||||
|
||||
/***** ***** ***** ***** 5.9.3 VSHR ***** ***** ***** *****/
|
||||
|
||||
template<class Td, class Tm, unsigned S>
|
||||
inline tvector<Td, S>
|
||||
vshr(tvector<Tm, S> m, const unsigned imm)
|
||||
{
|
||||
tvector<Td, S> d;
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
d[i] = m[i] >> imm;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
inline uint8x8_t
|
||||
vshrn_n_u16(uint16x8_t m, const unsigned imm)
|
||||
{
|
||||
return vshr<uint8_t, uint16_t, 8>(m, imm);
|
||||
}
|
||||
|
||||
/***** ***** ***** ***** 5.10.3 VADD ***** ***** ***** *****/
|
||||
|
||||
template<class Td, class Tn, class Tm, unsigned S>
|
||||
inline tvector<Td, S>
|
||||
vadd(tvector<Tn, S> n, tvector<Tm, S> m)
|
||||
{
|
||||
tvector<Td, S> d;
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
d[i] = n[i] + m[i];
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
inline uint16x8_t
|
||||
vaddq_u16(uint16x8_t n, uint16x8_t m)
|
||||
{
|
||||
return vadd<uint16_t, uint16_t, uint16_t, 8>(n, m);
|
||||
}
|
||||
|
||||
/***** ***** ***** ***** 5.11.1 VMUL ***** ***** ***** *****/
|
||||
|
||||
template<class Td, class Ts, unsigned S>
|
||||
inline tvector<Td, S>
|
||||
vmul(tvector<Ts, S> n, tvector<Ts, S> m)
|
||||
{
|
||||
tvector<Td, S> d;
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
d[i] = n[i] * m[i];
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
inline uint16x8_t
|
||||
vmull_u8(uint8x8_t n, uint8x8_t m)
|
||||
{
|
||||
return vmul<uint16_t, uint8_t, 8>(n, m);
|
||||
}
|
||||
|
||||
/***** ***** ***** ***** 5.12.3 VLDn and VSTn ***** ***** ***** *****/
|
||||
|
||||
template<class Td, unsigned S, unsigned D>
|
||||
inline tmatrix<Td, S, D>
|
||||
vld(Td* base)
|
||||
{
|
||||
tmatrix<Td, S, D> d;
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
for(unsigned j = 0; j < D; ++j) {
|
||||
d.val[j][i] = static_cast<Td>(base[i * D + j]);
|
||||
}
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
inline uint8x8x4_t
|
||||
vld4_u8(uint8_t* base)
|
||||
{
|
||||
return vld<uint8_t, 8, 4>(base);
|
||||
}
|
||||
|
||||
template<class Td, unsigned S, unsigned D>
|
||||
inline void
|
||||
vst(Td* base, tmatrix<Td, S, D> list)
|
||||
{
|
||||
for(unsigned i = 0; i < S; ++i) {
|
||||
for(unsigned j = 0; j < D; ++j) {
|
||||
base[i * D + j] = list.val[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
vst4_u8(uint8_t* base, uint8x8x4_t list)
|
||||
{
|
||||
vst<uint8_t, 8, 4>(base, list);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -23,6 +23,7 @@
|
|||
#include "sdl_utils.hpp"
|
||||
|
||||
#include "floating_point_emulation.hpp"
|
||||
#include "neon.hpp"
|
||||
#include "video.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -1488,6 +1489,38 @@ surface blend_surface(
|
|||
const Uint16 blue = ratio * static_cast<Uint8>(color);
|
||||
ratio = 256 - ratio;
|
||||
|
||||
#ifdef PANDORA
|
||||
/*
|
||||
* Use an optimised version of the generic algorithm. The optimised
|
||||
* version processes 8 pixels a time. If the number of pixels is not an
|
||||
* exact multiple of 8 it falls back to the generic algorithm to handle
|
||||
* the last pixels.
|
||||
*/
|
||||
uint16x8_t vred = vdupq_n_u16(red);
|
||||
uint16x8_t vgreen = vdupq_n_u16(green);
|
||||
uint16x8_t vblue = vdupq_n_u16(blue);
|
||||
|
||||
uint8x8_t vratio = vdup_n_u8(ratio);
|
||||
|
||||
const int div = (nsurf->w * surf->h) / 8;
|
||||
for(int i = 0; i < div; ++i, beg += 8) {
|
||||
uint8x8x4_t rgba = vld4_u8(reinterpret_cast<Uint8*>(beg));
|
||||
|
||||
uint16x8_t b = vmull_u8(rgba.val[0], vratio);
|
||||
uint16x8_t g = vmull_u8(rgba.val[1], vratio);
|
||||
uint16x8_t r = vmull_u8(rgba.val[2], vratio);
|
||||
|
||||
b = vaddq_u16(b, vblue);
|
||||
g = vaddq_u16(g, vgreen);
|
||||
r = vaddq_u16(r, vred);
|
||||
|
||||
rgba.val[0] = vshrn_n_u16(b, 8);
|
||||
rgba.val[1] = vshrn_n_u16(g, 8);
|
||||
rgba.val[2] = vshrn_n_u16(r, 8);
|
||||
|
||||
vst4_u8(reinterpret_cast<Uint8*>(beg), rgba);
|
||||
}
|
||||
#endif
|
||||
while(beg != end) {
|
||||
Uint8 a = static_cast<Uint8>(*beg >> 24);
|
||||
Uint8 r = (ratio * static_cast<Uint8>(*beg >> 16) + red) >> 8;
|
||||
|
|
Loading…
Add table
Reference in a new issue