Просмотр исходного кода

LibSoftGPU: Vectorize texture sampling and shading

Stephan Unverwerth 3 лет назад
Родитель
Сommit
034dc480d2

+ 1 - 0
Userland/Libraries/LibSoftGPU/CMakeLists.txt

@@ -6,5 +6,6 @@ set(SOURCES
     Sampler.cpp
 )
 
+add_compile_options(-Wno-psabi)
 serenity_lib(LibSoftGPU softgpu)
 target_link_libraries(LibSoftGPU LibM LibCore LibGfx)

+ 67 - 46
Userland/Libraries/LibSoftGPU/Device.cpp

@@ -6,12 +6,16 @@
  */
 
 #include <AK/Function.h>
+#include <AK/SIMDExtras.h>
+#include <AK/SIMDMath.h>
 #include <LibCore/ElapsedTimer.h>
 #include <LibGfx/Painter.h>
 #include <LibGfx/Vector2.h>
 #include <LibGfx/Vector3.h>
 #include <LibSoftGPU/Config.h>
 #include <LibSoftGPU/Device.h>
+#include <LibSoftGPU/PixelQuad.h>
+#include <LibSoftGPU/SIMD.h>
 
 namespace SoftGPU {
 
@@ -24,13 +28,17 @@ static long long g_num_sampler_calls;
 using IntVector2 = Gfx::Vector2<int>;
 using IntVector3 = Gfx::Vector3<int>;
 
+using AK::SIMD::exp;
+using AK::SIMD::expand4;
+using AK::SIMD::f32x4;
+
 constexpr static int edge_function(const IntVector2& a, const IntVector2& b, const IntVector2& c)
 {
     return ((c.x() - a.x()) * (b.y() - a.y()) - (c.y() - a.y()) * (b.x() - a.x()));
 }
 
-template<typename T>
-constexpr static T interpolate(const T& v0, const T& v1, const T& v2, const FloatVector3& barycentric_coords)
+template<typename T, typename U>
+constexpr static auto interpolate(const T& v0, const T& v1, const T& v2, const Vector3<U>& barycentric_coords)
 {
     return v0 * barycentric_coords.x() + v1 * barycentric_coords.y() + v2 * barycentric_coords.z();
 }
@@ -369,47 +377,56 @@ static void rasterize_triangle(const RasterizerOptions& options, Gfx::Bitmap& re
 
             // Draw the pixels according to the previously generated mask
             auto coords = b0;
-            for (int y = 0; y < RASTERIZER_BLOCK_SIZE; y++, coords += step_y) {
-                if (pixel_mask[y] == 0) {
-                    coords += dbdx * RASTERIZER_BLOCK_SIZE;
-                    continue;
-                }
+            for (int y = 0; y < RASTERIZER_BLOCK_SIZE; y += 2, coords += step_y + dbdy) {
+                for (int x = 0; x < RASTERIZER_BLOCK_SIZE; x += 2, coords += dbdx + dbdx) {
 
-                auto* pixel = pixel_staging[y];
-                for (int x = 0; x < RASTERIZER_BLOCK_SIZE; x++, coords += dbdx, pixel++) {
-                    if (~pixel_mask[y] & (1 << x))
-                        continue;
+                    PixelQuad quad;
+
+                    auto a = coords;
+                    auto b = coords + dbdx;
+                    auto c = coords + dbdy;
+                    auto d = coords + dbdx + dbdy;
 
                     // Perspective correct barycentric coordinates
-                    auto barycentric = FloatVector3(coords.x(), coords.y(), coords.z()) * one_over_area;
-                    auto const w_coordinates = FloatVector3 {
-                        vertex0.window_coordinates.w(),
-                        vertex1.window_coordinates.w(),
-                        vertex2.window_coordinates.w(),
+                    auto barycentric = Vector3<f32x4> {
+                        f32x4 { float(a.x()), float(b.x()), float(c.x()), float(d.x()) },
+                        f32x4 { float(a.y()), float(b.y()), float(c.y()), float(d.y()) },
+                        f32x4 { float(a.z()), float(b.z()), float(c.z()), float(d.z()) },
+                    } * one_over_area;
+
+                    auto const w_coordinates = Vector3<f32x4> {
+                        expand4(vertex0.window_coordinates.w()),
+                        expand4(vertex1.window_coordinates.w()),
+                        expand4(vertex2.window_coordinates.w()),
                     };
-                    float const interpolated_reciprocal_w = interpolate(w_coordinates.x(), w_coordinates.y(), w_coordinates.z(), barycentric);
-                    float const interpolated_w = 1 / interpolated_reciprocal_w;
+
+                    auto const interpolated_reciprocal_w = interpolate(w_coordinates.x(), w_coordinates.y(), w_coordinates.z(), barycentric);
+                    auto const interpolated_w = 1.0f / interpolated_reciprocal_w;
                     barycentric = barycentric * w_coordinates * interpolated_w;
 
                     // FIXME: make this more generic. We want to interpolate more than just color and uv
-                    FloatVector4 vertex_color;
                     if (options.shade_smooth) {
-                        vertex_color = interpolate(vertex0.color, vertex1.color, vertex2.color, barycentric);
+                        quad.vertex_color = interpolate(expand4(vertex0.color), expand4(vertex1.color), expand4(vertex2.color), barycentric);
                     } else {
-                        vertex_color = vertex0.color;
+                        quad.vertex_color = expand4(vertex0.color);
                     }
 
-                    auto uv = interpolate(vertex0.tex_coord, vertex1.tex_coord, vertex2.tex_coord, barycentric);
+                    quad.uv = interpolate(expand4(vertex0.tex_coord), expand4(vertex1.tex_coord), expand4(vertex2.tex_coord), barycentric);
 
                     // Calculate depth of fragment for fog
                     //
                     // OpenGL 1.5 spec chapter 3.10: "An implementation may choose to approximate the
                     // eye-coordinate distance from the eye to each fragment center by |Ze|."
 
-                    float fog_fragment_depth = interpolate(vertex0_eye_absz, vertex1_eye_absz, vertex2_eye_absz, barycentric);
+                    quad.fog_depth = interpolate(expand4(vertex0_eye_absz), expand4(vertex1_eye_absz), expand4(vertex2_eye_absz), barycentric);
+
+                    pixel_shader(quad);
 
-                    *pixel = pixel_shader(uv, vertex_color, fog_fragment_depth);
                     INCREASE_STATISTICS_COUNTER(g_num_pixels_shaded, 1);
+                    pixel_staging[y][x] = { quad.out_color.x()[0], quad.out_color.y()[0], quad.out_color.z()[0], quad.out_color.w()[0] };
+                    pixel_staging[y][x + 1] = { quad.out_color.x()[1], quad.out_color.y()[1], quad.out_color.z()[1], quad.out_color.w()[1] };
+                    pixel_staging[y + 1][x] = { quad.out_color.x()[2], quad.out_color.y()[2], quad.out_color.z()[2], quad.out_color.w()[2] };
+                    pixel_staging[y + 1][x + 1] = { quad.out_color.x()[3], quad.out_color.y()[3], quad.out_color.z()[3], quad.out_color.w()[3] };
                 }
             }
 
@@ -797,29 +814,29 @@ void Device::draw_primitives(PrimitiveType primitive_type, FloatMatrix4x4 const&
 
 void Device::submit_triangle(const Triangle& triangle, Vector<size_t> const& enabled_texture_units)
 {
-    rasterize_triangle(m_options, *m_render_target, *m_depth_buffer, triangle, [this, &enabled_texture_units](FloatVector4 const& uv, FloatVector4 const& color, float fog_depth) -> FloatVector4 {
-        FloatVector4 fragment = color;
+    rasterize_triangle(m_options, *m_render_target, *m_depth_buffer, triangle, [this, &enabled_texture_units](PixelQuad& quad) {
+        quad.out_color = quad.vertex_color;
 
         for (size_t i : enabled_texture_units) {
             // FIXME: implement GL_TEXTURE_1D, GL_TEXTURE_3D and GL_TEXTURE_CUBE_MAP
             auto const& sampler = m_samplers[i];
 
-            FloatVector4 texel = sampler.sample_2d({ uv.x(), uv.y() });
+            auto texel = sampler.sample_2d({ quad.uv.x(), quad.uv.y() });
             INCREASE_STATISTICS_COUNTER(g_num_sampler_calls, 1);
 
             // FIXME: Implement more blend modes
             switch (sampler.config().fixed_function_texture_env_mode) {
             case TextureEnvMode::Modulate:
-                fragment = fragment * texel;
+                quad.out_color = quad.out_color * texel;
                 break;
             case TextureEnvMode::Replace:
-                fragment = texel;
+                quad.out_color = texel;
                 break;
             case TextureEnvMode::Decal: {
-                float src_alpha = fragment.w();
-                fragment.set_x(mix(fragment.x(), texel.x(), src_alpha));
-                fragment.set_y(mix(fragment.y(), texel.y(), src_alpha));
-                fragment.set_z(mix(fragment.z(), texel.z(), src_alpha));
+                auto src_alpha = quad.out_color.w();
+                quad.out_color.set_x(mix(quad.out_color.x(), texel.x(), src_alpha));
+                quad.out_color.set_y(mix(quad.out_color.y(), texel.y(), src_alpha));
+                quad.out_color.set_z(mix(quad.out_color.z(), texel.z(), src_alpha));
                 break;
             }
             default:
@@ -829,29 +846,33 @@ void Device::submit_triangle(const Triangle& triangle, Vector<size_t> const& ena
 
         // Calculate fog
         // Math from here: https://opengl-notes.readthedocs.io/en/latest/topics/texturing/aliasing.html
+
+        // FIXME: exponential fog is not vectorized, we should add a SIMD exp function that calculates an approximation.
         if (m_options.fog_enabled) {
-            float factor = 0.0f;
+            auto factor = expand4(0.0f);
             switch (m_options.fog_mode) {
             case FogMode::Linear:
-                factor = (m_options.fog_end - fog_depth) / (m_options.fog_end - m_options.fog_start);
-                break;
-            case FogMode::Exp:
-                factor = expf(-m_options.fog_density * fog_depth);
-                break;
-            case FogMode::Exp2:
-                factor = expf(-((m_options.fog_density * fog_depth) * (m_options.fog_density * fog_depth)));
+                factor = (m_options.fog_end - quad.fog_depth) / (m_options.fog_end - m_options.fog_start);
                 break;
+            case FogMode::Exp: {
+                auto argument = -m_options.fog_density * quad.fog_depth;
+                factor = exp(argument);
+            } break;
+            case FogMode::Exp2: {
+                auto argument = m_options.fog_density * quad.fog_depth;
+                argument *= -argument;
+                factor = exp(argument);
+            } break;
             default:
                 VERIFY_NOT_REACHED();
             }
 
             // Mix texel's RGB with fog's RBG - leave alpha alone
-            fragment.set_x(mix(m_options.fog_color.x(), fragment.x(), factor));
-            fragment.set_y(mix(m_options.fog_color.y(), fragment.y(), factor));
-            fragment.set_z(mix(m_options.fog_color.z(), fragment.z(), factor));
+            auto fog_color = expand4(m_options.fog_color);
+            quad.out_color.set_x(mix(fog_color.x(), quad.out_color.x(), factor));
+            quad.out_color.set_y(mix(fog_color.y(), quad.out_color.y(), factor));
+            quad.out_color.set_z(mix(fog_color.z(), quad.out_color.z(), factor));
         }
-
-        return fragment;
     });
 }
 

+ 106 - 50
Userland/Libraries/LibSoftGPU/Sampler.cpp

@@ -4,43 +4,55 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/SIMDExtras.h>
+#include <AK/SIMDMath.h>
 #include <LibSoftGPU/Config.h>
 #include <LibSoftGPU/Image.h>
+#include <LibSoftGPU/SIMD.h>
 #include <LibSoftGPU/Sampler.h>
 #include <math.h>
 
 namespace SoftGPU {
 
-static constexpr float fracf(float value)
+using AK::SIMD::f32x4;
+using AK::SIMD::i32x4;
+using AK::SIMD::u32x4;
+
+using AK::SIMD::clamp;
+using AK::SIMD::expand4;
+using AK::SIMD::floor_int_range;
+using AK::SIMD::frac_int_range;
+using AK::SIMD::maskbits;
+using AK::SIMD::to_f32x4;
+using AK::SIMD::to_i32x4;
+using AK::SIMD::to_u32x4;
+using AK::SIMD::truncate_int_range;
+
+static f32x4 wrap_repeat(f32x4 value)
 {
-    return value - floorf(value);
+    return frac_int_range(value);
 }
 
-static constexpr float wrap_repeat(float value)
+[[maybe_unused]] static f32x4 wrap_clamp(f32x4 value)
 {
-    return fracf(value);
+    return clamp(value, expand4(0.0f), expand4(1.0f));
 }
 
-[[maybe_unused]] static constexpr float wrap_clamp(float value)
+static f32x4 wrap_clamp_to_edge(f32x4 value, u32x4 num_texels)
 {
-    return clamp(value, 0.0f, 1.0f);
-}
-
-static constexpr float wrap_clamp_to_edge(float value, unsigned num_texels)
-{
-    float const clamp_limit = 1.f / (2 * num_texels);
+    f32x4 const clamp_limit = 1.f / to_f32x4(2 * num_texels);
     return clamp(value, clamp_limit, 1.0f - clamp_limit);
 }
 
-static constexpr float wrap_mirrored_repeat(float value, unsigned num_texels)
+static f32x4 wrap_mirrored_repeat(f32x4 value, u32x4 num_texels)
 {
-    float integer = floorf(value);
-    float frac = value - integer;
-    bool iseven = fmodf(integer, 2.0f) == 0.0f;
-    return wrap_clamp_to_edge(iseven ? frac : 1 - frac, num_texels);
+    f32x4 integer = floor_int_range(value);
+    f32x4 frac = value - integer;
+    auto is_odd = to_i32x4(integer) & 1;
+    return wrap_clamp_to_edge(is_odd ? 1 - frac : frac, num_texels);
 }
 
-static constexpr float wrap(float value, TextureWrapMode mode, unsigned num_texels)
+static f32x4 wrap(f32x4 value, TextureWrapMode mode, u32x4 num_texels)
 {
     switch (mode) {
     case TextureWrapMode::Repeat:
@@ -60,59 +72,103 @@ static constexpr float wrap(float value, TextureWrapMode mode, unsigned num_texe
     }
 }
 
-FloatVector4 Sampler::sample_2d(FloatVector2 const& uv) const
+ALWAYS_INLINE static Vector4<f32x4> texel4(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z)
+{
+    auto t0 = image.texel(layer[0], level[0], x[0], y[0], z[0]);
+    auto t1 = image.texel(layer[1], level[1], x[1], y[1], z[1]);
+    auto t2 = image.texel(layer[2], level[2], x[2], y[2], z[2]);
+    auto t3 = image.texel(layer[3], level[3], x[3], y[3], z[3]);
+
+    return Vector4<f32x4> {
+        f32x4 { t0.x(), t1.x(), t2.x(), t3.x() },
+        f32x4 { t0.y(), t1.y(), t2.y(), t3.y() },
+        f32x4 { t0.z(), t1.z(), t2.z(), t3.z() },
+        f32x4 { t0.w(), t1.w(), t2.w(), t3.w() },
+    };
+}
+
+ALWAYS_INLINE static Vector4<f32x4> texel4border(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z, FloatVector4 const& border, u32x4 w, u32x4 h)
+{
+    auto border_mask = maskbits(x < 0 || x >= w || y < 0 || y >= h);
+
+    auto t0 = border_mask & 1 ? border : image.texel(layer[0], level[0], x[0], y[0], z[0]);
+    auto t1 = border_mask & 2 ? border : image.texel(layer[1], level[1], x[1], y[1], z[1]);
+    auto t2 = border_mask & 4 ? border : image.texel(layer[2], level[2], x[2], y[2], z[2]);
+    auto t3 = border_mask & 8 ? border : image.texel(layer[3], level[3], x[3], y[3], z[3]);
+
+    return Vector4<f32x4> {
+        f32x4 { t0.x(), t1.x(), t2.x(), t3.x() },
+        f32x4 { t0.y(), t1.y(), t2.y(), t3.y() },
+        f32x4 { t0.z(), t1.z(), t2.z(), t3.z() },
+        f32x4 { t0.w(), t1.w(), t2.w(), t3.w() },
+    };
+}
+
+Vector4<AK::SIMD::f32x4> Sampler::sample_2d(Vector2<AK::SIMD::f32x4> const& uv) const
 {
     if (m_config.bound_image.is_null())
-        return { 0, 0, 0, 1 };
+        return expand4(FloatVector4 { 1, 0, 0, 1 });
 
     auto const& image = *m_config.bound_image;
 
-    unsigned const layer = 0;
+    u32x4 const layer = expand4(0u);
     // FIXME: calculate actual mipmap level  to use
-    unsigned const level = 0;
-
-    unsigned width = image.level_width(level);
-    unsigned height = image.level_height(level);
-
-    float s = wrap(uv.x(), m_config.texture_wrap_u, width);
-    float t = wrap(uv.y(), m_config.texture_wrap_v, height);
-
-    float u = s * width;
-    float v = t * height;
+    u32x4 const level = expand4(0u);
+
+    u32x4 const width = {
+        image.level_width(level[0]),
+        image.level_width(level[1]),
+        image.level_width(level[2]),
+        image.level_width(level[3]),
+    };
+    u32x4 const height = {
+        image.level_height(level[0]),
+        image.level_height(level[1]),
+        image.level_height(level[2]),
+        image.level_height(level[3]),
+    };
+
+    f32x4 s = wrap(uv.x(), m_config.texture_wrap_u, width);
+    f32x4 t = wrap(uv.y(), m_config.texture_wrap_v, height);
+
+    f32x4 u = s * to_f32x4(width);
+    f32x4 v = t * to_f32x4(height);
 
     if (m_config.texture_mag_filter == TextureFilter::Nearest) {
-        unsigned i = min(static_cast<unsigned>(u), width - 1);
-        unsigned j = min(static_cast<unsigned>(v), height - 1);
-        return image.texel(layer, level, i, j, 0);
+        u32x4 i = to_i32x4(u) % width;
+        u32x4 j = to_i32x4(v) % height;
+        u32x4 k = expand4(0u);
+
+        return texel4(image, layer, level, i, j, k);
     }
 
     u -= 0.5f;
     v -= 0.5f;
 
-    int i0 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? static_cast<unsigned>(floorf(u)) % width : floorf(u);
-    int j0 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? static_cast<unsigned>(floorf(v)) % height : floorf(v);
+    i32x4 i0 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? to_i32x4(to_u32x4(floor_int_range(u)) % width) : to_i32x4(floor_int_range(u));
+    i32x4 j0 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? to_i32x4(to_u32x4(floor_int_range(v)) % height) : to_i32x4(floor_int_range(v));
+
+    i32x4 i1 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? to_i32x4((i0 + 1) % width) : i0 + 1;
+    i32x4 j1 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? to_i32x4((j0 + 1) % height) : j0 + 1;
 
-    int i1 = m_config.texture_wrap_u == TextureWrapMode::Repeat ? (i0 + 1) % width : i0 + 1;
-    int j1 = m_config.texture_wrap_v == TextureWrapMode::Repeat ? (j0 + 1) % height : j0 + 1;
+    u32x4 k = expand4(0u);
 
-    FloatVector4 t0, t1, t2, t3;
+    Vector4<f32x4> t0, t1, t2, t3;
 
     if (m_config.texture_wrap_u == TextureWrapMode::Repeat && m_config.texture_wrap_v == TextureWrapMode::Repeat) {
-        t0 = image.texel(layer, level, i0, j0, 0);
-        t1 = image.texel(layer, level, i1, j0, 0);
-        t2 = image.texel(layer, level, i0, j1, 0);
-        t3 = image.texel(layer, level, i1, j1, 0);
+        t0 = texel4(image, layer, level, to_u32x4(i0), to_u32x4(j0), k);
+        t1 = texel4(image, layer, level, to_u32x4(i1), to_u32x4(j0), k);
+        t2 = texel4(image, layer, level, to_u32x4(i0), to_u32x4(j1), k);
+        t3 = texel4(image, layer, level, to_u32x4(i1), to_u32x4(j1), k);
     } else {
-        int w = static_cast<int>(width);
-        int h = static_cast<int>(height);
-        t0 = (i0 < 0 || i0 >= w || j0 < 0 || j0 >= h) ? m_config.border_color : image.texel(layer, level, i0, j0, 0);
-        t1 = (i1 < 0 || i1 >= w || j0 < 0 || j0 >= h) ? m_config.border_color : image.texel(layer, level, i1, j0, 0);
-        t2 = (i0 < 0 || i0 >= w || j1 < 0 || j1 >= h) ? m_config.border_color : image.texel(layer, level, i0, j1, 0);
-        t3 = (i1 < 0 || i1 >= w || j1 < 0 || j1 >= h) ? m_config.border_color : image.texel(layer, level, i1, j1, 0);
+        t0 = texel4border(image, layer, level, to_u32x4(i0), to_u32x4(j0), k, m_config.border_color, width, height);
+        t1 = texel4border(image, layer, level, to_u32x4(i1), to_u32x4(j0), k, m_config.border_color, width, height);
+        t2 = texel4border(image, layer, level, to_u32x4(i0), to_u32x4(j1), k, m_config.border_color, width, height);
+        t3 = texel4border(image, layer, level, to_u32x4(i1), to_u32x4(j1), k, m_config.border_color, width, height);
     }
 
-    float const alpha = fracf(u);
-    float const beta = fracf(v);
+    f32x4 const alpha = frac_int_range(u);
+    f32x4 const beta = frac_int_range(v);
 
     auto const lerp_0 = mix(t0, t1, alpha);
     auto const lerp_1 = mix(t2, t3, alpha);

+ 2 - 1
Userland/Libraries/LibSoftGPU/Sampler.h

@@ -7,6 +7,7 @@
 #pragma once
 
 #include <AK/RefPtr.h>
+#include <AK/SIMD.h>
 #include <LibGfx/Vector2.h>
 #include <LibGfx/Vector4.h>
 #include <LibSoftGPU/Image.h>
@@ -52,7 +53,7 @@ struct SamplerConfig final {
 
 class Sampler final {
 public:
-    FloatVector4 sample_2d(FloatVector2 const& uv) const;
+    Vector4<AK::SIMD::f32x4> sample_2d(Vector2<AK::SIMD::f32x4> const& uv) const;
 
     void set_config(SamplerConfig const& config) { m_config = config; }
     SamplerConfig const& config() const { return m_config; }