Userland/Libraries/LibSoftGPU/Sampler.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

/*
 * Copyright (c) 2021, Stephan Unverwerth <s.unverwerth@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/SIMDExtras.h>
#include <AK/SIMDMath.h>
#include <LibSoftGPU/Config.h>
#include <LibSoftGPU/Image.h>
#include <LibSoftGPU/SIMD.h>
#include <LibSoftGPU/Sampler.h>
#include <math.h>

namespace SoftGPU {

using AK::SIMD::f32x4;
using AK::SIMD::i32x4;
using AK::SIMD::u32x4;

using AK::SIMD::clamp;
using AK::SIMD::expand4;
using AK::SIMD::floor_int_range;
using AK::SIMD::frac_int_range;
using AK::SIMD::maskbits;
using AK::SIMD::to_f32x4;
using AK::SIMD::to_i32x4;
using AK::SIMD::to_u32x4;

static f32x4 wrap_repeat(f32x4 value)
{
    return frac_int_range(value);
}

[[maybe_unused]] static f32x4 wrap_clamp(f32x4 value)
{
    return clamp(value, expand4(0.0f), expand4(1.0f));
}

static f32x4 wrap_clamp_to_edge(f32x4 value, f32x4 num_texels)
{
    f32x4 const clamp_limit = .5f / num_texels;
    return clamp(value, clamp_limit, 1.f - clamp_limit);
}

static f32x4 wrap_mirrored_repeat(f32x4 value, f32x4 num_texels)
{
    f32x4 integer = floor_int_range(value);
    f32x4 frac = value - integer;
    auto is_odd = to_i32x4(integer) & 1;
    return wrap_clamp_to_edge(is_odd ? 1 - frac : frac, num_texels);
}

static f32x4 wrap(f32x4 value, GPU::TextureWrapMode mode, f32x4 num_texels)
{
    switch (mode) {
    case GPU::TextureWrapMode::Repeat:
        return wrap_repeat(value);
    case GPU::TextureWrapMode::MirroredRepeat:
        return wrap_mirrored_repeat(value, num_texels);
    case GPU::TextureWrapMode::Clamp:
        if constexpr (CLAMP_DEPRECATED_BEHAVIOR) {
            return wrap_clamp(value);
        }
        return wrap_clamp_to_edge(value, num_texels);
    case GPU::TextureWrapMode::ClampToBorder:
    case GPU::TextureWrapMode::ClampToEdge:
        return wrap_clamp_to_edge(value, num_texels);
    default:
        VERIFY_NOT_REACHED();
    }
}

ALWAYS_INLINE static Vector4<f32x4> texel4(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z)
{
    auto t0 = image.texel(layer[0], level[0], x[0], y[0], z[0]);
    auto t1 = image.texel(layer[1], level[1], x[1], y[1], z[1]);
    auto t2 = image.texel(layer[2], level[2], x[2], y[2], z[2]);
    auto t3 = image.texel(layer[3], level[3], x[3], y[3], z[3]);

    return Vector4<f32x4> {
        f32x4 { t0.x(), t1.x(), t2.x(), t3.x() },
        f32x4 { t0.y(), t1.y(), t2.y(), t3.y() },
        f32x4 { t0.z(), t1.z(), t2.z(), t3.z() },
        f32x4 { t0.w(), t1.w(), t2.w(), t3.w() },
    };
}

ALWAYS_INLINE static Vector4<f32x4> texel4border(Image const& image, u32x4 layer, u32x4 level, u32x4 x, u32x4 y, u32x4 z, FloatVector4 const& border, u32x4 w, u32x4 h)
{
    auto border_mask = maskbits(x < 0 || x >= w || y < 0 || y >= h);

    auto t0 = border_mask & 1 ? border : image.texel(layer[0], level[0], x[0], y[0], z[0]);
    auto t1 = border_mask & 2 ? border : image.texel(layer[1], level[1], x[1], y[1], z[1]);
    auto t2 = border_mask & 4 ? border : image.texel(layer[2], level[2], x[2], y[2], z[2]);
    auto t3 = border_mask & 8 ? border : image.texel(layer[3], level[3], x[3], y[3], z[3]);

    return Vector4<f32x4> {
        f32x4 { t0.x(), t1.x(), t2.x(), t3.x() },
        f32x4 { t0.y(), t1.y(), t2.y(), t3.y() },
        f32x4 { t0.z(), t1.z(), t2.z(), t3.z() },
        f32x4 { t0.w(), t1.w(), t2.w(), t3.w() },
    };
}

Vector4<AK::SIMD::f32x4> Sampler::sample_2d(Vector2<AK::SIMD::f32x4> const& uv) const
{
    if (m_config.bound_image.is_null())
        return expand4(FloatVector4 { 1, 0, 0, 1 });

    auto const& image = *static_ptr_cast<Image>(m_config.bound_image);

    // FIXME: Make base level configurable with glTexParameteri(GL_TEXTURE_BASE_LEVEL, base_level)
    constexpr unsigned base_level = 0;

    // Determine the texture scale factor. See OpenGL 1.5 spec chapter 3.8.8.
    // FIXME: Static casting from u32 to float could silently truncate here.
    // u16 should be plenty enough for texture dimensions and would allow textures of up to 65536x65536x65536 pixels.
    auto texel_coordinates = uv;
    texel_coordinates.set_x(texel_coordinates.x() * static_cast<float>(image.level_width(base_level)));
    texel_coordinates.set_y(texel_coordinates.y() * static_cast<float>(image.level_height(base_level)));
    auto dtdx = ddx(texel_coordinates);
    auto dtdy = ddy(texel_coordinates);
    auto scale_factor = max(dtdx.dot(dtdx), dtdy.dot(dtdy));

    // FIXME: Here we simply determine the filter based on the single scale factor of the upper left pixel.
    // Actually, we could end up with different scale factors for each pixel. This however would break our
    // parallelisation as we could also end up with different filter modes per pixel.

    // Note: scale_factor approximates texels per pixel. This means a scale factor less than 1 indicates texture magnification.
    if (scale_factor[0] <= 1.f)
        return sample_2d_lod(uv, expand4(base_level), m_config.texture_mag_filter);

    if (m_config.mipmap_filter == GPU::MipMapFilter::None)
        return sample_2d_lod(uv, expand4(base_level), m_config.texture_min_filter);

    // FIXME: add texture-level support for GL_TEXTURE_LOD_BIAS; below is only texture unit-level
    auto texture_lod_bias = AK::clamp(m_config.level_of_detail_bias, -MAX_TEXTURE_LOD_BIAS, MAX_TEXTURE_LOD_BIAS);
    // FIXME: Instead of clamping to num_levels - 1, actually make the max mipmap level configurable with glTexParameteri(GL_TEXTURE_MAX_LEVEL, max_level)
    auto min_level = expand4(static_cast<float>(base_level));
    auto max_level = expand4(static_cast<float>(image.num_levels()) - 1.f);
    auto lambda_xy = log2_approximate(scale_factor) * .5f + texture_lod_bias;
    auto level = clamp(lambda_xy, min_level, max_level);

    auto lower_level_texel = sample_2d_lod(uv, to_u32x4(level), m_config.texture_min_filter);

    if (m_config.mipmap_filter == GPU::MipMapFilter::Nearest)
        return lower_level_texel;

    auto higher_level_texel = sample_2d_lod(uv, to_u32x4(min(level + 1.f, max_level)), m_config.texture_min_filter);

    return mix(lower_level_texel, higher_level_texel, frac_int_range(level));
}

Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const& uv, AK::SIMD::u32x4 level, GPU::TextureFilter filter) const
{
    auto const& image = *static_ptr_cast<Image>(m_config.bound_image);
    u32x4 const layer = expand4(0u);

    u32x4 const width = {
        image.level_width(level[0]),
        image.level_width(level[1]),
        image.level_width(level[2]),
        image.level_width(level[3]),
    };
    u32x4 const height = {
        image.level_height(level[0]),
        image.level_height(level[1]),
        image.level_height(level[2]),
        image.level_height(level[3]),
    };

    auto f_width = to_f32x4(width);
    auto f_height = to_f32x4(height);

    u32x4 width_mask = width - 1;
    u32x4 height_mask = height - 1;

    f32x4 u = wrap(uv.x(), m_config.texture_wrap_u, f_width) * f_width;
    f32x4 v = wrap(uv.y(), m_config.texture_wrap_v, f_height) * f_height;

    if (filter == GPU::TextureFilter::Nearest) {
        u32x4 i = to_u32x4(u);
        u32x4 j = to_u32x4(v);
        u32x4 k = expand4(0u);

        i = image.width_is_power_of_two() ? i & width_mask : i % width;
        j = image.height_is_power_of_two() ? j & height_mask : j % height;

        return texel4(image, layer, level, i, j, k);
    }

    u -= 0.5f;
    v -= 0.5f;

    u32x4 i0 = to_u32x4(floor_int_range(u));
    u32x4 i1 = i0 + 1;
    u32x4 j0 = to_u32x4(floor_int_range(v));
    u32x4 j1 = j0 + 1;

    if (m_config.texture_wrap_u == GPU::TextureWrapMode::Repeat) {
        if (image.width_is_power_of_two()) {
            i0 = i0 & width_mask;
            i1 = i1 & width_mask;
        } else {
            i0 = i0 % width;
            i1 = i1 % width;
        }
    }

    if (m_config.texture_wrap_v == GPU::TextureWrapMode::Repeat) {
        if (image.height_is_power_of_two()) {
            j0 = j0 & height_mask;
            j1 = j1 & height_mask;
        } else {
            j0 = j0 % height;
            j1 = j1 % height;
        }
    }

    u32x4 k = expand4(0u);

    Vector4<f32x4> t0, t1, t2, t3;

    if (m_config.texture_wrap_u == GPU::TextureWrapMode::Repeat && m_config.texture_wrap_v == GPU::TextureWrapMode::Repeat) {
        t0 = texel4(image, layer, level, i0, j0, k);
        t1 = texel4(image, layer, level, i1, j0, k);
        t2 = texel4(image, layer, level, i0, j1, k);
        t3 = texel4(image, layer, level, i1, j1, k);
    } else {
        t1 = texel4border(image, layer, level, i1, j0, k, m_config.border_color, width, height);
        t0 = texel4border(image, layer, level, i0, j0, k, m_config.border_color, width, height);
        t2 = texel4border(image, layer, level, i0, j1, k, m_config.border_color, width, height);
        t3 = texel4border(image, layer, level, i1, j1, k, m_config.border_color, width, height);
    }

    f32x4 const alpha = frac_int_range(u);
    f32x4 const beta = frac_int_range(v);

    auto const lerp_0 = mix(t0, t1, alpha);
    auto const lerp_1 = mix(t2, t3, alpha);
    return mix(lerp_0, lerp_1, beta);
}

}