LibVideo/VP9: Clamp reference frame prediction coords outside loops

Moving the clamping of the coordinates of the reference frame samples as well as some bounds checks outside of the loop reduces the branches needed in the `predict_inter_block()` significantly. This results in a whopping ~41% improvement in decode performance of an inter-prediction-heavy YouTube video (~35.4s -> ~20.8s).
author: Zaggy1024 <zaggy1024@gmail.com> 2023-04-13 14:03:02 -0500
committer: Tim Flynn <trflynn89@pm.me> 2023-04-14 07:11:45 -0400
commit: 08b90bb2d02c8e960ce271673346dede287370f8 (patch)
tree: 6ccada4f4d7724d10c39fa3589a50636701da25a /Userland/Libraries/LibVideo
parent: bc49af08b4b4ff1dffbb882d9ba0e79c53d9fdba (diff)
download: serenity-08b90bb2d02c8e960ce271673346dede287370f8.zip
1 files changed, 12 insertions, 5 deletions
diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.cpp b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
index 70f098742d..7c740b1425 100644
--- a/Userland/Libraries/LibVideo/VP9/Decoder.cpp
+++ b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
@@ -939,15 +939,19 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
         return intermediate_buffer[row * width + column];
     };
 
+    // Check our reference frame bounds before starting the loop.
+    reference_frame_buffer_at(scaled_bottom, scaled_right);
+
     for (auto row = 0u; row < intermediate_height; row++) {
+        auto clamped_row = static_cast<size_t>(clip_3(0, scaled_bottom, (offset_scaled_block_y >> 4) + static_cast<i32>(row) - 3));
+        u16 const* scan_line = &reference_frame_buffer_at(clamped_row, 0);
+
         for (auto column = 0u; column < width; column++) {
             auto samples_start = offset_scaled_block_x + static_cast<i32>(scaled_step_x * column);
 
             i32 accumulated_samples = 0;
             for (auto t = 0u; t < 8u; t++) {
-                auto sample = reference_frame_buffer_at(
-                    clip_3(0, scaled_bottom, (offset_scaled_block_y >> 4) + static_cast<i32>(row) - 3),
-                    clip_3(0, scaled_right, (samples_start >> 4) + static_cast<i32>(t) - 3));
+                auto sample = scan_line[clip_3(0, scaled_right, (samples_start >> 4) + static_cast<i32>(t) - 3)];
                 accumulated_samples += subpel_filters[block_context.interpolation_filter][samples_start & 15][t] * sample;
             }
             intermediate_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
@@ -957,11 +961,14 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
     for (auto row = 0u; row < height; row++) {
         for (auto column = 0u; column < width; column++) {
             auto samples_start = (offset_scaled_block_y & 15) + static_cast<i32>(scaled_step_y * row);
+            auto const* scan_column = &intermediate_buffer_at(samples_start >> 4, column);
+            auto const* subpel_filters_for_samples = subpel_filters[block_context.interpolation_filter][samples_start & 15];
 
             i32 accumulated_samples = 0;
             for (auto t = 0u; t < 8u; t++) {
-                auto sample = intermediate_buffer_at((samples_start >> 4) + t, column);
-                accumulated_samples += subpel_filters[block_context.interpolation_filter][samples_start & 15][t] * sample;
+                auto sample = *scan_column;
+                accumulated_samples += subpel_filters_for_samples[t] * sample;
+                scan_column += width;
             }
             block_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
         }
author	Zaggy1024 <zaggy1024@gmail.com>	2023-04-13 14:03:02 -0500
committer	Tim Flynn <trflynn89@pm.me>	2023-04-14 07:11:45 -0400
commit	08b90bb2d02c8e960ce271673346dede287370f8 (patch)
tree	6ccada4f4d7724d10c39fa3589a50636701da25a /Userland/Libraries/LibVideo
parent	bc49af08b4b4ff1dffbb882d9ba0e79c53d9fdba (diff)
download	serenity-08b90bb2d02c8e960ce271673346dede287370f8.zip