summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibVideo/VP9
diff options
context:
space:
mode:
authorZaggy1024 <zaggy1024@gmail.com>2023-04-16 08:39:05 -0500
committerTim Flynn <trflynn89@pm.me>2023-04-25 17:44:36 -0400
commit8ad0dff5c2bf687a0c9cb2739ddeca88602367f6 (patch)
tree1d52aca3ddc7de2396308ce2da56cf63cdf4e7c7 /Userland/Libraries/LibVideo/VP9
parent8cd72ad1ed00ea09133284d71e662429e5b21be3 (diff)
downloadserenity-8ad0dff5c2bf687a0c9cb2739ddeca88602367f6.zip
LibVideo/VP9: Implement unscaled fast paths in inter prediction
Inter-prediction convolution filters are selected based on the subpixel position determined for the motion vector relative to the block being predicted. The subpixel position 0 only uses one single sample in the center of the convolution, not averaging any other samples. Let's call this a copy. Reference frames can also be a different size relative to the frame being predicted, but in almost every case, that scale will be 1:1 for every single frame in a video. Taking into account these facts, we can create multiple fast paths for inter prediction. These fast paths are only active when scaling is 1:1. If we are doing a copy in both dimensions, then we can do a straight memcpy from the reference frame to the output block buffer. In videos where there is no motion, this is a dramatic speedup. If we are doing a copy in one dimension, we can just do one convolution and average directly into the output block buffer. If we aren't doing a copy in either dimension, we can still cut out a few operations from the convolution loops, since we only need to advance our samples by whole pixels instead of subpixels. These fast paths result in about a 34% improvement (~31.2s -> ~20.6s) in a video which relies heavily on intra-predicted blocks due to high motion. In videos with less motion, the improvement will be even greater. Also, note that the accumulators in these faster loops are only 16-bit. High bit-depth videos will overflow those, so for now the fast path is only used for 8-bit videos.
Diffstat (limited to 'Userland/Libraries/LibVideo/VP9')
-rw-r--r--Userland/Libraries/LibVideo/VP9/Decoder.cpp198
-rw-r--r--Userland/Libraries/LibVideo/VP9/Decoder.h3
-rw-r--r--Userland/Libraries/LibVideo/VP9/LookupTables.h2
3 files changed, 154 insertions, 49 deletions
diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.cpp b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
index b177b1ffcd..de83efba05 100644
--- a/Userland/Libraries/LibVideo/VP9/Decoder.cpp
+++ b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
@@ -207,6 +207,13 @@ DecoderErrorOr<NonnullOwnPtr<VideoFrame>> Decoder::get_decoded_frame()
return m_video_frame_queue.dequeue();
}
+template<typename T>
+static inline i32 rounded_right_shift(T value, u8 bits)
+{
+ value = (value + static_cast<T>(1u << (bits - 1u))) >> bits;
+ return static_cast<i32>(value);
+}
+
u8 Decoder::merge_prob(u8 pre_prob, u32 count_0, u32 count_1, u8 count_sat, u8 max_update_factor)
{
auto total_decode_count = count_0 + count_1;
@@ -863,6 +870,7 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
auto x_scale = reference_frame.x_scale;
auto y_scale = reference_frame.y_scale;
+ // The amount of subpixels between each sample of this block. Non-16 values will cause the output to be scaled.
auto scaled_step_x = reference_frame.scaled_step_x;
auto scaled_step_y = reference_frame.scaled_step_y;
@@ -901,68 +909,175 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
auto& reference_frame_buffer = reference_frame.frame_planes[plane];
auto reference_frame_width = y_size_to_uv_size(subsampling_x, reference_frame.size.width()) + MV_BORDER * 2;
- auto block_buffer_at = [&](u32 row, u32 column) -> u16& {
- return block_buffer[row * width + column];
- };
-
// The variable lastX is set equal to ( (RefFrameWidth[ refIdx ] + subX) >> subX) - 1.
// The variable lastY is set equal to ( (RefFrameHeight[ refIdx ] + subY) >> subY) - 1.
// (lastX and lastY specify the coordinates of the bottom right sample of the reference plane.)
// Ad-hoc: These variables are not needed, since the reference frame is expanded to contain the samples that
// may be referenced by motion vectors on the edge of the frame.
- // The variable intermediateHeight specifying the height required for the intermediate array is set equal to (((h -
- // 1) * yStep + 15) >> 4) + 8.
- static constexpr auto maximum_intermediate_height = (((maximum_block_dimensions - 1) * maximum_scaled_step + 15) >> 4) + 8;
- auto intermediate_height = (((height - 1) * scaled_step_y + 15) >> 4) + 8;
- VERIFY(intermediate_height <= maximum_intermediate_height);
// The sub-sample interpolation is effected via two one-dimensional convolutions. First a horizontal filter is used
// to build up a temporary array, and then this array is vertically filtered to obtain the final prediction. The
// fractional parts of the motion vectors determine the filtering process. If the fractional part is zero, then the
// filtering is equivalent to a straight sample copy.
// The filtering is applied as follows:
+
+ constexpr auto sample_offset = 3;
+
+ auto subpixel_row_from_reference_row = [offset_scaled_block_y](u32 row) {
+ return (offset_scaled_block_y >> SUBPEL_BITS) + static_cast<i32>(row);
+ };
+ auto reference_index_for_row = [reference_frame_width](i32 row) {
+ return static_cast<size_t>(MV_BORDER + row) * reference_frame_width;
+ };
+
+ // The variable intermediateHeight specifying the height required for the intermediate array is set equal to (((h -
+ // 1) * yStep + 15) >> 4) + 8.
+ static constexpr auto maximum_intermediate_height = (((maximum_block_dimensions - 1) * maximum_scaled_step + 15) >> 4) + 8;
+ auto const intermediate_height = (((height - 1) * scaled_step_y + 15) >> 4) + 8;
+ VERIFY(intermediate_height <= maximum_intermediate_height);
+ // Check our reference frame bounds before starting the loop.
+ auto const last_possible_reference_index = reference_index_for_row(subpixel_row_from_reference_row(intermediate_height - sample_offset));
+ VERIFY(reference_frame_buffer.size() >= last_possible_reference_index);
+
+ VERIFY(block_buffer.size() >= static_cast<size_t>(width) * height);
+
+ auto const reference_block_x = MV_BORDER + (offset_scaled_block_x >> SUBPEL_BITS);
+ auto const reference_block_y = MV_BORDER + (offset_scaled_block_y >> SUBPEL_BITS);
+ auto const reference_subpixel_x = offset_scaled_block_x & SUBPEL_MASK;
+ auto const reference_subpixel_y = offset_scaled_block_y & SUBPEL_MASK;
+
+ // OPTIMIZATION: If the fractional part of a component of the motion vector is 0, we want to do a fast path
+ // skipping one or both of the convolutions.
+ bool const copy_x = reference_subpixel_x == 0;
+ bool const copy_y = reference_subpixel_y == 0;
+ bool const unscaled_x = scaled_step_x == 16;
+ bool const unscaled_y = scaled_step_y == 16;
+
// The array intermediate is specified as follows:
// Note: Height is specified by `intermediate_height`, width is specified by `width`
Array<u16, maximum_intermediate_height * maximum_block_dimensions> intermediate_buffer;
- auto intermediate_buffer_at = [&](u32 row, u32 column) -> u16& {
- return intermediate_buffer[row * width + column];
- };
+ auto const bit_depth = block_context.frame_context.color_config.bit_depth;
+ auto const* reference_start = reference_frame_buffer.data() + reference_block_y * reference_frame_width + reference_block_x;
+
+ // FIXME: We are using 16-bit accumulators for speed in these loops, but when accumulating for a high bit-depth video, they will overflow.
+ // Instead of hardcoding them, the Decoder class should have the bit depth as a template parameter, and the accumulators can select
+ // a size based on whether the bit depth > 8.
+ if (unscaled_x && unscaled_y && bit_depth == 8) {
+ if (copy_x && copy_y) {
+ // We can memcpy here to avoid doing any real work.
+ auto const* reference_scan_line = &reference_frame_buffer[reference_block_y * reference_frame_width + reference_block_x];
+ auto* destination_scan_line = block_buffer.data();
+
+ for (auto row = 0u; row < height; row++) {
+ memcpy(destination_scan_line, reference_scan_line, width * sizeof(*destination_scan_line));
+ reference_scan_line += reference_frame_width;
+ destination_scan_line += width;
+ }
- // Check our reference frame bounds before starting the loop.
- auto last_possible_reference = (MV_BORDER + (offset_scaled_block_y >> 4) + static_cast<i32>(intermediate_height - 1) - 3) * reference_frame_width;
- VERIFY(reference_frame_buffer.size() >= last_possible_reference);
+ return {};
+ }
+
+ auto horizontal_convolution_unscaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_x) {
+ source -= sample_offset;
+ auto const source_end_skip = source_stride - width;
- for (auto row = 0u; row < intermediate_height; row++) {
- auto reference_row = (offset_scaled_block_y >> 4) + static_cast<i32>(row) - 3;
- u16 const* scan_line = &reference_frame_buffer[static_cast<size_t>(MV_BORDER + reference_row) * reference_frame_width];
+ for (auto row = 0u; row < height; row++) {
+ for (auto column = 0u; column < width; column++) {
+ i16 accumulated_samples = 0;
+ for (auto t = 0; t < 8; t++) {
+ auto sample = source[t];
+ accumulated_samples += subpel_filters[filter][subpixel_x][t] * sample;
+ }
+
+ *destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
+ source++;
+ destination++;
+ }
+ source += source_end_skip;
+ }
+ };
- for (auto column = 0u; column < width; column++) {
- auto samples_start = offset_scaled_block_x + static_cast<i32>(scaled_step_x * column);
+ if (copy_y) {
+ horizontal_convolution_unscaled(bit_depth, block_buffer.data(), width, height, reference_start, reference_frame_width, block_context.interpolation_filter, reference_subpixel_x);
+ return {};
+ }
- i32 accumulated_samples = 0;
- for (auto t = 0u; t < 8u; t++) {
- auto sample = scan_line[MV_BORDER + (samples_start >> 4) + static_cast<i32>(t) - 3];
- accumulated_samples += subpel_filters[block_context.interpolation_filter][samples_start & 15][t] * sample;
+ auto vertical_convolution_unscaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_y) {
+ auto const source_end_skip = source_stride - width;
+
+ for (auto row = 0u; row < height; row++) {
+ for (auto column = 0u; column < width; column++) {
+ auto const* scan_column = source;
+ i16 accumulated_samples = 0;
+ for (auto t = 0; t < 8; t++) {
+ auto sample = *scan_column;
+ accumulated_samples += subpel_filters[filter][subpixel_y][t] * sample;
+ scan_column += source_stride;
+ }
+ *destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
+ source++;
+ destination++;
+ }
+ source += source_end_skip;
}
- intermediate_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
+ };
+
+ if (copy_x) {
+ vertical_convolution_unscaled(bit_depth, block_buffer.data(), width, height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, reference_subpixel_y);
+ return {};
}
+
+ horizontal_convolution_unscaled(bit_depth, intermediate_buffer.data(), width, intermediate_height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, reference_subpixel_x);
+ vertical_convolution_unscaled(bit_depth, block_buffer.data(), width, height, intermediate_buffer.data(), width, block_context.interpolation_filter, reference_subpixel_y);
+ return {};
}
- for (auto row = 0u; row < height; row++) {
- for (auto column = 0u; column < width; column++) {
- auto samples_start = (offset_scaled_block_y & 15) + static_cast<i32>(scaled_step_y * row);
- auto const* scan_column = &intermediate_buffer_at(samples_start >> 4, column);
- auto const* subpel_filters_for_samples = subpel_filters[block_context.interpolation_filter][samples_start & 15];
-
- i32 accumulated_samples = 0;
- for (auto t = 0u; t < 8u; t++) {
- auto sample = *scan_column;
- accumulated_samples += subpel_filters_for_samples[t] * sample;
- scan_column += width;
+ // NOTE: Accumulators below are 32-bit to allow high bit-depth videos to decode without overflows.
+ // These should be changed when the accumulators above are.
+
+ auto horizontal_convolution_scaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_x, auto scale_x) {
+ source -= sample_offset;
+
+ for (auto row = 0u; row < height; row++) {
+ auto scan_subpixel = subpixel_x;
+ for (auto column = 0u; column < width; column++) {
+ auto const* scan_line = source + (scan_subpixel >> 4);
+ i32 accumulated_samples = 0;
+ for (auto t = 0; t < 8; t++) {
+ auto sample = scan_line[t];
+ accumulated_samples += subpel_filters[filter][scan_subpixel & SUBPEL_MASK][t] * sample;
+ }
+
+ *destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
+ destination++;
+ scan_subpixel += scale_x;
}
- block_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
+ source += source_stride;
}
- }
+ };
+
+ auto vertical_convolution_scaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_y, auto scale_y) {
+ for (auto row = 0u; row < height; row++) {
+ auto const* source_column_base = source + (subpixel_y >> SUBPEL_BITS) * source_stride;
+
+ for (auto column = 0u; column < width; column++) {
+ auto const* scan_column = source_column_base + column;
+ i32 accumulated_samples = 0;
+ for (auto t = 0; t < 8; t++) {
+ auto sample = *scan_column;
+ accumulated_samples += subpel_filters[filter][subpixel_y & SUBPEL_MASK][t] * sample;
+ scan_column += source_stride;
+ }
+
+ *destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
+ destination++;
+ }
+ subpixel_y += scale_y;
+ }
+ };
+
+ horizontal_convolution_scaled(bit_depth, intermediate_buffer.data(), width, intermediate_height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, offset_scaled_block_x & SUBPEL_MASK, scaled_step_x);
+ vertical_convolution_scaled(bit_depth, block_buffer.data(), width, height, intermediate_buffer.data(), width, block_context.interpolation_filter, reference_subpixel_y, scaled_step_y);
return {};
}
@@ -1193,13 +1308,6 @@ inline i32 Decoder::sin64(u8 angle)
return cos64(angle - 32u);
}
-template<typename T>
-inline i32 Decoder::rounded_right_shift(T value, u8 bits)
-{
- value = (value + static_cast<T>(1u << (bits - 1u))) >> bits;
- return static_cast<i32>(value);
-}
-
// (8.7.1.1) The function B( a, b, angle, 0 ) performs a butterfly rotation.
inline void Decoder::butterfly_rotation_in_place(Span<Intermediate> data, size_t index_a, size_t index_b, u8 angle, bool flip)
{
diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.h b/Userland/Libraries/LibVideo/VP9/Decoder.h
index 9d219bbe1c..f861e67468 100644
--- a/Userland/Libraries/LibVideo/VP9/Decoder.h
+++ b/Userland/Libraries/LibVideo/VP9/Decoder.h
@@ -103,9 +103,6 @@ private:
template<typename S, typename D>
inline void hadamard_rotation(Span<S> source, Span<D> destination, size_t index_a, size_t index_b);
- template<typename T>
- inline i32 rounded_right_shift(T value, u8 bits);
-
// (8.7.1.10) This process does an in-place Walsh-Hadamard transform of the array T (of length 4).
inline DecoderErrorOr<void> inverse_walsh_hadamard_transform(Span<Intermediate> data, u8 log2_of_block_size, u8 shift);
diff --git a/Userland/Libraries/LibVideo/VP9/LookupTables.h b/Userland/Libraries/LibVideo/VP9/LookupTables.h
index 0756719465..1f523d94a3 100644
--- a/Userland/Libraries/LibVideo/VP9/LookupTables.h
+++ b/Userland/Libraries/LibVideo/VP9/LookupTables.h
@@ -336,7 +336,7 @@ static constexpr u8 counter_to_context[19] = {
};
// Coefficients used by predict_inter
-static constexpr i32 subpel_filters[4][16][8] = {
+static constexpr i16 subpel_filters[4][16][8] = {
{ { 0, 0, 0, 128, 0, 0, 0, 0 },
{ 0, 1, -5, 126, 8, -3, 1, 0 },
{ -1, 3, -10, 122, 18, -6, 2, 0 },