Skip to main content

rav1e/
lrf.rs

1// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10use std::cmp;
11use std::iter::FusedIterator;
12use std::ops::{Index, IndexMut};
13
14use crate::api::SGRComplexityLevel;
15use crate::color::ChromaSampling::Cs400;
16use crate::context::{MAX_PLANES, SB_SIZE};
17use crate::encoder::FrameInvariants;
18use crate::frame::{
19  AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
20};
21use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
22use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
23
24cfg_if::cfg_if! {
25  if #[cfg(nasm_x86_64)] {
26    use crate::asm::x86::lrf::*;
27  } else {
28    use self::rust::*;
29  }
30}
31
32pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8;
33
34pub const RESTORE_NONE: u8 = 0;
35pub const RESTORE_SWITCHABLE: u8 = 1;
36pub const RESTORE_WIENER: u8 = 2;
37pub const RESTORE_SGRPROJ: u8 = 3;
38
39pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17];
40pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15];
41pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46];
42#[allow(unused)]
43pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3];
44pub const WIENER_BITS: usize = 7;
45
46pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32];
47pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31];
48pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95];
49pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4;
50pub const SGRPROJ_PRJ_BITS: u8 = 7;
51pub const SGRPROJ_PARAMS_BITS: u8 = 4;
52pub const SGRPROJ_MTABLE_BITS: u8 = 20;
53pub const SGRPROJ_SGR_BITS: u8 = 8;
54pub const SGRPROJ_RECIP_BITS: u8 = 12;
55pub const SGRPROJ_RST_BITS: u8 = 4;
56pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [
57  [140, 3236],
58  [112, 2158],
59  [93, 1618],
60  [80, 1438],
61  [70, 1295],
62  [58, 1177],
63  [47, 1079],
64  [37, 996],
65  [30, 925],
66  [25, 863],
67  [0, 2589],
68  [0, 1618],
69  [0, 1177],
70  [0, 925],
71  [56, 0],
72  [22, 0],
73];
74
75// List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
76// SGRPROJ_ALL_SETS contains every possible index
77const SGRPROJ_ALL_SETS: &[u8] =
78  &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
79// SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
80// most of the gains from sgr. The decision of which values to use is somewhat
81// arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
82// parameters as non-zero. The other two are distinguishable by which of the
83// two parameters is zero. There are an even number of each of these groups and
84// the non-zero parameters grow as the indices increase. This array uses the
85// 1st, 3rd, ... smallest params of each group.
86const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15];
87
88pub const fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
89  match complexity {
90    SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
91    SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
92  }
93}
94
95pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2;
96pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2;
97pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
98pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
99
100pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2)
101  + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1));
102pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2;
103pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2;
104pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
105
106pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
107  [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
108
109/// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
110#[derive(Debug)]
111pub struct IntegralImageBuffer {
112  pub integral_image: Vec<u32>,
113  pub sq_integral_image: Vec<u32>,
114}
115
116impl IntegralImageBuffer {
117  /// Creates a new buffer with the given size, filled with zeros.
118  #[inline]
119  pub fn zeroed(size: usize) -> Self {
120    Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] }
121  }
122}
123
124#[allow(unused)] // Wiener coming soon!
125#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
126pub enum RestorationFilter {
127  #[default]
128  None,
129  Wiener {
130    coeffs: [[i8; 3]; 2],
131  },
132  Sgrproj {
133    set: u8,
134    xqd: [i8; 2],
135  },
136}
137
138impl RestorationFilter {
139  pub const fn notequal(self, cmp: RestorationFilter) -> bool {
140    match self {
141      RestorationFilter::None => !matches!(cmp, RestorationFilter::None),
142      RestorationFilter::Sgrproj { set, xqd } => {
143        if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
144          !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1])
145        } else {
146          true
147        }
148      }
149      RestorationFilter::Wiener { coeffs } => {
150        if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
151          !(coeffs[0][0] == coeffs2[0][0]
152            && coeffs[0][1] == coeffs2[0][1]
153            && coeffs[0][2] == coeffs2[0][2]
154            && coeffs[1][0] == coeffs2[1][0]
155            && coeffs[1][1] == coeffs2[1][1]
156            && coeffs[1][2] == coeffs2[1][2])
157        } else {
158          true
159        }
160      }
161    }
162  }
163}
164
165pub(crate) mod rust {
166  use crate::cpu_features::CpuFeatureLevel;
167  use crate::frame::PlaneSlice;
168  use crate::lrf::{
169    get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
170    SGRPROJ_SGR_BITS,
171  };
172  use crate::util::CastFromPrimitive;
173  use crate::Pixel;
174
175  #[inline(always)]
176  pub(crate) fn sgrproj_box_ab_internal<const BD: usize>(
177    r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
178    iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
179  ) {
180    let d: usize = r * 2 + 1;
181    let n: usize = d * d;
182    let one_over_n = if r == 1 { 455 } else { 164 };
183
184    assert!(iimg.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
185    assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
186    assert!(af.len() > stripe_w + 1);
187    assert!(bf.len() > stripe_w + 1);
188
189    for x in start_x..stripe_w + 2 {
190      // SAFETY: We perform the bounds checks above, once for the whole loop
191      unsafe {
192        let sum = get_integral_square(iimg, iimg_stride, x, y, d);
193        let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
194        let (reta, retb) =
195          sgrproj_sum_finish::<BD>(ssq, sum, n as u32, one_over_n, s);
196        *af.get_unchecked_mut(x) = reta;
197        *bf.get_unchecked_mut(x) = retb;
198      }
199    }
200  }
201
202  // computes an intermediate (ab) row for stripe_w + 2 columns at row y
203  pub(crate) fn sgrproj_box_ab_r1<const BD: usize>(
204    af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
205    iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
206    _cpu: CpuFeatureLevel,
207  ) {
208    sgrproj_box_ab_internal::<BD>(
209      1,
210      af,
211      bf,
212      iimg,
213      iimg_sq,
214      iimg_stride,
215      0,
216      y,
217      stripe_w,
218      s,
219    );
220  }
221
222  // computes an intermediate (ab) row for stripe_w + 2 columns at row y
223  pub(crate) fn sgrproj_box_ab_r2<const BD: usize>(
224    af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
225    iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
226    _cpu: CpuFeatureLevel,
227  ) {
228    sgrproj_box_ab_internal::<BD>(
229      2,
230      af,
231      bf,
232      iimg,
233      iimg_sq,
234      iimg_stride,
235      0,
236      y,
237      stripe_w,
238      s,
239    );
240  }
241
242  pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
243    f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
244    _cpu: CpuFeatureLevel,
245  ) {
246    sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed);
247  }
248
249  #[inline(always)]
250  pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
251    f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
252  ) {
253    let line = cdeffed.row(y);
254    for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
255      *fp = u32::cast_from(v) << SGRPROJ_RST_BITS;
256    }
257  }
258
259  pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
260    af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
261    cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
262  ) {
263    sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed);
264  }
265
266  #[inline(always)]
267  pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
268    af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize,
269    y: usize, w: usize, cdeffed: &PlaneSlice<T>,
270  ) {
271    let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
272    let line = cdeffed.row(y);
273    for x in start_x..w {
274      let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2])
275        + 4
276          * (af[1][x]
277            + af[0][x + 1]
278            + af[1][x + 1]
279            + af[2][x + 1]
280            + af[1][x + 2]);
281      let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2])
282        + 4
283          * (bf[1][x]
284            + bf[0][x + 1]
285            + bf[1][x + 1]
286            + bf[2][x + 1]
287            + bf[1][x + 2]);
288      let v = a * u32::cast_from(line[x]) + b;
289      f[x] = (v + (1 << shift >> 1)) >> shift;
290    }
291  }
292
293  pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
294    af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
295    y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
296  ) {
297    sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed);
298  }
299
300  #[inline(always)]
301  pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
302    af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
303    start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
304  ) {
305    let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
306    let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
307    let line = cdeffed.row(y);
308    let line1 = cdeffed.row(y + 1);
309
310    let af0 = af[0][start_x..w + 3].windows(3);
311    let af1 = af[1][start_x..w + 3].windows(3);
312    let bf0 = bf[0][start_x..w + 3].windows(3);
313    let bf1 = bf[1][start_x..w + 3].windows(3);
314
315    let af_it = af0.zip(af1);
316    let bf_it = bf0.zip(bf1);
317
318    let in0 = line[start_x..w].iter();
319    let in1 = line1[start_x..w].iter();
320
321    let o0 = f0[start_x..w].iter_mut();
322    let o1 = f1[start_x..w].iter_mut();
323
324    let in_iter = in0.zip(in1);
325    let out_iter = o0.zip(o1);
326
327    let io_iter = out_iter.zip(in_iter);
328
329    for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
330      io_iter.zip(af_it.zip(bf_it))
331    {
332      let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1];
333      let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1];
334      let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1];
335      let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1];
336      let v = (a + ao) * u32::cast_from(p0) + b + bo;
337      *o0 = (v + (1 << shift >> 1)) >> shift;
338      let vo = ao * u32::cast_from(p1) + bo;
339      *o1 = (vo + (1 << shifto >> 1)) >> shifto;
340    }
341  }
342}
343
344#[inline(always)]
345fn sgrproj_sum_finish<const BD: usize>(
346  ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32,
347) -> (u32, u32) {
348  let bdm8 = BD - 8;
349  let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8);
350  let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8;
351  let p = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum);
352  let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS;
353  let a = if z >= 255 {
354    256
355  } else if z == 0 {
356    1
357  } else {
358    ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1)
359  };
360  let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
361  (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS)
362}
363
364// Using an integral image, compute the sum of a square region
365// SAFETY: The size of `iimg` must be at least `(y + size) * stride + x + size`
366#[inline(always)]
367unsafe fn get_integral_square(
368  iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
369) -> u32 {
370  // Cancel out overflow in iimg by using wrapping arithmetic
371  let top_left = *iimg.get_unchecked(y * stride + x);
372  let top_right = *iimg.get_unchecked(y * stride + x + size);
373  let bottom_left = *iimg.get_unchecked((y + size) * stride + x);
374  let bottom_right = *iimg.get_unchecked((y + size) * stride + x + size);
375  top_left
376    .wrapping_add(bottom_right)
377    .wrapping_sub(bottom_left)
378    .wrapping_sub(top_right)
379}
380
381struct VertPaddedIter<'a, T: Pixel> {
382  // The two sources that can be selected when clipping
383  deblocked: &'a Plane<T>,
384  cdeffed: &'a Plane<T>,
385  // x index to choice where on the row to start
386  x: isize,
387  // y index that will be mutated
388  y: isize,
389  // The index at which to terminate. Can be larger than the slice length.
390  end: isize,
391  // Used for source buffer choice/clipping. May (and regularly will)
392  // be negative.
393  stripe_begin: isize,
394  // Also used for source buffer choice/clipping. May specify a stripe boundary
395  // less than, equal to, or larger than the buffers we're accessing.
396  stripe_end: isize,
397  // Active area cropping is done by specifying a value smaller than the height
398  // of the plane.
399  crop: isize,
400}
401
402impl<'a, T: Pixel> VertPaddedIter<'a, T> {
403  fn new(
404    cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
405    stripe_h: usize, crop: usize,
406  ) -> VertPaddedIter<'a, T> {
407    // cdeffed and deblocked must start at the same coordinates from their
408    // underlying planes. Since cropping is provided via a separate params, the
409    // height of the underlying planes do not need to match.
410    assert_eq!(cdeffed.x, deblocked.x);
411    assert_eq!(cdeffed.y, deblocked.y);
412
413    // To share integral images, always use the max box filter radius of 2
414    let r = 2;
415
416    // The number of rows outside the stripe are needed
417    let rows_above = r + 2;
418    let rows_below = 2;
419
420    // Offset crop and stripe_h so they are relative to the underlying plane
421    // and not the plane slice.
422    let crop = crop as isize + deblocked.y;
423    let stripe_end = stripe_h as isize + deblocked.y;
424
425    // Move y up the number rows above.
426    // If y is negative we repeat the first row
427    let y = deblocked.y - rows_above as isize;
428
429    VertPaddedIter {
430      deblocked: deblocked.plane,
431      cdeffed: cdeffed.plane,
432      x: deblocked.x,
433      y,
434      end: (rows_above + stripe_h + rows_below) as isize + y,
435      stripe_begin: deblocked.y,
436      stripe_end,
437      crop,
438    }
439  }
440}
441
442impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
443  type Item = &'a [T];
444
445  #[inline(always)]
446  fn next(&mut self) -> Option<Self::Item> {
447    if self.end > self.y {
448      // clamp before deciding the source
449      // clamp vertically to storage at top and passed-in height at bottom
450      let cropped_y = clamp(self.y, 0, self.crop - 1);
451      // clamp vertically to stripe limits
452      let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1);
453
454      // decide if we're vertically inside or outside the strip
455      let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end {
456        self.cdeffed
457      } else {
458        self.deblocked
459      };
460      // cannot directly return self.ps.row(row) due to lifetime issue
461      let range = src_plane.row_range(self.x, ly);
462      self.y += 1;
463      Some(&src_plane.data[range])
464    } else {
465      None
466    }
467  }
468
469  fn size_hint(&self) -> (usize, Option<usize>) {
470    let remaining = self.end - self.y;
471    debug_assert!(remaining >= 0);
472    let remaining = remaining as usize;
473
474    (remaining, Some(remaining))
475  }
476}
477
478impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
479impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
480
481struct HorzPaddedIter<'a, T: Pixel> {
482  // Active area cropping is done using the length of the slice
483  slice: &'a [T],
484  // x index of the iterator
485  // When less than 0, repeat the first element. When greater than end, repeat
486  // the last element
487  index: isize,
488  // The index at which to terminate. Can be larger than the slice length.
489  end: usize,
490}
491
492impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
493  fn new(
494    slice: &'a [T], start_index: isize, width: usize,
495  ) -> HorzPaddedIter<'a, T> {
496    HorzPaddedIter {
497      slice,
498      index: start_index,
499      end: (width as isize + start_index) as usize,
500    }
501  }
502}
503
504impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
505  type Item = &'a T;
506
507  #[inline(always)]
508  fn next(&mut self) -> Option<Self::Item> {
509    if self.index < self.end as isize {
510      // clamp to the edges of the frame
511      let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize;
512      self.index += 1;
513      Some(&self.slice[x])
514    } else {
515      None
516    }
517  }
518
519  #[inline(always)]
520  fn size_hint(&self) -> (usize, Option<usize>) {
521    let size: usize = (self.end as isize - self.index) as usize;
522    (size, Some(size))
523  }
524}
525
526impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
527impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
528
529#[profiling::function]
530pub fn setup_integral_image<T: Pixel>(
531  integral_image_buffer: &mut IntegralImageBuffer,
532  integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
533  stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
534) {
535  let integral_image = &mut integral_image_buffer.integral_image;
536  let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
537
538  // Number of elements outside the stripe
539  let left_w = 4; // max radius of 2 + 2 padding
540  let right_w = 3; // max radius of 2 + 1 padding
541
542  assert_eq!(cdeffed.x, deblocked.x);
543
544  // Find how many unique elements to use to the left and right
545  let left_uniques = if cdeffed.x == 0 { 0 } else { left_w };
546  let right_uniques = right_w.min(crop_w - stripe_w);
547
548  // Find the total number of unique elements used
549  let row_uniques = left_uniques + stripe_w + right_uniques;
550
551  // Negative start indices result in repeating the first element of the row
552  let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 };
553
554  let mut rows_iter = VertPaddedIter::new(
555    // Move left to encompass all the used data
556    &cdeffed.go_left(left_uniques),
557    &deblocked.go_left(left_uniques),
558    // since r2 uses every other row, we need an extra row if stripe_h is odd
559    stripe_h + (stripe_h & 1),
560    crop_h,
561  )
562  .map(|row: &[T]| {
563    HorzPaddedIter::new(
564      // Limit how many unique elements we use
565      &row[..row_uniques],
566      start_index_x,
567      left_w + stripe_w + right_w,
568    )
569  });
570
571  // Setup the first row
572  {
573    let mut sum: u32 = 0;
574    let mut sq_sum: u32 = 0;
575    // Remove the first row and use it outside of the main loop
576    let row = rows_iter.next().unwrap();
577    for (src, (integral, sq_integral)) in
578      row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
579    {
580      let current = u32::cast_from(*src);
581
582      // Wrap adds to prevent undefined behaviour on overflow. Overflow is
583      // cancelled out when calculating the sum of a region.
584      sum = sum.wrapping_add(current);
585      *integral = sum;
586      sq_sum = sq_sum.wrapping_add(current * current);
587      *sq_integral = sq_sum;
588    }
589  }
590  // Calculate all other rows
591  let mut integral_slice = &mut integral_image[..];
592  let mut sq_integral_slice = &mut sq_integral_image[..];
593  for row in rows_iter {
594    let mut sum: u32 = 0;
595    let mut sq_sum: u32 = 0;
596
597    // Split the data between the previous row and future rows.
598    // This allows us to mutate the current row while accessing the
599    // previous row.
600    let (integral_row_prev, integral_row) =
601      integral_slice.split_at_mut(integral_image_stride);
602    let (sq_integral_row_prev, sq_integral_row) =
603      sq_integral_slice.split_at_mut(integral_image_stride);
604    for (
605      src,
606      ((integral_above, sq_integral_above), (integral, sq_integral)),
607    ) in row.zip(
608      integral_row_prev
609        .iter()
610        .zip(sq_integral_row_prev.iter())
611        .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
612    ) {
613      let current = u32::cast_from(*src);
614      // Wrap adds to prevent undefined behaviour on overflow. Overflow is
615      // cancelled out when calculating the sum of a region.
616      sum = sum.wrapping_add(current);
617      *integral = sum.wrapping_add(*integral_above);
618      sq_sum = sq_sum.wrapping_add(current * current);
619      *sq_integral = sq_sum.wrapping_add(*sq_integral_above);
620    }
621
622    // The current row also contains all future rows. Replacing the slice with
623    // it moves down a row.
624    integral_slice = integral_row;
625    sq_integral_slice = sq_integral_row;
626  }
627}
628
629#[profiling::function]
630pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
631  set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
632  integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
633  cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
634) {
635  let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
636  let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
637    [[0; IMAGE_WIDTH_MAX + 2]; 2];
638  let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
639    [[0; IMAGE_WIDTH_MAX + 2]; 2];
640  let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
641  let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
642  let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
643    [[0; IMAGE_WIDTH_MAX + 2]; 3];
644  let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
645    [[0; IMAGE_WIDTH_MAX + 2]; 3];
646  let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
647
648  let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
649  let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
650
651  let fn_ab_r1 = match fi.sequence.bit_depth {
652    8 => sgrproj_box_ab_r1::<8>,
653    10 => sgrproj_box_ab_r1::<10>,
654    12 => sgrproj_box_ab_r1::<12>,
655    _ => unimplemented!(),
656  };
657  let fn_ab_r2 = match fi.sequence.bit_depth {
658    8 => sgrproj_box_ab_r2::<8>,
659    10 => sgrproj_box_ab_r2::<10>,
660    12 => sgrproj_box_ab_r2::<12>,
661    _ => unimplemented!(),
662  };
663
664  /* prime the intermediate arrays */
665  // One oddness about the radius=2 intermediate array computations that
666  // the spec doesn't make clear: Although the spec defines computation
667  // of every row (of a, b and f), only half of the rows (every-other
668  // row) are actually used.
669  let integral_image = &integral_image_buffer.integral_image;
670  let sq_integral_image = &integral_image_buffer.sq_integral_image;
671  if s_r2 > 0 {
672    fn_ab_r2(
673      &mut a_r2[0],
674      &mut b_r2[0],
675      integral_image,
676      sq_integral_image,
677      integral_image_stride,
678      0,
679      stripe_w,
680      s_r2,
681      fi.cpu_feature_level,
682    );
683  }
684  if s_r1 > 0 {
685    let integral_image_offset = integral_image_stride + 1;
686    fn_ab_r1(
687      &mut a_r1[0],
688      &mut b_r1[0],
689      &integral_image[integral_image_offset..],
690      &sq_integral_image[integral_image_offset..],
691      integral_image_stride,
692      0,
693      stripe_w,
694      s_r1,
695      fi.cpu_feature_level,
696    );
697    fn_ab_r1(
698      &mut a_r1[1],
699      &mut b_r1[1],
700      &integral_image[integral_image_offset..],
701      &sq_integral_image[integral_image_offset..],
702      integral_image_stride,
703      1,
704      stripe_w,
705      s_r1,
706      fi.cpu_feature_level,
707    );
708  }
709
710  /* iterate by row */
711  // Increment by two to handle the use of even rows by r=2 and run a nested
712  //  loop to handle increments of one.
713  for y in (0..stripe_h).step_by(2) {
714    // get results to use y and y+1
715    let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
716      fn_ab_r2(
717        &mut a_r2[(y / 2 + 1) % 2],
718        &mut b_r2[(y / 2 + 1) % 2],
719        integral_image,
720        sq_integral_image,
721        integral_image_stride,
722        y + 2,
723        stripe_w,
724        s_r2,
725        fi.cpu_feature_level,
726      );
727      let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
728      let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
729      sgrproj_box_f_r2(
730        &ap0,
731        &bp0,
732        &mut f_r2_0,
733        &mut f_r2_1,
734        y,
735        stripe_w,
736        cdeffed,
737        fi.cpu_feature_level,
738      );
739      [&f_r2_0, &f_r2_1]
740    } else {
741      sgrproj_box_f_r0(
742        &mut f_r2_0,
743        y,
744        stripe_w,
745        cdeffed,
746        fi.cpu_feature_level,
747      );
748      // share results for both rows
749      [&f_r2_0, &f_r2_0]
750    };
751    for dy in 0..(2.min(stripe_h - y)) {
752      let y = y + dy;
753      if s_r1 > 0 {
754        let integral_image_offset = integral_image_stride + 1;
755        fn_ab_r1(
756          &mut a_r1[(y + 2) % 3],
757          &mut b_r1[(y + 2) % 3],
758          &integral_image[integral_image_offset..],
759          &sq_integral_image[integral_image_offset..],
760          integral_image_stride,
761          y + 2,
762          stripe_w,
763          s_r1,
764          fi.cpu_feature_level,
765        );
766        let ap1: [&[u32]; 3] =
767          [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
768        let bp1: [&[u32]; 3] =
769          [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
770        sgrproj_box_f_r1(
771          &ap1,
772          &bp1,
773          &mut f_r1,
774          y,
775          stripe_w,
776          cdeffed,
777          fi.cpu_feature_level,
778        );
779      } else {
780        sgrproj_box_f_r0(
781          &mut f_r1,
782          y,
783          stripe_w,
784          cdeffed,
785          fi.cpu_feature_level,
786        );
787      }
788
789      /* apply filter */
790      let w0 = xqd[0] as i32;
791      let w1 = xqd[1] as i32;
792      let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1;
793
794      let line = &cdeffed[y];
795
796      #[inline(always)]
797      fn apply_filter<U: Pixel>(
798        out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
799        stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
800      ) {
801        let line_it = line[..stripe_w].iter();
802        let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
803        let f_r1_it = f_r1[..stripe_w].iter();
804        let out_it = out[..stripe_w].iter_mut();
805
806        for ((o, &u), (&f_r2_ab, &f_r1)) in
807          out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
808        {
809          let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
810          let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
811          let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
812            >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
813          *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
814        }
815      }
816
817      apply_filter(
818        &mut out[y],
819        line,
820        &f_r1,
821        f_r2_ab[dy],
822        stripe_w,
823        fi.sequence.bit_depth,
824        w0,
825        w1,
826        w2,
827      );
828    }
829  }
830}
831
832// Frame inputs below aren't all equal, and will change as work
833// continues.  There's no deblocked reconstruction available at this
834// point of RDO, so we use the non-deblocked reconstruction, cdef and
835// input.  The input can be a full-sized frame. Cdef input is a partial
836// frame constructed specifically for RDO.
837
838// For simplicity, this ignores stripe segmentation (it's possible the
839// extra complexity isn't worth it and we'll ignore stripes
840// permanently during RDO, but that's not been tested yet). Data
841// access inside the cdef frame is monolithic and clipped to the cdef
842// borders.
843
844// Input params follow the same rules as sgrproj_stripe_filter.
845// Inputs are relative to the colocated slice views.
846#[profiling::function]
847pub fn sgrproj_solve<T: Pixel>(
848  set: u8, fi: &FrameInvariants<T>,
849  integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
850  cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
851) -> (i8, i8) {
852  let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
853    [[0; IMAGE_WIDTH_MAX + 2]; 2];
854  let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
855    [[0; IMAGE_WIDTH_MAX + 2]; 2];
856  let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
857  let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
858  let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
859    [[0; IMAGE_WIDTH_MAX + 2]; 3];
860  let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
861    [[0; IMAGE_WIDTH_MAX + 2]; 3];
862  let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
863
864  let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
865  let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
866
867  let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
868  let mut c: [f64; 2] = [0., 0.];
869
870  let fn_ab_r1 = match fi.sequence.bit_depth {
871    8 => sgrproj_box_ab_r1::<8>,
872    10 => sgrproj_box_ab_r1::<10>,
873    12 => sgrproj_box_ab_r1::<12>,
874    _ => unimplemented!(),
875  };
876  let fn_ab_r2 = match fi.sequence.bit_depth {
877    8 => sgrproj_box_ab_r2::<8>,
878    10 => sgrproj_box_ab_r2::<10>,
879    12 => sgrproj_box_ab_r2::<12>,
880    _ => unimplemented!(),
881  };
882
883  /* prime the intermediate arrays */
884  // One oddness about the radius=2 intermediate array computations that
885  // the spec doesn't make clear: Although the spec defines computation
886  // of every row (of a, b and f), only half of the rows (every-other
887  // row) are actually used.
888  let integral_image = &integral_image_buffer.integral_image;
889  let sq_integral_image = &integral_image_buffer.sq_integral_image;
890  if s_r2 > 0 {
891    fn_ab_r2(
892      &mut a_r2[0],
893      &mut b_r2[0],
894      integral_image,
895      sq_integral_image,
896      SOLVE_IMAGE_STRIDE,
897      0,
898      cdef_w,
899      s_r2,
900      fi.cpu_feature_level,
901    );
902  }
903  if s_r1 > 0 {
904    let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
905    fn_ab_r1(
906      &mut a_r1[0],
907      &mut b_r1[0],
908      &integral_image[integral_image_offset..],
909      &sq_integral_image[integral_image_offset..],
910      SOLVE_IMAGE_STRIDE,
911      0,
912      cdef_w,
913      s_r1,
914      fi.cpu_feature_level,
915    );
916    fn_ab_r1(
917      &mut a_r1[1],
918      &mut b_r1[1],
919      &integral_image[integral_image_offset..],
920      &sq_integral_image[integral_image_offset..],
921      SOLVE_IMAGE_STRIDE,
922      1,
923      cdef_w,
924      s_r1,
925      fi.cpu_feature_level,
926    );
927  }
928
929  /* iterate by row */
930  // Increment by two to handle the use of even rows by r=2 and run a nested
931  //  loop to handle increments of one.
932  for y in (0..cdef_h).step_by(2) {
933    // get results to use y and y+1
934    let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
935      fn_ab_r2(
936        &mut a_r2[(y / 2 + 1) % 2],
937        &mut b_r2[(y / 2 + 1) % 2],
938        integral_image,
939        sq_integral_image,
940        SOLVE_IMAGE_STRIDE,
941        y + 2,
942        cdef_w,
943        s_r2,
944        fi.cpu_feature_level,
945      );
946      let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
947      let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
948      sgrproj_box_f_r2(
949        &ap0,
950        &bp0,
951        &mut f_r2_0,
952        &mut f_r2_1,
953        y,
954        cdef_w,
955        cdeffed,
956        fi.cpu_feature_level,
957      );
958      [&f_r2_0, &f_r2_1]
959    } else {
960      sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
961      // share results for both rows
962      [&f_r2_0, &f_r2_0]
963    };
964    for dy in 0..(2.min(cdef_h - y)) {
965      let y = y + dy;
966      if s_r1 > 0 {
967        let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
968        fn_ab_r1(
969          &mut a_r1[(y + 2) % 3],
970          &mut b_r1[(y + 2) % 3],
971          &integral_image[integral_image_offset..],
972          &sq_integral_image[integral_image_offset..],
973          SOLVE_IMAGE_STRIDE,
974          y + 2,
975          cdef_w,
976          s_r1,
977          fi.cpu_feature_level,
978        );
979        let ap1: [&[u32]; 3] =
980          [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
981        let bp1: [&[u32]; 3] =
982          [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
983        sgrproj_box_f_r1(
984          &ap1,
985          &bp1,
986          &mut f_r1,
987          y,
988          cdef_w,
989          cdeffed,
990          fi.cpu_feature_level,
991        );
992      } else {
993        sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
994      }
995
996      #[inline(always)]
997      fn process_line<T: Pixel>(
998        h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
999        f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
1000      ) {
1001        let cdeffed_it = cdeffed[..cdef_w].iter();
1002        let input_it = input[..cdef_w].iter();
1003        let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
1004        let f_r1_it = f_r1[..cdef_w].iter();
1005
1006        #[derive(Debug, Copy, Clone)]
1007        struct Sums {
1008          h: [[i64; 2]; 2],
1009          c: [i64; 2],
1010        }
1011
1012        let sums: Sums = cdeffed_it
1013          .zip(input_it)
1014          .zip(f_r2_ab_it.zip(f_r1_it))
1015          .map(|((&u, &i), (&f2, &f1))| {
1016            let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
1017            let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
1018            let f2 = f2 as i32 - u;
1019            let f1 = f1 as i32 - u;
1020            (s as i64, f1 as i64, f2 as i64)
1021          })
1022          .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
1023            let mut ret: Sums = sums;
1024            ret.h[0][0] += f2 * f2;
1025            ret.h[1][1] += f1 * f1;
1026            ret.h[0][1] += f1 * f2;
1027            ret.c[0] += f2 * s;
1028            ret.c[1] += f1 * s;
1029            ret
1030          });
1031
1032        h[0][0] += sums.h[0][0] as f64;
1033        h[1][1] += sums.h[1][1] as f64;
1034        h[0][1] += sums.h[0][1] as f64;
1035        c[0] += sums.c[0] as f64;
1036        c[1] += sums.c[1] as f64;
1037      }
1038
1039      process_line(
1040        &mut h,
1041        &mut c,
1042        &cdeffed[y],
1043        &input[y],
1044        &f_r1,
1045        f_r2_01[dy],
1046        cdef_w,
1047      );
1048    }
1049  }
1050
1051  // this is lifted almost in-tact from libaom
1052  let n = cdef_w as f64 * cdef_h as f64;
1053  h[0][0] /= n;
1054  h[0][1] /= n;
1055  h[1][1] /= n;
1056  h[1][0] = h[0][1];
1057  c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1058  c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1059  let (xq0, xq1) = if s_r2 == 0 {
1060    // H matrix is now only the scalar h[1][1]
1061    // C vector is now only the scalar c[1]
1062    if h[1][1] == 0. {
1063      (0, 0)
1064    } else {
1065      (0, (c[1] / h[1][1]).round() as i32)
1066    }
1067  } else if s_r1 == 0 {
1068    // H matrix is now only the scalar h[0][0]
1069    // C vector is now only the scalar c[0]
1070    if h[0][0] == 0. {
1071      (0, 0)
1072    } else {
1073      ((c[0] / h[0][0]).round() as i32, 0)
1074    }
1075  } else {
1076    let det = h[0][0].mul_add(h[1][1], -h[0][1] * h[1][0]);
1077    if det == 0. {
1078      (0, 0)
1079    } else {
1080      // If scaling up dividend would overflow, instead scale down the divisor
1081      let div1 = h[1][1].mul_add(c[0], -h[0][1] * c[1]);
1082      let div2 = h[0][0].mul_add(c[1], -h[1][0] * c[0]);
1083      ((div1 / det).round() as i32, (div2 / det).round() as i32)
1084    }
1085  };
1086  {
1087    let xqd0 =
1088      clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32);
1089    let xqd1 = clamp(
1090      (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1091      SGRPROJ_XQD_MIN[1] as i32,
1092      SGRPROJ_XQD_MAX[1] as i32,
1093    );
1094    (xqd0 as i8, xqd1 as i8)
1095  }
1096}
1097
1098#[profiling::function]
1099fn wiener_stripe_filter<T: Pixel>(
1100  coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1101  stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1102  cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1103) {
1104  let bit_depth = fi.sequence.bit_depth;
1105  let round_h = if bit_depth == 12 { 5 } else { 3 };
1106  let round_v = if bit_depth == 12 { 9 } else { 11 };
1107  let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
1108  let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
1109
1110  let mut coeffs_ = [[0; 3]; 2];
1111  for i in 0..2 {
1112    for j in 0..3 {
1113      coeffs_[i][j] = i32::from(coeffs[i][j]);
1114    }
1115  }
1116
1117  let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7];
1118  let vfilter: [i32; 7] = [
1119    coeffs_[0][0],
1120    coeffs_[0][1],
1121    coeffs_[0][2],
1122    128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]),
1123    coeffs_[0][2],
1124    coeffs_[0][1],
1125    coeffs_[0][0],
1126  ];
1127  let hfilter: [i32; 7] = [
1128    coeffs_[1][0],
1129    coeffs_[1][1],
1130    coeffs_[1][2],
1131    128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]),
1132    coeffs_[1][2],
1133    coeffs_[1][1],
1134    coeffs_[1][0],
1135  ];
1136
1137  // unlike x, our y can be negative to start as the first stripe
1138  // starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1139  let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize;
1140  let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize;
1141  let end_i = cmp::max(
1142    0,
1143    if stripe_h as isize + stripe_y > crop_h as isize {
1144      crop_h as isize - stripe_y - start_wi as isize
1145    } else {
1146      stripe_h as isize - start_wi as isize
1147    },
1148  ) as usize;
1149
1150  let mut out_slice =
1151    out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize });
1152
1153  for xi in stripe_x..stripe_x + stripe_w {
1154    let n = cmp::min(7, crop_w as isize + 3 - xi as isize);
1155    for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 {
1156      let mut acc = 0;
1157      let src = if yi < stripe_y {
1158        let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2);
1159        deblocked.row(ly)
1160      } else if yi < stripe_y + stripe_h as isize {
1161        let ly = clamp(yi, 0, crop_h as isize - 1);
1162        cdeffed.row(ly)
1163      } else {
1164        let ly = cmp::min(
1165          clamp(yi, 0, crop_h as isize - 1),
1166          stripe_y + stripe_h as isize + 1,
1167        );
1168        deblocked.row(ly)
1169      };
1170      let start = i32::cast_from(src[0]);
1171      let end = i32::cast_from(src[crop_w - 1]);
1172      for i in 0..3 - xi as isize {
1173        acc += hfilter[i as usize] * start;
1174      }
1175
1176      let off = 3 - (xi as isize);
1177      let s = cmp::max(0, off) as usize;
1178      let s1 = (s as isize - off) as usize;
1179      let n1 = (n - off) as usize;
1180
1181      for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1182        acc += hf * i32::cast_from(v);
1183      }
1184
1185      for i in n..7 {
1186        acc += hfilter[i as usize] * end;
1187      }
1188
1189      acc = (acc + (1 << round_h >> 1)) >> round_h;
1190      work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset);
1191    }
1192
1193    for (wi, dst) in (start_wi..start_wi + end_i)
1194      .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i))
1195    {
1196      let mut acc = 0;
1197      for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) {
1198        acc += vfilter[i] * *src;
1199      }
1200      *dst = T::cast_from(clamp(
1201        (acc + (1 << round_v >> 1)) >> round_v,
1202        0,
1203        (1 << bit_depth) - 1,
1204      ));
1205    }
1206  }
1207}
1208
1209#[derive(Copy, Clone, Debug, Default)]
1210pub struct RestorationUnit {
1211  pub filter: RestorationFilter,
1212}
1213
1214#[derive(Clone, Debug)]
1215pub struct FrameRestorationUnits {
1216  units: Box<[RestorationUnit]>,
1217  pub cols: usize,
1218  pub rows: usize,
1219}
1220
1221impl FrameRestorationUnits {
1222  pub fn new(cols: usize, rows: usize) -> Self {
1223    Self {
1224      units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1225      cols,
1226      rows,
1227    }
1228  }
1229}
1230
1231impl Index<usize> for FrameRestorationUnits {
1232  type Output = [RestorationUnit];
1233  #[inline(always)]
1234  fn index(&self, index: usize) -> &Self::Output {
1235    &self.units[index * self.cols..(index + 1) * self.cols]
1236  }
1237}
1238
1239impl IndexMut<usize> for FrameRestorationUnits {
1240  #[inline(always)]
1241  fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1242    &mut self.units[index * self.cols..(index + 1) * self.cols]
1243  }
1244}
1245
1246#[derive(Clone, Debug)]
1247pub struct RestorationPlaneConfig {
1248  pub lrf_type: u8,
1249  pub unit_size: usize,
1250  // (1 << sb_x_shift) gives the number of superblocks horizontally or
1251  // vertically in a restoration unit, not accounting for RU stretching
1252  pub sb_h_shift: usize,
1253  pub sb_v_shift: usize,
1254  pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1255  pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1256  // stripe height is 64 in all cases except 4:2:0 chroma planes where
1257  // it is 32.  This is independent of all other setup parameters
1258  pub stripe_height: usize,
1259  pub cols: usize,
1260  pub rows: usize,
1261}
1262
1263#[derive(Clone, Debug)]
1264pub struct RestorationPlane {
1265  pub cfg: RestorationPlaneConfig,
1266  pub units: FrameRestorationUnits,
1267}
1268
1269impl RestorationPlane {
1270  pub fn new(
1271    lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1272    sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1273    rows: usize,
1274  ) -> RestorationPlane {
1275    let stripe_height = if stripe_decimate != 0 { 32 } else { 64 };
1276    RestorationPlane {
1277      cfg: RestorationPlaneConfig {
1278        lrf_type,
1279        unit_size,
1280        sb_h_shift,
1281        sb_v_shift,
1282        sb_cols,
1283        sb_rows,
1284        stripe_height,
1285        cols,
1286        rows,
1287      },
1288      units: FrameRestorationUnits::new(cols, rows),
1289    }
1290  }
1291
1292  // Stripes are always 64 pixels high in a non-subsampled
1293  // frame, and decimated from 64 pixels in chroma.  When
1294  // filtering, they are not co-located on Y with superblocks.
1295  fn restoration_unit_index_by_stripe(
1296    &self, stripenum: usize, rux: usize,
1297  ) -> (usize, usize) {
1298    (
1299      cmp::min(rux, self.cfg.cols - 1),
1300      cmp::min(
1301        stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1302        self.cfg.rows - 1,
1303      ),
1304    )
1305  }
1306
1307  pub fn restoration_unit_by_stripe(
1308    &self, stripenum: usize, rux: usize,
1309  ) -> &RestorationUnit {
1310    let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1311    &self.units[y][x]
1312  }
1313}
1314
1315#[derive(Clone, Debug)]
1316pub struct RestorationState {
1317  pub planes: [RestorationPlane; MAX_PLANES],
1318}
1319
1320impl RestorationState {
1321  pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1322    let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg;
1323    // stripe size is decimated in 4:2:0 (and only 4:2:0)
1324    let stripe_uv_decimate = usize::from(xdec > 0 && ydec > 0);
1325    let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 };
1326    let uv_sb_h_log2 = y_sb_log2 - xdec;
1327    let uv_sb_v_log2 = y_sb_log2 - ydec;
1328
1329    let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1330      && fi.sequence.enable_restoration
1331    {
1332      assert!(
1333        fi.width > 1 && fi.height > 1,
1334        "Width and height must be higher than 1 for LRF setup"
1335      );
1336
1337      // Specific content does affect optimal LRU size choice, but the
1338      // quantizer in use is a surprisingly strong selector.
1339      let lrf_base_shift = if fi.base_q_idx > 200 {
1340        0 // big
1341      } else if fi.base_q_idx > 160 {
1342        1
1343      } else {
1344        2 // small
1345      };
1346      let lrf_chroma_shift = if stripe_uv_decimate > 0 {
1347        // 4:2:0 only
1348        if lrf_base_shift == 2 {
1349          1 // smallest chroma LRU is a win at low quant
1350        } else {
1351          // Will a down-shifted chroma LRU eliminate stretch in chroma?
1352          // If so, that's generally a win.
1353          let lrf_unit_size =
1354            1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1355          let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size
1356            <= lrf_unit_size / 2
1357            || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2;
1358          let shifted_stretch = ((fi.width >> xdec) - 1)
1359            % (lrf_unit_size >> 1)
1360            <= lrf_unit_size / 4
1361            || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1)
1362              <= lrf_unit_size / 4;
1363          // shift to eliminate stretch if needed,
1364          // otherwise do not shift and save the signaling bits
1365          usize::from(unshifted_stretch && !shifted_stretch)
1366        }
1367      } else {
1368        0
1369      };
1370      (lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1371    } else {
1372      // Explicit request to tie LRU size to superblock size ==
1373      // smallest possible LRU size
1374      let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 };
1375      (lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1376    };
1377
1378    let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1379    let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1380
1381    let tiling = fi.sequence.tiling;
1382    // Right now we defer to tiling setup: don't choose an LRU size
1383    // large enough that a tile is not an integer number of LRUs
1384    // wide/high.
1385    if tiling.cols > 1 || tiling.rows > 1 {
1386      // despite suggestions to the contrary, tiles can be
1387      // non-powers-of-2.
1388      let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1389      let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1390      let tile_aligned_y_unit_size =
1391        1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1392      let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros);
1393      let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros);
1394      y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1395      uv_unit_size = uv_unit_size
1396        .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1397
1398      // But it's actually worse: LRUs can't span tiles (in our
1399      // one-pass design that is, spec allows it).  However, the spec
1400      // mandates the last LRU stretches forward into any
1401      // less-than-half-LRU span of superblocks at the right and
1402      // bottom of a frame.  These superblocks may well be in a
1403      // different tile!  Even if LRUs are minimum size (one
1404      // superblock), when the right or bottom edge of the frame is a
1405      // superblock that's less than half the width/height of a normal
1406      // superblock, the LRU is forced by the spec to span into it
1407      // (and thus a different tile).  Tiling is under no such
1408      // restriction; it could decide the right/left sliver will be in
1409      // its own tile row/column.  We can't disallow the combination
1410      // here.  The tiling code will have to either prevent it or
1411      // tolerate it.  (prayer mechanic == Issue #1629).
1412    }
1413
1414    // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1415    // to be the same*. If they differ at this
1416    // point, it's due to a tiling restriction enforcing a maximum
1417    // size, so force both to the smaller value.
1418    //
1419    // *see sec 5.9.20, "Loop restoration params syntax".  The
1420    // bitstream provides means of coding a different UV LRU size only
1421    // when chroma is in use and both x and y are subsampled in the
1422    // chroma planes.
1423    if ydec == 0 && y_unit_size != uv_unit_size {
1424      y_unit_size = uv_unit_size.min(y_unit_size);
1425      uv_unit_size = y_unit_size;
1426    }
1427
1428    // derive the rest
1429    let y_unit_log2 = y_unit_size.ilog() - 1;
1430    let uv_unit_log2 = uv_unit_size.ilog() - 1;
1431    let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1);
1432    let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1);
1433    let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec)
1434      + (uv_unit_size >> 1))
1435      / uv_unit_size)
1436      .max(1);
1437    let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec)
1438      + (uv_unit_size >> 1))
1439      / uv_unit_size)
1440      .max(1);
1441
1442    RestorationState {
1443      planes: [
1444        RestorationPlane::new(
1445          RESTORE_SWITCHABLE,
1446          y_unit_size,
1447          y_unit_log2 - y_sb_log2,
1448          y_unit_log2 - y_sb_log2,
1449          fi.sb_width,
1450          fi.sb_height,
1451          0,
1452          y_cols,
1453          y_rows,
1454        ),
1455        RestorationPlane::new(
1456          RESTORE_SWITCHABLE,
1457          uv_unit_size,
1458          uv_unit_log2 - uv_sb_h_log2,
1459          uv_unit_log2 - uv_sb_v_log2,
1460          fi.sb_width,
1461          fi.sb_height,
1462          stripe_uv_decimate,
1463          uv_cols,
1464          uv_rows,
1465        ),
1466        RestorationPlane::new(
1467          RESTORE_SWITCHABLE,
1468          uv_unit_size,
1469          uv_unit_log2 - uv_sb_h_log2,
1470          uv_unit_log2 - uv_sb_v_log2,
1471          fi.sb_width,
1472          fi.sb_height,
1473          stripe_uv_decimate,
1474          uv_cols,
1475          uv_rows,
1476        ),
1477      ],
1478    }
1479  }
1480
1481  #[profiling::function]
1482  pub fn lrf_filter_frame<T: Pixel>(
1483    &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1484    fi: &FrameInvariants<T>,
1485  ) {
1486    let cdeffed = out.clone();
1487    let planes =
1488      if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES };
1489
1490    // unlike the other loop filters that operate over the padded
1491    // frame dimensions, restoration filtering and source pixel
1492    // accesses are clipped to the original frame dimensions
1493    // that's why we use fi.width and fi.height instead of PlaneConfig fields
1494
1495    // number of stripes (counted according to colocated Y luma position)
1496    let stripe_n = (fi.height + 7) / 64 + 1;
1497
1498    // Buffers for the stripe filter.
1499    let mut stripe_filter_buffer =
1500      IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1501
1502    for pli in 0..planes {
1503      let rp = &self.planes[pli];
1504      let xdec = out.planes[pli].cfg.xdec;
1505      let ydec = out.planes[pli].cfg.ydec;
1506      let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec;
1507      let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec;
1508
1509      for si in 0..stripe_n {
1510        let (stripe_start_y, stripe_size) = if si == 0 {
1511          (0, (64 - 8) >> ydec)
1512        } else {
1513          let start = (si * 64 - 8) >> ydec;
1514          (
1515            start as isize,
1516            // one past, unlike spec
1517            (64 >> ydec).min(crop_h - start),
1518          )
1519        };
1520
1521        // horizontally, go rdu-by-rdu
1522        for rux in 0..rp.cfg.cols {
1523          // stripe x pixel locations must be clipped to frame, last may need to stretch
1524          let x = rux * rp.cfg.unit_size;
1525          let size =
1526            if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size };
1527          let ru = rp.restoration_unit_by_stripe(si, rux);
1528          match ru.filter {
1529            RestorationFilter::Wiener { coeffs } => {
1530              wiener_stripe_filter(
1531                coeffs,
1532                fi,
1533                crop_w,
1534                crop_h,
1535                size,
1536                stripe_size,
1537                x,
1538                stripe_start_y,
1539                &cdeffed.planes[pli],
1540                &pre_cdef.planes[pli],
1541                &mut out.planes[pli],
1542              );
1543            }
1544            RestorationFilter::Sgrproj { set, xqd } => {
1545              if !fi.sequence.enable_cdef {
1546                continue;
1547              }
1548
1549              setup_integral_image(
1550                &mut stripe_filter_buffer,
1551                STRIPE_IMAGE_STRIDE,
1552                crop_w - x,
1553                (crop_h as isize - stripe_start_y) as usize,
1554                size,
1555                stripe_size,
1556                &cdeffed.planes[pli]
1557                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1558                &pre_cdef.planes[pli]
1559                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1560              );
1561
1562              sgrproj_stripe_filter(
1563                set,
1564                xqd,
1565                fi,
1566                &stripe_filter_buffer,
1567                STRIPE_IMAGE_STRIDE,
1568                &cdeffed.planes[pli]
1569                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1570                &mut out.planes[pli].region_mut(Area::Rect {
1571                  x: x as isize,
1572                  y: stripe_start_y,
1573                  width: size,
1574                  height: stripe_size,
1575                }),
1576              );
1577            }
1578            RestorationFilter::None => {
1579              // do nothing
1580            }
1581          }
1582        }
1583      }
1584    }
1585  }
1586}