rav1e/
lrf.rs

1// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11  if #[cfg(nasm_x86_64)] {
12    use crate::asm::x86::lrf::*;
13  } else {
14    use self::rust::*;
15  }
16}
17
18use crate::api::SGRComplexityLevel;
19use crate::color::ChromaSampling::Cs400;
20use crate::context::{MAX_PLANES, SB_SIZE};
21use crate::encoder::FrameInvariants;
22use crate::frame::{
23  AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
24};
25use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
26use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
27use std::cmp;
28use std::iter::FusedIterator;
29use std::ops::{Index, IndexMut};
30
31pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8;
32
33pub const RESTORE_NONE: u8 = 0;
34pub const RESTORE_SWITCHABLE: u8 = 1;
35pub const RESTORE_WIENER: u8 = 2;
36pub const RESTORE_SGRPROJ: u8 = 3;
37
38pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17];
39pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15];
40pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46];
41#[allow(unused)]
42pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3];
43pub const WIENER_BITS: usize = 7;
44
45pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32];
46pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31];
47pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95];
48pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4;
49pub const SGRPROJ_PRJ_BITS: u8 = 7;
50pub const SGRPROJ_PARAMS_BITS: u8 = 4;
51pub const SGRPROJ_MTABLE_BITS: u8 = 20;
52pub const SGRPROJ_SGR_BITS: u8 = 8;
53pub const SGRPROJ_RECIP_BITS: u8 = 12;
54pub const SGRPROJ_RST_BITS: u8 = 4;
55pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [
56  [140, 3236],
57  [112, 2158],
58  [93, 1618],
59  [80, 1438],
60  [70, 1295],
61  [58, 1177],
62  [47, 1079],
63  [37, 996],
64  [30, 925],
65  [25, 863],
66  [0, 2589],
67  [0, 1618],
68  [0, 1177],
69  [0, 925],
70  [56, 0],
71  [22, 0],
72];
73
74// List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
75// SGRPROJ_ALL_SETS contains every possible index
76const SGRPROJ_ALL_SETS: &[u8] =
77  &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
78// SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
79// most of the gains from sgr. The decision of which values to use is somewhat
80// arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
81// parameters as non-zero. The other two are distinguishable by which of the
82// two parameters is zero. There are an even number of each of these groups and
83// the non-zero parameters grow as the indices increase. This array uses the
84// 1st, 3rd, ... smallest params of each group.
85const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15];
86
87pub const fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
88  match complexity {
89    SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
90    SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
91  }
92}
93
94pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2;
95pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2;
96pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
97pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
98
99pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2)
100  + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1));
101pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2;
102pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2;
103pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
104
105pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
106  [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
107
108/// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
109#[derive(Debug)]
110pub struct IntegralImageBuffer {
111  pub integral_image: Vec<u32>,
112  pub sq_integral_image: Vec<u32>,
113}
114
115impl IntegralImageBuffer {
116  /// Creates a new buffer with the given size, filled with zeros.
117  #[inline]
118  pub fn zeroed(size: usize) -> Self {
119    Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] }
120  }
121}
122
123#[allow(unused)] // Wiener coming soon!
124#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
125pub enum RestorationFilter {
126  #[default]
127  None,
128  Wiener {
129    coeffs: [[i8; 3]; 2],
130  },
131  Sgrproj {
132    set: u8,
133    xqd: [i8; 2],
134  },
135}
136
137impl RestorationFilter {
138  pub const fn notequal(self, cmp: RestorationFilter) -> bool {
139    match self {
140      RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}),
141      RestorationFilter::Sgrproj { set, xqd } => {
142        if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
143          !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1])
144        } else {
145          true
146        }
147      }
148      RestorationFilter::Wiener { coeffs } => {
149        if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
150          !(coeffs[0][0] == coeffs2[0][0]
151            && coeffs[0][1] == coeffs2[0][1]
152            && coeffs[0][2] == coeffs2[0][2]
153            && coeffs[1][0] == coeffs2[1][0]
154            && coeffs[1][1] == coeffs2[1][1]
155            && coeffs[1][2] == coeffs2[1][2])
156        } else {
157          true
158        }
159      }
160    }
161  }
162}
163
164pub(crate) mod rust {
165  use crate::cpu_features::CpuFeatureLevel;
166  use crate::frame::PlaneSlice;
167  use crate::lrf::{
168    get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
169    SGRPROJ_SGR_BITS,
170  };
171  use crate::util::CastFromPrimitive;
172  use crate::Pixel;
173
174  #[inline(always)]
175  pub(crate) fn sgrproj_box_ab_internal<const BD: usize>(
176    r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
177    iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
178  ) {
179    let d: usize = r * 2 + 1;
180    let n: usize = d * d;
181    let one_over_n = if r == 1 { 455 } else { 164 };
182
183    assert!(iimg.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
184    assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + 1 + d);
185    assert!(af.len() > stripe_w + 1);
186    assert!(bf.len() > stripe_w + 1);
187
188    for x in start_x..stripe_w + 2 {
189      // SAFETY: We perform the bounds checks above, once for the whole loop
190      unsafe {
191        let sum = get_integral_square(iimg, iimg_stride, x, y, d);
192        let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
193        let (reta, retb) =
194          sgrproj_sum_finish::<BD>(ssq, sum, n as u32, one_over_n, s);
195        *af.get_unchecked_mut(x) = reta;
196        *bf.get_unchecked_mut(x) = retb;
197      }
198    }
199  }
200
201  // computes an intermediate (ab) row for stripe_w + 2 columns at row y
202  pub(crate) fn sgrproj_box_ab_r1<const BD: usize>(
203    af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
204    iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
205    _cpu: CpuFeatureLevel,
206  ) {
207    sgrproj_box_ab_internal::<BD>(
208      1,
209      af,
210      bf,
211      iimg,
212      iimg_sq,
213      iimg_stride,
214      0,
215      y,
216      stripe_w,
217      s,
218    );
219  }
220
221  // computes an intermediate (ab) row for stripe_w + 2 columns at row y
222  pub(crate) fn sgrproj_box_ab_r2<const BD: usize>(
223    af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
224    iimg_stride: usize, y: usize, stripe_w: usize, s: u32,
225    _cpu: CpuFeatureLevel,
226  ) {
227    sgrproj_box_ab_internal::<BD>(
228      2,
229      af,
230      bf,
231      iimg,
232      iimg_sq,
233      iimg_stride,
234      0,
235      y,
236      stripe_w,
237      s,
238    );
239  }
240
241  pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
242    f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
243    _cpu: CpuFeatureLevel,
244  ) {
245    sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed);
246  }
247
248  #[inline(always)]
249  pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
250    f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
251  ) {
252    let line = cdeffed.row(y);
253    for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
254      *fp = u32::cast_from(v) << SGRPROJ_RST_BITS;
255    }
256  }
257
258  pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
259    af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
260    cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
261  ) {
262    sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed);
263  }
264
265  #[inline(always)]
266  pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
267    af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize,
268    y: usize, w: usize, cdeffed: &PlaneSlice<T>,
269  ) {
270    let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
271    let line = cdeffed.row(y);
272    for x in start_x..w {
273      let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2])
274        + 4
275          * (af[1][x]
276            + af[0][x + 1]
277            + af[1][x + 1]
278            + af[2][x + 1]
279            + af[1][x + 2]);
280      let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2])
281        + 4
282          * (bf[1][x]
283            + bf[0][x + 1]
284            + bf[1][x + 1]
285            + bf[2][x + 1]
286            + bf[1][x + 2]);
287      let v = a * u32::cast_from(line[x]) + b;
288      f[x] = (v + (1 << shift >> 1)) >> shift;
289    }
290  }
291
292  pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
293    af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
294    y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
295  ) {
296    sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed);
297  }
298
299  #[inline(always)]
300  pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
301    af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
302    start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
303  ) {
304    let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
305    let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
306    let line = cdeffed.row(y);
307    let line1 = cdeffed.row(y + 1);
308
309    let af0 = af[0][start_x..w + 3].windows(3);
310    let af1 = af[1][start_x..w + 3].windows(3);
311    let bf0 = bf[0][start_x..w + 3].windows(3);
312    let bf1 = bf[1][start_x..w + 3].windows(3);
313
314    let af_it = af0.zip(af1);
315    let bf_it = bf0.zip(bf1);
316
317    let in0 = line[start_x..w].iter();
318    let in1 = line1[start_x..w].iter();
319
320    let o0 = f0[start_x..w].iter_mut();
321    let o1 = f1[start_x..w].iter_mut();
322
323    let in_iter = in0.zip(in1);
324    let out_iter = o0.zip(o1);
325
326    let io_iter = out_iter.zip(in_iter);
327
328    for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
329      io_iter.zip(af_it.zip(bf_it))
330    {
331      let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1];
332      let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1];
333      let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1];
334      let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1];
335      let v = (a + ao) * u32::cast_from(p0) + b + bo;
336      *o0 = (v + (1 << shift >> 1)) >> shift;
337      let vo = ao * u32::cast_from(p1) + bo;
338      *o1 = (vo + (1 << shifto >> 1)) >> shifto;
339    }
340  }
341}
342
343#[inline(always)]
344fn sgrproj_sum_finish<const BD: usize>(
345  ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32,
346) -> (u32, u32) {
347  let bdm8 = BD - 8;
348  let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8);
349  let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8;
350  let p = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum);
351  let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS;
352  let a = if z >= 255 {
353    256
354  } else if z == 0 {
355    1
356  } else {
357    ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1)
358  };
359  let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
360  (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS)
361}
362
363// Using an integral image, compute the sum of a square region
364// SAFETY: The size of `iimg` must be at least `(y + size) * stride + x + size`
365#[inline(always)]
366unsafe fn get_integral_square(
367  iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
368) -> u32 {
369  // Cancel out overflow in iimg by using wrapping arithmetic
370  let top_left = *iimg.get_unchecked(y * stride + x);
371  let top_right = *iimg.get_unchecked(y * stride + x + size);
372  let bottom_left = *iimg.get_unchecked((y + size) * stride + x);
373  let bottom_right = *iimg.get_unchecked((y + size) * stride + x + size);
374  top_left
375    .wrapping_add(bottom_right)
376    .wrapping_sub(bottom_left)
377    .wrapping_sub(top_right)
378}
379
380struct VertPaddedIter<'a, T: Pixel> {
381  // The two sources that can be selected when clipping
382  deblocked: &'a Plane<T>,
383  cdeffed: &'a Plane<T>,
384  // x index to choice where on the row to start
385  x: isize,
386  // y index that will be mutated
387  y: isize,
388  // The index at which to terminate. Can be larger than the slice length.
389  end: isize,
390  // Used for source buffer choice/clipping. May (and regularly will)
391  // be negative.
392  stripe_begin: isize,
393  // Also used for source buffer choice/clipping. May specify a stripe boundary
394  // less than, equal to, or larger than the buffers we're accessing.
395  stripe_end: isize,
396  // Active area cropping is done by specifying a value smaller than the height
397  // of the plane.
398  crop: isize,
399}
400
401impl<'a, T: Pixel> VertPaddedIter<'a, T> {
402  fn new(
403    cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
404    stripe_h: usize, crop: usize,
405  ) -> VertPaddedIter<'a, T> {
406    // cdeffed and deblocked must start at the same coordinates from their
407    // underlying planes. Since cropping is provided via a separate params, the
408    // height of the underlying planes do not need to match.
409    assert_eq!(cdeffed.x, deblocked.x);
410    assert_eq!(cdeffed.y, deblocked.y);
411
412    // To share integral images, always use the max box filter radius of 2
413    let r = 2;
414
415    // The number of rows outside the stripe are needed
416    let rows_above = r + 2;
417    let rows_below = 2;
418
419    // Offset crop and stripe_h so they are relative to the underlying plane
420    // and not the plane slice.
421    let crop = crop as isize + deblocked.y;
422    let stripe_end = stripe_h as isize + deblocked.y;
423
424    // Move y up the number rows above.
425    // If y is negative we repeat the first row
426    let y = deblocked.y - rows_above as isize;
427
428    VertPaddedIter {
429      deblocked: deblocked.plane,
430      cdeffed: cdeffed.plane,
431      x: deblocked.x,
432      y,
433      end: (rows_above + stripe_h + rows_below) as isize + y,
434      stripe_begin: deblocked.y,
435      stripe_end,
436      crop,
437    }
438  }
439}
440
441impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
442  type Item = &'a [T];
443
444  #[inline(always)]
445  fn next(&mut self) -> Option<Self::Item> {
446    if self.end > self.y {
447      // clamp before deciding the source
448      // clamp vertically to storage at top and passed-in height at bottom
449      let cropped_y = clamp(self.y, 0, self.crop - 1);
450      // clamp vertically to stripe limits
451      let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1);
452
453      // decide if we're vertically inside or outside the strip
454      let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end {
455        self.cdeffed
456      } else {
457        self.deblocked
458      };
459      // cannot directly return self.ps.row(row) due to lifetime issue
460      let range = src_plane.row_range(self.x, ly);
461      self.y += 1;
462      Some(&src_plane.data[range])
463    } else {
464      None
465    }
466  }
467
468  fn size_hint(&self) -> (usize, Option<usize>) {
469    let remaining = self.end - self.y;
470    debug_assert!(remaining >= 0);
471    let remaining = remaining as usize;
472
473    (remaining, Some(remaining))
474  }
475}
476
477impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
478impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
479
480struct HorzPaddedIter<'a, T: Pixel> {
481  // Active area cropping is done using the length of the slice
482  slice: &'a [T],
483  // x index of the iterator
484  // When less than 0, repeat the first element. When greater than end, repeat
485  // the last element
486  index: isize,
487  // The index at which to terminate. Can be larger than the slice length.
488  end: usize,
489}
490
491impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
492  fn new(
493    slice: &'a [T], start_index: isize, width: usize,
494  ) -> HorzPaddedIter<'a, T> {
495    HorzPaddedIter {
496      slice,
497      index: start_index,
498      end: (width as isize + start_index) as usize,
499    }
500  }
501}
502
503impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
504  type Item = &'a T;
505
506  #[inline(always)]
507  fn next(&mut self) -> Option<Self::Item> {
508    if self.index < self.end as isize {
509      // clamp to the edges of the frame
510      let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize;
511      self.index += 1;
512      Some(&self.slice[x])
513    } else {
514      None
515    }
516  }
517
518  #[inline(always)]
519  fn size_hint(&self) -> (usize, Option<usize>) {
520    let size: usize = (self.end as isize - self.index) as usize;
521    (size, Some(size))
522  }
523}
524
525impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
526impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
527
528#[profiling::function]
529pub fn setup_integral_image<T: Pixel>(
530  integral_image_buffer: &mut IntegralImageBuffer,
531  integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
532  stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
533) {
534  let integral_image = &mut integral_image_buffer.integral_image;
535  let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
536
537  // Number of elements outside the stripe
538  let left_w = 4; // max radius of 2 + 2 padding
539  let right_w = 3; // max radius of 2 + 1 padding
540
541  assert_eq!(cdeffed.x, deblocked.x);
542
543  // Find how many unique elements to use to the left and right
544  let left_uniques = if cdeffed.x == 0 { 0 } else { left_w };
545  let right_uniques = right_w.min(crop_w - stripe_w);
546
547  // Find the total number of unique elements used
548  let row_uniques = left_uniques + stripe_w + right_uniques;
549
550  // Negative start indices result in repeating the first element of the row
551  let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 };
552
553  let mut rows_iter = VertPaddedIter::new(
554    // Move left to encompass all the used data
555    &cdeffed.go_left(left_uniques),
556    &deblocked.go_left(left_uniques),
557    // since r2 uses every other row, we need an extra row if stripe_h is odd
558    stripe_h + (stripe_h & 1),
559    crop_h,
560  )
561  .map(|row: &[T]| {
562    HorzPaddedIter::new(
563      // Limit how many unique elements we use
564      &row[..row_uniques],
565      start_index_x,
566      left_w + stripe_w + right_w,
567    )
568  });
569
570  // Setup the first row
571  {
572    let mut sum: u32 = 0;
573    let mut sq_sum: u32 = 0;
574    // Remove the first row and use it outside of the main loop
575    let row = rows_iter.next().unwrap();
576    for (src, (integral, sq_integral)) in
577      row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
578    {
579      let current = u32::cast_from(*src);
580
581      // Wrap adds to prevent undefined behaviour on overflow. Overflow is
582      // cancelled out when calculating the sum of a region.
583      sum = sum.wrapping_add(current);
584      *integral = sum;
585      sq_sum = sq_sum.wrapping_add(current * current);
586      *sq_integral = sq_sum;
587    }
588  }
589  // Calculate all other rows
590  let mut integral_slice = &mut integral_image[..];
591  let mut sq_integral_slice = &mut sq_integral_image[..];
592  for row in rows_iter {
593    let mut sum: u32 = 0;
594    let mut sq_sum: u32 = 0;
595
596    // Split the data between the previous row and future rows.
597    // This allows us to mutate the current row while accessing the
598    // previous row.
599    let (integral_row_prev, integral_row) =
600      integral_slice.split_at_mut(integral_image_stride);
601    let (sq_integral_row_prev, sq_integral_row) =
602      sq_integral_slice.split_at_mut(integral_image_stride);
603    for (
604      src,
605      ((integral_above, sq_integral_above), (integral, sq_integral)),
606    ) in row.zip(
607      integral_row_prev
608        .iter()
609        .zip(sq_integral_row_prev.iter())
610        .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
611    ) {
612      let current = u32::cast_from(*src);
613      // Wrap adds to prevent undefined behaviour on overflow. Overflow is
614      // cancelled out when calculating the sum of a region.
615      sum = sum.wrapping_add(current);
616      *integral = sum.wrapping_add(*integral_above);
617      sq_sum = sq_sum.wrapping_add(current * current);
618      *sq_integral = sq_sum.wrapping_add(*sq_integral_above);
619    }
620
621    // The current row also contains all future rows. Replacing the slice with
622    // it moves down a row.
623    integral_slice = integral_row;
624    sq_integral_slice = sq_integral_row;
625  }
626}
627
628#[profiling::function]
629pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
630  set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
631  integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
632  cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
633) {
634  let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
635  let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
636    [[0; IMAGE_WIDTH_MAX + 2]; 2];
637  let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
638    [[0; IMAGE_WIDTH_MAX + 2]; 2];
639  let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
640  let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
641  let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
642    [[0; IMAGE_WIDTH_MAX + 2]; 3];
643  let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
644    [[0; IMAGE_WIDTH_MAX + 2]; 3];
645  let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
646
647  let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
648  let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
649
650  let fn_ab_r1 = match fi.sequence.bit_depth {
651    8 => sgrproj_box_ab_r1::<8>,
652    10 => sgrproj_box_ab_r1::<10>,
653    12 => sgrproj_box_ab_r1::<12>,
654    _ => unimplemented!(),
655  };
656  let fn_ab_r2 = match fi.sequence.bit_depth {
657    8 => sgrproj_box_ab_r2::<8>,
658    10 => sgrproj_box_ab_r2::<10>,
659    12 => sgrproj_box_ab_r2::<12>,
660    _ => unimplemented!(),
661  };
662
663  /* prime the intermediate arrays */
664  // One oddness about the radius=2 intermediate array computations that
665  // the spec doesn't make clear: Although the spec defines computation
666  // of every row (of a, b and f), only half of the rows (every-other
667  // row) are actually used.
668  let integral_image = &integral_image_buffer.integral_image;
669  let sq_integral_image = &integral_image_buffer.sq_integral_image;
670  if s_r2 > 0 {
671    fn_ab_r2(
672      &mut a_r2[0],
673      &mut b_r2[0],
674      integral_image,
675      sq_integral_image,
676      integral_image_stride,
677      0,
678      stripe_w,
679      s_r2,
680      fi.cpu_feature_level,
681    );
682  }
683  if s_r1 > 0 {
684    let integral_image_offset = integral_image_stride + 1;
685    fn_ab_r1(
686      &mut a_r1[0],
687      &mut b_r1[0],
688      &integral_image[integral_image_offset..],
689      &sq_integral_image[integral_image_offset..],
690      integral_image_stride,
691      0,
692      stripe_w,
693      s_r1,
694      fi.cpu_feature_level,
695    );
696    fn_ab_r1(
697      &mut a_r1[1],
698      &mut b_r1[1],
699      &integral_image[integral_image_offset..],
700      &sq_integral_image[integral_image_offset..],
701      integral_image_stride,
702      1,
703      stripe_w,
704      s_r1,
705      fi.cpu_feature_level,
706    );
707  }
708
709  /* iterate by row */
710  // Increment by two to handle the use of even rows by r=2 and run a nested
711  //  loop to handle increments of one.
712  for y in (0..stripe_h).step_by(2) {
713    // get results to use y and y+1
714    let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
715      fn_ab_r2(
716        &mut a_r2[(y / 2 + 1) % 2],
717        &mut b_r2[(y / 2 + 1) % 2],
718        integral_image,
719        sq_integral_image,
720        integral_image_stride,
721        y + 2,
722        stripe_w,
723        s_r2,
724        fi.cpu_feature_level,
725      );
726      let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
727      let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
728      sgrproj_box_f_r2(
729        &ap0,
730        &bp0,
731        &mut f_r2_0,
732        &mut f_r2_1,
733        y,
734        stripe_w,
735        cdeffed,
736        fi.cpu_feature_level,
737      );
738      [&f_r2_0, &f_r2_1]
739    } else {
740      sgrproj_box_f_r0(
741        &mut f_r2_0,
742        y,
743        stripe_w,
744        cdeffed,
745        fi.cpu_feature_level,
746      );
747      // share results for both rows
748      [&f_r2_0, &f_r2_0]
749    };
750    for dy in 0..(2.min(stripe_h - y)) {
751      let y = y + dy;
752      if s_r1 > 0 {
753        let integral_image_offset = integral_image_stride + 1;
754        fn_ab_r1(
755          &mut a_r1[(y + 2) % 3],
756          &mut b_r1[(y + 2) % 3],
757          &integral_image[integral_image_offset..],
758          &sq_integral_image[integral_image_offset..],
759          integral_image_stride,
760          y + 2,
761          stripe_w,
762          s_r1,
763          fi.cpu_feature_level,
764        );
765        let ap1: [&[u32]; 3] =
766          [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
767        let bp1: [&[u32]; 3] =
768          [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
769        sgrproj_box_f_r1(
770          &ap1,
771          &bp1,
772          &mut f_r1,
773          y,
774          stripe_w,
775          cdeffed,
776          fi.cpu_feature_level,
777        );
778      } else {
779        sgrproj_box_f_r0(
780          &mut f_r1,
781          y,
782          stripe_w,
783          cdeffed,
784          fi.cpu_feature_level,
785        );
786      }
787
788      /* apply filter */
789      let w0 = xqd[0] as i32;
790      let w1 = xqd[1] as i32;
791      let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1;
792
793      let line = &cdeffed[y];
794
795      #[inline(always)]
796      fn apply_filter<U: Pixel>(
797        out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
798        stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
799      ) {
800        let line_it = line[..stripe_w].iter();
801        let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
802        let f_r1_it = f_r1[..stripe_w].iter();
803        let out_it = out[..stripe_w].iter_mut();
804
805        for ((o, &u), (&f_r2_ab, &f_r1)) in
806          out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
807        {
808          let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
809          let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
810          let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
811            >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
812          *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
813        }
814      }
815
816      apply_filter(
817        &mut out[y],
818        line,
819        &f_r1,
820        f_r2_ab[dy],
821        stripe_w,
822        fi.sequence.bit_depth,
823        w0,
824        w1,
825        w2,
826      );
827    }
828  }
829}
830
831// Frame inputs below aren't all equal, and will change as work
832// continues.  There's no deblocked reconstruction available at this
833// point of RDO, so we use the non-deblocked reconstruction, cdef and
834// input.  The input can be a full-sized frame. Cdef input is a partial
835// frame constructed specifically for RDO.
836
837// For simplicity, this ignores stripe segmentation (it's possible the
838// extra complexity isn't worth it and we'll ignore stripes
839// permanently during RDO, but that's not been tested yet). Data
840// access inside the cdef frame is monolithic and clipped to the cdef
841// borders.
842
843// Input params follow the same rules as sgrproj_stripe_filter.
844// Inputs are relative to the colocated slice views.
845#[profiling::function]
846pub fn sgrproj_solve<T: Pixel>(
847  set: u8, fi: &FrameInvariants<T>,
848  integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
849  cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
850) -> (i8, i8) {
851  let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
852    [[0; IMAGE_WIDTH_MAX + 2]; 2];
853  let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
854    [[0; IMAGE_WIDTH_MAX + 2]; 2];
855  let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
856  let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
857  let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
858    [[0; IMAGE_WIDTH_MAX + 2]; 3];
859  let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
860    [[0; IMAGE_WIDTH_MAX + 2]; 3];
861  let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
862
863  let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
864  let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
865
866  let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
867  let mut c: [f64; 2] = [0., 0.];
868
869  let fn_ab_r1 = match fi.sequence.bit_depth {
870    8 => sgrproj_box_ab_r1::<8>,
871    10 => sgrproj_box_ab_r1::<10>,
872    12 => sgrproj_box_ab_r1::<12>,
873    _ => unimplemented!(),
874  };
875  let fn_ab_r2 = match fi.sequence.bit_depth {
876    8 => sgrproj_box_ab_r2::<8>,
877    10 => sgrproj_box_ab_r2::<10>,
878    12 => sgrproj_box_ab_r2::<12>,
879    _ => unimplemented!(),
880  };
881
882  /* prime the intermediate arrays */
883  // One oddness about the radius=2 intermediate array computations that
884  // the spec doesn't make clear: Although the spec defines computation
885  // of every row (of a, b and f), only half of the rows (every-other
886  // row) are actually used.
887  let integral_image = &integral_image_buffer.integral_image;
888  let sq_integral_image = &integral_image_buffer.sq_integral_image;
889  if s_r2 > 0 {
890    fn_ab_r2(
891      &mut a_r2[0],
892      &mut b_r2[0],
893      integral_image,
894      sq_integral_image,
895      SOLVE_IMAGE_STRIDE,
896      0,
897      cdef_w,
898      s_r2,
899      fi.cpu_feature_level,
900    );
901  }
902  if s_r1 > 0 {
903    let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
904    fn_ab_r1(
905      &mut a_r1[0],
906      &mut b_r1[0],
907      &integral_image[integral_image_offset..],
908      &sq_integral_image[integral_image_offset..],
909      SOLVE_IMAGE_STRIDE,
910      0,
911      cdef_w,
912      s_r1,
913      fi.cpu_feature_level,
914    );
915    fn_ab_r1(
916      &mut a_r1[1],
917      &mut b_r1[1],
918      &integral_image[integral_image_offset..],
919      &sq_integral_image[integral_image_offset..],
920      SOLVE_IMAGE_STRIDE,
921      1,
922      cdef_w,
923      s_r1,
924      fi.cpu_feature_level,
925    );
926  }
927
928  /* iterate by row */
929  // Increment by two to handle the use of even rows by r=2 and run a nested
930  //  loop to handle increments of one.
931  for y in (0..cdef_h).step_by(2) {
932    // get results to use y and y+1
933    let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
934      fn_ab_r2(
935        &mut a_r2[(y / 2 + 1) % 2],
936        &mut b_r2[(y / 2 + 1) % 2],
937        integral_image,
938        sq_integral_image,
939        SOLVE_IMAGE_STRIDE,
940        y + 2,
941        cdef_w,
942        s_r2,
943        fi.cpu_feature_level,
944      );
945      let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
946      let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
947      sgrproj_box_f_r2(
948        &ap0,
949        &bp0,
950        &mut f_r2_0,
951        &mut f_r2_1,
952        y,
953        cdef_w,
954        cdeffed,
955        fi.cpu_feature_level,
956      );
957      [&f_r2_0, &f_r2_1]
958    } else {
959      sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
960      // share results for both rows
961      [&f_r2_0, &f_r2_0]
962    };
963    for dy in 0..(2.min(cdef_h - y)) {
964      let y = y + dy;
965      if s_r1 > 0 {
966        let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
967        fn_ab_r1(
968          &mut a_r1[(y + 2) % 3],
969          &mut b_r1[(y + 2) % 3],
970          &integral_image[integral_image_offset..],
971          &sq_integral_image[integral_image_offset..],
972          SOLVE_IMAGE_STRIDE,
973          y + 2,
974          cdef_w,
975          s_r1,
976          fi.cpu_feature_level,
977        );
978        let ap1: [&[u32]; 3] =
979          [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
980        let bp1: [&[u32]; 3] =
981          [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
982        sgrproj_box_f_r1(
983          &ap1,
984          &bp1,
985          &mut f_r1,
986          y,
987          cdef_w,
988          cdeffed,
989          fi.cpu_feature_level,
990        );
991      } else {
992        sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
993      }
994
995      #[inline(always)]
996      fn process_line<T: Pixel>(
997        h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
998        f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
999      ) {
1000        let cdeffed_it = cdeffed[..cdef_w].iter();
1001        let input_it = input[..cdef_w].iter();
1002        let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
1003        let f_r1_it = f_r1[..cdef_w].iter();
1004
1005        #[derive(Debug, Copy, Clone)]
1006        struct Sums {
1007          h: [[i64; 2]; 2],
1008          c: [i64; 2],
1009        }
1010
1011        let sums: Sums = cdeffed_it
1012          .zip(input_it)
1013          .zip(f_r2_ab_it.zip(f_r1_it))
1014          .map(|((&u, &i), (&f2, &f1))| {
1015            let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
1016            let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
1017            let f2 = f2 as i32 - u;
1018            let f1 = f1 as i32 - u;
1019            (s as i64, f1 as i64, f2 as i64)
1020          })
1021          .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
1022            let mut ret: Sums = sums;
1023            ret.h[0][0] += f2 * f2;
1024            ret.h[1][1] += f1 * f1;
1025            ret.h[0][1] += f1 * f2;
1026            ret.c[0] += f2 * s;
1027            ret.c[1] += f1 * s;
1028            ret
1029          });
1030
1031        h[0][0] += sums.h[0][0] as f64;
1032        h[1][1] += sums.h[1][1] as f64;
1033        h[0][1] += sums.h[0][1] as f64;
1034        c[0] += sums.c[0] as f64;
1035        c[1] += sums.c[1] as f64;
1036      }
1037
1038      process_line(
1039        &mut h,
1040        &mut c,
1041        &cdeffed[y],
1042        &input[y],
1043        &f_r1,
1044        f_r2_01[dy],
1045        cdef_w,
1046      );
1047    }
1048  }
1049
1050  // this is lifted almost in-tact from libaom
1051  let n = cdef_w as f64 * cdef_h as f64;
1052  h[0][0] /= n;
1053  h[0][1] /= n;
1054  h[1][1] /= n;
1055  h[1][0] = h[0][1];
1056  c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1057  c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1058  let (xq0, xq1) = if s_r2 == 0 {
1059    // H matrix is now only the scalar h[1][1]
1060    // C vector is now only the scalar c[1]
1061    if h[1][1] == 0. {
1062      (0, 0)
1063    } else {
1064      (0, (c[1] / h[1][1]).round() as i32)
1065    }
1066  } else if s_r1 == 0 {
1067    // H matrix is now only the scalar h[0][0]
1068    // C vector is now only the scalar c[0]
1069    if h[0][0] == 0. {
1070      (0, 0)
1071    } else {
1072      ((c[0] / h[0][0]).round() as i32, 0)
1073    }
1074  } else {
1075    let det = h[0][0].mul_add(h[1][1], -h[0][1] * h[1][0]);
1076    if det == 0. {
1077      (0, 0)
1078    } else {
1079      // If scaling up dividend would overflow, instead scale down the divisor
1080      let div1 = h[1][1].mul_add(c[0], -h[0][1] * c[1]);
1081      let div2 = h[0][0].mul_add(c[1], -h[1][0] * c[0]);
1082      ((div1 / det).round() as i32, (div2 / det).round() as i32)
1083    }
1084  };
1085  {
1086    let xqd0 =
1087      clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32);
1088    let xqd1 = clamp(
1089      (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1090      SGRPROJ_XQD_MIN[1] as i32,
1091      SGRPROJ_XQD_MAX[1] as i32,
1092    );
1093    (xqd0 as i8, xqd1 as i8)
1094  }
1095}
1096
1097#[profiling::function]
1098fn wiener_stripe_filter<T: Pixel>(
1099  coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1100  stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1101  cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1102) {
1103  let bit_depth = fi.sequence.bit_depth;
1104  let round_h = if bit_depth == 12 { 5 } else { 3 };
1105  let round_v = if bit_depth == 12 { 9 } else { 11 };
1106  let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
1107  let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
1108
1109  let mut coeffs_ = [[0; 3]; 2];
1110  for i in 0..2 {
1111    for j in 0..3 {
1112      coeffs_[i][j] = i32::from(coeffs[i][j]);
1113    }
1114  }
1115
1116  let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7];
1117  let vfilter: [i32; 7] = [
1118    coeffs_[0][0],
1119    coeffs_[0][1],
1120    coeffs_[0][2],
1121    128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]),
1122    coeffs_[0][2],
1123    coeffs_[0][1],
1124    coeffs_[0][0],
1125  ];
1126  let hfilter: [i32; 7] = [
1127    coeffs_[1][0],
1128    coeffs_[1][1],
1129    coeffs_[1][2],
1130    128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]),
1131    coeffs_[1][2],
1132    coeffs_[1][1],
1133    coeffs_[1][0],
1134  ];
1135
1136  // unlike x, our y can be negative to start as the first stripe
1137  // starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1138  let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize;
1139  let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize;
1140  let end_i = cmp::max(
1141    0,
1142    if stripe_h as isize + stripe_y > crop_h as isize {
1143      crop_h as isize - stripe_y - start_wi as isize
1144    } else {
1145      stripe_h as isize - start_wi as isize
1146    },
1147  ) as usize;
1148
1149  let mut out_slice =
1150    out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize });
1151
1152  for xi in stripe_x..stripe_x + stripe_w {
1153    let n = cmp::min(7, crop_w as isize + 3 - xi as isize);
1154    for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 {
1155      let mut acc = 0;
1156      let src = if yi < stripe_y {
1157        let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2);
1158        deblocked.row(ly)
1159      } else if yi < stripe_y + stripe_h as isize {
1160        let ly = clamp(yi, 0, crop_h as isize - 1);
1161        cdeffed.row(ly)
1162      } else {
1163        let ly = cmp::min(
1164          clamp(yi, 0, crop_h as isize - 1),
1165          stripe_y + stripe_h as isize + 1,
1166        );
1167        deblocked.row(ly)
1168      };
1169      let start = i32::cast_from(src[0]);
1170      let end = i32::cast_from(src[crop_w - 1]);
1171      for i in 0..3 - xi as isize {
1172        acc += hfilter[i as usize] * start;
1173      }
1174
1175      let off = 3 - (xi as isize);
1176      let s = cmp::max(0, off) as usize;
1177      let s1 = (s as isize - off) as usize;
1178      let n1 = (n - off) as usize;
1179
1180      for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1181        acc += hf * i32::cast_from(v);
1182      }
1183
1184      for i in n..7 {
1185        acc += hfilter[i as usize] * end;
1186      }
1187
1188      acc = (acc + (1 << round_h >> 1)) >> round_h;
1189      work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset);
1190    }
1191
1192    for (wi, dst) in (start_wi..start_wi + end_i)
1193      .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i))
1194    {
1195      let mut acc = 0;
1196      for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) {
1197        acc += vfilter[i] * *src;
1198      }
1199      *dst = T::cast_from(clamp(
1200        (acc + (1 << round_v >> 1)) >> round_v,
1201        0,
1202        (1 << bit_depth) - 1,
1203      ));
1204    }
1205  }
1206}
1207
1208#[derive(Copy, Clone, Debug, Default)]
1209pub struct RestorationUnit {
1210  pub filter: RestorationFilter,
1211}
1212
1213#[derive(Clone, Debug)]
1214pub struct FrameRestorationUnits {
1215  units: Box<[RestorationUnit]>,
1216  pub cols: usize,
1217  pub rows: usize,
1218}
1219
1220impl FrameRestorationUnits {
1221  pub fn new(cols: usize, rows: usize) -> Self {
1222    Self {
1223      units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1224      cols,
1225      rows,
1226    }
1227  }
1228}
1229
1230impl Index<usize> for FrameRestorationUnits {
1231  type Output = [RestorationUnit];
1232  #[inline(always)]
1233  fn index(&self, index: usize) -> &Self::Output {
1234    &self.units[index * self.cols..(index + 1) * self.cols]
1235  }
1236}
1237
1238impl IndexMut<usize> for FrameRestorationUnits {
1239  #[inline(always)]
1240  fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1241    &mut self.units[index * self.cols..(index + 1) * self.cols]
1242  }
1243}
1244
1245#[derive(Clone, Debug)]
1246pub struct RestorationPlaneConfig {
1247  pub lrf_type: u8,
1248  pub unit_size: usize,
1249  // (1 << sb_x_shift) gives the number of superblocks horizontally or
1250  // vertically in a restoration unit, not accounting for RU stretching
1251  pub sb_h_shift: usize,
1252  pub sb_v_shift: usize,
1253  pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1254  pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1255  // stripe height is 64 in all cases except 4:2:0 chroma planes where
1256  // it is 32.  This is independent of all other setup parameters
1257  pub stripe_height: usize,
1258  pub cols: usize,
1259  pub rows: usize,
1260}
1261
1262#[derive(Clone, Debug)]
1263pub struct RestorationPlane {
1264  pub cfg: RestorationPlaneConfig,
1265  pub units: FrameRestorationUnits,
1266}
1267
1268#[derive(Clone, Default)]
1269pub struct RestorationPlaneOffset {
1270  pub row: usize,
1271  pub col: usize,
1272}
1273
1274impl RestorationPlane {
1275  pub fn new(
1276    lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1277    sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1278    rows: usize,
1279  ) -> RestorationPlane {
1280    let stripe_height = if stripe_decimate != 0 { 32 } else { 64 };
1281    RestorationPlane {
1282      cfg: RestorationPlaneConfig {
1283        lrf_type,
1284        unit_size,
1285        sb_h_shift,
1286        sb_v_shift,
1287        sb_cols,
1288        sb_rows,
1289        stripe_height,
1290        cols,
1291        rows,
1292      },
1293      units: FrameRestorationUnits::new(cols, rows),
1294    }
1295  }
1296
1297  // Stripes are always 64 pixels high in a non-subsampled
1298  // frame, and decimated from 64 pixels in chroma.  When
1299  // filtering, they are not co-located on Y with superblocks.
1300  fn restoration_unit_index_by_stripe(
1301    &self, stripenum: usize, rux: usize,
1302  ) -> (usize, usize) {
1303    (
1304      cmp::min(rux, self.cfg.cols - 1),
1305      cmp::min(
1306        stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1307        self.cfg.rows - 1,
1308      ),
1309    )
1310  }
1311
1312  pub fn restoration_unit_by_stripe(
1313    &self, stripenum: usize, rux: usize,
1314  ) -> &RestorationUnit {
1315    let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1316    &self.units[y][x]
1317  }
1318}
1319
1320#[derive(Clone, Debug)]
1321pub struct RestorationState {
1322  pub planes: [RestorationPlane; MAX_PLANES],
1323}
1324
1325impl RestorationState {
1326  pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1327    let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg;
1328    // stripe size is decimated in 4:2:0 (and only 4:2:0)
1329    let stripe_uv_decimate = usize::from(xdec > 0 && ydec > 0);
1330    let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 };
1331    let uv_sb_h_log2 = y_sb_log2 - xdec;
1332    let uv_sb_v_log2 = y_sb_log2 - ydec;
1333
1334    let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1335      && fi.sequence.enable_restoration
1336    {
1337      assert!(
1338        fi.width > 1 && fi.height > 1,
1339        "Width and height must be higher than 1 for LRF setup"
1340      );
1341
1342      // Specific content does affect optimal LRU size choice, but the
1343      // quantizer in use is a surprisingly strong selector.
1344      let lrf_base_shift = if fi.base_q_idx > 200 {
1345        0 // big
1346      } else if fi.base_q_idx > 160 {
1347        1
1348      } else {
1349        2 // small
1350      };
1351      let lrf_chroma_shift = if stripe_uv_decimate > 0 {
1352        // 4:2:0 only
1353        if lrf_base_shift == 2 {
1354          1 // smallest chroma LRU is a win at low quant
1355        } else {
1356          // Will a down-shifted chroma LRU eliminate stretch in chroma?
1357          // If so, that's generally a win.
1358          let lrf_unit_size =
1359            1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1360          let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size
1361            <= lrf_unit_size / 2
1362            || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2;
1363          let shifted_stretch = ((fi.width >> xdec) - 1)
1364            % (lrf_unit_size >> 1)
1365            <= lrf_unit_size / 4
1366            || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1)
1367              <= lrf_unit_size / 4;
1368          // shift to eliminate stretch if needed,
1369          // otherwise do not shift and save the signaling bits
1370          usize::from(unshifted_stretch && !shifted_stretch)
1371        }
1372      } else {
1373        0
1374      };
1375      (lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1376    } else {
1377      // Explicit request to tie LRU size to superblock size ==
1378      // smallest possible LRU size
1379      let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 };
1380      (lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1381    };
1382
1383    let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1384    let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1385
1386    let tiling = fi.sequence.tiling;
1387    // Right now we defer to tiling setup: don't choose an LRU size
1388    // large enough that a tile is not an integer number of LRUs
1389    // wide/high.
1390    if tiling.cols > 1 || tiling.rows > 1 {
1391      // despite suggestions to the contrary, tiles can be
1392      // non-powers-of-2.
1393      let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1394      let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1395      let tile_aligned_y_unit_size =
1396        1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1397      let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros);
1398      let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros);
1399      y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1400      uv_unit_size = uv_unit_size
1401        .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1402
1403      // But it's actually worse: LRUs can't span tiles (in our
1404      // one-pass design that is, spec allows it).  However, the spec
1405      // mandates the last LRU stretches forward into any
1406      // less-than-half-LRU span of superblocks at the right and
1407      // bottom of a frame.  These superblocks may well be in a
1408      // different tile!  Even if LRUs are minimum size (one
1409      // superblock), when the right or bottom edge of the frame is a
1410      // superblock that's less than half the width/height of a normal
1411      // superblock, the LRU is forced by the spec to span into it
1412      // (and thus a different tile).  Tiling is under no such
1413      // restriction; it could decide the right/left sliver will be in
1414      // its own tile row/column.  We can't disallow the combination
1415      // here.  The tiling code will have to either prevent it or
1416      // tolerate it.  (prayer mechanic == Issue #1629).
1417    }
1418
1419    // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1420    // to be the same*. If they differ at this
1421    // point, it's due to a tiling restriction enforcing a maximum
1422    // size, so force both to the smaller value.
1423    //
1424    // *see sec 5.9.20, "Loop restoration params syntax".  The
1425    // bitstream provides means of coding a different UV LRU size only
1426    // when chroma is in use and both x and y are subsampled in the
1427    // chroma planes.
1428    if ydec == 0 && y_unit_size != uv_unit_size {
1429      y_unit_size = uv_unit_size.min(y_unit_size);
1430      uv_unit_size = y_unit_size;
1431    }
1432
1433    // derive the rest
1434    let y_unit_log2 = y_unit_size.ilog() - 1;
1435    let uv_unit_log2 = uv_unit_size.ilog() - 1;
1436    let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1);
1437    let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1);
1438    let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec)
1439      + (uv_unit_size >> 1))
1440      / uv_unit_size)
1441      .max(1);
1442    let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec)
1443      + (uv_unit_size >> 1))
1444      / uv_unit_size)
1445      .max(1);
1446
1447    RestorationState {
1448      planes: [
1449        RestorationPlane::new(
1450          RESTORE_SWITCHABLE,
1451          y_unit_size,
1452          y_unit_log2 - y_sb_log2,
1453          y_unit_log2 - y_sb_log2,
1454          fi.sb_width,
1455          fi.sb_height,
1456          0,
1457          y_cols,
1458          y_rows,
1459        ),
1460        RestorationPlane::new(
1461          RESTORE_SWITCHABLE,
1462          uv_unit_size,
1463          uv_unit_log2 - uv_sb_h_log2,
1464          uv_unit_log2 - uv_sb_v_log2,
1465          fi.sb_width,
1466          fi.sb_height,
1467          stripe_uv_decimate,
1468          uv_cols,
1469          uv_rows,
1470        ),
1471        RestorationPlane::new(
1472          RESTORE_SWITCHABLE,
1473          uv_unit_size,
1474          uv_unit_log2 - uv_sb_h_log2,
1475          uv_unit_log2 - uv_sb_v_log2,
1476          fi.sb_width,
1477          fi.sb_height,
1478          stripe_uv_decimate,
1479          uv_cols,
1480          uv_rows,
1481        ),
1482      ],
1483    }
1484  }
1485
1486  #[profiling::function]
1487  pub fn lrf_filter_frame<T: Pixel>(
1488    &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1489    fi: &FrameInvariants<T>,
1490  ) {
1491    let cdeffed = out.clone();
1492    let planes =
1493      if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES };
1494
1495    // unlike the other loop filters that operate over the padded
1496    // frame dimensions, restoration filtering and source pixel
1497    // accesses are clipped to the original frame dimensions
1498    // that's why we use fi.width and fi.height instead of PlaneConfig fields
1499
1500    // number of stripes (counted according to colocated Y luma position)
1501    let stripe_n = (fi.height + 7) / 64 + 1;
1502
1503    // Buffers for the stripe filter.
1504    let mut stripe_filter_buffer =
1505      IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1506
1507    for pli in 0..planes {
1508      let rp = &self.planes[pli];
1509      let xdec = out.planes[pli].cfg.xdec;
1510      let ydec = out.planes[pli].cfg.ydec;
1511      let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec;
1512      let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec;
1513
1514      for si in 0..stripe_n {
1515        let (stripe_start_y, stripe_size) = if si == 0 {
1516          (0, (64 - 8) >> ydec)
1517        } else {
1518          let start = (si * 64 - 8) >> ydec;
1519          (
1520            start as isize,
1521            // one past, unlike spec
1522            (64 >> ydec).min(crop_h - start),
1523          )
1524        };
1525
1526        // horizontally, go rdu-by-rdu
1527        for rux in 0..rp.cfg.cols {
1528          // stripe x pixel locations must be clipped to frame, last may need to stretch
1529          let x = rux * rp.cfg.unit_size;
1530          let size =
1531            if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size };
1532          let ru = rp.restoration_unit_by_stripe(si, rux);
1533          match ru.filter {
1534            RestorationFilter::Wiener { coeffs } => {
1535              wiener_stripe_filter(
1536                coeffs,
1537                fi,
1538                crop_w,
1539                crop_h,
1540                size,
1541                stripe_size,
1542                x,
1543                stripe_start_y,
1544                &cdeffed.planes[pli],
1545                &pre_cdef.planes[pli],
1546                &mut out.planes[pli],
1547              );
1548            }
1549            RestorationFilter::Sgrproj { set, xqd } => {
1550              if !fi.sequence.enable_cdef {
1551                continue;
1552              }
1553
1554              setup_integral_image(
1555                &mut stripe_filter_buffer,
1556                STRIPE_IMAGE_STRIDE,
1557                crop_w - x,
1558                (crop_h as isize - stripe_start_y) as usize,
1559                size,
1560                stripe_size,
1561                &cdeffed.planes[pli]
1562                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1563                &pre_cdef.planes[pli]
1564                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1565              );
1566
1567              sgrproj_stripe_filter(
1568                set,
1569                xqd,
1570                fi,
1571                &stripe_filter_buffer,
1572                STRIPE_IMAGE_STRIDE,
1573                &cdeffed.planes[pli]
1574                  .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1575                &mut out.planes[pli].region_mut(Area::Rect {
1576                  x: x as isize,
1577                  y: stripe_start_y,
1578                  width: size,
1579                  height: stripe_size,
1580                }),
1581              );
1582            }
1583            RestorationFilter::None => {
1584              // do nothing
1585            }
1586          }
1587        }
1588      }
1589    }
1590  }
1591}