rav1e/
rdo.rs

1// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
3//
4// This source code is subject to the terms of the BSD 2 Clause License and
5// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6// was not distributed with this source code in the LICENSE file, you can
7// obtain it at www.aomedia.org/license/software. If the Alliance for Open
8// Media Patent License 1.0 was not distributed with this source code in the
9// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11#![allow(non_camel_case_types)]
12
13use crate::api::*;
14use crate::cdef::*;
15use crate::context::*;
16use crate::cpu_features::CpuFeatureLevel;
17use crate::deblock::*;
18use crate::dist::*;
19use crate::ec::{Writer, WriterCounter, OD_BITRES};
20use crate::encode_block_with_modes;
21use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
22use crate::frame::Frame;
23use crate::frame::*;
24use crate::header::ReferenceMode;
25use crate::lrf::*;
26use crate::mc::MotionVector;
27use crate::me::estimate_motion;
28use crate::me::MVSamplingMode;
29use crate::me::MotionSearchResult;
30use crate::motion_compensate;
31use crate::partition::PartitionType::*;
32use crate::partition::RefType::*;
33use crate::partition::*;
34use crate::predict::{
35  luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
36  RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
37};
38use crate::rdo_tables::*;
39use crate::tiling::*;
40use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
41use crate::util::{init_slice_repeat_mut, Aligned, Pixel};
42use crate::write_tx_blocks;
43use crate::write_tx_tree;
44use crate::Tune;
45use crate::{encode_block_post_cdef, encode_block_pre_cdef};
46
47use arrayvec::*;
48use itertools::izip;
49use std::fmt;
50use std::mem::MaybeUninit;
51
52#[derive(Copy, Clone, PartialEq, Eq)]
53pub enum RDOType {
54  PixelDistRealRate,
55  TxDistRealRate,
56  TxDistEstRate,
57}
58
59impl RDOType {
60  #[inline]
61  pub const fn needs_tx_dist(self) -> bool {
62    match self {
63      // Pixel-domain distortion and exact ec rate
64      RDOType::PixelDistRealRate => false,
65      // Tx-domain distortion and exact ec rate
66      RDOType::TxDistRealRate => true,
67      // Tx-domain distortion and txdist-based rate
68      RDOType::TxDistEstRate => true,
69    }
70  }
71  #[inline]
72  pub const fn needs_coeff_rate(self) -> bool {
73    match self {
74      RDOType::PixelDistRealRate => true,
75      RDOType::TxDistRealRate => true,
76      RDOType::TxDistEstRate => false,
77    }
78  }
79}
80
81#[derive(Clone)]
82pub struct PartitionGroupParameters {
83  pub rd_cost: f64,
84  pub part_type: PartitionType,
85  pub part_modes: ArrayVec<PartitionParameters, 4>,
86}
87
88#[derive(Clone, Debug)]
89pub struct PartitionParameters {
90  pub rd_cost: f64,
91  pub bo: TileBlockOffset,
92  pub bsize: BlockSize,
93  pub pred_mode_luma: PredictionMode,
94  pub pred_mode_chroma: PredictionMode,
95  pub pred_cfl_params: CFLParams,
96  pub angle_delta: AngleDelta,
97  pub ref_frames: [RefType; 2],
98  pub mvs: [MotionVector; 2],
99  pub skip: bool,
100  pub has_coeff: bool,
101  pub tx_size: TxSize,
102  pub tx_type: TxType,
103  pub sidx: u8,
104}
105
106impl Default for PartitionParameters {
107  fn default() -> Self {
108    PartitionParameters {
109      rd_cost: std::f64::MAX,
110      bo: TileBlockOffset::default(),
111      bsize: BlockSize::BLOCK_32X32,
112      pred_mode_luma: PredictionMode::default(),
113      pred_mode_chroma: PredictionMode::default(),
114      pred_cfl_params: CFLParams::default(),
115      angle_delta: AngleDelta::default(),
116      ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
117      mvs: [MotionVector::default(); 2],
118      skip: false,
119      has_coeff: true,
120      tx_size: TxSize::TX_4X4,
121      tx_type: TxType::DCT_DCT,
122      sidx: 0,
123    }
124  }
125}
126
127pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
128  let bs_index = ts as usize;
129  let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV;
130  let bin_idx_down =
131    ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64);
132  let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64);
133  let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
134  let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
135  let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
136  let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
137  let slope = ((y1 - y0) << 8) / (x1 - x0);
138  (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64
139}
140
141#[allow(unused)]
142pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
143  src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
144  bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel,
145) -> Distortion {
146  debug_assert!(src1.plane_cfg.xdec == 0);
147  debug_assert!(src1.plane_cfg.ydec == 0);
148  debug_assert!(src2.plane_cfg.xdec == 0);
149  debug_assert!(src2.plane_cfg.ydec == 0);
150
151  let mut sum = Distortion::zero();
152  for y in (0..h).step_by(8) {
153    for x in (0..w).step_by(8) {
154      let kernel_h = (h - y).min(8);
155      let kernel_w = (w - x).min(8);
156      let area = Area::StartingAt { x: x as isize, y: y as isize };
157
158      let value = RawDistortion(cdef_dist_kernel(
159        &src1.subregion(area),
160        &src2.subregion(area),
161        kernel_w,
162        kernel_h,
163        bit_depth,
164        cpu,
165      ) as u64);
166
167      // cdef is always called on non-subsampled planes, so BLOCK_8X8 is
168      // correct here.
169      sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
170    }
171  }
172  sum
173}
174
175/// Sum of Squared Error for a wxh block
176/// Currently limited to w and h of valid blocks
177pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
178  src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
179  compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
180) -> Distortion {
181  // See get_weighted_sse in src/dist.rs.
182  // Provide a scale to get_weighted_sse for each square region of this size.
183  const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1;
184
185  // To bias the distortion correctly, compute it in blocks up to the size
186  // importance block size in a non-subsampled plane.
187  let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
188  let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
189
190  let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
191
192  let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE;
193  let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE;
194
195  // TODO: Copying biases into a buffer is slow. It would be best if biases were
196  // passed directly. To do this, we would need different versions of the
197  // weighted sse function for decimated/subsampled data. Also requires
198  // eliminating use of unbiased sse.
199  // It should also be noted that the current copy code does not auto-vectorize.
200
201  // Copy biases into a buffer.
202  let mut buf_storage = Aligned::new(
203    [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE],
204  );
205  let buf_stride = n_imp_blocks_w.next_power_of_two();
206  let buf = init_slice_repeat_mut(
207    &mut buf_storage.data[..buf_stride * n_imp_blocks_h],
208    0,
209  );
210
211  for block_y in 0..n_imp_blocks_h {
212    for block_x in 0..n_imp_blocks_w {
213      let block = Area::StartingAt {
214        x: (block_x * CHUNK_SIZE) as isize,
215        y: (block_y * CHUNK_SIZE) as isize,
216      };
217      buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
218    }
219  }
220
221  Distortion(get_weighted_sse(
222    src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
223  ))
224}
225
226pub const fn clip_visible_bsize(
227  frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
228) -> (usize, usize) {
229  let blk_w = bsize.width();
230  let blk_h = bsize.height();
231
232  let visible_w: usize = if x + blk_w <= frame_w {
233    blk_w
234  } else if x >= frame_w {
235    0
236  } else {
237    frame_w - x
238  };
239
240  let visible_h: usize = if y + blk_h <= frame_h {
241    blk_h
242  } else if y >= frame_h {
243    0
244  } else {
245    frame_h - y
246  };
247
248  (visible_w, visible_h)
249}
250
251// Compute the pixel-domain distortion for an encode
252fn compute_distortion<T: Pixel>(
253  fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
254  is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
255) -> ScaledDistortion {
256  let area = Area::BlockStartingAt { bo: tile_bo.0 };
257  let input_region = ts.input_tile.planes[0].subregion(area);
258  let rec_region = ts.rec.planes[0].subregion(area);
259
260  // clip a block to have visible pixles only
261  let frame_bo = ts.to_frame_block_offset(tile_bo);
262  let (visible_w, visible_h) = clip_visible_bsize(
263    fi.width,
264    fi.height,
265    bsize,
266    frame_bo.0.x << MI_SIZE_LOG2,
267    frame_bo.0.y << MI_SIZE_LOG2,
268  );
269
270  if visible_w == 0 || visible_h == 0 {
271    return ScaledDistortion::zero();
272  }
273
274  let mut distortion = match fi.config.tune {
275    Tune::Psychovisual => cdef_dist_wxh(
276      &input_region,
277      &rec_region,
278      visible_w,
279      visible_h,
280      fi.sequence.bit_depth,
281      |bias_area, bsize| {
282        distortion_scale(
283          fi,
284          input_region.subregion(bias_area).frame_block_offset(),
285          bsize,
286        )
287      },
288      fi.cpu_feature_level,
289    ),
290    Tune::Psnr => sse_wxh(
291      &input_region,
292      &rec_region,
293      visible_w,
294      visible_h,
295      |bias_area, bsize| {
296        distortion_scale(
297          fi,
298          input_region.subregion(bias_area).frame_block_offset(),
299          bsize,
300        )
301      },
302      fi.sequence.bit_depth,
303      fi.cpu_feature_level,
304    ),
305  } * fi.dist_scale[0];
306
307  if is_chroma_block
308    && !luma_only
309    && fi.sequence.chroma_sampling != ChromaSampling::Cs400
310  {
311    let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
312    let chroma_w = if bsize.width() >= 8 || xdec == 0 {
313      (visible_w + xdec) >> xdec
314    } else {
315      (4 + visible_w + xdec) >> xdec
316    };
317    let chroma_h = if bsize.height() >= 8 || ydec == 0 {
318      (visible_h + ydec) >> ydec
319    } else {
320      (4 + visible_h + ydec) >> ydec
321    };
322
323    for p in 1..3 {
324      let input_region = ts.input_tile.planes[p].subregion(area);
325      let rec_region = ts.rec.planes[p].subregion(area);
326      distortion += sse_wxh(
327        &input_region,
328        &rec_region,
329        chroma_w,
330        chroma_h,
331        |bias_area, bsize| {
332          distortion_scale(
333            fi,
334            input_region.subregion(bias_area).frame_block_offset(),
335            bsize,
336          )
337        },
338        fi.sequence.bit_depth,
339        fi.cpu_feature_level,
340      ) * fi.dist_scale[p];
341    }
342  }
343  distortion
344}
345
346// Compute the transform-domain distortion for an encode
347fn compute_tx_distortion<T: Pixel>(
348  fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
349  is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
350  skip: bool, luma_only: bool,
351) -> ScaledDistortion {
352  assert!(fi.config.tune == Tune::Psnr);
353  let area = Area::BlockStartingAt { bo: tile_bo.0 };
354  let input_region = ts.input_tile.planes[0].subregion(area);
355  let rec_region = ts.rec.planes[0].subregion(area);
356
357  let (visible_w, visible_h) = if !skip {
358    (bsize.width(), bsize.height())
359  } else {
360    let frame_bo = ts.to_frame_block_offset(tile_bo);
361    clip_visible_bsize(
362      fi.width,
363      fi.height,
364      bsize,
365      frame_bo.0.x << MI_SIZE_LOG2,
366      frame_bo.0.y << MI_SIZE_LOG2,
367    )
368  };
369
370  if visible_w == 0 || visible_h == 0 {
371    return ScaledDistortion::zero();
372  }
373
374  let mut distortion = if skip {
375    sse_wxh(
376      &input_region,
377      &rec_region,
378      visible_w,
379      visible_h,
380      |bias_area, bsize| {
381        distortion_scale(
382          fi,
383          input_region.subregion(bias_area).frame_block_offset(),
384          bsize,
385        )
386      },
387      fi.sequence.bit_depth,
388      fi.cpu_feature_level,
389    ) * fi.dist_scale[0]
390  } else {
391    tx_dist
392  };
393
394  if is_chroma_block
395    && !luma_only
396    && skip
397    && fi.sequence.chroma_sampling != ChromaSampling::Cs400
398  {
399    let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
400    let chroma_w = if bsize.width() >= 8 || xdec == 0 {
401      (visible_w + xdec) >> xdec
402    } else {
403      (4 + visible_w + xdec) >> xdec
404    };
405    let chroma_h = if bsize.height() >= 8 || ydec == 0 {
406      (visible_h + ydec) >> ydec
407    } else {
408      (4 + visible_h + ydec) >> ydec
409    };
410
411    for p in 1..3 {
412      let input_region = ts.input_tile.planes[p].subregion(area);
413      let rec_region = ts.rec.planes[p].subregion(area);
414      distortion += sse_wxh(
415        &input_region,
416        &rec_region,
417        chroma_w,
418        chroma_h,
419        |bias_area, bsize| {
420          distortion_scale(
421            fi,
422            input_region.subregion(bias_area).frame_block_offset(),
423            bsize,
424          )
425        },
426        fi.sequence.bit_depth,
427        fi.cpu_feature_level,
428      ) * fi.dist_scale[p];
429    }
430  }
431  distortion
432}
433
434/// Compute a scaling factor to multiply the distortion of a block by,
435/// this factor is determined using temporal RDO.
436///
437/// # Panics
438///
439/// - If called with `bsize` of 8x8 or smaller
440/// - If the coded frame data doesn't exist on the `FrameInvariants`
441pub fn distortion_scale<T: Pixel>(
442  fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
443) -> DistortionScale {
444  if !fi.config.temporal_rdo() {
445    return DistortionScale::default();
446  }
447  // EncoderConfig::temporal_rdo() should always return false in situations
448  // where distortion is computed on > 8x8 blocks, so we should never hit this
449  // assert.
450  assert!(bsize <= BlockSize::BLOCK_8X8);
451
452  let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
453  let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
454
455  let coded_data = fi.coded_frame_data.as_ref().unwrap();
456  coded_data.distortion_scales[y * coded_data.w_in_imp_b + x]
457}
458
459/// # Panics
460///
461/// - If the coded frame data doesn't exist on the `FrameInvariants`
462pub fn spatiotemporal_scale<T: Pixel>(
463  fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
464) -> DistortionScale {
465  if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual {
466    return DistortionScale::default();
467  }
468
469  let coded_data = fi.coded_frame_data.as_ref().unwrap();
470
471  let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
472  let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
473  let x1 = (x0 + bsize.width_imp_b()).min(coded_data.w_in_imp_b);
474  let y1 = (y0 + bsize.height_imp_b()).min(coded_data.h_in_imp_b);
475  let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT;
476
477  // calling this on each slice individually improves autovectorization
478  // compared to using `Iterator::take`
479  #[inline(always)]
480  fn take_slice<T>(slice: &[T], n: usize) -> &[T] {
481    slice.get(..n).unwrap_or(slice)
482  }
483
484  let mut sum = 0;
485  for y in y0..y1 {
486    sum += take_slice(
487      &coded_data.distortion_scales[y * coded_data.w_in_imp_b..][x0..x1],
488      MAX_SB_IN_IMP_B,
489    )
490    .iter()
491    .zip(
492      take_slice(
493        &coded_data.activity_scales[y * coded_data.w_in_imp_b..][x0..x1],
494        MAX_SB_IN_IMP_B,
495      )
496      .iter(),
497    )
498    .map(|(d, a)| d.0 as u64 * a.0 as u64)
499    .sum::<u64>();
500  }
501  DistortionScale(((sum + (den >> 1)) / den) as u32)
502}
503
504pub fn distortion_scale_for(
505  propagate_cost: f64, intra_cost: f64,
506) -> DistortionScale {
507  // The mbtree paper \cite{mbtree} uses the following formula:
508  //
509  //     QP_delta = -strength * log2(1 + (propagate_cost / intra_cost))
510  //
511  // Since this is H.264, this corresponds to the following quantizer:
512  //
513  //     Q' = Q * 2^(QP_delta/6)
514  //
515  // Since lambda is proportial to Q^2, this means we want to minimize:
516  //
517  //     D + lambda' * R
518  //   = D + 2^(QP_delta / 3) * lambda * R
519  //
520  // If we want to keep lambda fixed, we can instead scale distortion and
521  // minimize:
522  //
523  //     D * scale + lambda * R
524  //
525  // where:
526  //
527  //     scale = 2^(QP_delta / -3)
528  //           = (1 + (propagate_cost / intra_cost))^(strength / 3)
529  //
530  //  The original paper empirically chooses strength = 2.0, but strength = 1.0
531  //  seems to work best in rav1e currently, this may have something to do with
532  //  the fact that they use 16x16 blocks whereas our "importance blocks" are
533  //  8x8, but everything should be scale invariant here so that's weird.
534  //
535  // @article{mbtree,
536  //   title={A novel macroblock-tree algorithm for high-performance
537  //    optimization of dependent video coding in H.264/AVC},
538  //   author={Garrett-Glaser, Jason},
539  //   journal={Tech. Rep.},
540  //   year={2009},
541  //   url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
542  // }
543
544  if intra_cost == 0. {
545    return DistortionScale::default(); // no scaling
546  }
547
548  let strength = 1.0; // empirical, see comment above
549  let frac = (intra_cost + propagate_cost) / intra_cost;
550  frac.powf(strength / 3.0).into()
551}
552
553/// Fixed point arithmetic version of distortion scale
554#[repr(transparent)]
555#[derive(Copy, Clone)]
556pub struct DistortionScale(pub u32);
557
558#[repr(transparent)]
559pub struct RawDistortion(u64);
560
561#[repr(transparent)]
562pub struct Distortion(pub u64);
563
564#[repr(transparent)]
565pub struct ScaledDistortion(u64);
566
567impl DistortionScale {
568  /// Bits past the radix point
569  const SHIFT: u32 = 14;
570  /// Number of bits used. Determines the max value.
571  /// 28 bits is quite excessive.
572  const BITS: u32 = 28;
573  /// Maximum internal value
574  const MAX: u64 = (1 << Self::BITS) - 1;
575
576  #[inline]
577  pub const fn new(num: u64, den: u64) -> Self {
578    let raw = (num << Self::SHIFT).saturating_add(den / 2) / den;
579    let mask = (raw <= Self::MAX) as u64;
580    Self((mask * raw + (1 - mask) * Self::MAX) as u32)
581  }
582
583  pub fn inv_mean(slice: &[Self]) -> Self {
584    use crate::util::{bexp64, blog32_q11};
585    let sum = slice.iter().map(|&s| blog32_q11(s.0) as i64).sum::<i64>();
586    let log_inv_mean_q11 =
587      (Self::SHIFT << 11) as i64 - sum / slice.len() as i64;
588    Self(
589      bexp64((log_inv_mean_q11 + (Self::SHIFT << 11) as i64) << (57 - 11))
590        .clamp(1, (1 << Self::BITS) - 1) as u32,
591    )
592  }
593
594  /// Binary logarithm in Q11
595  #[inline]
596  pub const fn blog16(self) -> i16 {
597    use crate::util::blog32_q11;
598    (blog32_q11(self.0) - ((Self::SHIFT as i32) << 11)) as i16
599  }
600
601  /// Binary logarithm in Q57
602  #[inline]
603  pub const fn blog64(self) -> i64 {
604    use crate::util::{blog64, q57};
605    blog64(self.0 as i64) - q57(Self::SHIFT as i32)
606  }
607
608  /// Multiply, round and shift
609  /// Internal implementation, so don't use multiply trait.
610  #[inline]
611  pub const fn mul_u64(self, dist: u64) -> u64 {
612    (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
613  }
614}
615
616impl std::ops::Mul for DistortionScale {
617  type Output = Self;
618
619  /// Multiply, round and shift
620  #[inline]
621  fn mul(self, rhs: Self) -> Self {
622    Self(
623      (((self.0 as u64 * rhs.0 as u64) + (1 << (Self::SHIFT - 1)))
624        >> Self::SHIFT)
625        .clamp(1, (1 << Self::BITS) - 1) as u32,
626    )
627  }
628}
629
630impl std::ops::MulAssign for DistortionScale {
631  fn mul_assign(&mut self, rhs: Self) {
632    *self = *self * rhs;
633  }
634}
635
636// Default value for DistortionScale is a fixed point 1
637impl Default for DistortionScale {
638  #[inline]
639  fn default() -> Self {
640    Self(1 << Self::SHIFT)
641  }
642}
643
644impl fmt::Debug for DistortionScale {
645  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
646    write!(f, "{}", f64::from(*self))
647  }
648}
649
650impl From<f64> for DistortionScale {
651  #[inline]
652  fn from(scale: f64) -> Self {
653    let den = 1 << (Self::SHIFT + 1);
654    Self::new((scale * den as f64) as u64, den)
655  }
656}
657
658impl From<DistortionScale> for f64 {
659  #[inline]
660  fn from(scale: DistortionScale) -> Self {
661    scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64
662  }
663}
664
665impl RawDistortion {
666  #[inline]
667  pub const fn new(dist: u64) -> Self {
668    Self(dist)
669  }
670}
671
672impl std::ops::Mul<DistortionScale> for RawDistortion {
673  type Output = Distortion;
674  #[inline]
675  fn mul(self, rhs: DistortionScale) -> Distortion {
676    Distortion(rhs.mul_u64(self.0))
677  }
678}
679
680impl Distortion {
681  #[inline]
682  pub const fn zero() -> Self {
683    Self(0)
684  }
685}
686
687impl std::ops::Mul<DistortionScale> for Distortion {
688  type Output = ScaledDistortion;
689  #[inline]
690  fn mul(self, rhs: DistortionScale) -> ScaledDistortion {
691    ScaledDistortion(rhs.mul_u64(self.0))
692  }
693}
694
695impl std::ops::AddAssign for Distortion {
696  #[inline]
697  fn add_assign(&mut self, other: Self) {
698    self.0 += other.0;
699  }
700}
701
702impl ScaledDistortion {
703  #[inline]
704  pub const fn zero() -> Self {
705    Self(0)
706  }
707}
708
709impl std::ops::AddAssign for ScaledDistortion {
710  #[inline]
711  fn add_assign(&mut self, other: Self) {
712    self.0 += other.0;
713  }
714}
715
716pub fn compute_rd_cost<T: Pixel>(
717  fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
718) -> f64 {
719  let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64);
720  fi.lambda.mul_add(rate_in_bits, distortion.0 as f64)
721}
722
723pub fn rdo_tx_size_type<T: Pixel>(
724  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
725  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
726  luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
727  skip: bool,
728) -> (TxSize, TxType) {
729  let is_inter = !luma_mode.is_intra();
730  let mut tx_size = max_txsize_rect_lookup[bsize as usize];
731
732  if fi.enable_inter_txfm_split && is_inter && !skip {
733    tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
734  }
735
736  let mut best_tx_type = TxType::DCT_DCT;
737  let mut best_tx_size = tx_size;
738  let mut best_rd = std::f64::MAX;
739
740  let do_rdo_tx_size = fi.tx_mode_select
741    && fi.config.speed_settings.transform.rdo_tx_decision
742    && !is_inter;
743  let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 };
744  let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
745
746  for _ in 0..=rdo_tx_depth {
747    let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
748
749    let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
750      && fi.config.speed_settings.transform.rdo_tx_decision
751      && !is_inter
752      && !skip;
753
754    if !do_rdo_tx_size && !do_rdo_tx_type {
755      return (best_tx_size, best_tx_type);
756    };
757
758    let tx_types =
759      if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
760
761    // Luma plane transform type decision
762    let (tx_type, rd_cost) = rdo_tx_type_decision(
763      fi,
764      ts,
765      cw,
766      &mut cw_checkpoint,
767      luma_mode,
768      ref_frames,
769      mvs,
770      bsize,
771      tile_bo,
772      tx_size,
773      tx_set,
774      tx_types,
775      best_rd,
776    );
777
778    if rd_cost < best_rd {
779      best_tx_size = tx_size;
780      best_tx_type = tx_type;
781      best_rd = rd_cost;
782    }
783
784    debug_assert!(tx_size.width_log2() <= bsize.width_log2());
785    debug_assert!(tx_size.height_log2() <= bsize.height_log2());
786    debug_assert!(
787      tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT
788    );
789
790    let next_tx_size = sub_tx_size_map[tx_size as usize];
791
792    if next_tx_size == tx_size {
793      break;
794    } else {
795      tx_size = next_tx_size;
796    };
797  }
798
799  (best_tx_size, best_tx_type)
800}
801
802#[inline]
803const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
804  let diff_row = mv.row as i32 - ref_mv.row as i32;
805  let diff_col = mv.col as i32 - ref_mv.col as i32;
806  diff_row >= MV_LOW
807    && diff_row <= MV_UPP
808    && diff_col >= MV_LOW
809    && diff_col <= MV_UPP
810}
811
812#[inline]
813#[profiling::function]
814fn luma_chroma_mode_rdo<T: Pixel>(
815  luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
816  tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
817  cw: &mut ContextWriter, rdo_type: RDOType,
818  cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
819  mvs: [MotionVector; 2], ref_frames: [RefType; 2],
820  mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
821  mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>,
822  angle_delta: AngleDelta,
823) {
824  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
825
826  let is_chroma_block =
827    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
828
829  if !luma_mode_is_intra {
830    let ref_mvs = if mv_stack.is_empty() {
831      [MotionVector::default(); 2]
832    } else {
833      [mv_stack[0].this_mv, mv_stack[0].comp_mv]
834    };
835
836    if (luma_mode == PredictionMode::NEWMV
837      || luma_mode == PredictionMode::NEW_NEWMV
838      || luma_mode == PredictionMode::NEW_NEARESTMV)
839      && !dmv_in_range(mvs[0], ref_mvs[0])
840    {
841      return;
842    }
843
844    if (luma_mode == PredictionMode::NEW_NEWMV
845      || luma_mode == PredictionMode::NEAREST_NEWMV)
846      && !dmv_in_range(mvs[1], ref_mvs[1])
847    {
848      return;
849    }
850  }
851
852  // Find the best chroma prediction mode for the current luma prediction mode
853  let mut chroma_rdo = |skip: bool| -> bool {
854    use crate::segmentation::select_segment;
855
856    let mut zero_distortion = false;
857
858    for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
859      cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
860
861      let (tx_size, tx_type) = rdo_tx_size_type(
862        fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
863      );
864      for &chroma_mode in mode_set_chroma.iter() {
865        let wr = &mut WriterCounter::new();
866        let tell = wr.tell_frac();
867
868        if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
869          cw.write_partition(
870            wr,
871            tile_bo,
872            PartitionType::PARTITION_NONE,
873            bsize,
874          );
875        }
876
877        // TODO(yushin): luma and chroma would have different decision based on chroma format
878        let need_recon_pixel =
879          luma_mode_is_intra && tx_size.block_size() != bsize;
880
881        encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
882        let (has_coeff, tx_dist) = encode_block_post_cdef(
883          fi,
884          ts,
885          cw,
886          wr,
887          luma_mode,
888          chroma_mode,
889          angle_delta,
890          ref_frames,
891          mvs,
892          bsize,
893          tile_bo,
894          skip,
895          CFLParams::default(),
896          tx_size,
897          tx_type,
898          mode_context,
899          mv_stack,
900          rdo_type,
901          need_recon_pixel,
902          None,
903        );
904
905        let rate = wr.tell_frac() - tell;
906        let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
907          compute_tx_distortion(
908            fi,
909            ts,
910            bsize,
911            is_chroma_block,
912            tile_bo,
913            tx_dist,
914            skip,
915            false,
916          )
917        } else {
918          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
919        };
920        let is_zero_dist = distortion.0 == 0;
921        let rd = compute_rd_cost(fi, rate, distortion);
922        if rd < best.rd_cost {
923          //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV {
924          best.rd_cost = rd;
925          best.pred_mode_luma = luma_mode;
926          best.pred_mode_chroma = chroma_mode;
927          best.angle_delta = angle_delta;
928          best.ref_frames = ref_frames;
929          best.mvs = mvs;
930          best.skip = skip;
931          best.has_coeff = has_coeff;
932          best.tx_size = tx_size;
933          best.tx_type = tx_type;
934          best.sidx = sidx;
935          zero_distortion = is_zero_dist;
936        }
937
938        cw.rollback(cw_checkpoint);
939      }
940    }
941
942    zero_distortion
943  };
944
945  // Don't skip when using intra modes
946  let zero_distortion =
947    if !luma_mode_is_intra { chroma_rdo(true) } else { false };
948  // early skip
949  if !zero_distortion {
950    chroma_rdo(false);
951  }
952}
953
954/// RDO-based mode decision
955///
956/// # Panics
957///
958/// - If the best RD found is negative.
959///   This should never happen and indicates a development error.
960#[profiling::function]
961pub fn rdo_mode_decision<T: Pixel>(
962  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
963  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
964  inter_cfg: &InterConfig,
965) -> PartitionParameters {
966  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
967  let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
968
969  let rdo_type = if fi.use_tx_domain_rate {
970    RDOType::TxDistEstRate
971  } else if fi.use_tx_domain_distortion {
972    RDOType::TxDistRealRate
973  } else {
974    RDOType::PixelDistRealRate
975  };
976
977  let mut best = if fi.frame_type.has_inter() {
978    assert!(fi.frame_type != FrameType::KEY);
979
980    inter_frame_rdo_mode_decision(
981      fi,
982      ts,
983      cw,
984      bsize,
985      tile_bo,
986      inter_cfg,
987      &cw_checkpoint,
988      rdo_type,
989    )
990  } else {
991    PartitionParameters::default()
992  };
993
994  let is_chroma_block =
995    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
996
997  if !best.skip {
998    best = intra_frame_rdo_mode_decision(
999      fi,
1000      ts,
1001      cw,
1002      bsize,
1003      tile_bo,
1004      &cw_checkpoint,
1005      rdo_type,
1006      best,
1007      is_chroma_block,
1008    );
1009  }
1010
1011  if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
1012    cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
1013
1014    let chroma_mode = PredictionMode::UV_CFL_PRED;
1015    let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1016    let mut wr = WriterCounter::new();
1017    let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
1018
1019    write_tx_blocks(
1020      fi,
1021      ts,
1022      cw,
1023      &mut wr,
1024      best.pred_mode_luma,
1025      best.pred_mode_luma,
1026      angle_delta,
1027      tile_bo,
1028      bsize,
1029      best.tx_size,
1030      best.tx_type,
1031      false,
1032      CFLParams::default(),
1033      true,
1034      rdo_type,
1035      true,
1036    );
1037    cw.rollback(&cw_checkpoint);
1038    if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1039      if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1040        let mut wr = WriterCounter::new();
1041        let tell = wr.tell_frac();
1042
1043        encode_block_pre_cdef(
1044          &fi.sequence,
1045          ts,
1046          cw,
1047          &mut wr,
1048          bsize,
1049          tile_bo,
1050          best.skip,
1051        );
1052        let (has_coeff, _) = encode_block_post_cdef(
1053          fi,
1054          ts,
1055          cw,
1056          &mut wr,
1057          best.pred_mode_luma,
1058          chroma_mode,
1059          angle_delta,
1060          best.ref_frames,
1061          best.mvs,
1062          bsize,
1063          tile_bo,
1064          best.skip,
1065          cfl,
1066          best.tx_size,
1067          best.tx_type,
1068          0,
1069          &[],
1070          rdo_type,
1071          true, // For CFL, luma should be always reconstructed.
1072          None,
1073        );
1074
1075        let rate = wr.tell_frac() - tell;
1076
1077        // For CFL, tx-domain distortion is not an option.
1078        let distortion =
1079          compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
1080        let rd = compute_rd_cost(fi, rate, distortion);
1081        if rd < best.rd_cost {
1082          best.rd_cost = rd;
1083          best.pred_mode_chroma = chroma_mode;
1084          best.angle_delta = angle_delta;
1085          best.has_coeff = has_coeff;
1086          best.pred_cfl_params = cfl;
1087        }
1088
1089        cw.rollback(&cw_checkpoint);
1090      }
1091    }
1092  }
1093
1094  cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1095  cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1096  cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1097
1098  assert!(best.rd_cost >= 0_f64);
1099
1100  PartitionParameters {
1101    bo: tile_bo,
1102    bsize,
1103    pred_mode_luma: best.pred_mode_luma,
1104    pred_mode_chroma: best.pred_mode_chroma,
1105    pred_cfl_params: best.pred_cfl_params,
1106    angle_delta: best.angle_delta,
1107    ref_frames: best.ref_frames,
1108    mvs: best.mvs,
1109    rd_cost: best.rd_cost,
1110    skip: best.skip,
1111    has_coeff: best.has_coeff,
1112    tx_size: best.tx_size,
1113    tx_type: best.tx_type,
1114    sidx: best.sidx,
1115  }
1116}
1117
1118#[profiling::function]
1119fn inter_frame_rdo_mode_decision<T: Pixel>(
1120  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1121  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1122  inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1123  rdo_type: RDOType,
1124) -> PartitionParameters {
1125  let mut best = PartitionParameters::default();
1126
1127  // we can never have more than 7 reference frame sets
1128  let mut ref_frames_set = ArrayVec::<_, 7>::new();
1129  // again, max of 7 ref slots
1130  let mut ref_slot_set = ArrayVec::<_, 7>::new();
1131  // our implementation never returns more than 3 at the moment
1132  let mut mvs_from_me = ArrayVec::<_, 3>::new();
1133  let mut fwdref = None;
1134  let mut bwdref = None;
1135
1136  for i in inter_cfg.allowed_ref_frames().iter().copied() {
1137    // Don't search LAST3 since it's used only for probs
1138    if i == LAST3_FRAME {
1139      continue;
1140    }
1141
1142    if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1143      if fwdref.is_none() && i.is_fwd_ref() {
1144        fwdref = Some(ref_frames_set.len());
1145      }
1146      if bwdref.is_none() && i.is_bwd_ref() {
1147        bwdref = Some(ref_frames_set.len());
1148      }
1149      ref_frames_set.push([i, NONE_FRAME]);
1150      let slot_idx = fi.ref_frames[i.to_index()];
1151      ref_slot_set.push(slot_idx);
1152    }
1153  }
1154  assert!(!ref_frames_set.is_empty());
1155
1156  let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new();
1157  let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new();
1158  let mut satds = ArrayVec::<u32, 20>::new();
1159  let mut mv_stacks = ArrayVec::<_, 20>::new();
1160  let mut mode_contexts = ArrayVec::<_, 7>::new();
1161
1162  for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1163    let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1164    mode_contexts.push(cw.find_mvrefs(
1165      tile_bo,
1166      ref_frames,
1167      &mut mv_stack,
1168      bsize,
1169      fi,
1170      false,
1171    ));
1172
1173    let mut pmv = [MotionVector::default(); 2];
1174    if !mv_stack.is_empty() {
1175      pmv[0] = mv_stack[0].this_mv;
1176    }
1177    if mv_stack.len() > 1 {
1178      pmv[1] = mv_stack[1].this_mv;
1179    }
1180
1181    let res = estimate_motion(
1182      fi,
1183      ts,
1184      bsize.width(),
1185      bsize.height(),
1186      tile_bo,
1187      ref_frames[0],
1188      Some(pmv),
1189      MVSamplingMode::CORNER { right: true, bottom: true },
1190      false,
1191      0,
1192      None,
1193    )
1194    .unwrap_or_else(MotionSearchResult::empty);
1195    let b_me = res.mv;
1196
1197    mvs_from_me.push([b_me, MotionVector::default()]);
1198
1199    for &x in RAV1E_INTER_MODES_MINIMAL {
1200      inter_mode_set.push((x, i));
1201    }
1202    if !mv_stack.is_empty() {
1203      inter_mode_set.push((PredictionMode::NEAR0MV, i));
1204    }
1205    if mv_stack.len() >= 2 {
1206      inter_mode_set.push((PredictionMode::GLOBALMV, i));
1207    }
1208    let include_near_mvs = fi.config.speed_settings.motion.include_near_mvs;
1209    if include_near_mvs {
1210      if mv_stack.len() >= 3 {
1211        inter_mode_set.push((PredictionMode::NEAR1MV, i));
1212      }
1213      if mv_stack.len() >= 4 {
1214        inter_mode_set.push((PredictionMode::NEAR2MV, i));
1215      }
1216    }
1217    let same_row_col = |x: &CandidateMV| {
1218      x.this_mv.row == mvs_from_me[i][0].row
1219        && x.this_mv.col == mvs_from_me[i][0].col
1220    };
1221    if !mv_stack
1222      .iter()
1223      .take(if include_near_mvs { 4 } else { 2 })
1224      .any(same_row_col)
1225      && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0)
1226    {
1227      inter_mode_set.push((PredictionMode::NEWMV, i));
1228    }
1229
1230    mv_stacks.push(mv_stack);
1231  }
1232
1233  let sz = bsize.width_mi().min(bsize.height_mi());
1234
1235  // To use non single reference modes, block width and height must be greater than 4.
1236  if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 {
1237    // Adding compound candidate
1238    if let Some(r0) = fwdref {
1239      if let Some(r1) = bwdref {
1240        let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]];
1241        ref_frames_set.push(ref_frames);
1242        let mv0 = mvs_from_me[r0][0];
1243        let mv1 = mvs_from_me[r1][0];
1244        mvs_from_me.push([mv0, mv1]);
1245        let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1246        mode_contexts.push(cw.find_mvrefs(
1247          tile_bo,
1248          ref_frames,
1249          &mut mv_stack,
1250          bsize,
1251          fi,
1252          true,
1253        ));
1254        for &x in RAV1E_INTER_COMPOUND_MODES {
1255          // exclude any NEAR mode based on speed setting
1256          if fi.config.speed_settings.motion.include_near_mvs
1257            || !x.has_nearmv()
1258          {
1259            let mv_stack_idx = ref_frames_set.len() - 1;
1260            // exclude NEAR modes if the mv_stack is too short
1261            if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1262              inter_mode_set.push((x, mv_stack_idx));
1263            }
1264          }
1265        }
1266        mv_stacks.push(mv_stack);
1267      }
1268    }
1269  }
1270
1271  let num_modes_rdo = if fi.config.speed_settings.prediction.prediction_modes
1272    >= PredictionModesSetting::ComplexAll
1273  {
1274    inter_mode_set.len()
1275  } else {
1276    9 // This number is determined by AWCY test
1277  };
1278
1279  inter_mode_set.iter().for_each(|&(luma_mode, i)| {
1280    let mvs = match luma_mode {
1281      PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i],
1282      PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => {
1283        if !mv_stacks[i].is_empty() {
1284          [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv]
1285        } else {
1286          [MotionVector::default(); 2]
1287        }
1288      }
1289      PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => {
1290        if mv_stacks[i].len() > 1 {
1291          [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv]
1292        } else {
1293          [MotionVector::default(); 2]
1294        }
1295      }
1296      PredictionMode::NEAR1MV
1297      | PredictionMode::NEAR2MV
1298      | PredictionMode::NEAR_NEAR1MV
1299      | PredictionMode::NEAR_NEAR2MV => [
1300        mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1301        mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1302      ],
1303      PredictionMode::NEAREST_NEWMV => {
1304        [mv_stacks[i][0].this_mv, mvs_from_me[i][1]]
1305      }
1306      PredictionMode::NEW_NEARESTMV => {
1307        [mvs_from_me[i][0], mv_stacks[i][0].comp_mv]
1308      }
1309      PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => {
1310        [MotionVector::default(); 2]
1311      }
1312      _ => {
1313        unimplemented!();
1314      }
1315    };
1316    mvs_set.push(mvs);
1317
1318    // Calculate SATD for each mode
1319    if num_modes_rdo != inter_mode_set.len() {
1320      let tile_rect = ts.tile_rect();
1321      let rec = &mut ts.rec.planes[0];
1322      let po = tile_bo.plane_offset(rec.plane_cfg);
1323      let mut rec_region =
1324        rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1325
1326      luma_mode.predict_inter(
1327        fi,
1328        tile_rect,
1329        0,
1330        po,
1331        &mut rec_region,
1332        bsize.width(),
1333        bsize.height(),
1334        ref_frames_set[i],
1335        mvs,
1336        &mut ts.inter_compound_buffers,
1337      );
1338
1339      let plane_org = ts.input_tile.planes[0]
1340        .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1341      let plane_ref = rec_region.as_const();
1342
1343      let satd = get_satd(
1344        &plane_org,
1345        &plane_ref,
1346        bsize.width(),
1347        bsize.height(),
1348        fi.sequence.bit_depth,
1349        fi.cpu_feature_level,
1350      );
1351      satds.push(satd);
1352    } else {
1353      satds.push(0);
1354    }
1355  });
1356
1357  let mut sorted =
1358    izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, 20>>();
1359  if num_modes_rdo != sorted.len() {
1360    sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd);
1361  }
1362
1363  sorted.iter().take(num_modes_rdo).for_each(
1364    |&((luma_mode, i), mvs, _satd)| {
1365      let mode_set_chroma = ArrayVec::from([luma_mode]);
1366
1367      luma_chroma_mode_rdo(
1368        luma_mode,
1369        fi,
1370        bsize,
1371        tile_bo,
1372        ts,
1373        cw,
1374        rdo_type,
1375        cw_checkpoint,
1376        &mut best,
1377        mvs,
1378        ref_frames_set[i],
1379        &mode_set_chroma,
1380        false,
1381        mode_contexts[i],
1382        &mv_stacks[i],
1383        AngleDelta::default(),
1384      );
1385    },
1386  );
1387
1388  best
1389}
1390
1391#[profiling::function]
1392fn intra_frame_rdo_mode_decision<T: Pixel>(
1393  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1394  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1395  cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1396  mut best: PartitionParameters, is_chroma_block: bool,
1397) -> PartitionParameters {
1398  let mut modes = ArrayVec::<_, INTRA_MODES>::new();
1399
1400  // Reduce number of prediction modes at higher speed levels
1401  let num_modes_rdo = if (fi.frame_type == FrameType::KEY
1402    && fi.config.speed_settings.prediction.prediction_modes
1403      >= PredictionModesSetting::ComplexKeyframes)
1404    || (fi.frame_type.has_inter()
1405      && fi.config.speed_settings.prediction.prediction_modes
1406        >= PredictionModesSetting::ComplexAll)
1407  {
1408    7
1409  } else {
1410    3
1411  };
1412
1413  let intra_mode_set = RAV1E_INTRA_MODES;
1414
1415  // Find mode with lowest rate cost
1416  {
1417    use crate::ec::cdf_to_pdf;
1418
1419    let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() {
1420      cw.get_cdf_intra_mode(bsize)
1421    } else {
1422      cw.get_cdf_intra_mode_kf(tile_bo)
1423    });
1424
1425    modes.try_extend_from_slice(intra_mode_set).unwrap();
1426    modes.sort_by_key(|&a| !probs_all[a as usize]);
1427  }
1428
1429  // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1430  // may be improved by emulating prediction for each tx block.
1431  {
1432    let satds = {
1433      // FIXME: If tx partition is used, this whole sads block should be fixed
1434      let tx_size = bsize.tx_size();
1435      let mut edge_buf = Aligned::uninit_array();
1436      let edge_buf = {
1437        let rec = &ts.rec.planes[0].as_const();
1438        let po = tile_bo.plane_offset(rec.plane_cfg);
1439        // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1440        get_intra_edges(
1441          &mut edge_buf,
1442          rec,
1443          tile_bo,
1444          0,
1445          0,
1446          bsize,
1447          po,
1448          tx_size,
1449          fi.sequence.bit_depth,
1450          None,
1451          fi.sequence.enable_intra_edge_filter,
1452          IntraParam::None,
1453        )
1454      };
1455
1456      let ief_params = if fi.sequence.enable_intra_edge_filter {
1457        let above_block_info = ts.above_block_info(tile_bo, 0, 0);
1458        let left_block_info = ts.left_block_info(tile_bo, 0, 0);
1459        Some(IntraEdgeFilterParameters::new(
1460          0,
1461          above_block_info,
1462          left_block_info,
1463        ))
1464      } else {
1465        None
1466      };
1467
1468      let mut satds_all = [0; INTRA_MODES];
1469      for &luma_mode in modes.iter().skip(num_modes_rdo / 2) {
1470        let tile_rect = ts.tile_rect();
1471        let rec = &mut ts.rec.planes[0];
1472        let mut rec_region =
1473          rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1474        // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1475        luma_mode.predict_intra(
1476          tile_rect,
1477          &mut rec_region,
1478          tx_size,
1479          fi.sequence.bit_depth,
1480          &[0i16; 2],
1481          IntraParam::None,
1482          if luma_mode.is_directional() { ief_params } else { None },
1483          &edge_buf,
1484          fi.cpu_feature_level,
1485        );
1486
1487        let plane_org = ts.input_tile.planes[0]
1488          .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1489        let plane_ref = rec_region.as_const();
1490
1491        satds_all[luma_mode as usize] = get_satd(
1492          &plane_org,
1493          &plane_ref,
1494          tx_size.width(),
1495          tx_size.height(),
1496          fi.sequence.bit_depth,
1497          fi.cpu_feature_level,
1498        );
1499      }
1500      satds_all
1501    };
1502
1503    modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]);
1504  }
1505
1506  debug_assert!(num_modes_rdo >= 1);
1507
1508  modes.iter().take(num_modes_rdo).for_each(|&luma_mode| {
1509    let mvs = [MotionVector::default(); 2];
1510    let ref_frames = [INTRA_FRAME, NONE_FRAME];
1511    let mut mode_set_chroma = ArrayVec::<_, 2>::new();
1512    mode_set_chroma.push(luma_mode);
1513    if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1514      mode_set_chroma.push(PredictionMode::DC_PRED);
1515    }
1516    luma_chroma_mode_rdo(
1517      luma_mode,
1518      fi,
1519      bsize,
1520      tile_bo,
1521      ts,
1522      cw,
1523      rdo_type,
1524      cw_checkpoint,
1525      &mut best,
1526      mvs,
1527      ref_frames,
1528      &mode_set_chroma,
1529      true,
1530      0,
1531      &ArrayVec::<CandidateMV, 9>::new(),
1532      AngleDelta::default(),
1533    );
1534  });
1535
1536  if fi.config.speed_settings.prediction.fine_directional_intra
1537    && bsize >= BlockSize::BLOCK_8X8
1538  {
1539    // Find the best angle delta for the current best prediction mode
1540    let luma_deltas = best.pred_mode_luma.angle_delta_count();
1541    let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1542
1543    let mvs = [MotionVector::default(); 2];
1544    let ref_frames = [INTRA_FRAME, NONE_FRAME];
1545    let mode_set_chroma = [best.pred_mode_chroma];
1546    let mv_stack = ArrayVec::<_, 9>::new();
1547    let mut best_angle_delta = best.angle_delta;
1548    let mut angle_delta_rdo = |y, uv| -> AngleDelta {
1549      if best.angle_delta.y != y || best.angle_delta.uv != uv {
1550        luma_chroma_mode_rdo(
1551          best.pred_mode_luma,
1552          fi,
1553          bsize,
1554          tile_bo,
1555          ts,
1556          cw,
1557          rdo_type,
1558          cw_checkpoint,
1559          &mut best,
1560          mvs,
1561          ref_frames,
1562          &mode_set_chroma,
1563          true,
1564          0,
1565          &mv_stack,
1566          AngleDelta { y, uv },
1567        );
1568      }
1569      best.angle_delta
1570    };
1571
1572    for i in 0..luma_deltas {
1573      let angle_delta_y =
1574        if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 };
1575      best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1576    }
1577    for j in 0..chroma_deltas {
1578      let angle_delta_uv =
1579        if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 };
1580      best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1581    }
1582  }
1583
1584  best
1585}
1586
1587/// # Panics
1588///
1589/// - If the block size is invalid for subsampling.
1590#[profiling::function]
1591pub fn rdo_cfl_alpha<T: Pixel>(
1592  ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1593  luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1594) -> Option<CFLParams> {
1595  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1596  let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1597  debug_assert!(
1598    bsize.subsampled_size(xdec, ydec).unwrap() == uv_tx_size.block_size()
1599  );
1600
1601  let frame_bo = ts.to_frame_block_offset(tile_bo);
1602  let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1603    (fi.width + xdec) >> xdec,
1604    (fi.height + ydec) >> ydec,
1605    uv_tx_size.block_size(),
1606    (frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1607    (frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1608  );
1609
1610  if visible_tx_w == 0 || visible_tx_h == 0 {
1611    return None;
1612  };
1613  let mut ac = Aligned::<[MaybeUninit<i16>; 32 * 32]>::uninit_array();
1614  let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1615  let best_alpha: ArrayVec<i16, 2> = (1..3)
1616    .map(|p| {
1617      let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1618      let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1619      let rec = &mut ts.rec.planes[p];
1620      let input = &ts.input_tile.planes[p];
1621      let po = tile_bo.plane_offset(rec.plane_cfg);
1622      let mut edge_buf = Aligned::uninit_array();
1623      let edge_buf = get_intra_edges(
1624        &mut edge_buf,
1625        &rec.as_const(),
1626        tile_bo,
1627        0,
1628        0,
1629        bsize,
1630        po,
1631        uv_tx_size,
1632        fi.sequence.bit_depth,
1633        Some(PredictionMode::UV_CFL_PRED),
1634        fi.sequence.enable_intra_edge_filter,
1635        IntraParam::None,
1636      );
1637      let mut alpha_cost = |alpha: i16| -> u64 {
1638        let mut rec_region =
1639          rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1640        PredictionMode::UV_CFL_PRED.predict_intra(
1641          tile_rect,
1642          &mut rec_region,
1643          uv_tx_size,
1644          fi.sequence.bit_depth,
1645          ac,
1646          IntraParam::Alpha(alpha),
1647          None,
1648          &edge_buf,
1649          fi.cpu_feature_level,
1650        );
1651        sse_wxh(
1652          &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1653          &rec_region.as_const(),
1654          visible_tx_w,
1655          visible_tx_h,
1656          |_, _| DistortionScale::default(), // We're not doing RDO here.
1657          fi.sequence.bit_depth,
1658          fi.cpu_feature_level,
1659        )
1660        .0
1661      };
1662      let mut best = (alpha_cost(0), 0);
1663      let mut count = 2;
1664      for alpha in 1i16..=16i16 {
1665        let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1666        if cost.0 < best.0 {
1667          best = (cost.0, alpha);
1668          count += 2;
1669        }
1670        if cost.1 < best.0 {
1671          best = (cost.1, -alpha);
1672          count += 2;
1673        }
1674        if count < alpha {
1675          break;
1676        }
1677      }
1678      best.1
1679    })
1680    .collect();
1681
1682  if best_alpha[0] == 0 && best_alpha[1] == 0 {
1683    None
1684  } else {
1685    Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
1686  }
1687}
1688
1689/// RDO-based transform type decision
1690/// If `cw_checkpoint` is `None`, a checkpoint for cw's (`ContextWriter`) current
1691/// state is created and stored for later use.
1692///
1693/// # Panics
1694///
1695/// - If a writer checkpoint is never created before or within the function.
1696///   This should never happen and indicates a development error.
1697/// - If the best RD found is negative.
1698///   This should never happen and indicates a development error.
1699pub fn rdo_tx_type_decision<T: Pixel>(
1700  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1701  cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1702  mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
1703  bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1704  tx_types: &[TxType], cur_best_rd: f64,
1705) -> (TxType, f64) {
1706  let mut best_type = TxType::DCT_DCT;
1707  let mut best_rd = std::f64::MAX;
1708
1709  let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1710  let is_chroma_block =
1711    has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1712
1713  let is_inter = !mode.is_intra();
1714
1715  if cw_checkpoint.is_none() {
1716    // Only run the first call
1717    // Prevents creating multiple checkpoints for own version of cw
1718    *cw_checkpoint =
1719      Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling));
1720  }
1721
1722  let rdo_type = if fi.use_tx_domain_distortion {
1723    RDOType::TxDistRealRate
1724  } else {
1725    RDOType::PixelDistRealRate
1726  };
1727  let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1728
1729  let mut first_iteration = true;
1730  for &tx_type in tx_types {
1731    // Skip unsupported transform types
1732    if av1_tx_used[tx_set as usize][tx_type as usize] == 0 {
1733      continue;
1734    }
1735
1736    if is_inter {
1737      motion_compensate(
1738        fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
1739      );
1740    }
1741
1742    let mut wr = WriterCounter::new();
1743    let tell = wr.tell_frac();
1744    let (_, tx_dist) = if is_inter {
1745      write_tx_tree(
1746        fi,
1747        ts,
1748        cw,
1749        &mut wr,
1750        mode,
1751        0,
1752        tile_bo,
1753        bsize,
1754        tx_size,
1755        tx_type,
1756        false,
1757        true,
1758        rdo_type,
1759        need_recon_pixel,
1760      )
1761    } else {
1762      write_tx_blocks(
1763        fi,
1764        ts,
1765        cw,
1766        &mut wr,
1767        mode,
1768        mode,
1769        AngleDelta::default(),
1770        tile_bo,
1771        bsize,
1772        tx_size,
1773        tx_type,
1774        false,
1775        CFLParams::default(), // Unused.
1776        true,
1777        rdo_type,
1778        need_recon_pixel,
1779      )
1780    };
1781
1782    let rate = wr.tell_frac() - tell;
1783    let distortion = if fi.use_tx_domain_distortion {
1784      compute_tx_distortion(
1785        fi,
1786        ts,
1787        bsize,
1788        is_chroma_block,
1789        tile_bo,
1790        tx_dist,
1791        false,
1792        true,
1793      )
1794    } else {
1795      compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
1796    };
1797    cw.rollback(cw_checkpoint.as_ref().unwrap());
1798
1799    let rd = compute_rd_cost(fi, rate, distortion);
1800
1801    if first_iteration {
1802      // We use an optimization to early exit after testing the first
1803      // transform type if the cost is higher than the existing best.
1804      // The idea is that if this transform size is not better than he
1805      // previous size, it is not worth testing remaining modes for this size.
1806      if rd > cur_best_rd {
1807        break;
1808      }
1809      first_iteration = false;
1810    }
1811
1812    if rd < best_rd {
1813      best_rd = rd;
1814      best_type = tx_type;
1815    }
1816  }
1817
1818  assert!(best_rd >= 0_f64);
1819
1820  (best_type, best_rd)
1821}
1822
1823pub fn get_sub_partitions(
1824  four_partitions: &[TileBlockOffset; 4], partition: PartitionType,
1825) -> ArrayVec<TileBlockOffset, 4> {
1826  let mut partition_offsets = ArrayVec::<TileBlockOffset, 4>::new();
1827
1828  partition_offsets.push(four_partitions[0]);
1829
1830  if partition == PARTITION_NONE {
1831    return partition_offsets;
1832  }
1833  if partition == PARTITION_VERT || partition == PARTITION_SPLIT {
1834    partition_offsets.push(four_partitions[1]);
1835  };
1836  if partition == PARTITION_HORZ || partition == PARTITION_SPLIT {
1837    partition_offsets.push(four_partitions[2]);
1838  };
1839  if partition == PARTITION_SPLIT {
1840    partition_offsets.push(four_partitions[3]);
1841  };
1842
1843  partition_offsets
1844}
1845
1846#[inline(always)]
1847fn rdo_partition_none<T: Pixel>(
1848  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1849  cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1850  inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
1851) -> f64 {
1852  debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1853
1854  let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1855  let cost = mode.rd_cost;
1856
1857  child_modes.push(mode);
1858
1859  cost
1860}
1861
1862// VERTICAL, HORIZONTAL or simple SPLIT
1863#[inline(always)]
1864fn rdo_partition_simple<T: Pixel, W: Writer>(
1865  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1866  cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1867  bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1868  partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1869  child_modes: &mut ArrayVec<PartitionParameters, 4>,
1870) -> Option<f64> {
1871  debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1872  let subsize = bsize.subsize(partition).unwrap();
1873
1874  let cost = if bsize >= BlockSize::BLOCK_8X8 {
1875    let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1876    let tell = w.tell_frac();
1877    cw.write_partition(w, tile_bo, partition, bsize);
1878    compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1879  } else {
1880    0.0
1881  };
1882
1883  let hbsw = subsize.width_mi(); // Half the block size width in blocks
1884  let hbsh = subsize.height_mi(); // Half the block size height in blocks
1885  let four_partitions = [
1886    tile_bo,
1887    TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }),
1888    TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }),
1889    TileBlockOffset(BlockOffset {
1890      x: tile_bo.0.x + hbsw,
1891      y: tile_bo.0.y + hbsh,
1892    }),
1893  ];
1894
1895  let partitions = get_sub_partitions(&four_partitions, partition);
1896
1897  let mut rd_cost_sum = 0.0;
1898
1899  for offset in partitions {
1900    let hbs = subsize.width_mi() >> 1;
1901    let has_cols = offset.0.x + hbs < ts.mi_width;
1902    let has_rows = offset.0.y + hbs < ts.mi_height;
1903
1904    if has_cols && has_rows {
1905      let mode_decision =
1906        rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1907
1908      rd_cost_sum += mode_decision.rd_cost;
1909
1910      if fi.enable_early_exit && rd_cost_sum > best_rd {
1911        return None;
1912      }
1913      if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1914        let w: &mut W =
1915          if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1916        cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1917      }
1918      encode_block_with_modes(
1919        fi,
1920        ts,
1921        cw,
1922        w_pre_cdef,
1923        w_post_cdef,
1924        subsize,
1925        offset,
1926        &mode_decision,
1927        rdo_type,
1928        None,
1929      );
1930      child_modes.push(mode_decision);
1931    } else {
1932      //rd_cost_sum += std::f64::MAX;
1933      return None;
1934    }
1935  }
1936
1937  Some(cost + rd_cost_sum)
1938}
1939
1940/// RDO-based single level partitioning decision
1941///
1942/// # Panics
1943///
1944/// - If the best RD found is negative.
1945///   This should never happen, and indicates a development error.
1946#[profiling::function]
1947pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1948  fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1949  cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1950  bsize: BlockSize, tile_bo: TileBlockOffset,
1951  cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1952  rdo_type: RDOType, inter_cfg: &InterConfig,
1953) -> PartitionGroupParameters {
1954  let mut best_partition = cached_block.part_type;
1955  let mut best_rd = cached_block.rd_cost;
1956  let mut best_pred_modes = cached_block.part_modes.clone();
1957
1958  let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1959  let w_pre_checkpoint = w_pre_cdef.checkpoint();
1960  let w_post_checkpoint = w_post_cdef.checkpoint();
1961
1962  for &partition in partition_types {
1963    // Do not re-encode results we already have
1964    if partition == cached_block.part_type {
1965      continue;
1966    }
1967
1968    let mut child_modes = ArrayVec::<_, 4>::new();
1969
1970    let cost = match partition {
1971      PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1972        Some(rdo_partition_none(
1973          fi,
1974          ts,
1975          cw,
1976          bsize,
1977          tile_bo,
1978          inter_cfg,
1979          &mut child_modes,
1980        ))
1981      }
1982      PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
1983        rdo_partition_simple(
1984          fi,
1985          ts,
1986          cw,
1987          w_pre_cdef,
1988          w_post_cdef,
1989          bsize,
1990          tile_bo,
1991          inter_cfg,
1992          partition,
1993          rdo_type,
1994          best_rd,
1995          &mut child_modes,
1996        )
1997      }
1998      _ => {
1999        unreachable!();
2000      }
2001    };
2002
2003    if let Some(rd) = cost {
2004      if rd < best_rd {
2005        best_rd = rd;
2006        best_partition = partition;
2007        best_pred_modes = child_modes.clone();
2008      }
2009    }
2010    cw.rollback(&cw_checkpoint);
2011    w_pre_cdef.rollback(&w_pre_checkpoint);
2012    w_post_cdef.rollback(&w_post_checkpoint);
2013  }
2014
2015  assert!(best_rd >= 0_f64);
2016
2017  PartitionGroupParameters {
2018    rd_cost: best_rd,
2019    part_type: best_partition,
2020    part_modes: best_pred_modes,
2021  }
2022}
2023
2024#[profiling::function]
2025fn rdo_loop_plane_error<T: Pixel>(
2026  base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
2027  sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
2028  blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
2029) -> ScaledDistortion {
2030  let sb_w_blocks =
2031    if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w;
2032  let sb_h_blocks =
2033    if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h;
2034  // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
2035  // accumulating in-frame and unpadded
2036  let mut err = Distortion::zero();
2037  for by in 0..sb_h_blocks {
2038    for bx in 0..sb_w_blocks {
2039      let loop_bo = offset_sbo.block_offset(bx << 1, by << 1);
2040      if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
2041        let src_plane = &src.planes[pli];
2042        let test_plane = &test.planes[pli];
2043        let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
2044        debug_assert_eq!(xdec, test_plane.cfg.xdec);
2045        debug_assert_eq!(ydec, test_plane.cfg.ydec);
2046
2047        // Unfortunately, our distortion biases are only available via
2048        // Frame-absolute addressing, so we need a block offset
2049        // relative to the full frame origin (not the tile or analysis
2050        // area)
2051        let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1);
2052        let bias = distortion_scale(
2053          fi,
2054          ts.to_frame_block_offset(frame_bo),
2055          BlockSize::BLOCK_8X8,
2056        );
2057
2058        let src_region =
2059          src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
2060        let test_region =
2061          test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
2062
2063        err += if pli == 0 {
2064          // For loop filters, We intentionally use cdef_dist even with
2065          // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
2066          // significant negative impact on other metrics and visual quality.
2067          RawDistortion(cdef_dist_kernel(
2068            &src_region,
2069            &test_region,
2070            8,
2071            8,
2072            fi.sequence.bit_depth,
2073            fi.cpu_feature_level,
2074          ) as u64)
2075            * bias
2076        } else {
2077          sse_wxh(
2078            &src_region,
2079            &test_region,
2080            8 >> xdec,
2081            8 >> ydec,
2082            |_, _| bias,
2083            fi.sequence.bit_depth,
2084            fi.cpu_feature_level,
2085          )
2086        };
2087      }
2088    }
2089  }
2090  err * fi.dist_scale[pli]
2091}
2092
2093/// Passed in a superblock offset representing the upper left corner of
2094/// the LRU area we're optimizing.  This area covers the largest LRU in
2095/// any of the present planes, but may consist of a number of
2096/// superblocks and full, smaller LRUs in the other planes
2097///
2098/// # Panics
2099///
2100/// - If both CDEF and LRF are disabled.
2101#[profiling::function]
2102pub fn rdo_loop_decision<T: Pixel, W: Writer>(
2103  base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2104  ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
2105  deblock_p: bool,
2106) {
2107  let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2108    1
2109  } else {
2110    MAX_PLANES
2111  };
2112  assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration);
2113  // Determine area of optimization: Which plane has the largest LRUs?
2114  // How many LRUs for each?
2115  let mut sb_w = 1; // how many superblocks wide the largest LRU
2116                    // is/how many SBs we're processing (same thing)
2117  let mut sb_h = 1; // how many superblocks wide the largest LRU
2118                    // is/how many SBs we're processing (same thing)
2119  let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing
2120  let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing
2121  for pli in 0..planes {
2122    let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2123    let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2124    if sb_w < (1 << sb_h_shift) {
2125      sb_w = 1 << sb_h_shift;
2126    }
2127    if sb_h < (1 << sb_v_shift) {
2128      sb_h = 1 << sb_v_shift;
2129    }
2130  }
2131  for pli in 0..planes {
2132    let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2133    let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2134    lru_w[pli] = sb_w / (1 << sb_h_shift);
2135    lru_h[pli] = sb_h / (1 << sb_v_shift);
2136  }
2137
2138  // The superblock width/height determinations may be calling for us
2139  // to compute over superblocks that do not actually exist in the
2140  // frame (off the right or lower edge).  Trim sb width/height down
2141  // to actual superblocks.  Note that these last superblocks on the
2142  // right/bottom may themselves still span the edge of the frame, but
2143  // they do hold at least some visible pixels.
2144  sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2145  sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2146
2147  // We have need to know the Y visible pixel limits as well (the
2148  // sb_w/sb_h figures above can be used to determine how many
2149  // allocated pixels, possibly beyond the visible frame, exist).
2150  let crop_w =
2151    fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2152  let crop_h =
2153    fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2154  let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2155  let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2156
2157  // Based on `RestorationState::new`
2158  const MAX_SB_SHIFT: usize = 4;
2159  const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT;
2160  const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2161
2162  // Static allocation relies on the "minimal LRU area for all N planes" invariant.
2163  let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE];
2164  let mut best_lrf =
2165    [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2166
2167  // due to imprecision in the reconstruction parameter solver, we
2168  // need to make sure we don't fall into a limit cycle.  Track our
2169  // best cost at LRF so that we can break if we get a solution that doesn't
2170  // improve at the reconstruction stage.
2171  let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2172
2173  // sub-setted region of the TileBlocks for our working frame area.
2174  // Note that the size of this subset is what signals CDEF as to the
2175  // actual coded size.
2176  let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2177    base_sbo.block_offset(0, 0).0.x,
2178    base_sbo.block_offset(0, 0).0.y,
2179    sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2180    sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2181  );
2182
2183  // cdef doesn't run on superblocks that are completely skipped.
2184  // Determine which super blocks are marked as skipped so we can avoid running
2185  // them. If all blocks are skipped, we can avoid some of the overhead related
2186  // to setting up for cdef.
2187  let mut cdef_skip = [true; MAX_SB_SIZE * MAX_SB_SIZE];
2188  let mut cdef_skip_all = true;
2189  if fi.sequence.enable_cdef {
2190    for sby in 0..sb_h {
2191      for sbx in 0..sb_w {
2192        let blocks = tileblocks_subset.subregion(16 * sbx, 16 * sby, 16, 16);
2193        let mut skip = true;
2194        for y in 0..blocks.rows() {
2195          for block in blocks[y].iter() {
2196            skip &= block.skip;
2197          }
2198        }
2199        cdef_skip[sby * MAX_SB_SIZE + sbx] = skip;
2200        cdef_skip_all &= skip;
2201      }
2202    }
2203  }
2204
2205  // Unlike cdef, loop restoration will run regardless of whether blocks are
2206  // skipped or not. At the same time, the most significant improvement will
2207  // generally be from un-skipped blocks, so lru is only performed if there are
2208  // un-skipped blocks.
2209  // This should be the same as `cdef_skip_all`, except when cdef is disabled.
2210  let mut lru_skip_all = true;
2211  let mut lru_skip = [[true; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2212  if fi.sequence.enable_restoration {
2213    if fi.config.speed_settings.lru_on_skip {
2214      lru_skip_all = false;
2215      lru_skip = [[false; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2216    } else {
2217      for pli in 0..planes {
2218        // width, in sb, of an LRU in this plane
2219        let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2220        // height, in sb, of an LRU in this plane
2221        let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2222        for lru_y in 0..lru_h[pli] {
2223          // number of LRUs vertically
2224          for lru_x in 0..lru_w[pli] {
2225            // number of LRUs horizontally
2226
2227            let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2228              x: lru_x * lru_sb_w,
2229              y: lru_y * lru_sb_h,
2230            });
2231
2232            if !ts.restoration.has_restoration_unit(
2233              base_sbo + loop_sbo,
2234              pli,
2235              false,
2236            ) {
2237              continue;
2238            }
2239
2240            let start = loop_sbo.block_offset(0, 0).0;
2241            let size = TileSuperBlockOffset(SuperBlockOffset {
2242              x: lru_sb_w,
2243              y: lru_sb_h,
2244            })
2245            .block_offset(0, 0)
2246            .0;
2247
2248            let blocks =
2249              tileblocks_subset.subregion(start.x, start.y, size.x, size.y);
2250            let mut skip = true;
2251            for y in 0..blocks.rows() {
2252              for block in blocks[y].iter() {
2253                skip &= block.skip;
2254              }
2255            }
2256            lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] = skip;
2257            lru_skip_all &= skip;
2258          }
2259        }
2260      }
2261    }
2262  }
2263
2264  // Return early if all blocks are skipped for lru and cdef.
2265  if lru_skip_all && cdef_skip_all {
2266    return;
2267  }
2268
2269  // Loop filter RDO is an iterative process and we need temporary
2270  // scratch data to hold the results of deblocking, cdef, and the
2271  // loop reconstruction filter so that each can be partially updated
2272  // without recomputing the entire stack.  Construct
2273  // largest-LRU-sized frames for each, accounting for padding
2274  // required by deblocking, cdef and [optionally] LR.
2275  let mut rec_subset = ts
2276    .rec
2277    .subregion(Area::BlockRect {
2278      bo: base_sbo.block_offset(0, 0).0,
2279      width: (pixel_w + 7) >> 3 << 3,
2280      height: (pixel_h + 7) >> 3 << 3,
2281    })
2282    .scratch_copy();
2283
2284  // const, no need to copy, just need the subregion (but do zero the
2285  // origin to match the other copies/new backing frames).
2286  let src_subset = ts
2287    .input_tile
2288    .subregion(Area::BlockRect {
2289      bo: base_sbo.block_offset(0, 0).0,
2290      width: (pixel_w + 7) >> 3 << 3,
2291      height: (pixel_h + 7) >> 3 << 3,
2292    })
2293    .home();
2294
2295  if deblock_p {
2296    // Find a good deblocking filter solution for the passed in area.
2297    // This is not RDO of deblocking itself, merely a solution to get
2298    // better results from CDEF/LRF RDO.
2299    let deblock_levels = deblock_filter_optimize(
2300      fi,
2301      &rec_subset.as_tile(),
2302      &src_subset,
2303      &tileblocks_subset.as_const(),
2304      crop_w,
2305      crop_h,
2306    );
2307
2308    // Deblock the contents of our reconstruction copy.
2309    if deblock_levels[0] != 0 || deblock_levels[1] != 0 {
2310      // copy ts.deblock because we need to set some of our own values here
2311      let mut deblock_copy = *ts.deblock;
2312      deblock_copy.levels = deblock_levels;
2313
2314      // finally, deblock the temp frame
2315      deblock_filter_frame(
2316        &deblock_copy,
2317        &mut rec_subset.as_tile_mut(),
2318        &tileblocks_subset.as_const(),
2319        crop_w,
2320        crop_h,
2321        fi.sequence.bit_depth,
2322        planes,
2323      );
2324    }
2325  }
2326
2327  let mut cdef_work =
2328    if !cdef_skip_all { Some(rec_subset.clone()) } else { None };
2329  let mut lrf_work = if !lru_skip_all {
2330    Some(Frame {
2331      planes: {
2332        let new_plane = |pli: usize| {
2333          let PlaneConfig { xdec, ydec, width, height, .. } =
2334            rec_subset.planes[pli].cfg;
2335          Plane::new(width, height, xdec, ydec, 0, 0)
2336        };
2337        [new_plane(0), new_plane(1), new_plane(2)]
2338      },
2339    })
2340  } else {
2341    None
2342  };
2343
2344  // Precompute directional analysis for CDEF
2345  let cdef_data = {
2346    if cdef_work.is_some() {
2347      Some((
2348        &rec_subset,
2349        cdef_analyze_superblock_range(
2350          fi,
2351          &rec_subset,
2352          &tileblocks_subset.as_const(),
2353          sb_w,
2354          sb_h,
2355        ),
2356      ))
2357    } else {
2358      None
2359    }
2360  };
2361
2362  // CDEF/LRF decision iteration
2363  // Start with a default of CDEF 0 and RestorationFilter::None
2364  // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2365  // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2366  // If LRF choice changed for any plane, repeat until no changes
2367  // Limit iterations and where we break based on speed setting (in the TODO list ;-)
2368  let mut cdef_change = true;
2369  let mut lrf_change = true;
2370  while cdef_change || lrf_change {
2371    // search for improved cdef indices, superblock by superblock, if cdef is enabled.
2372    if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2373      (&cdef_data, &mut cdef_work.as_mut())
2374    {
2375      for sby in 0..sb_h {
2376        for sbx in 0..sb_w {
2377          // determine whether this superblock can be skipped
2378          if cdef_skip[sby * MAX_SB_SIZE + sbx] {
2379            continue;
2380          }
2381
2382          let prev_best_index = best_index[sby * sb_w + sbx];
2383          let mut best_cost = -1.;
2384          let mut best_new_index = -1i8;
2385
2386          /* offset of the superblock we're currently testing within the larger
2387          analysis area */
2388          let loop_sbo =
2389            TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2390
2391          /* cdef index testing loop */
2392          for cdef_index in 0..(1 << fi.cdef_bits) {
2393            let mut err = ScaledDistortion::zero();
2394            let mut rate = 0;
2395
2396            cdef_filter_superblock(
2397              fi,
2398              &rec_subset,
2399              &mut cdef_ref.as_tile_mut(),
2400              &tileblocks_subset.as_const(),
2401              loop_sbo,
2402              cdef_index,
2403              &cdef_dirs[sby * sb_w + sbx],
2404            );
2405            // apply LRF if any
2406            for pli in 0..planes {
2407              // We need the cropped-to-visible-frame area of this SB
2408              let wh =
2409                if fi.sequence.use_128x128_superblock { 128 } else { 64 };
2410              let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2411              let vis_width = (wh >> xdec).min(
2412                (crop_w >> xdec)
2413                  - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2414                    as usize,
2415              );
2416              let vis_height = (wh >> ydec).min(
2417                (crop_h >> ydec)
2418                  - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2419                    as usize,
2420              );
2421              // which LRU are we currently testing against?
2422              if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2423                let rp = &ts.restoration.planes[pli];
2424                (
2425                  rp.restoration_unit_offset(base_sbo, loop_sbo, false),
2426                  &mut lrf_work,
2427                )
2428              } {
2429                // We have a valid LRU, apply LRF, compute error
2430                match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2431                  RestorationFilter::None {} => {
2432                    err += rdo_loop_plane_error(
2433                      base_sbo,
2434                      loop_sbo,
2435                      1,
2436                      1,
2437                      fi,
2438                      ts,
2439                      &tileblocks_subset.as_const(),
2440                      cdef_ref,
2441                      &src_subset,
2442                      pli,
2443                    );
2444                    rate += if fi.sequence.enable_restoration {
2445                      cw.fc.count_lrf_switchable(
2446                        w,
2447                        &ts.restoration.as_const(),
2448                        best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2449                        pli,
2450                      )
2451                    } else {
2452                      0 // no relative cost differeneces to different
2453                        // CDEF params.  If cdef is on, it's a wash.
2454                    };
2455                  }
2456                  RestorationFilter::Sgrproj { set, xqd } => {
2457                    // only run on this single superblock
2458                    let loop_po =
2459                      loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2460                    // todo: experiment with borrowing border pixels
2461                    // rather than edge-extending. Right now this is
2462                    // hard-clipping to the superblock boundary.
2463                    setup_integral_image(
2464                      &mut ts.integral_buffer,
2465                      SOLVE_IMAGE_STRIDE,
2466                      vis_width,
2467                      vis_height,
2468                      vis_width,
2469                      vis_height,
2470                      &cdef_ref.planes[pli].slice(loop_po),
2471                      &cdef_ref.planes[pli].slice(loop_po),
2472                    );
2473                    sgrproj_stripe_filter(
2474                      set,
2475                      xqd,
2476                      fi,
2477                      &ts.integral_buffer,
2478                      SOLVE_IMAGE_STRIDE,
2479                      &cdef_ref.planes[pli].slice(loop_po),
2480                      &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2481                        x: loop_po.x,
2482                        y: loop_po.y,
2483                        width: vis_width,
2484                        height: vis_height,
2485                      }),
2486                    );
2487                    err += rdo_loop_plane_error(
2488                      base_sbo,
2489                      loop_sbo,
2490                      1,
2491                      1,
2492                      fi,
2493                      ts,
2494                      &tileblocks_subset.as_const(),
2495                      lrf_ref,
2496                      &src_subset,
2497                      pli,
2498                    );
2499                    rate += cw.fc.count_lrf_switchable(
2500                      w,
2501                      &ts.restoration.as_const(),
2502                      best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2503                      pli,
2504                    );
2505                  }
2506                  RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2507                }
2508              } else {
2509                // No actual LRU here, compute error directly from CDEF output.
2510                err += rdo_loop_plane_error(
2511                  base_sbo,
2512                  loop_sbo,
2513                  1,
2514                  1,
2515                  fi,
2516                  ts,
2517                  &tileblocks_subset.as_const(),
2518                  cdef_ref,
2519                  &src_subset,
2520                  pli,
2521                );
2522                // no relative cost differeneces to different
2523                // CDEF params.  If cdef is on, it's a wash.
2524                // rate += 0;
2525              }
2526            }
2527
2528            let cost = compute_rd_cost(fi, rate, err);
2529            if best_cost < 0. || cost < best_cost {
2530              best_cost = cost;
2531              best_new_index = cdef_index as i8;
2532            }
2533          }
2534
2535          // Did we change any preexisting choices?
2536          if best_new_index != prev_best_index {
2537            cdef_change = true;
2538            best_index[sby * sb_w + sbx] = best_new_index;
2539            tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2540          }
2541
2542          let mut cdef_ref_tm = TileMut::new(
2543            cdef_ref,
2544            TileRect {
2545              x: 0,
2546              y: 0,
2547              width: cdef_ref.planes[0].cfg.width,
2548              height: cdef_ref.planes[0].cfg.height,
2549            },
2550          );
2551
2552          // Keep cdef output up to date; we need it for restoration
2553          // both below and above (padding)
2554          cdef_filter_superblock(
2555            fi,
2556            rec_copy,
2557            &mut cdef_ref_tm,
2558            &tileblocks_subset.as_const(),
2559            loop_sbo,
2560            best_index[sby * sb_w + sbx] as u8,
2561            &cdef_dirs[sby * sb_w + sbx],
2562          );
2563        }
2564      }
2565    }
2566
2567    if !cdef_change {
2568      break;
2569    }
2570    cdef_change = false;
2571    lrf_change = false;
2572
2573    // search for improved restoration filter parameters if restoration is enabled
2574    if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2575      let lrf_input = if cdef_work.is_some() {
2576        // When CDEF is enabled, we pull from the CDEF output
2577        cdef_work.as_ref().unwrap()
2578      } else {
2579        // When CDEF is disabled, we pull from the [optionally
2580        // deblocked] reconstruction
2581        &rec_subset
2582      };
2583      for pli in 0..planes {
2584        // Nominal size of LRU in pixels before clipping to visible frame
2585        let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2586        // width, in sb, of an LRU in this plane
2587        let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2588        // height, in sb, of an LRU in this plane
2589        let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2590        let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2591        for lru_y in 0..lru_h[pli] {
2592          // number of LRUs vertically
2593          for lru_x in 0..lru_w[pli] {
2594            // number of LRUs horizontally
2595
2596            // determine whether this lru should be skipped
2597            if lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] {
2598              continue;
2599            }
2600
2601            let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2602              x: lru_x * lru_sb_w,
2603              y: lru_y * lru_sb_h,
2604            });
2605            if ts.restoration.has_restoration_unit(
2606              base_sbo + loop_sbo,
2607              pli,
2608              false,
2609            ) {
2610              let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2611              let lrf_in_plane = &lrf_input.planes[pli];
2612              let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2613              let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2614              let mut best_cost =
2615                best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2616
2617              // Check the no filter option
2618              {
2619                let err = rdo_loop_plane_error(
2620                  base_sbo,
2621                  loop_sbo,
2622                  lru_sb_w,
2623                  lru_sb_h,
2624                  fi,
2625                  ts,
2626                  &tileblocks_subset.as_const(),
2627                  lrf_input,
2628                  &src_subset,
2629                  pli,
2630                );
2631                let rate = cw.fc.count_lrf_switchable(
2632                  w,
2633                  &ts.restoration.as_const(),
2634                  best_new_lrf,
2635                  pli,
2636                );
2637
2638                let cost = compute_rd_cost(fi, rate, err);
2639                // Was this choice actually an improvement?
2640                if best_cost < 0. || cost < best_cost {
2641                  best_cost = cost;
2642                  best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2643                  best_new_lrf = RestorationFilter::None;
2644                }
2645              }
2646
2647              // Look for a self guided filter
2648              // We need the cropped-to-visible-frame computation area of this LRU
2649              let vis_width = unit_size.min(
2650                (crop_w >> xdec)
2651                  - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2652              );
2653              let vis_height = unit_size.min(
2654                (crop_h >> ydec)
2655                  - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2656              );
2657
2658              // todo: experiment with borrowing border pixels
2659              // rather than edge-extending. Right now this is
2660              // hard-clipping to the superblock boundary.
2661              setup_integral_image(
2662                &mut ts.integral_buffer,
2663                SOLVE_IMAGE_STRIDE,
2664                vis_width,
2665                vis_height,
2666                vis_width,
2667                vis_height,
2668                &lrf_in_plane.slice(lrf_po),
2669                &lrf_in_plane.slice(lrf_po),
2670              );
2671
2672              for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2673              {
2674                let (xqd0, xqd1) = sgrproj_solve(
2675                  set,
2676                  fi,
2677                  &ts.integral_buffer,
2678                  &src_plane
2679                    .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2680                  &lrf_in_plane.slice(lrf_po),
2681                  vis_width,
2682                  vis_height,
2683                );
2684                let current_lrf =
2685                  RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2686                if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2687                  sgrproj_stripe_filter(
2688                    set,
2689                    xqd,
2690                    fi,
2691                    &ts.integral_buffer,
2692                    SOLVE_IMAGE_STRIDE,
2693                    &lrf_in_plane.slice(lrf_po),
2694                    &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2695                      x: lrf_po.x,
2696                      y: lrf_po.y,
2697                      width: vis_width,
2698                      height: vis_height,
2699                    }),
2700                  );
2701                }
2702                let err = rdo_loop_plane_error(
2703                  base_sbo,
2704                  loop_sbo,
2705                  lru_sb_w,
2706                  lru_sb_h,
2707                  fi,
2708                  ts,
2709                  &tileblocks_subset.as_const(),
2710                  lrf_ref,
2711                  &src_subset,
2712                  pli,
2713                );
2714                let rate = cw.fc.count_lrf_switchable(
2715                  w,
2716                  &ts.restoration.as_const(),
2717                  current_lrf,
2718                  pli,
2719                );
2720                let cost = compute_rd_cost(fi, rate, err);
2721                if cost < best_cost {
2722                  best_cost = cost;
2723                  best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2724                  best_new_lrf = current_lrf;
2725                }
2726              }
2727
2728              if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2729                .notequal(best_new_lrf)
2730              {
2731                best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2732                lrf_change = true;
2733                if let Some(ru) = ts.restoration.planes[pli]
2734                  .restoration_unit_mut(base_sbo + loop_sbo)
2735                {
2736                  ru.filter = best_new_lrf;
2737                }
2738              }
2739            }
2740          }
2741        }
2742      }
2743    }
2744  }
2745}
2746
2747#[test]
2748fn estimate_rate_test() {
2749  assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]);
2750}