rav1e/
dist.rs

1// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11  if #[cfg(nasm_x86_64)] {
12    pub use crate::asm::x86::dist::*;
13  } else if #[cfg(asm_neon)] {
14    pub use crate::asm::aarch64::dist::*;
15  } else {
16    pub use self::rust::*;
17  }
18}
19
20pub(crate) mod rust {
21  use crate::activity::apply_ssim_boost;
22  use crate::cpu_features::CpuFeatureLevel;
23  use crate::tiling::*;
24  use crate::util::*;
25
26  use crate::encoder::IMPORTANCE_BLOCK_SIZE;
27  use crate::rdo::DistortionScale;
28
29  /// Compute the sum of absolute differences over a block.
30  /// w and h can be at most 128, the size of the largest block.
31  pub fn get_sad<T: Pixel>(
32    plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
33    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
34  ) -> u32 {
35    debug_assert!(w <= 128 && h <= 128);
36    let plane_org =
37      plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
38    let plane_ref =
39      plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
40
41    plane_org
42      .rows_iter()
43      .zip(plane_ref.rows_iter())
44      .map(|(src, dst)| {
45        src
46          .iter()
47          .zip(dst)
48          .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2)))
49          .sum::<u32>()
50      })
51      .sum()
52  }
53
54  #[inline(always)]
55  const fn butterfly(a: i32, b: i32) -> (i32, i32) {
56    ((a + b), (a - b))
57  }
58
59  #[inline(always)]
60  #[allow(clippy::identity_op, clippy::erasing_op)]
61  fn hadamard4_1d<
62    const LEN: usize,
63    const N: usize,
64    const STRIDE0: usize,
65    const STRIDE1: usize,
66  >(
67    data: &mut [i32; LEN],
68  ) {
69    for i in 0..N {
70      let sub: &mut [i32] = &mut data[i * STRIDE0..];
71      let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
72      let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
73      let (b0, b2) = butterfly(a0, a2);
74      let (b1, b3) = butterfly(a1, a3);
75      sub[0 * STRIDE1] = b0;
76      sub[1 * STRIDE1] = b1;
77      sub[2 * STRIDE1] = b2;
78      sub[3 * STRIDE1] = b3;
79    }
80  }
81
82  #[inline(always)]
83  #[allow(clippy::identity_op, clippy::erasing_op)]
84  fn hadamard8_1d<
85    const LEN: usize,
86    const N: usize,
87    const STRIDE0: usize,
88    const STRIDE1: usize,
89  >(
90    data: &mut [i32; LEN],
91  ) {
92    for i in 0..N {
93      let sub: &mut [i32] = &mut data[i * STRIDE0..];
94
95      let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
96      let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
97      let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]);
98      let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]);
99
100      let (b0, b2) = butterfly(a0, a2);
101      let (b1, b3) = butterfly(a1, a3);
102      let (b4, b6) = butterfly(a4, a6);
103      let (b5, b7) = butterfly(a5, a7);
104
105      let (c0, c4) = butterfly(b0, b4);
106      let (c1, c5) = butterfly(b1, b5);
107      let (c2, c6) = butterfly(b2, b6);
108      let (c3, c7) = butterfly(b3, b7);
109
110      sub[0 * STRIDE1] = c0;
111      sub[1 * STRIDE1] = c1;
112      sub[2 * STRIDE1] = c2;
113      sub[3 * STRIDE1] = c3;
114      sub[4 * STRIDE1] = c4;
115      sub[5 * STRIDE1] = c5;
116      sub[6 * STRIDE1] = c6;
117      sub[7 * STRIDE1] = c7;
118    }
119  }
120
121  #[inline(always)]
122  fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
123    data: &mut [i32; LEN],
124  ) {
125    /*Vertical transform.*/
126    let vert_func = if H == 4 {
127      hadamard4_1d::<LEN, W, 1, H>
128    } else {
129      hadamard8_1d::<LEN, W, 1, H>
130    };
131    vert_func(data);
132    /*Horizontal transform.*/
133    let horz_func = if W == 4 {
134      hadamard4_1d::<LEN, H, W, 1>
135    } else {
136      hadamard8_1d::<LEN, H, W, 1>
137    };
138    horz_func(data);
139  }
140
141  // SAFETY: The length of data must be 16.
142  unsafe fn hadamard4x4(data: &mut [i32]) {
143    hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16]));
144  }
145
146  // SAFETY: The length of data must be 64.
147  unsafe fn hadamard8x8(data: &mut [i32]) {
148    hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64]));
149  }
150
151  /// Sum of absolute transformed differences over a block.
152  /// w and h can be at most 128, the size of the largest block.
153  /// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but
154  /// revert to sad on edges when these transforms do not fit into w and h.
155  /// 4x4 transforms instead of 8x8 transforms when width or height < 8.
156  pub fn get_satd<T: Pixel>(
157    plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
158    h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
159  ) -> u32 {
160    assert!(w <= 128 && h <= 128);
161    assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
162    assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
163
164    // Size of hadamard transform should be 4x4 or 8x8
165    // 4x* and *x4 use 4x4 and all other use 8x8
166    let size: usize = w.min(h).min(8);
167    let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
168
169    let mut sum: u64 = 0;
170
171    // Loop over chunks the size of the chosen transform
172    for chunk_y in (0..h).step_by(size) {
173      let chunk_h = (h - chunk_y).min(size);
174      for chunk_x in (0..w).step_by(size) {
175        let chunk_w = (w - chunk_x).min(size);
176        let chunk_area: Area = Area::Rect {
177          x: chunk_x as isize,
178          y: chunk_y as isize,
179          width: chunk_w,
180          height: chunk_h,
181        };
182        let chunk_org = plane_org.subregion(chunk_area);
183        let chunk_ref = plane_ref.subregion(chunk_area);
184
185        // Revert to sad on edge blocks (frame edges)
186        if chunk_w != size || chunk_h != size {
187          sum += get_sad(
188            &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
189          ) as u64;
190          continue;
191        }
192
193        let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
194
195        // Move the difference of the transforms to a buffer
196        for (row_diff, (row_org, row_ref)) in buf
197          .chunks_mut(size)
198          .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
199        {
200          for (diff, (a, b)) in
201            row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
202          {
203            *diff = i32::cast_from(*a) - i32::cast_from(*b);
204          }
205        }
206
207        // Perform the hadamard transform on the differences
208        // SAFETY: A sufficient number elements exist for the size of the transform.
209        unsafe {
210          tx2d(buf);
211        }
212
213        // Sum the absolute values of the transformed differences
214        sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::<u64>();
215      }
216    }
217
218    // Normalize the results
219    let ln = msb(size as i32) as u64;
220    ((sum + (1 << ln >> 1)) >> ln) as u32
221  }
222
223  /// Number of bits rounded off before summing in `get_weighted_sse`
224  pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8;
225
226  /// Computes weighted sum of squared error.
227  ///
228  /// Each scale is applied to a 4x4 region in the provided inputs. Each scale
229  /// value is a fixed point number, currently [`DistortionScale`].
230  ///
231  /// Implementations can require alignment (`bw` (block width) for [`src1`] and
232  /// [`src2`] and `bw/4` for `scale`).
233  #[inline(never)]
234  pub fn get_weighted_sse<T: Pixel>(
235    src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
236    scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
237    _cpu: CpuFeatureLevel,
238  ) -> u64 {
239    let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
240    // Always chunk and apply scaling on the sse of squares the size of
241    // decimated/sub-sampled importance block sizes.
242    // Warning: Changing this will require changing/disabling assembly.
243    let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1;
244
245    // Iterator of a row of scales, stretched out to be per row
246    let scales = scale.chunks_exact(scale_stride);
247
248    let sse = src1
249      .vert_windows(chunk_size)
250      .step_by(chunk_size)
251      .zip(src2.vert_windows(chunk_size).step_by(chunk_size))
252      .zip(scales)
253      .map(|((row1, row2), scales)| {
254        row1
255          .horz_windows(chunk_size)
256          .step_by(chunk_size)
257          .zip(row2.horz_windows(chunk_size).step_by(chunk_size))
258          .zip(scales)
259          .map(|((chunk1, chunk2), &scale)| {
260            let sum = chunk1
261              .rows_iter()
262              .zip(chunk2.rows_iter())
263              .map(|(chunk_row1, chunk_row2)| {
264                chunk_row1
265                  .iter()
266                  .zip(chunk_row2)
267                  .map(|(&a, &b)| {
268                    let c = i32::cast_from(a) - i32::cast_from(b);
269                    (c * c) as u32
270                  })
271                  .sum::<u32>()
272              })
273              .sum::<u32>();
274            (sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1))
275              >> GET_WEIGHTED_SSE_SHIFT
276          })
277          .sum::<u64>()
278      })
279      .sum::<u64>();
280
281    let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64;
282    (sse + (den >> 1)) / den
283  }
284
285  /// Number of bits of precision used in `AREA_DIVISORS`
286  const AREA_DIVISOR_BITS: u8 = 14;
287
288  /// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x)
289  #[rustfmt::skip]
290  const AREA_DIVISORS: [u16; 64] = [
291    16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365,
292     1260, 1170, 1092, 1024,  964,  910,  862,  819,  780,  745,  712,  683,
293      655,  630,  607,  585,  565,  546,  529,  512,  496,  482,  468,  455,
294      443,  431,  420,  410,  400,  390,  381,  372,  364,  356,  349,  341,
295      334,  328,  321,  315,  309,  303,  298,  293,  287,  282,  278,  273,
296      269,  264,  260,  256,
297  ];
298
299  /// Computes a distortion metric of the sum of squares weighted by activity.
300  /// w and h should be <= 8.
301  #[inline(never)]
302  pub fn cdef_dist_kernel<T: Pixel>(
303    src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
304    bit_depth: usize, _cpu: CpuFeatureLevel,
305  ) -> u32 {
306    // TODO: Investigate using different constants in ssim boost for block sizes
307    // smaller than 8x8.
308
309    debug_assert!(src.plane_cfg.xdec == 0);
310    debug_assert!(src.plane_cfg.ydec == 0);
311    debug_assert!(dst.plane_cfg.xdec == 0);
312    debug_assert!(dst.plane_cfg.ydec == 0);
313
314    // Limit kernel to 8x8
315    debug_assert!(w <= 8);
316    debug_assert!(h <= 8);
317
318    // Compute the following summations.
319    let mut sum_s: u32 = 0; // sum(src_{i,j})
320    let mut sum_d: u32 = 0; // sum(dst_{i,j})
321    let mut sum_s2: u32 = 0; // sum(src_{i,j}^2)
322    let mut sum_d2: u32 = 0; // sum(dst_{i,j}^2)
323    let mut sum_sd: u32 = 0; // sum(src_{i,j} * dst_{i,j})
324    for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
325      for (s, d) in row1[..w].iter().zip(row2) {
326        let s: u32 = u32::cast_from(*s);
327        let d: u32 = u32::cast_from(*d);
328        sum_s += s;
329        sum_d += d;
330
331        sum_s2 += s * s;
332        sum_d2 += d * d;
333        sum_sd += s * d;
334      }
335    }
336
337    // To get the distortion, compute sum of squared error and apply a weight
338    // based on the variance of the two planes.
339    let sse = sum_d2 + sum_s2 - 2 * sum_sd;
340
341    // Convert to 64-bits to avoid overflow when squaring
342    let sum_s = sum_s as u64;
343    let sum_d = sum_d as u64;
344
345    // Calculate the variance (more accurately variance*area) of each plane.
346    // var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2
347    //    (n = # samples i.e. area)
348    // var * n = sum(X^2) - sum(X)^2 / n
349    // When w and h are powers of two, this can be done via shifting.
350    let div = AREA_DIVISORS[w * h - 1] as u64;
351    let div_shift = AREA_DIVISOR_BITS;
352    // Due to rounding, negative values can occur when w or h aren't powers of
353    // two. Saturate to avoid underflow.
354    let mut svar = sum_s2.saturating_sub(
355      ((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32,
356    );
357    let mut dvar = sum_d2.saturating_sub(
358      ((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32,
359    );
360
361    // Scale variances up to 8x8 size.
362    //   scaled variance = var * (8x8) / wxh
363    // For 8x8, this is a nop. For powers of 2, this is doable with shifting.
364    // TODO: It should be possible and faster to do this adjustment in ssim boost
365    let scale_shift = AREA_DIVISOR_BITS - 6;
366    svar =
367      ((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
368    dvar =
369      ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
370
371    apply_ssim_boost(sse, svar, dvar, bit_depth)
372  }
373}
374
375#[cfg(test)]
376pub mod test {
377  use super::*;
378  use crate::cpu_features::CpuFeatureLevel;
379  use crate::frame::*;
380  use crate::tiling::Area;
381  use crate::util::Pixel;
382
383  // Generate plane data for get_sad_same()
384  fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
385    // Two planes with different strides
386    let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
387    let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);
388
389    // Make the test pattern robust to data alignment
390    let xpad_off =
391      (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;
392
393    for (i, row) in
394      input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
395    {
396      for (j, pixel) in row.iter_mut().enumerate() {
397        let val = ((j + i) as i32 - xpad_off) & 255i32;
398        assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
399        *pixel = T::cast_from(val);
400      }
401    }
402
403    for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
404    {
405      for (j, pixel) in row.iter_mut().enumerate() {
406        let val = (j as i32 - i as i32 - xpad_off) & 255i32;
407        assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
408        *pixel = T::cast_from(val);
409      }
410    }
411
412    (input_plane, rec_plane)
413  }
414
415  // Regression and validation test for SAD computation
416  fn get_sad_same_inner<T: Pixel>() {
417    // dynamic allocation: test
418    let blocks: Vec<(usize, usize, u32)> = vec![
419      (4, 4, 1912),
420      (4, 8, 4296),
421      (8, 4, 3496),
422      (8, 8, 7824),
423      (8, 16, 16592),
424      (16, 8, 14416),
425      (16, 16, 31136),
426      (16, 32, 60064),
427      (32, 16, 59552),
428      (32, 32, 120128),
429      (32, 64, 186688),
430      (64, 32, 250176),
431      (64, 64, 438912),
432      (64, 128, 654272),
433      (128, 64, 1016768),
434      (128, 128, 1689792),
435      (4, 16, 8680),
436      (16, 4, 6664),
437      (8, 32, 31056),
438      (32, 8, 27600),
439      (16, 64, 93344),
440      (64, 16, 116384),
441    ];
442
443    let bit_depth: usize = 8;
444    let (input_plane, rec_plane) = setup_planes::<T>();
445
446    for (w, h, distortion) in blocks {
447      let area = Area::StartingAt { x: 32, y: 40 };
448
449      let input_region = input_plane.region(area);
450      let rec_region = rec_plane.region(area);
451
452      assert_eq!(
453        distortion,
454        get_sad(
455          &input_region,
456          &rec_region,
457          w,
458          h,
459          bit_depth,
460          CpuFeatureLevel::default()
461        )
462      );
463    }
464  }
465
466  #[test]
467  fn get_sad_same_u8() {
468    get_sad_same_inner::<u8>();
469  }
470
471  #[test]
472  fn get_sad_same_u16() {
473    get_sad_same_inner::<u16>();
474  }
475
476  fn get_satd_same_inner<T: Pixel>() {
477    let blocks: Vec<(usize, usize, u32)> = vec![
478      (4, 4, 1408),
479      (4, 8, 2016),
480      (8, 4, 1816),
481      (8, 8, 3984),
482      (8, 16, 5136),
483      (16, 8, 4864),
484      (16, 16, 9984),
485      (16, 32, 13824),
486      (32, 16, 13760),
487      (32, 32, 27952),
488      (32, 64, 37168),
489      (64, 32, 45104),
490      (64, 64, 84176),
491      (64, 128, 127920),
492      (128, 64, 173680),
493      (128, 128, 321456),
494      (4, 16, 3136),
495      (16, 4, 2632),
496      (8, 32, 7056),
497      (32, 8, 6624),
498      (16, 64, 18432),
499      (64, 16, 21312),
500    ];
501
502    let bit_depth: usize = 8;
503    let (input_plane, rec_plane) = setup_planes::<T>();
504
505    for (w, h, distortion) in blocks {
506      let area = Area::StartingAt { x: 32, y: 40 };
507
508      let input_region = input_plane.region(area);
509      let rec_region = rec_plane.region(area);
510
511      assert_eq!(
512        distortion,
513        get_satd(
514          &input_region,
515          &rec_region,
516          w,
517          h,
518          bit_depth,
519          CpuFeatureLevel::default()
520        )
521      );
522    }
523  }
524
525  #[test]
526  fn get_satd_same_u8() {
527    get_satd_same_inner::<u8>();
528  }
529
530  #[test]
531  fn get_satd_same_u16() {
532    get_satd_same_inner::<u16>();
533  }
534}