1cfg_if::cfg_if! {
11 if #[cfg(nasm_x86_64)] {
12 pub use crate::asm::x86::dist::*;
13 } else if #[cfg(asm_neon)] {
14 pub use crate::asm::aarch64::dist::*;
15 } else {
16 pub use self::rust::*;
17 }
18}
19
20pub(crate) mod rust {
21 use crate::activity::apply_ssim_boost;
22 use crate::cpu_features::CpuFeatureLevel;
23 use crate::tiling::*;
24 use crate::util::*;
25
26 use crate::encoder::IMPORTANCE_BLOCK_SIZE;
27 use crate::rdo::DistortionScale;
28
29 pub fn get_sad<T: Pixel>(
32 plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
33 h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
34 ) -> u32 {
35 debug_assert!(w <= 128 && h <= 128);
36 let plane_org =
37 plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
38 let plane_ref =
39 plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
40
41 plane_org
42 .rows_iter()
43 .zip(plane_ref.rows_iter())
44 .map(|(src, dst)| {
45 src
46 .iter()
47 .zip(dst)
48 .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2)))
49 .sum::<u32>()
50 })
51 .sum()
52 }
53
54 #[inline(always)]
55 const fn butterfly(a: i32, b: i32) -> (i32, i32) {
56 ((a + b), (a - b))
57 }
58
59 #[inline(always)]
60 #[allow(clippy::identity_op, clippy::erasing_op)]
61 fn hadamard4_1d<
62 const LEN: usize,
63 const N: usize,
64 const STRIDE0: usize,
65 const STRIDE1: usize,
66 >(
67 data: &mut [i32; LEN],
68 ) {
69 for i in 0..N {
70 let sub: &mut [i32] = &mut data[i * STRIDE0..];
71 let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
72 let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
73 let (b0, b2) = butterfly(a0, a2);
74 let (b1, b3) = butterfly(a1, a3);
75 sub[0 * STRIDE1] = b0;
76 sub[1 * STRIDE1] = b1;
77 sub[2 * STRIDE1] = b2;
78 sub[3 * STRIDE1] = b3;
79 }
80 }
81
82 #[inline(always)]
83 #[allow(clippy::identity_op, clippy::erasing_op)]
84 fn hadamard8_1d<
85 const LEN: usize,
86 const N: usize,
87 const STRIDE0: usize,
88 const STRIDE1: usize,
89 >(
90 data: &mut [i32; LEN],
91 ) {
92 for i in 0..N {
93 let sub: &mut [i32] = &mut data[i * STRIDE0..];
94
95 let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]);
96 let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]);
97 let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]);
98 let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]);
99
100 let (b0, b2) = butterfly(a0, a2);
101 let (b1, b3) = butterfly(a1, a3);
102 let (b4, b6) = butterfly(a4, a6);
103 let (b5, b7) = butterfly(a5, a7);
104
105 let (c0, c4) = butterfly(b0, b4);
106 let (c1, c5) = butterfly(b1, b5);
107 let (c2, c6) = butterfly(b2, b6);
108 let (c3, c7) = butterfly(b3, b7);
109
110 sub[0 * STRIDE1] = c0;
111 sub[1 * STRIDE1] = c1;
112 sub[2 * STRIDE1] = c2;
113 sub[3 * STRIDE1] = c3;
114 sub[4 * STRIDE1] = c4;
115 sub[5 * STRIDE1] = c5;
116 sub[6 * STRIDE1] = c6;
117 sub[7 * STRIDE1] = c7;
118 }
119 }
120
121 #[inline(always)]
122 fn hadamard2d<const LEN: usize, const W: usize, const H: usize>(
123 data: &mut [i32; LEN],
124 ) {
125 let vert_func = if H == 4 {
127 hadamard4_1d::<LEN, W, 1, H>
128 } else {
129 hadamard8_1d::<LEN, W, 1, H>
130 };
131 vert_func(data);
132 let horz_func = if W == 4 {
134 hadamard4_1d::<LEN, H, W, 1>
135 } else {
136 hadamard8_1d::<LEN, H, W, 1>
137 };
138 horz_func(data);
139 }
140
141 unsafe fn hadamard4x4(data: &mut [i32]) {
143 hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16]));
144 }
145
146 unsafe fn hadamard8x8(data: &mut [i32]) {
148 hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64]));
149 }
150
151 pub fn get_satd<T: Pixel>(
157 plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize,
158 h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel,
159 ) -> u32 {
160 assert!(w <= 128 && h <= 128);
161 assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
162 assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
163
164 let size: usize = w.min(h).min(8);
167 let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
168
169 let mut sum: u64 = 0;
170
171 for chunk_y in (0..h).step_by(size) {
173 let chunk_h = (h - chunk_y).min(size);
174 for chunk_x in (0..w).step_by(size) {
175 let chunk_w = (w - chunk_x).min(size);
176 let chunk_area: Area = Area::Rect {
177 x: chunk_x as isize,
178 y: chunk_y as isize,
179 width: chunk_w,
180 height: chunk_h,
181 };
182 let chunk_org = plane_org.subregion(chunk_area);
183 let chunk_ref = plane_ref.subregion(chunk_area);
184
185 if chunk_w != size || chunk_h != size {
187 sum += get_sad(
188 &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu,
189 ) as u64;
190 continue;
191 }
192
193 let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
194
195 for (row_diff, (row_org, row_ref)) in buf
197 .chunks_mut(size)
198 .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
199 {
200 for (diff, (a, b)) in
201 row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
202 {
203 *diff = i32::cast_from(*a) - i32::cast_from(*b);
204 }
205 }
206
207 unsafe {
210 tx2d(buf);
211 }
212
213 sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::<u64>();
215 }
216 }
217
218 let ln = msb(size as i32) as u64;
220 ((sum + (1 << ln >> 1)) >> ln) as u32
221 }
222
223 pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8;
225
226 #[inline(never)]
234 pub fn get_weighted_sse<T: Pixel>(
235 src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
236 scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
237 _cpu: CpuFeatureLevel,
238 ) -> u64 {
239 let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h });
240 let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1;
244
245 let scales = scale.chunks_exact(scale_stride);
247
248 let sse = src1
249 .vert_windows(chunk_size)
250 .step_by(chunk_size)
251 .zip(src2.vert_windows(chunk_size).step_by(chunk_size))
252 .zip(scales)
253 .map(|((row1, row2), scales)| {
254 row1
255 .horz_windows(chunk_size)
256 .step_by(chunk_size)
257 .zip(row2.horz_windows(chunk_size).step_by(chunk_size))
258 .zip(scales)
259 .map(|((chunk1, chunk2), &scale)| {
260 let sum = chunk1
261 .rows_iter()
262 .zip(chunk2.rows_iter())
263 .map(|(chunk_row1, chunk_row2)| {
264 chunk_row1
265 .iter()
266 .zip(chunk_row2)
267 .map(|(&a, &b)| {
268 let c = i32::cast_from(a) - i32::cast_from(b);
269 (c * c) as u32
270 })
271 .sum::<u32>()
272 })
273 .sum::<u32>();
274 (sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1))
275 >> GET_WEIGHTED_SSE_SHIFT
276 })
277 .sum::<u64>()
278 })
279 .sum::<u64>();
280
281 let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64;
282 (sse + (den >> 1)) / den
283 }
284
285 const AREA_DIVISOR_BITS: u8 = 14;
287
288 #[rustfmt::skip]
290 const AREA_DIVISORS: [u16; 64] = [
291 16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365,
292 1260, 1170, 1092, 1024, 964, 910, 862, 819, 780, 745, 712, 683,
293 655, 630, 607, 585, 565, 546, 529, 512, 496, 482, 468, 455,
294 443, 431, 420, 410, 400, 390, 381, 372, 364, 356, 349, 341,
295 334, 328, 321, 315, 309, 303, 298, 293, 287, 282, 278, 273,
296 269, 264, 260, 256,
297 ];
298
299 #[inline(never)]
302 pub fn cdef_dist_kernel<T: Pixel>(
303 src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize,
304 bit_depth: usize, _cpu: CpuFeatureLevel,
305 ) -> u32 {
306 debug_assert!(src.plane_cfg.xdec == 0);
310 debug_assert!(src.plane_cfg.ydec == 0);
311 debug_assert!(dst.plane_cfg.xdec == 0);
312 debug_assert!(dst.plane_cfg.ydec == 0);
313
314 debug_assert!(w <= 8);
316 debug_assert!(h <= 8);
317
318 let mut sum_s: u32 = 0; let mut sum_d: u32 = 0; let mut sum_s2: u32 = 0; let mut sum_d2: u32 = 0; let mut sum_sd: u32 = 0; for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) {
325 for (s, d) in row1[..w].iter().zip(row2) {
326 let s: u32 = u32::cast_from(*s);
327 let d: u32 = u32::cast_from(*d);
328 sum_s += s;
329 sum_d += d;
330
331 sum_s2 += s * s;
332 sum_d2 += d * d;
333 sum_sd += s * d;
334 }
335 }
336
337 let sse = sum_d2 + sum_s2 - 2 * sum_sd;
340
341 let sum_s = sum_s as u64;
343 let sum_d = sum_d as u64;
344
345 let div = AREA_DIVISORS[w * h - 1] as u64;
351 let div_shift = AREA_DIVISOR_BITS;
352 let mut svar = sum_s2.saturating_sub(
355 ((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32,
356 );
357 let mut dvar = sum_d2.saturating_sub(
358 ((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32,
359 );
360
361 let scale_shift = AREA_DIVISOR_BITS - 6;
366 svar =
367 ((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
368 dvar =
369 ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32;
370
371 apply_ssim_boost(sse, svar, dvar, bit_depth)
372 }
373}
374
375#[cfg(test)]
376pub mod test {
377 use super::*;
378 use crate::cpu_features::CpuFeatureLevel;
379 use crate::frame::*;
380 use crate::tiling::Area;
381 use crate::util::Pixel;
382
383 fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
385 let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
387 let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);
388
389 let xpad_off =
391 (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;
392
393 for (i, row) in
394 input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
395 {
396 for (j, pixel) in row.iter_mut().enumerate() {
397 let val = ((j + i) as i32 - xpad_off) & 255i32;
398 assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
399 *pixel = T::cast_from(val);
400 }
401 }
402
403 for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
404 {
405 for (j, pixel) in row.iter_mut().enumerate() {
406 let val = (j as i32 - i as i32 - xpad_off) & 255i32;
407 assert!(val >= u8::MIN.into() && val <= u8::MAX.into());
408 *pixel = T::cast_from(val);
409 }
410 }
411
412 (input_plane, rec_plane)
413 }
414
415 fn get_sad_same_inner<T: Pixel>() {
417 let blocks: Vec<(usize, usize, u32)> = vec![
419 (4, 4, 1912),
420 (4, 8, 4296),
421 (8, 4, 3496),
422 (8, 8, 7824),
423 (8, 16, 16592),
424 (16, 8, 14416),
425 (16, 16, 31136),
426 (16, 32, 60064),
427 (32, 16, 59552),
428 (32, 32, 120128),
429 (32, 64, 186688),
430 (64, 32, 250176),
431 (64, 64, 438912),
432 (64, 128, 654272),
433 (128, 64, 1016768),
434 (128, 128, 1689792),
435 (4, 16, 8680),
436 (16, 4, 6664),
437 (8, 32, 31056),
438 (32, 8, 27600),
439 (16, 64, 93344),
440 (64, 16, 116384),
441 ];
442
443 let bit_depth: usize = 8;
444 let (input_plane, rec_plane) = setup_planes::<T>();
445
446 for (w, h, distortion) in blocks {
447 let area = Area::StartingAt { x: 32, y: 40 };
448
449 let input_region = input_plane.region(area);
450 let rec_region = rec_plane.region(area);
451
452 assert_eq!(
453 distortion,
454 get_sad(
455 &input_region,
456 &rec_region,
457 w,
458 h,
459 bit_depth,
460 CpuFeatureLevel::default()
461 )
462 );
463 }
464 }
465
466 #[test]
467 fn get_sad_same_u8() {
468 get_sad_same_inner::<u8>();
469 }
470
471 #[test]
472 fn get_sad_same_u16() {
473 get_sad_same_inner::<u16>();
474 }
475
476 fn get_satd_same_inner<T: Pixel>() {
477 let blocks: Vec<(usize, usize, u32)> = vec![
478 (4, 4, 1408),
479 (4, 8, 2016),
480 (8, 4, 1816),
481 (8, 8, 3984),
482 (8, 16, 5136),
483 (16, 8, 4864),
484 (16, 16, 9984),
485 (16, 32, 13824),
486 (32, 16, 13760),
487 (32, 32, 27952),
488 (32, 64, 37168),
489 (64, 32, 45104),
490 (64, 64, 84176),
491 (64, 128, 127920),
492 (128, 64, 173680),
493 (128, 128, 321456),
494 (4, 16, 3136),
495 (16, 4, 2632),
496 (8, 32, 7056),
497 (32, 8, 6624),
498 (16, 64, 18432),
499 (64, 16, 21312),
500 ];
501
502 let bit_depth: usize = 8;
503 let (input_plane, rec_plane) = setup_planes::<T>();
504
505 for (w, h, distortion) in blocks {
506 let area = Area::StartingAt { x: 32, y: 40 };
507
508 let input_region = input_plane.region(area);
509 let rec_region = rec_plane.region(area);
510
511 assert_eq!(
512 distortion,
513 get_satd(
514 &input_region,
515 &rec_region,
516 w,
517 h,
518 bit_depth,
519 CpuFeatureLevel::default()
520 )
521 );
522 }
523 }
524
525 #[test]
526 fn get_satd_same_u8() {
527 get_satd_same_inner::<u8>();
528 }
529
530 #[test]
531 fn get_satd_same_u16() {
532 get_satd_same_inner::<u16>();
533 }
534}