Skip to main content

av_scenechange/analyze/
inter.rs

1use std::sync::Arc;
2
3use aligned::{Aligned, A64};
4use arrayvec::ArrayVec;
5use num_rational::Rational32;
6use rayon::iter::{IntoParallelIterator, ParallelIterator};
7use v_frame::{
8    frame::Frame,
9    math::{clamp, ILog},
10    pixel::{ChromaSampling, Pixel},
11    plane::{Plane, PlaneConfig, PlaneOffset},
12};
13
14use super::importance::{
15    IMPORTANCE_BLOCK_SIZE,
16    IMP_BLOCK_MV_UNITS_PER_PIXEL,
17    IMP_BLOCK_SIZE_IN_MV_UNITS,
18};
19use crate::{
20    cpu::CpuFeatureLevel,
21    data::{
22        block::{BlockOffset, BlockSize, MIB_SIZE_LOG2},
23        frame::{FrameInvariants, FrameState, RefType, ALLOWED_REF_FRAMES},
24        motion::{
25            MEStats,
26            MVSamplingMode,
27            MotionEstimationSubsets,
28            MotionVector,
29            ReadGuardMEStats,
30            RefMEStats,
31            TileMEStats,
32            MV_LOW,
33            MV_UPP,
34        },
35        plane::{Area, AsRegion, PlaneBlockOffset, PlaneRegion, PlaneRegionMut, Rect},
36        prediction::PredictionMode,
37        sad::get_sad,
38        satd::get_satd,
39        superblock::{
40            SuperBlockOffset,
41            TileSuperBlockOffset,
42            MAX_SB_SIZE_LOG2,
43            MI_SIZE,
44            MI_SIZE_LOG2,
45            SB_SIZE,
46        },
47        tile::{TileBlockOffset, TileRect, TileStateMut, TilingInfo},
48    },
49};
50
51/// Declares an array of motion vectors in structure of arrays syntax.
52macro_rules! search_pattern_subpel {
53    ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => {
54      [ $(MotionVector { $field_a: $ll_a, $field_b: $ll_b } ),*]
55    };
56}
57
58/// Declares an array of motion vectors in structure of arrays syntax.
59/// Compared to [`search_pattern_subpel`], this version creates motion vectors
60/// in fullpel resolution (x8).
61macro_rules! search_pattern {
62    ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => {
63      [ $(MotionVector { $field_a: $ll_a << 3, $field_b: $ll_b << 3 } ),*]
64    };
65}
66
67/// Diamond pattern of radius 1 as shown below. For fullpel search, use
68/// `DIAMOND_R1_PATTERN_FULLPEL` since it has been scaled for fullpel search.
69/// ```text
70///  X
71/// XoX
72///  X
73/// ```
74/// 'X's are motion candidates and the 'o' is the center.
75const DIAMOND_R1_PATTERN_SUBPEL: [MotionVector; 4] = search_pattern_subpel!(
76  col: [  0,  1,  0, -1],
77  row: [  1,  0, -1,  0]
78);
79
80/// Diamond pattern of radius 1 as shown below. Unlike `DIAMOND_R1_PATTERN`, the
81/// vectors have been shifted fullpel scale.
82/// ```text
83///  X
84/// XoX
85///  X
86/// ```
87/// 'X's are motion candidates and the 'o' is the center.
88const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!(
89  col: [  0,  1,  0, -1],
90  row: [  1,  0, -1,  0]
91);
92
93/// Uneven multi-hexagon search pattern around a center point. Used for locating
94/// irregular movement.
95/// ```text
96///      X
97///    X   X
98///  X       X
99///  X       X
100///  X   o   X
101///  X       X
102///  X       X
103///    X   X
104///      X
105/// ```
106/// 'X's are motion candidates and the 'o' is the center.
107const UMH_PATTERN: [MotionVector; 16] = search_pattern!(
108  col: [ -2, -1,  0,  1,  2,  3,  4,  3,  2,  1,  0, -1, -2,  3, -4, -3],
109  row: [  4,  4,  4,  4,  4,  2,  0, -2, -4, -4, -4, -4, -4, -2,  0,  2]
110);
111
112/// A hexagon pattern around a center point. The pattern is ordered so that the
113/// offsets circle around the center. This is done to allow pruning locations
114/// covered by the last iteration.
115/// ```text
116///   21012
117/// 2  X X
118/// 1
119/// 0 X o X
120/// 1
121/// 2  X X
122/// ```
123/// 'X's are motion candidates and the 'o' is the center.
124///
125/// The illustration below shows the process of a hexagon search.
126/// ```text
127/// Step 1    Step 2
128///  1 1       1 1 2
129///
130/// 1(0)1  => 1 0(1)2
131///
132///  1 1       1 1 2
133/// ```
134/// The search above has gone through the following steps.
135/// 1. Search '1' elements for better candidates than the center '0'.
136/// 2. Recenter around the best candidate ('(1)') and hexagon candidates that
137///    don't overlap with the previous search step (labeled '2').
138const HEXAGON_PATTERN: [MotionVector; 6] = search_pattern!(
139  col: [  0,  2,  2,  0, -2, -2],
140  row: [ -2, -1,  1,  2,  1, -1]
141);
142
143/// A small square pattern around a center point.
144/// ```text
145///   101
146/// 1 XXX
147/// 0 XoX
148/// 1 XXX
149/// ```
150/// 'X's are motion candidates and the 'o' is the center.
151const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!(
152  col: [ -1,  0,  1, -1,  1, -1,  0,  1],
153  row: [  1,  1,  1,  0,  0, -1, -1, -1]
154);
155
156pub(crate) fn estimate_inter_costs<T: Pixel>(
157    frame: Arc<Frame<T>>,
158    ref_frame: Arc<Frame<T>>,
159    bit_depth: usize,
160    frame_rate: Rational32,
161    chroma_sampling: ChromaSampling,
162    buffer: RefMEStats,
163    cpu_feature_level: CpuFeatureLevel,
164) -> f64 {
165    let last_fi =
166        FrameInvariants::new_key_frame(frame.planes[0].cfg.width, frame.planes[0].cfg.height);
167    let fi = FrameInvariants::new_inter_frame(&last_fi, 1).unwrap();
168
169    // Compute the motion vectors.
170    let mut fs = FrameState::new_with_frame_and_me_stats_and_rec(Arc::clone(&frame), buffer);
171    let mut tiling = TilingInfo::from_target_tiles(
172        frame.planes[0].cfg.width,
173        frame.planes[0].cfg.height,
174        *frame_rate.numer() as f64 / *frame_rate.denom() as f64,
175        TilingInfo::tile_log2(1, 0).unwrap(),
176        TilingInfo::tile_log2(1, 0).unwrap(),
177        chroma_sampling == ChromaSampling::Cs422,
178    );
179    compute_motion_vectors(&fi, &mut fs, &mut tiling, bit_depth, cpu_feature_level);
180
181    // Estimate inter costs
182    let plane_org = &frame.planes[0];
183    let plane_ref = &ref_frame.planes[0];
184    let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE;
185    let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE;
186    let stats = &fs.frame_me_stats.read().expect("poisoned lock")[0];
187    let bsize = BlockSize::from_width_and_height(IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE);
188
189    let mut inter_costs = 0;
190    (0..h_in_imp_b).for_each(|y| {
191        (0..w_in_imp_b).for_each(|x| {
192            let mv = stats[y * 2][x * 2].mv;
193
194            // Coordinates of the top-left corner of the reference block, in MV
195            // units.
196            let reference_x = x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64;
197            let reference_y = y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64;
198
199            let region_org = plane_org.region(Area::Rect(Rect {
200                x: (x * IMPORTANCE_BLOCK_SIZE) as isize,
201                y: (y * IMPORTANCE_BLOCK_SIZE) as isize,
202                width: IMPORTANCE_BLOCK_SIZE,
203                height: IMPORTANCE_BLOCK_SIZE,
204            }));
205
206            let region_ref = plane_ref.region(Area::Rect(Rect {
207                x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize,
208                y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize,
209                width: IMPORTANCE_BLOCK_SIZE,
210                height: IMPORTANCE_BLOCK_SIZE,
211            }));
212
213            inter_costs += get_satd(
214                &region_org,
215                &region_ref,
216                bsize.width(),
217                bsize.height(),
218                bit_depth,
219                cpu_feature_level,
220            ) as u64;
221        });
222    });
223    inter_costs as f64 / (w_in_imp_b * h_in_imp_b) as f64
224}
225
226fn compute_motion_vectors<T: Pixel>(
227    fi: &FrameInvariants<T>,
228    fs: &mut FrameState<T>,
229    tiling_info: &mut TilingInfo,
230    bit_depth: usize,
231    cpu_feature_level: CpuFeatureLevel,
232) {
233    tiling_info
234        .tile_iter_mut(fs)
235        .collect::<Vec<_>>()
236        .into_par_iter()
237        .for_each(|mut ctx| {
238            let ts = &mut ctx.ts;
239            estimate_tile_motion(fi, ts, bit_depth, cpu_feature_level);
240        });
241}
242
243fn estimate_tile_motion<T: Pixel>(
244    fi: &FrameInvariants<T>,
245    ts: &mut TileStateMut<'_, T>,
246    bit_depth: usize,
247    cpu_feature_level: CpuFeatureLevel,
248) {
249    let init_size = MIB_SIZE_LOG2;
250
251    let mut prev_ssdec: Option<u8> = None;
252    for mv_size_in_b_log2 in (2..=init_size).rev() {
253        let init = mv_size_in_b_log2 == init_size;
254
255        // Choose subsampling. Pass one is quarter res and pass two is at half res.
256        let ssdec = match init_size - mv_size_in_b_log2 {
257            0 => 2,
258            1 => 1,
259            _ => 0,
260        };
261
262        let new_subsampling = if let Some(prev) = prev_ssdec {
263            prev != ssdec
264        } else {
265            false
266        };
267        prev_ssdec = Some(ssdec);
268
269        // 0.5 and 0.125 are a fudge factors
270        let lambda = 0;
271
272        for sby in 0..ts.sb_height {
273            for sbx in 0..ts.sb_width {
274                let mut tested_frames_flags = 0;
275                for &ref_frame in ALLOWED_REF_FRAMES {
276                    let frame_flag = 1 << fi.ref_frames[ref_frame.to_index()];
277                    if tested_frames_flags & frame_flag == frame_flag {
278                        continue;
279                    }
280                    tested_frames_flags |= frame_flag;
281
282                    let tile_bo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby })
283                        .block_offset(0, 0);
284
285                    if new_subsampling {
286                        refine_subsampled_sb_motion(
287                            fi,
288                            ts,
289                            ref_frame,
290                            mv_size_in_b_log2 + 1,
291                            tile_bo,
292                            ssdec,
293                            lambda,
294                            bit_depth,
295                            cpu_feature_level,
296                        );
297                    }
298
299                    estimate_sb_motion(
300                        fi,
301                        ts,
302                        ref_frame,
303                        mv_size_in_b_log2,
304                        tile_bo,
305                        init,
306                        ssdec,
307                        lambda,
308                        bit_depth,
309                        cpu_feature_level,
310                    );
311                }
312            }
313        }
314    }
315}
316
317#[allow(clippy::too_many_arguments)]
318fn refine_subsampled_sb_motion<T: Pixel>(
319    fi: &FrameInvariants<T>,
320    ts: &mut TileStateMut<'_, T>,
321    ref_frame: RefType,
322    mv_size_in_b_log2: usize,
323    tile_bo: TileBlockOffset,
324    ssdec: u8,
325    lambda: u32,
326    bit_depth: usize,
327    cpu_feature_level: CpuFeatureLevel,
328) {
329    let pix_offset = tile_bo.to_luma_plane_offset();
330    let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize);
331    let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize);
332
333    let mv_size = MI_SIZE << mv_size_in_b_log2;
334
335    // Process in blocks, cropping at edges.
336    for y in (0..sb_h).step_by(mv_size) {
337        for x in (0..sb_w).step_by(mv_size) {
338            let sub_bo =
339                tile_bo.with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2);
340
341            // Clamp to frame edge, rounding up in the case of subsampling.
342            // The rounding makes some assumptions about how subsampling is done.
343            let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec;
344            let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec;
345
346            // Refine the existing motion estimate
347            if let Some(results) = refine_subsampled_motion_estimate(
348                fi,
349                ts,
350                w,
351                h,
352                sub_bo,
353                ref_frame,
354                ssdec,
355                lambda,
356                bit_depth,
357                cpu_feature_level,
358            ) {
359                // normalize sad to 128x128 block
360                let sad =
361                    (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32;
362                save_me_stats(ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats {
363                    mv: results.mv,
364                    normalized_sad: sad,
365                });
366            }
367        }
368    }
369}
370
371/// Refine motion estimation that was computed one level of subsampling up.
372#[allow(clippy::too_many_arguments)]
373fn refine_subsampled_motion_estimate<T: Pixel>(
374    fi: &FrameInvariants<T>,
375    ts: &TileStateMut<'_, T>,
376    w: usize,
377    h: usize,
378    tile_bo: TileBlockOffset,
379    ref_frame: RefType,
380    ssdec: u8,
381    lambda: u32,
382    bit_depth: usize,
383    cpu_feature_level: CpuFeatureLevel,
384) -> Option<MotionSearchResult> {
385    if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] {
386        let frame_bo = ts.to_frame_block_offset(tile_bo);
387        let (mvx_min, mvx_max, mvy_min, mvy_max) =
388            get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec);
389
390        let pmv = [MotionVector { row: 0, col: 0 }; 2];
391
392        let po = frame_bo.to_luma_plane_offset();
393        let (mvx_min, mvx_max, mvy_min, mvy_max) = (
394            mvx_min >> ssdec,
395            mvx_max >> ssdec,
396            mvy_min >> ssdec,
397            mvy_max >> ssdec,
398        );
399        let po = PlaneOffset {
400            x: po.x >> ssdec,
401            y: po.y >> ssdec,
402        };
403        let p_ref = match ssdec {
404            0 => &rec.frame.planes[0],
405            1 => &rec.input_hres,
406            2 => &rec.input_qres,
407            _ => unimplemented!(),
408        };
409
410        let org_region = &match ssdec {
411            0 => ts.input_tile.planes[0].subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
412            1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }),
413            2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }),
414            _ => unimplemented!(),
415        };
416
417        let mv = ts.me_stats[ref_frame.to_index()][tile_bo.0.y][tile_bo.0.x].mv >> ssdec;
418
419        // Given a motion vector at 0 at higher subsampling:
420        // |  -1   |   0   |   1   |
421        // then the vectors at -1 to 2 should be tested at the current subsampling.
422        //      |-------------|
423        // | -2 -1 |  0  1 |  2  3 |
424        // This corresponds to a 4x4 full search.
425        let x_lo = po.x + (mv.col as isize / 8 - 1).max(mvx_min / 8);
426        let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8);
427        let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8);
428        let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8);
429        let mut results = full_search(
430            x_lo,
431            x_hi,
432            y_lo,
433            y_hi,
434            w,
435            h,
436            org_region,
437            p_ref,
438            po,
439            1,
440            lambda,
441            pmv,
442            bit_depth,
443            cpu_feature_level,
444        );
445
446        // Scale motion vectors to full res size
447        results.mv = results.mv << ssdec;
448
449        Some(results)
450    } else {
451        None
452    }
453}
454
455fn get_mv_range(
456    w_in_b: usize,
457    h_in_b: usize,
458    bo: PlaneBlockOffset,
459    blk_w: usize,
460    blk_h: usize,
461) -> (isize, isize, isize, isize) {
462    let border_w = 128 + blk_w as isize * 8;
463    let border_h = 128 + blk_h as isize * 8;
464    let mvx_min = -(bo.0.x as isize) * (8 * MI_SIZE) as isize - border_w;
465    let mvx_max = ((w_in_b - bo.0.x) as isize - (blk_w / MI_SIZE) as isize)
466        * (8 * MI_SIZE) as isize
467        + border_w;
468    let mvy_min = -(bo.0.y as isize) * (8 * MI_SIZE) as isize - border_h;
469    let mvy_max = ((h_in_b - bo.0.y) as isize - (blk_h / MI_SIZE) as isize)
470        * (8 * MI_SIZE) as isize
471        + border_h;
472
473    // <https://aomediacodec.github.io/av1-spec/#assign-mv-semantics>
474    (
475        mvx_min.max(MV_LOW as isize + 1),
476        mvx_max.min(MV_UPP as isize - 1),
477        mvy_min.max(MV_LOW as isize + 1),
478        mvy_max.min(MV_UPP as isize - 1),
479    )
480}
481
482#[allow(clippy::too_many_arguments)]
483fn full_search<T: Pixel>(
484    x_lo: isize,
485    x_hi: isize,
486    y_lo: isize,
487    y_hi: isize,
488    w: usize,
489    h: usize,
490    org_region: &PlaneRegion<T>,
491    p_ref: &Plane<T>,
492    po: PlaneOffset,
493    step: usize,
494    lambda: u32,
495    pmv: [MotionVector; 2],
496    bit_depth: usize,
497    cpu_feature_level: CpuFeatureLevel,
498) -> MotionSearchResult {
499    let search_region = p_ref.region(Area::Rect(Rect {
500        x: x_lo,
501        y: y_lo,
502        width: (x_hi - x_lo) as usize + w,
503        height: (y_hi - y_lo) as usize + h,
504    }));
505
506    let mut best: MotionSearchResult = MotionSearchResult::empty();
507
508    // Select rectangular regions within search region with vert+horz windows
509    for vert_window in search_region.vert_windows(h).step_by(step) {
510        for ref_window in vert_window.horz_windows(w).step_by(step) {
511            let &Rect { x, y, .. } = ref_window.rect();
512
513            let mv = MotionVector {
514                row: 8 * (y as i16 - po.y as i16),
515                col: 8 * (x as i16 - po.x as i16),
516            };
517
518            let rd = compute_mv_rd(
519                pmv,
520                lambda,
521                false,
522                bit_depth,
523                w,
524                h,
525                mv,
526                org_region,
527                &ref_window,
528                cpu_feature_level,
529            );
530
531            if rd.cost < best.rd.cost {
532                best.rd = rd;
533                best.mv = mv;
534            }
535        }
536    }
537
538    best
539}
540
541/// Compute the rate distortion stats for a motion vector.
542#[allow(clippy::too_many_arguments)]
543fn compute_mv_rd<T: Pixel>(
544    pmv: [MotionVector; 2],
545    lambda: u32,
546    use_satd: bool,
547    bit_depth: usize,
548    w: usize,
549    h: usize,
550    cand_mv: MotionVector,
551    plane_org: &PlaneRegion<'_, T>,
552    plane_ref: &PlaneRegion<'_, T>,
553    cpu_feature_level: CpuFeatureLevel,
554) -> MVCandidateRD {
555    let sad = if use_satd {
556        get_satd(plane_org, plane_ref, w, h, bit_depth, cpu_feature_level)
557    } else {
558        get_sad(plane_org, plane_ref, w, h, bit_depth, cpu_feature_level)
559    };
560
561    let rate1 = get_mv_rate(cand_mv, pmv[0]);
562    let rate2 = get_mv_rate(cand_mv, pmv[1]);
563    let rate = rate1.min(rate2 + 1);
564
565    MVCandidateRD {
566        cost: 256 * sad as u64 + rate as u64 * lambda as u64,
567        sad,
568    }
569}
570
571fn diff_to_rate(diff: i16) -> u32 {
572    let d = diff >> 1;
573    2 * ILog::ilog(d.abs()) as u32
574}
575
576fn get_mv_rate(a: MotionVector, b: MotionVector) -> u32 {
577    diff_to_rate(a.row - b.row) + diff_to_rate(a.col - b.col)
578}
579
580/// Result of motion search.
581#[derive(Debug, Copy, Clone)]
582pub struct MotionSearchResult {
583    /// Motion vector chosen by the motion search.
584    pub mv: MotionVector,
585    /// Rate distortion data associated with `mv`.
586    pub rd: MVCandidateRD,
587}
588
589impl MotionSearchResult {
590    /// Creates an 'empty' value.
591    ///
592    /// To be considered empty, cost is set higher than any naturally occurring
593    /// cost value. The idea is that comparing to any valid rd output, the
594    /// search result will always be replaced.
595    pub fn empty() -> MotionSearchResult {
596        MotionSearchResult {
597            mv: MotionVector::default(),
598            rd: MVCandidateRD::empty(),
599        }
600    }
601
602    /// Check if the value should be considered to be empty.
603    const fn is_empty(&self) -> bool {
604        self.rd.cost == u64::MAX
605    }
606}
607
608/// Holds data from computing rate distortion of a motion vector.
609#[derive(Debug, Copy, Clone)]
610pub struct MVCandidateRD {
611    /// Rate distortion cost of the motion vector.
612    pub cost: u64,
613    /// Distortion metric value for the motion vector.
614    pub sad: u32,
615}
616
617impl MVCandidateRD {
618    /// Creates an 'empty' value.
619    ///
620    /// To be considered empty, cost is set higher than any naturally occurring
621    /// cost value. The idea is that comparing to any valid rd output, the
622    /// search result will always be replaced.
623    const fn empty() -> MVCandidateRD {
624        MVCandidateRD {
625            sad: u32::MAX,
626            cost: u64::MAX,
627        }
628    }
629}
630
631fn save_me_stats<T: Pixel>(
632    ts: &mut TileStateMut<'_, T>,
633    mv_size_in_b_log2: usize,
634    tile_bo: TileBlockOffset,
635    ref_frame: RefType,
636    stats: MEStats,
637) {
638    let size_in_b = 1 << mv_size_in_b_log2;
639    let tile_me_stats = &mut ts.me_stats[ref_frame.to_index()];
640    let tile_bo_x_end = (tile_bo.0.x + size_in_b).min(ts.mi_width);
641    let tile_bo_y_end = (tile_bo.0.y + size_in_b).min(ts.mi_height);
642    for mi_y in tile_bo.0.y..tile_bo_y_end {
643        for a in tile_me_stats[mi_y][tile_bo.0.x..tile_bo_x_end].iter_mut() {
644            *a = stats;
645        }
646    }
647}
648
649#[allow(clippy::too_many_arguments)]
650fn estimate_sb_motion<T: Pixel>(
651    fi: &FrameInvariants<T>,
652    ts: &mut TileStateMut<'_, T>,
653    ref_frame: RefType,
654    mv_size_in_b_log2: usize,
655    tile_bo: TileBlockOffset,
656    init: bool,
657    ssdec: u8,
658    lambda: u32,
659    bit_depth: usize,
660    cpu_feature_level: CpuFeatureLevel,
661) {
662    let pix_offset = tile_bo.to_luma_plane_offset();
663    let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize);
664    let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize);
665
666    let mv_size = MI_SIZE << mv_size_in_b_log2;
667
668    // Process in blocks, cropping at edges.
669    for y in (0..sb_h).step_by(mv_size) {
670        for x in (0..sb_w).step_by(mv_size) {
671            let corner: MVSamplingMode = if init {
672                MVSamplingMode::INIT
673            } else {
674                // Processing the block a size up produces data that can be used by
675                // the right and bottom corners.
676                MVSamplingMode::CORNER {
677                    right: x & mv_size == mv_size,
678                    bottom: y & mv_size == mv_size,
679                }
680            };
681
682            let sub_bo =
683                tile_bo.with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2);
684
685            // Clamp to frame edge, rounding up in the case of subsampling.
686            // The rounding makes some assumptions about how subsampling is done.
687            let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec;
688            let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec;
689
690            // Run motion estimation.
691            // Note that the initial search (init) instructs the called function to
692            // perform a more extensive search.
693            if let Some(results) = estimate_motion(
694                fi,
695                ts,
696                w,
697                h,
698                sub_bo,
699                ref_frame,
700                None,
701                corner,
702                init,
703                ssdec,
704                Some(lambda),
705                bit_depth,
706                cpu_feature_level,
707            ) {
708                // normalize sad to 128x128 block
709                let sad =
710                    (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32;
711                save_me_stats(ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats {
712                    mv: results.mv,
713                    normalized_sad: sad,
714                });
715            }
716        }
717    }
718}
719
720#[allow(clippy::too_many_arguments)]
721fn estimate_motion<T: Pixel>(
722    fi: &FrameInvariants<T>,
723    ts: &TileStateMut<'_, T>,
724    w: usize,
725    h: usize,
726    tile_bo: TileBlockOffset,
727    ref_frame: RefType,
728    pmv: Option<[MotionVector; 2]>,
729    corner: MVSamplingMode,
730    extensive_search: bool,
731    ssdec: u8,
732    lambda: Option<u32>,
733    bit_depth: usize,
734    cpu_feature_level: CpuFeatureLevel,
735) -> Option<MotionSearchResult> {
736    if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] {
737        let frame_bo = ts.to_frame_block_offset(tile_bo);
738        let (mvx_min, mvx_max, mvy_min, mvy_max) =
739            get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec);
740
741        let lambda = lambda.unwrap_or(0);
742
743        let global_mv = [MotionVector { row: 0, col: 0 }; 2];
744
745        let po = frame_bo.to_luma_plane_offset();
746        let (mvx_min, mvx_max, mvy_min, mvy_max) = (
747            mvx_min >> ssdec,
748            mvx_max >> ssdec,
749            mvy_min >> ssdec,
750            mvy_max >> ssdec,
751        );
752        let po = PlaneOffset {
753            x: po.x >> ssdec,
754            y: po.y >> ssdec,
755        };
756        let p_ref = match ssdec {
757            0 => &rec.frame.planes[0],
758            1 => &rec.input_hres,
759            2 => &rec.input_qres,
760            _ => unimplemented!(),
761        };
762
763        let org_region = &match ssdec {
764            0 => ts.input_tile.planes[0].subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
765            1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }),
766            2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }),
767            _ => unimplemented!(),
768        };
769
770        let mut best: MotionSearchResult = full_pixel_me(
771            fi,
772            ts,
773            org_region,
774            p_ref,
775            tile_bo,
776            po,
777            lambda,
778            pmv.unwrap_or(global_mv),
779            w,
780            h,
781            mvx_min,
782            mvx_max,
783            mvy_min,
784            mvy_max,
785            ref_frame,
786            corner,
787            extensive_search,
788            ssdec,
789            bit_depth,
790            cpu_feature_level,
791        );
792
793        if let Some(pmv) = pmv {
794            best.rd = get_fullpel_mv_rd(
795                po,
796                org_region,
797                p_ref,
798                bit_depth,
799                pmv,
800                lambda,
801                true,
802                mvx_min,
803                mvx_max,
804                mvy_min,
805                mvy_max,
806                w,
807                h,
808                best.mv,
809                cpu_feature_level,
810            );
811
812            sub_pixel_me(
813                fi,
814                po,
815                org_region,
816                p_ref,
817                lambda,
818                pmv,
819                mvx_min,
820                mvx_max,
821                mvy_min,
822                mvy_max,
823                w,
824                h,
825                true,
826                &mut best,
827                ref_frame,
828                bit_depth,
829                cpu_feature_level,
830            );
831        }
832
833        // Scale motion vectors to full res size
834        best.mv = best.mv << ssdec;
835
836        Some(best)
837    } else {
838        None
839    }
840}
841
842#[allow(clippy::too_many_arguments)]
843fn full_pixel_me<T: Pixel>(
844    fi: &FrameInvariants<T>,
845    ts: &TileStateMut<'_, T>,
846    org_region: &PlaneRegion<T>,
847    p_ref: &Plane<T>,
848    tile_bo: TileBlockOffset,
849    po: PlaneOffset,
850    lambda: u32,
851    pmv: [MotionVector; 2],
852    w: usize,
853    h: usize,
854    mvx_min: isize,
855    mvx_max: isize,
856    mvy_min: isize,
857    mvy_max: isize,
858    ref_frame: RefType,
859    corner: MVSamplingMode,
860    extensive_search: bool,
861    ssdec: u8,
862    bit_depth: usize,
863    cpu_feature_level: CpuFeatureLevel,
864) -> MotionSearchResult {
865    let ref_frame_id = ref_frame.to_index();
866    let tile_me_stats = &ts.me_stats[ref_frame_id].as_const();
867    let frame_ref = fi.rec_buffer.frames[fi.ref_frames[0] as usize]
868        .as_ref()
869        .map(|frame_ref| frame_ref.frame_me_stats.read().expect("poisoned lock"));
870    let subsets = get_subset_predictors(
871        tile_bo,
872        tile_me_stats,
873        frame_ref,
874        ref_frame_id,
875        w,
876        h,
877        mvx_min,
878        mvx_max,
879        mvy_min,
880        mvy_max,
881        corner,
882        ssdec,
883    );
884
885    let try_cands = |predictors: &[MotionVector], best: &mut MotionSearchResult| {
886        let mut results = get_best_predictor(
887            po,
888            org_region,
889            p_ref,
890            predictors,
891            bit_depth,
892            pmv,
893            lambda,
894            mvx_min,
895            mvx_max,
896            mvy_min,
897            mvy_max,
898            w,
899            h,
900            cpu_feature_level,
901        );
902        fullpel_diamond_search(
903            po,
904            org_region,
905            p_ref,
906            &mut results,
907            bit_depth,
908            pmv,
909            lambda,
910            mvx_min,
911            mvx_max,
912            mvy_min,
913            mvy_max,
914            w,
915            h,
916            cpu_feature_level,
917        );
918
919        if results.rd.cost < best.rd.cost {
920            *best = results;
921        }
922    };
923
924    let mut best: MotionSearchResult = MotionSearchResult::empty();
925    if !extensive_search {
926        try_cands(&subsets.all_mvs(), &mut best);
927        best
928    } else {
929        // Perform a more thorough search before resorting to full search.
930        // Search the median, the best mvs of neighboring blocks, and motion vectors
931        // from the previous frame. Stop once a candidate with a sad less than a
932        // threshold is found.
933
934        let thresh = (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (bit_depth - 8));
935
936        if let Some(median) = subsets.median {
937            try_cands(&[median], &mut best);
938
939            if best.rd.sad < thresh {
940                return best;
941            }
942        }
943
944        try_cands(&subsets.subset_b, &mut best);
945
946        if best.rd.sad < thresh {
947            return best;
948        }
949
950        try_cands(&subsets.subset_c, &mut best);
951
952        if best.rd.sad < thresh {
953            return best;
954        }
955
956        // Preform UMH search, either as the last possible search when full search
957        // is disabled, or as the last search before resorting to full search.
958        // Use 24 merange, since it is the largest range that x264 uses.
959        uneven_multi_hex_search(
960            po,
961            org_region,
962            p_ref,
963            &mut best,
964            bit_depth,
965            pmv,
966            lambda,
967            mvx_min,
968            mvx_max,
969            mvy_min,
970            mvy_max,
971            w,
972            h,
973            24,
974            cpu_feature_level,
975        );
976
977        best
978    }
979}
980
981#[allow(clippy::too_many_arguments)]
982fn sub_pixel_me<T: Pixel>(
983    fi: &FrameInvariants<T>,
984    po: PlaneOffset,
985    org_region: &PlaneRegion<T>,
986    p_ref: &Plane<T>,
987    lambda: u32,
988    pmv: [MotionVector; 2],
989    mvx_min: isize,
990    mvx_max: isize,
991    mvy_min: isize,
992    mvy_max: isize,
993    w: usize,
994    h: usize,
995    use_satd: bool,
996    best: &mut MotionSearchResult,
997    ref_frame: RefType,
998    bit_depth: usize,
999    cpu_feature_level: CpuFeatureLevel,
1000) {
1001    subpel_diamond_search(
1002        fi,
1003        po,
1004        org_region,
1005        p_ref,
1006        bit_depth,
1007        pmv,
1008        lambda,
1009        mvx_min,
1010        mvx_max,
1011        mvy_min,
1012        mvy_max,
1013        w,
1014        h,
1015        use_satd,
1016        best,
1017        ref_frame,
1018        cpu_feature_level,
1019    );
1020}
1021
1022/// Run a subpixel diamond search. The search is run on multiple step sizes.
1023///
1024/// For each step size, candidate motion vectors are examined for improvement
1025/// to the current search location. The search location is moved to the best
1026/// candidate (if any). This is repeated until the search location stops moving.
1027#[allow(clippy::too_many_arguments)]
1028fn subpel_diamond_search<T: Pixel>(
1029    fi: &FrameInvariants<T>,
1030    po: PlaneOffset,
1031    org_region: &PlaneRegion<T>,
1032    _p_ref: &Plane<T>,
1033    bit_depth: usize,
1034    pmv: [MotionVector; 2],
1035    lambda: u32,
1036    mvx_min: isize,
1037    mvx_max: isize,
1038    mvy_min: isize,
1039    mvy_max: isize,
1040    w: usize,
1041    h: usize,
1042    use_satd: bool,
1043    current: &mut MotionSearchResult,
1044    ref_frame: RefType,
1045    cpu_feature_level: CpuFeatureLevel,
1046) {
1047    // Motion compensation assembly has special requirements for edges
1048    let mc_w = w.next_power_of_two();
1049    let mc_h = (h + 1) & !1;
1050
1051    // Metadata for subpel scratch pad.
1052    let cfg = PlaneConfig::new(mc_w, mc_h, 0, 0, 0, 0, std::mem::size_of::<T>());
1053    // Stack allocation for subpel scratch pad.
1054    // SAFETY: We write to the array below before reading from it.
1055    let mut buf: Aligned<A64, [T; 128 * 128]> = Aligned([T::cast_from(0); 128 * 128]);
1056    let mut tmp_region = PlaneRegionMut::from_slice(buf.as_mut(), &cfg, Rect {
1057        x: 0,
1058        y: 0,
1059        width: cfg.width,
1060        height: cfg.height,
1061    });
1062
1063    // start at 1/2 pel and end at 1/4 or 1/8 pel
1064    let (mut diamond_radius_log2, diamond_radius_end_log2) = (2u8, 1u8);
1065
1066    loop {
1067        // Find the best candidate from the diamond pattern.
1068        let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
1069        for &offset in &DIAMOND_R1_PATTERN_SUBPEL {
1070            let cand_mv = current.mv + (offset << diamond_radius_log2);
1071
1072            let rd = get_subpel_mv_rd(
1073                fi,
1074                po,
1075                org_region,
1076                bit_depth,
1077                pmv,
1078                lambda,
1079                use_satd,
1080                mvx_min,
1081                mvx_max,
1082                mvy_min,
1083                mvy_max,
1084                w,
1085                h,
1086                cand_mv,
1087                &mut tmp_region,
1088                ref_frame,
1089                cpu_feature_level,
1090            );
1091
1092            if rd.cost < best_cand.rd.cost {
1093                best_cand.mv = cand_mv;
1094                best_cand.rd = rd;
1095            }
1096        }
1097
1098        // Continue the search at this scale until a better candidate isn't found.
1099        if current.rd.cost <= best_cand.rd.cost {
1100            if diamond_radius_log2 == diamond_radius_end_log2 {
1101                break;
1102            } else {
1103                diamond_radius_log2 -= 1;
1104            }
1105        } else {
1106            *current = best_cand;
1107        }
1108    }
1109
1110    assert!(!current.is_empty());
1111}
1112
1113#[allow(clippy::too_many_arguments)]
1114fn get_subpel_mv_rd<T: Pixel>(
1115    fi: &FrameInvariants<T>,
1116    po: PlaneOffset,
1117    org_region: &PlaneRegion<T>,
1118    bit_depth: usize,
1119    pmv: [MotionVector; 2],
1120    lambda: u32,
1121    use_satd: bool,
1122    mvx_min: isize,
1123    mvx_max: isize,
1124    mvy_min: isize,
1125    mvy_max: isize,
1126    w: usize,
1127    h: usize,
1128    cand_mv: MotionVector,
1129    tmp_region: &mut PlaneRegionMut<T>,
1130    ref_frame: RefType,
1131    cpu_feature_level: CpuFeatureLevel,
1132) -> MVCandidateRD {
1133    if (cand_mv.col as isize) < mvx_min
1134        || (cand_mv.col as isize) > mvx_max
1135        || (cand_mv.row as isize) < mvy_min
1136        || (cand_mv.row as isize) > mvy_max
1137    {
1138        return MVCandidateRD::empty();
1139    }
1140
1141    let tmp_width = tmp_region.rect().width;
1142    let tmp_height = tmp_region.rect().height;
1143    let tile_rect = TileRect {
1144        x: 0,
1145        y: 0,
1146        width: tmp_width,
1147        height: tmp_height,
1148    };
1149
1150    PredictionMode::NEWMV.predict_inter_single(
1151        fi,
1152        tile_rect,
1153        0,
1154        po,
1155        tmp_region,
1156        // motion comp's w & h on edges can be different from distortion's
1157        tmp_width,
1158        tmp_height,
1159        ref_frame,
1160        cand_mv,
1161        bit_depth,
1162        cpu_feature_level,
1163    );
1164    let plane_ref = tmp_region.as_const();
1165    compute_mv_rd(
1166        pmv,
1167        lambda,
1168        use_satd,
1169        bit_depth,
1170        w,
1171        h,
1172        cand_mv,
1173        org_region,
1174        &plane_ref,
1175        cpu_feature_level,
1176    )
1177}
1178
1179/// Perform an uneven multi-hexagon search. There are 4 stages:
1180/// 1. Unsymmetrical-cross search: Search the horizontal and vertical directions
1181///    for the general direction of the motion.
1182/// 2. A 5x5 full search is done to refine the current candidate.
1183/// 3. Uneven multi-hexagon search. See [`UMH_PATTERN`].
1184/// 4. Refinement using standard hexagon search.
1185///
1186/// `current` provides the initial search location and serves as
1187/// the output for the final search results.
1188///
1189/// `me_range` parameter determines how far these stages can search.
1190#[allow(clippy::too_many_arguments)]
1191fn uneven_multi_hex_search<T: Pixel>(
1192    po: PlaneOffset,
1193    org_region: &PlaneRegion<T>,
1194    p_ref: &Plane<T>,
1195    current: &mut MotionSearchResult,
1196    bit_depth: usize,
1197    pmv: [MotionVector; 2],
1198    lambda: u32,
1199    mvx_min: isize,
1200    mvx_max: isize,
1201    mvy_min: isize,
1202    mvy_max: isize,
1203    w: usize,
1204    h: usize,
1205    me_range: i16,
1206    cpu_feature_level: CpuFeatureLevel,
1207) {
1208    assert!(!current.is_empty());
1209
1210    // Search in a cross pattern to obtain a rough approximate of motion.
1211    // The cross is split into a horizontal and vertical component. Video content
1212    // tends to have more horizontal motion, so the horizontal part of the cross
1213    // is twice as large as the vertical half.
1214    //        X        -
1215    //                 | <- me_range/2
1216    //        X        |
1217    // X X X XoX X X X -
1218    //        X
1219    //
1220    //        X
1221    // |------|
1222    //     \
1223    //    me_range
1224    let center = current.mv;
1225
1226    // The larger, horizontal, part of the cross search.
1227    for i in (1..=me_range).step_by(2) {
1228        const HORIZONTAL_LINE: [MotionVector; 2] = search_pattern!(
1229          col: [ 0, 0],
1230          row: [-1, 1]
1231        );
1232
1233        for &offset in &HORIZONTAL_LINE {
1234            let cand_mv = center + offset * i;
1235            let rd = get_fullpel_mv_rd(
1236                po,
1237                org_region,
1238                p_ref,
1239                bit_depth,
1240                pmv,
1241                lambda,
1242                false,
1243                mvx_min,
1244                mvx_max,
1245                mvy_min,
1246                mvy_max,
1247                w,
1248                h,
1249                cand_mv,
1250                cpu_feature_level,
1251            );
1252
1253            if rd.cost < current.rd.cost {
1254                current.mv = cand_mv;
1255                current.rd = rd;
1256            }
1257        }
1258    }
1259
1260    // The smaller, vertical, part of the cross search
1261    for i in (1..=me_range >> 1).step_by(2) {
1262        const VERTICAL_LINE: [MotionVector; 2] = search_pattern!(
1263          col: [-1, 1],
1264          row: [ 0, 0]
1265        );
1266
1267        for &offset in &VERTICAL_LINE {
1268            let cand_mv = center + offset * i;
1269            let rd = get_fullpel_mv_rd(
1270                po,
1271                org_region,
1272                p_ref,
1273                bit_depth,
1274                pmv,
1275                lambda,
1276                false,
1277                mvx_min,
1278                mvx_max,
1279                mvy_min,
1280                mvy_max,
1281                w,
1282                h,
1283                cand_mv,
1284                cpu_feature_level,
1285            );
1286
1287            if rd.cost < current.rd.cost {
1288                current.mv = cand_mv;
1289                current.rd = rd;
1290            }
1291        }
1292    }
1293
1294    // 5x5 full search. Search a 5x5 square region around the current best mv.
1295    let center = current.mv;
1296    for row in -2..=2 {
1297        for col in -2..=2 {
1298            if row == 0 && col == 0 {
1299                continue;
1300            }
1301            let cand_mv = center + MotionVector { row, col };
1302            let rd = get_fullpel_mv_rd(
1303                po,
1304                org_region,
1305                p_ref,
1306                bit_depth,
1307                pmv,
1308                lambda,
1309                false,
1310                mvx_min,
1311                mvx_max,
1312                mvy_min,
1313                mvy_max,
1314                w,
1315                h,
1316                cand_mv,
1317                cpu_feature_level,
1318            );
1319
1320            if rd.cost < current.rd.cost {
1321                current.mv = cand_mv;
1322                current.rd = rd;
1323            }
1324        }
1325    }
1326
1327    // Run the hexagons in uneven multi-hexagon. The hexagonal pattern is tested
1328    // around the best vector at multiple scales.
1329    // Example of the UMH pattern run on a scale of 1 and 2:
1330    //         2         -
1331    //                   | <- me_range
1332    //     2       2     |
1333    //                   |
1334    // 2       1       2 |
1335    //       1   1       |
1336    // 2   1       1   2 |
1337    //     1       1     |
1338    // 2   1   o   1   2 |
1339    //     1       1     |
1340    // 2   1       1   2 |
1341    //       1   1       |
1342    // 2       1       2 |
1343    //                   |
1344    //     2       2     |
1345    //                   |
1346    //         2         -
1347    // |---------------|
1348    //        \
1349    //       me_range
1350    let center = current.mv;
1351
1352    // Divide by 4, the radius of the UMH's hexagon.
1353    let iterations = me_range >> 2;
1354    for i in 1..=iterations {
1355        for &offset in &UMH_PATTERN {
1356            let cand_mv = center + offset * i;
1357            let rd = get_fullpel_mv_rd(
1358                po,
1359                org_region,
1360                p_ref,
1361                bit_depth,
1362                pmv,
1363                lambda,
1364                false,
1365                mvx_min,
1366                mvx_max,
1367                mvy_min,
1368                mvy_max,
1369                w,
1370                h,
1371                cand_mv,
1372                cpu_feature_level,
1373            );
1374
1375            if rd.cost < current.rd.cost {
1376                current.mv = cand_mv;
1377                current.rd = rd;
1378            }
1379        }
1380    }
1381
1382    // Refine the search results using a 'normal' hexagon search.
1383    hexagon_search(
1384        po,
1385        org_region,
1386        p_ref,
1387        current,
1388        bit_depth,
1389        pmv,
1390        lambda,
1391        mvx_min,
1392        mvx_max,
1393        mvy_min,
1394        mvy_max,
1395        w,
1396        h,
1397        cpu_feature_level,
1398    );
1399}
1400
1401#[allow(clippy::too_many_arguments)]
1402fn get_subset_predictors(
1403    tile_bo: TileBlockOffset,
1404    tile_me_stats: &TileMEStats<'_>,
1405    frame_ref_opt: Option<ReadGuardMEStats<'_>>,
1406    ref_frame_id: usize,
1407    pix_w: usize,
1408    pix_h: usize,
1409    mvx_min: isize,
1410    mvx_max: isize,
1411    mvy_min: isize,
1412    mvy_max: isize,
1413    corner: MVSamplingMode,
1414    ssdec: u8,
1415) -> MotionEstimationSubsets {
1416    let mut min_sad: u32 = u32::MAX;
1417    let mut subset_b = ArrayVec::<MotionVector, 5>::new();
1418    let mut subset_c = ArrayVec::<MotionVector, 5>::new();
1419
1420    // rounded up width in blocks
1421    let w = ((pix_w << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2;
1422    let h = ((pix_h << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2;
1423
1424    // Get predictors from the same frame.
1425
1426    let clipped_half_w = (w >> 1).min(tile_me_stats.cols() - 1 - tile_bo.0.x);
1427    let clipped_half_h = (h >> 1).min(tile_me_stats.rows() - 1 - tile_bo.0.y);
1428
1429    let mut process_cand = |stats: MEStats| -> MotionVector {
1430        min_sad = min_sad.min(stats.normalized_sad);
1431        let mv = stats.mv.quantize_to_fullpel();
1432        MotionVector {
1433            col: clamp(mv.col as isize, mvx_min, mvx_max) as i16,
1434            row: clamp(mv.row as isize, mvy_min, mvy_max) as i16,
1435        }
1436    };
1437
1438    // Sample the middle of all block edges bordering this one.
1439    // Note: If motion vectors haven't been precomputed to a given blocksize, then
1440    // the right and bottom edges will be duplicates of the center predictor when
1441    // processing in raster order.
1442
1443    // left
1444    if tile_bo.0.x > 0 {
1445        subset_b.push(process_cand(
1446            tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x - 1],
1447        ));
1448    }
1449    // top
1450    if tile_bo.0.y > 0 {
1451        subset_b.push(process_cand(
1452            tile_me_stats[tile_bo.0.y - 1][tile_bo.0.x + clipped_half_w],
1453        ));
1454    }
1455
1456    // Sampling far right and far bottom edges was tested, but had worse results
1457    // without an extensive threshold test (with threshold being applied after
1458    // checking median and the best of each subset).
1459
1460    // right
1461    if let MVSamplingMode::CORNER {
1462        right: true,
1463        bottom: _,
1464    } = corner
1465    {
1466        if tile_bo.0.x + w < tile_me_stats.cols() {
1467            subset_b.push(process_cand(
1468                tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + w],
1469            ));
1470        }
1471    }
1472    // bottom
1473    if let MVSamplingMode::CORNER {
1474        right: _,
1475        bottom: true,
1476    } = corner
1477    {
1478        if tile_bo.0.y + h < tile_me_stats.rows() {
1479            subset_b.push(process_cand(
1480                tile_me_stats[tile_bo.0.y + h][tile_bo.0.x + clipped_half_w],
1481            ));
1482        }
1483    }
1484
1485    let median = if corner != MVSamplingMode::INIT {
1486        // Sample the center of the current block.
1487        Some(process_cand(
1488            tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + clipped_half_w],
1489        ))
1490    } else if subset_b.len() != 3 {
1491        None
1492    } else {
1493        let mut rows: ArrayVec<i16, 3> = subset_b.iter().map(|&a| a.row).collect();
1494        let mut cols: ArrayVec<i16, 3> = subset_b.iter().map(|&a| a.col).collect();
1495        rows.as_mut_slice().sort_unstable();
1496        cols.as_mut_slice().sort_unstable();
1497        Some(MotionVector {
1498            row: rows[1],
1499            col: cols[1],
1500        })
1501    };
1502
1503    // Zero motion vector, don't use add_cand since it skips zero vectors.
1504    subset_b.push(MotionVector::default());
1505
1506    // EPZS subset C predictors.
1507    // Sample the middle of bordering side of the left, right, top and bottom
1508    // blocks of the previous frame.
1509    // Sample the middle of this block in the previous frame.
1510
1511    if let Some(frame_me_stats) = frame_ref_opt {
1512        let prev_frame = &frame_me_stats[ref_frame_id];
1513
1514        let frame_bo = PlaneBlockOffset(BlockOffset {
1515            x: tile_me_stats.x() + tile_bo.0.x,
1516            y: tile_me_stats.y() + tile_bo.0.y,
1517        });
1518        let clipped_half_w = (w >> 1).min(prev_frame.cols - 1 - frame_bo.0.x);
1519        let clipped_half_h = (h >> 1).min(prev_frame.rows - 1 - frame_bo.0.y);
1520
1521        // left
1522        if frame_bo.0.x > 0 {
1523            subset_c.push(process_cand(
1524                prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x - 1],
1525            ));
1526        }
1527        // top
1528        if frame_bo.0.y > 0 {
1529            subset_c.push(process_cand(
1530                prev_frame[frame_bo.0.y - 1][frame_bo.0.x + clipped_half_w],
1531            ));
1532        }
1533        // right
1534        if frame_bo.0.x + w < prev_frame.cols {
1535            subset_c.push(process_cand(
1536                prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + w],
1537            ));
1538        }
1539        // bottom
1540        if frame_bo.0.y + h < prev_frame.rows {
1541            subset_c.push(process_cand(
1542                prev_frame[frame_bo.0.y + h][frame_bo.0.x + clipped_half_w],
1543            ));
1544        }
1545
1546        subset_c.push(process_cand(
1547            prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + clipped_half_w],
1548        ));
1549    }
1550
1551    // Undo normalization to 128x128 block size
1552    let min_sad = ((min_sad as u64 * (pix_w * pix_h) as u64) >> (MAX_SB_SIZE_LOG2 * 2)) as u32;
1553
1554    let dec_mv = |mv: MotionVector| MotionVector {
1555        col: mv.col >> ssdec,
1556        row: mv.row >> ssdec,
1557    };
1558    let median = median.map(dec_mv);
1559    for mv in subset_b.iter_mut() {
1560        *mv = dec_mv(*mv);
1561    }
1562    for mv in subset_c.iter_mut() {
1563        *mv = dec_mv(*mv);
1564    }
1565
1566    MotionEstimationSubsets {
1567        min_sad,
1568        median,
1569        subset_b,
1570        subset_c,
1571    }
1572}
1573
1574#[allow(clippy::too_many_arguments)]
1575fn get_best_predictor<T: Pixel>(
1576    po: PlaneOffset,
1577    org_region: &PlaneRegion<T>,
1578    p_ref: &Plane<T>,
1579    predictors: &[MotionVector],
1580    bit_depth: usize,
1581    pmv: [MotionVector; 2],
1582    lambda: u32,
1583    mvx_min: isize,
1584    mvx_max: isize,
1585    mvy_min: isize,
1586    mvy_max: isize,
1587    w: usize,
1588    h: usize,
1589    cpu_feature_level: CpuFeatureLevel,
1590) -> MotionSearchResult {
1591    let mut best: MotionSearchResult = MotionSearchResult::empty();
1592
1593    for &init_mv in predictors.iter() {
1594        let rd = get_fullpel_mv_rd(
1595            po,
1596            org_region,
1597            p_ref,
1598            bit_depth,
1599            pmv,
1600            lambda,
1601            false,
1602            mvx_min,
1603            mvx_max,
1604            mvy_min,
1605            mvy_max,
1606            w,
1607            h,
1608            init_mv,
1609            cpu_feature_level,
1610        );
1611
1612        if rd.cost < best.rd.cost {
1613            best.mv = init_mv;
1614            best.rd = rd;
1615        }
1616    }
1617
1618    best
1619}
1620
1621#[allow(clippy::too_many_arguments)]
1622fn get_fullpel_mv_rd<T: Pixel>(
1623    po: PlaneOffset,
1624    org_region: &PlaneRegion<T>,
1625    p_ref: &Plane<T>,
1626    bit_depth: usize,
1627    pmv: [MotionVector; 2],
1628    lambda: u32,
1629    use_satd: bool,
1630    mvx_min: isize,
1631    mvx_max: isize,
1632    mvy_min: isize,
1633    mvy_max: isize,
1634    w: usize,
1635    h: usize,
1636    cand_mv: MotionVector,
1637    cpu_feature_level: CpuFeatureLevel,
1638) -> MVCandidateRD {
1639    if (cand_mv.col as isize) < mvx_min
1640        || (cand_mv.col as isize) > mvx_max
1641        || (cand_mv.row as isize) < mvy_min
1642        || (cand_mv.row as isize) > mvy_max
1643    {
1644        return MVCandidateRD::empty();
1645    }
1646
1647    // Convert the motion vector into an full pixel offset.
1648    let plane_ref = p_ref.region(Area::StartingAt {
1649        x: po.x + (cand_mv.col / 8) as isize,
1650        y: po.y + (cand_mv.row / 8) as isize,
1651    });
1652    compute_mv_rd(
1653        pmv,
1654        lambda,
1655        use_satd,
1656        bit_depth,
1657        w,
1658        h,
1659        cand_mv,
1660        org_region,
1661        &plane_ref,
1662        cpu_feature_level,
1663    )
1664}
1665
1666/// Perform hexagon search and refine afterwards.
1667///
1668/// In the hexagon search stage, candidate motion vectors are examined for
1669/// improvement to the current search location. The search location is moved to
1670/// the best candidate (if any). This is repeated until the search location
1671/// stops moving.
1672///
1673/// Refinement uses a square pattern that fits between the hexagon candidates.
1674///
1675/// The hexagon pattern is defined by [`HEXAGON_PATTERN`] and the refinement
1676/// is defined by [`SQUARE_REFINE_PATTERN`].
1677///
1678/// `current` provides the initial search location and serves as
1679/// the output for the final search results.
1680#[allow(clippy::too_many_arguments)]
1681fn hexagon_search<T: Pixel>(
1682    po: PlaneOffset,
1683    org_region: &PlaneRegion<T>,
1684    p_ref: &Plane<T>,
1685    current: &mut MotionSearchResult,
1686    bit_depth: usize,
1687    pmv: [MotionVector; 2],
1688    lambda: u32,
1689    mvx_min: isize,
1690    mvx_max: isize,
1691    mvy_min: isize,
1692    mvy_max: isize,
1693    w: usize,
1694    h: usize,
1695    cpu_feature_level: CpuFeatureLevel,
1696) {
1697    // The first iteration of hexagon search is implemented separate from
1698    // subsequent iterations, which overlap with previous iterations.
1699
1700    // Holds what candidate is used (if any). This is used to determine which
1701    // candidates have already been tested in a previous iteration and can be
1702    // skipped.
1703    let mut best_cand_idx: usize = 0;
1704    let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
1705
1706    // First iteration of hexagon search. There are six candidates to consider.
1707    for (i, &pattern_mv) in HEXAGON_PATTERN.iter().enumerate() {
1708        let cand_mv = current.mv + pattern_mv;
1709        let rd = get_fullpel_mv_rd(
1710            po,
1711            org_region,
1712            p_ref,
1713            bit_depth,
1714            pmv,
1715            lambda,
1716            false,
1717            mvx_min,
1718            mvx_max,
1719            mvy_min,
1720            mvy_max,
1721            w,
1722            h,
1723            cand_mv,
1724            cpu_feature_level,
1725        );
1726
1727        if rd.cost < best_cand.rd.cost {
1728            best_cand_idx = i;
1729            best_cand.mv = cand_mv;
1730            best_cand.rd = rd;
1731        }
1732    }
1733
1734    // Run additional iterations of hexagon search until the search location
1735    // doesn't update.
1736    while best_cand.rd.cost < current.rd.cost {
1737        // Update the search location.
1738        *current = best_cand;
1739        best_cand = MotionSearchResult::empty();
1740
1741        // Save the index/direction taken in the previous iteration to the current
1742        // search location.
1743        let center_cand_idx = best_cand_idx;
1744
1745        // Look only at candidates that don't overlap with previous iterations. This
1746        // corresponds with the three offsets (2D) with the closest direction to
1747        // that traveled by the previous iteration. HEXAGON_PATTERN has clockwise
1748        // order, so the last direction -1, +0, and +1 (mod 6) give the indices for
1749        // these offsets.
1750        for idx_offset_mod6 in 5..=7 {
1751            let i = (center_cand_idx + idx_offset_mod6) % 6;
1752            let cand_mv = current.mv + HEXAGON_PATTERN[i];
1753
1754            let rd = get_fullpel_mv_rd(
1755                po,
1756                org_region,
1757                p_ref,
1758                bit_depth,
1759                pmv,
1760                lambda,
1761                false,
1762                mvx_min,
1763                mvx_max,
1764                mvy_min,
1765                mvy_max,
1766                w,
1767                h,
1768                cand_mv,
1769                cpu_feature_level,
1770            );
1771
1772            if rd.cost < best_cand.rd.cost {
1773                best_cand_idx = i;
1774                best_cand.mv = cand_mv;
1775                best_cand.rd = rd;
1776            }
1777        }
1778    }
1779
1780    // Refine the motion after completing hexagon search.
1781    let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
1782    for &offset in &SQUARE_REFINE_PATTERN {
1783        let cand_mv = current.mv + offset;
1784        let rd = get_fullpel_mv_rd(
1785            po,
1786            org_region,
1787            p_ref,
1788            bit_depth,
1789            pmv,
1790            lambda,
1791            false,
1792            mvx_min,
1793            mvx_max,
1794            mvy_min,
1795            mvy_max,
1796            w,
1797            h,
1798            cand_mv,
1799            cpu_feature_level,
1800        );
1801
1802        if rd.cost < best_cand.rd.cost {
1803            best_cand.mv = cand_mv;
1804            best_cand.rd = rd;
1805        }
1806    }
1807    if best_cand.rd.cost < current.rd.cost {
1808        *current = best_cand;
1809    }
1810
1811    assert!(!current.is_empty());
1812}
1813
1814/// Run a full pixel diamond search. The search is run on multiple step sizes.
1815///
1816/// For each step size, candidate motion vectors are examined for improvement
1817/// to the current search location. The search location is moved to the best
1818/// candidate (if any). This is repeated until the search location stops moving.
1819#[allow(clippy::too_many_arguments)]
1820fn fullpel_diamond_search<T: Pixel>(
1821    po: PlaneOffset,
1822    org_region: &PlaneRegion<T>,
1823    p_ref: &Plane<T>,
1824    current: &mut MotionSearchResult,
1825    bit_depth: usize,
1826    pmv: [MotionVector; 2],
1827    lambda: u32,
1828    mvx_min: isize,
1829    mvx_max: isize,
1830    mvy_min: isize,
1831    mvy_max: isize,
1832    w: usize,
1833    h: usize,
1834    cpu_feature_level: CpuFeatureLevel,
1835) {
1836    // Define the initial and the final scale (log2) of the diamond.
1837    let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8);
1838
1839    loop {
1840        // Find the best candidate from the diamond pattern.
1841        let mut best_cand: MotionSearchResult = MotionSearchResult::empty();
1842        for &offset in &DIAMOND_R1_PATTERN {
1843            let cand_mv = current.mv + (offset << diamond_radius_log2);
1844            let rd = get_fullpel_mv_rd(
1845                po,
1846                org_region,
1847                p_ref,
1848                bit_depth,
1849                pmv,
1850                lambda,
1851                false,
1852                mvx_min,
1853                mvx_max,
1854                mvy_min,
1855                mvy_max,
1856                w,
1857                h,
1858                cand_mv,
1859                cpu_feature_level,
1860            );
1861
1862            if rd.cost < best_cand.rd.cost {
1863                best_cand.mv = cand_mv;
1864                best_cand.rd = rd;
1865            }
1866        }
1867
1868        // Continue the search at this scale until the can't find a better candidate
1869        // to use.
1870        if current.rd.cost <= best_cand.rd.cost {
1871            if diamond_radius_log2 == diamond_radius_end_log2 {
1872                break;
1873            } else {
1874                diamond_radius_log2 -= 1;
1875            }
1876        } else {
1877            *current = best_cand;
1878        }
1879    }
1880
1881    assert!(!current.is_empty());
1882}