Skip to main content

rav1e/quantize/
mod.rs

1// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10#![allow(non_upper_case_globals)]
11
12mod tables;
13
14cfg_if::cfg_if! {
15  if #[cfg(nasm_x86_64)] {
16    pub use crate::asm::x86::quantize::*;
17  } else {
18    pub use self::rust::*;
19  }
20}
21
22pub use tables::*;
23
24use crate::scan_order::av1_scan_orders;
25use crate::transform::{TxSize, TxType};
26use crate::util::*;
27use std::mem;
28use std::num::{NonZeroU16, NonZeroU32, NonZeroU64};
29
30pub fn get_log_tx_scale(tx_size: TxSize) -> usize {
31  let num_pixels = tx_size.area();
32
33  Into::<usize>::into(num_pixels > 256)
34    + Into::<usize>::into(num_pixels > 1024)
35}
36
37pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
38  let dc_q: [&[NonZeroU16; 256]; 3] =
39    [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3];
40  let bd = ((bit_depth ^ 8) >> 1).min(2);
41  dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
42}
43
44pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 {
45  let ac_q: [&[NonZeroU16; 256]; 3] =
46    [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3];
47  let bd = ((bit_depth ^ 8) >> 1).min(2);
48  ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)]
49}
50
51// TODO: Handle lossless properly.
52fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 {
53  if quantizer < qlookup[MINQ].get() as i64 {
54    MINQ as u8
55  } else if quantizer >= qlookup[MAXQ].get() as i64 {
56    MAXQ as u8
57  } else {
58    match qlookup
59      .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero"))
60    {
61      Ok(qi) => qi as u8,
62      Err(qi) => {
63        debug_assert!(qi > MINQ);
64        debug_assert!(qi <= MAXQ);
65        // Pick the closest quantizer in the log domain.
66        let qthresh =
67          (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32);
68        let q2_i32 = (quantizer as i32) * (quantizer as i32);
69        if q2_i32 < qthresh {
70          (qi - 1) as u8
71        } else {
72          qi as u8
73        }
74      }
75    }
76  }
77}
78
79pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 {
80  let qlookup = match bit_depth {
81    8 => &dc_qlookup_Q3,
82    10 => &dc_qlookup_10_Q3,
83    12 => &dc_qlookup_12_Q3,
84    _ => unimplemented!(),
85  };
86  select_qi(quantizer, qlookup)
87}
88
89pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 {
90  let qlookup = match bit_depth {
91    8 => &ac_qlookup_Q3,
92    10 => &ac_qlookup_10_Q3,
93    12 => &ac_qlookup_12_Q3,
94    _ => unimplemented!(),
95  };
96  select_qi(quantizer, qlookup)
97}
98
99#[derive(Debug, Clone, Copy)]
100pub struct QuantizationContext {
101  log_tx_scale: usize,
102  dc_quant: NonZeroU16,
103  dc_offset: u32,
104  dc_mul_add: (u32, u32, u32),
105
106  ac_quant: NonZeroU16,
107  ac_offset_eob: u32,
108  ac_offset0: u32,
109  ac_offset1: u32,
110  ac_mul_add: (u32, u32, u32),
111}
112
113impl Default for QuantizationContext {
114  fn default() -> Self {
115    QuantizationContext {
116      dc_quant: NonZeroU16::new(1).expect("Not zero"),
117      ac_quant: NonZeroU16::new(1).expect("Not zero"),
118      log_tx_scale: Default::default(),
119      dc_offset: Default::default(),
120      dc_mul_add: Default::default(),
121      ac_offset_eob: Default::default(),
122      ac_offset0: Default::default(),
123      ac_offset1: Default::default(),
124      ac_mul_add: Default::default(),
125    }
126  }
127}
128
129fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) {
130  let nbits = (mem::size_of_val(&d) as u64) * 8;
131  let m = nbits - d.leading_zeros() as u64 - 1;
132  if d.is_power_of_two() {
133    (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32)
134  } else {
135    let d = NonZeroU64::from(d);
136    let t = (1u64 << (m + nbits)) / d;
137
138    let d = d.get();
139    let r = (t * d + d) & ((1 << nbits) - 1);
140    if r <= 1u64 << m {
141      (t as u32 + 1, 0u32, m as u32)
142    } else {
143      (t as u32, t as u32, m as u32)
144    }
145  }
146}
147
148#[inline]
149const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 {
150  let x = x as u64;
151  let (a, b, shift) = d;
152  let shift = shift as u64;
153  let a = a as u64;
154  let b = b as u64;
155
156  (((a * x + b) >> 32) >> shift) as u32
157}
158
159#[inline]
160const fn copysign(value: u32, signed: i32) -> i32 {
161  if signed < 0 {
162    -(value as i32)
163  } else {
164    value as i32
165  }
166}
167
168#[cfg(test)]
169mod test {
170  use super::*;
171  use crate::transform::TxSize::*;
172
173  #[test]
174  fn test_divu_pair() {
175    for d in 1..1024 {
176      for x in 0..1000 {
177        let ab = divu_gen(NonZeroU32::new(d).unwrap());
178        assert_eq!(x / d, divu_pair(x, ab));
179      }
180    }
181  }
182  #[test]
183  fn gen_divu_table() {
184    let b: Vec<(u32, u32, u32)> =
185      dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect();
186
187    println!("{:?}", b);
188  }
189  #[test]
190  fn test_tx_log_scale() {
191    let tx_sizes = [
192      (TX_4X4, 0),
193      (TX_8X8, 0),
194      (TX_16X16, 0),
195      (TX_32X32, 1),
196      (TX_64X64, 2),
197      (TX_4X8, 0),
198      (TX_8X4, 0),
199      (TX_8X16, 0),
200      (TX_16X8, 0),
201      (TX_16X32, 1),
202      (TX_32X16, 1),
203      (TX_32X64, 2),
204      (TX_64X32, 2),
205      (TX_4X16, 0),
206      (TX_16X4, 0),
207      (TX_8X32, 0),
208      (TX_32X8, 0),
209      (TX_16X64, 1),
210      (TX_64X16, 1),
211    ];
212    for &tx_size in tx_sizes.iter() {
213      assert!(tx_size.1 == get_log_tx_scale(tx_size.0));
214    }
215  }
216}
217
218impl QuantizationContext {
219  pub fn update(
220    &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize,
221    dc_delta_q: i8, ac_delta_q: i8,
222  ) {
223    self.log_tx_scale = get_log_tx_scale(tx_size);
224
225    self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth);
226    self.dc_mul_add = divu_gen(self.dc_quant.into());
227
228    self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth);
229    self.ac_mul_add = divu_gen(self.ac_quant.into());
230
231    // All of these biases were derived by measuring the cost of coding
232    // a zero vs coding a one on any given coefficient position, or, in
233    // the case of the EOB bias, the cost of coding the block with
234    // the chosen EOB (rounding to one) vs rounding to zero and continuing
235    // to choose a new EOB. This was done over several clips, with the
236    // average of the bit costs taken over all blocks in the set, and a new
237    // bias derived via the method outlined in Jean-Marc Valin's
238    // Journal of Dubious Theoretical Results[1], aka:
239    //
240    // lambda = ln(2) / 6.0
241    // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0
242    // bias = 1 - threshold
243    //
244    // lambda is a constant since our offsets are already adjusted for the
245    // quantizer.
246    //
247    // Biases were then updated, and cost collection was re-run, until
248    // the calculated biases started to converge after 2-4 iterations.
249    //
250    // In theory, the rounding biases for inter should be somewhat smaller
251    // than the biases for intra, but this turns out to only be the case
252    // for EOB optimization, or at least, is covered by EOB optimization.
253    // The RD-optimal rounding biases for the actual coefficients seem
254    // to be quite close (+/- 1/256), for both inter and intra,
255    // post-deadzoning.
256    //
257    // [1] https://jmvalin.ca/notes/theoretical_results.pdf
258    self.dc_offset =
259      self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256;
260    self.ac_offset0 =
261      self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256;
262    self.ac_offset1 =
263      self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256;
264    self.ac_offset_eob =
265      self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256;
266  }
267
268  #[inline]
269  pub fn quantize<T: Coefficient>(
270    &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType,
271  ) -> u16 {
272    let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan;
273    let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan;
274
275    qcoeffs[0] = {
276      let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale;
277      let abs_coeff = coeff.unsigned_abs();
278      T::cast_from(copysign(
279        divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add),
280        coeff,
281      ))
282    };
283
284    // Find the last non-zero coefficient using our smaller biases and
285    // zero everything else.
286    // This threshold is such that `abs(coeff) < deadzone` implies:
287    // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0
288    let deadzone = T::cast_from(
289      (self.ac_quant.get() as usize - self.ac_offset_eob as usize)
290        .align_power_of_two_and_shift(self.log_tx_scale),
291    );
292    let eob = {
293      let eob_minus_one = iscan
294        .iter()
295        .zip(coeffs)
296        .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 })
297        .max()
298        .unwrap_or(0);
299      // We skip the DC coefficient since it has its own quantizer index.
300      if eob_minus_one > 0 {
301        eob_minus_one + 1
302      } else {
303        u16::from(qcoeffs[0] != T::cast_from(0))
304      }
305    };
306
307    // Here we use different rounding biases depending on whether we've
308    // had recent coefficients that are larger than one, or less than
309    // one. The reason for this is that a block usually has a chunk of
310    // large coefficients and a tail of zeroes and ones, and the tradeoffs
311    // for coding these two are different. In the tail of zeroes and ones,
312    // you'll likely end up spending most bits just saying where that
313    // coefficient is in the block, whereas in the chunk of larger
314    // coefficients, most bits will be spent on coding its magnitude.
315    // To that end, we want to bias more toward rounding to zero for
316    // that tail of zeroes and ones than we do for the larger coefficients.
317    let mut level_mode = 1;
318    let ac_quant = self.ac_quant.get() as u32;
319    for &pos in scan.iter().take(usize::from(eob)).skip(1) {
320      let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale;
321      let abs_coeff = coeff.unsigned_abs();
322
323      let level0 = divu_pair(abs_coeff, self.ac_mul_add);
324      let offset = if level0 > 1 - level_mode {
325        self.ac_offset1
326      } else {
327        self.ac_offset0
328      };
329
330      let abs_qcoeff: u32 =
331        level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32;
332      if level_mode != 0 && abs_qcoeff == 0 {
333        level_mode = 0;
334      } else if abs_qcoeff > 1 {
335        level_mode = 1;
336      }
337
338      qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff));
339    }
340
341    // Rather than zeroing the tail in scan order, assume that qcoeffs is
342    // pre-filled with zeros.
343
344    // Check the eob is correct
345    debug_assert_eq!(
346      usize::from(eob),
347      scan
348        .iter()
349        .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0))
350        .map(|n| n + 1)
351        .unwrap_or(0)
352    );
353
354    eob
355  }
356}
357
358pub mod rust {
359  use super::*;
360  use crate::cpu_features::CpuFeatureLevel;
361  use std::mem::MaybeUninit;
362
363  pub fn dequantize<T: Coefficient>(
364    qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit<T>],
365    tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
366    _cpu: CpuFeatureLevel,
367  ) {
368    let log_tx_scale = get_log_tx_scale(tx_size) as i32;
369    let offset = (1 << log_tx_scale) - 1;
370
371    let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32;
372    let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32;
373
374    for (i, (r, c)) in rcoeffs
375      .iter_mut()
376      .zip(coeffs.iter().map(|&c| i32::cast_from(c)))
377      .enumerate()
378    {
379      let quant = if i == 0 { dc_quant } else { ac_quant };
380      r.write(T::cast_from(
381        (c * quant + ((c >> 31) & offset)) >> log_tx_scale,
382      ));
383    }
384  }
385}