perf(playback): optimize audio conversion with 16-bit dithering and bit shifts

roderickvd · roderickvd · commit f59766af7e15 · 2025-08-14T00:31:59.000+02:00
Since Spotify audio is always 16-bit depth, optimize the conversion pipeline:
- Always dither at 16-bit level regardless of output format
- Preserve fractional precision until final rounding for better requantization
- Replace floating-point multiplication with compile-time bit shifts
- Add comprehensive inlining to eliminate function call overhead
- Specialize 24-bit clamping to remove runtime branching

This maintains proper dithering of the original 16-bit quantization artifacts
while maximizing performance through bit-shift operations and eliminating
unnecessary runtime calculations.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [metadata] Replaced `AudioFileFormat` with own enum. (breaking)
 - [playback] Changed trait `Mixer::open` to return `Result<Self, Error>` instead of `Self` (breaking)
 - [playback] Changed type alias `MixerFn` to return `Result<Arc<dyn Mixer>, Error>` instead of `Arc<dyn Mixer>` (breaking)
+- [playback] Optimize audio conversion to always dither at 16-bit level and use bit shifts for scaling
 
 ### Added
 
diff --git a/playback/src/convert.rs b/playback/src/convert.rs
@@ -35,81 +35,100 @@ impl Converter {
         }
     }
 
-    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
-    /// to 32-bit signed integer, multiply by 2147483648 (0x80000000) and
-    /// saturate at the bounds of `i32`.
-    const SCALE_S32: f64 = 2147483648.;
-
-    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
-    /// to 24-bit signed integer, multiply by 8388608 (0x800000) and saturate
-    /// at the bounds of `i24`.
-    const SCALE_S24: f64 = 8388608.;
-
-    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
-    /// to 16-bit signed integer, multiply by 32768 (0x8000) and saturate at
-    /// the bounds of `i16`. When the samples were encoded using the same
-    /// scaling factor, like the reference Vorbis encoder does, this makes
-    /// conversions transparent.
-    const SCALE_S16: f64 = 32768.;
-
-    pub fn scale(&mut self, sample: f64, factor: f64) -> f64 {
-        // From the many float to int conversion methods available, match what
-        // the reference Vorbis implementation uses: sample * 32768 (for 16 bit)
-
-        // Casting float to integer rounds towards zero by default, i.e. it
-        // truncates, and that generates larger error than rounding to nearest.
+    /// Base bit positions for PCM format scaling. These represent the position
+    /// of the most significant bit in each format's full-scale representation.
+    /// For signed integers in two's complement, full scale is 2^(bits-1).
+    const SHIFT_S16: u8 = 15; // 16-bit: 2^15 = 32768
+    const SHIFT_S24: u8 = 23; // 24-bit: 2^23 = 8388608  
+    const SHIFT_S32: u8 = 31; // 32-bit: 2^31 = 2147483648
+
+
+    /// Additional bit shifts needed to scale from 16-bit to higher bit depths.
+    /// These are the differences between the base shift amounts above.
+    const SHIFT_16_TO_24: u8 = Self::SHIFT_S24 - Self::SHIFT_S16; // 23 - 15 = 8
+    const SHIFT_16_TO_32: u8 = Self::SHIFT_S32 - Self::SHIFT_S16; // 31 - 15 = 16
+
+    /// Pre-calculated scale factor for 24-bit clamping bounds
+    const SCALE_S24: f64 = (1_u64 << Self::SHIFT_S24) as f64;
+
+    /// Scale audio samples with optimal dithering strategy for Spotify's 16-bit source material.
+    /// 
+    /// Since Spotify audio is always 16-bit depth, this function:
+    /// 1. When dithering: applies noise at 16-bit level, preserves fractional precision,
+    ///    then scales to target format and rounds once at the end
+    /// 2. When not dithering: scales directly from normalized float to target format
+    /// 
+    /// The `shift` parameter specifies how many extra bits to shift beyond
+    /// the base 16-bit scaling (0 for 16-bit, 8 for 24-bit, 16 for 32-bit).
+    #[inline]
+    pub fn scale(&mut self, sample: f64, shift: u8) -> f64 {
         match self.ditherer.as_mut() {
-            Some(d) => (sample * factor + d.noise()).round(),
-            None => (sample * factor).round(),
+            Some(d) => {
+                // With dithering: Apply noise at 16-bit level to address original quantization,
+                // then scale up to target format while preserving sub-LSB information
+                let dithered_16bit = sample * (1_u64 << Self::SHIFT_S16) as f64 + d.noise();
+                let scaled = dithered_16bit * (1_u64 << shift) as f64;
+                scaled.round()
+            }
+            None => {
+                // No dithering: Scale directly from normalized float to target format
+                // using a single bit shift operation (base 16-bit shift + additional shift)
+                let total_shift = Self::SHIFT_S16 + shift;
+                (sample * (1_u64 << total_shift) as f64).round()
+            }
         }
     }
 
-    // Special case for samples packed in a word of greater bit depth (e.g.
-    // S24): clamp between min and max to ensure that the most significant
-    // byte is zero. Otherwise, dithering may cause an overflow. This is not
-    // necessary for other formats, because casting to integer will saturate
-    // to the bounds of the primitive.
-    pub fn clamping_scale(&mut self, sample: f64, factor: f64) -> f64 {
-        let int_value = self.scale(sample, factor);
-
+    /// Clamping scale specifically for 24-bit output to prevent MSB overflow.
+    /// Only used for S24 formats where samples are packed in 32-bit words.
+    /// Ensures the most significant byte is zero to prevent overflow during dithering.
+    #[inline]
+    pub fn clamping_scale_s24(&mut self, sample: f64) -> f64 {
+        let int_value = self.scale(sample, Self::SHIFT_16_TO_24);
+        
         // In two's complement, there are more negative than positive values.
-        let min = -factor;
-        let max = factor - 1.0;
-
+        let min = -Self::SCALE_S24;
+        let max = Self::SCALE_S24 - 1.0;
+        
         int_value.clamp(min, max)
     }
 
+    #[inline]
     pub fn f64_to_f32(&mut self, samples: &[f64]) -> Vec<f32> {
         samples.iter().map(|sample| *sample as f32).collect()
     }
 
+    #[inline]
     pub fn f64_to_s32(&mut self, samples: &[f64]) -> Vec<i32> {
         samples
             .iter()
-            .map(|sample| self.scale(*sample, Self::SCALE_S32) as i32)
+            .map(|sample| self.scale(*sample, Self::SHIFT_16_TO_32) as i32)
             .collect()
     }
 
-    // S24 is 24-bit PCM packed in an upper 32-bit word
+    /// S24 is 24-bit PCM packed in an upper 32-bit word
+    #[inline]
     pub fn f64_to_s24(&mut self, samples: &[f64]) -> Vec<i32> {
         samples
             .iter()
-            .map(|sample| self.clamping_scale(*sample, Self::SCALE_S24) as i32)
+            .map(|sample| self.clamping_scale_s24(*sample) as i32)
             .collect()
     }
 
-    // S24_3 is 24-bit PCM in a 3-byte array
+    /// S24_3 is 24-bit PCM in a 3-byte array
+    #[inline]
     pub fn f64_to_s24_3(&mut self, samples: &[f64]) -> Vec<i24> {
         samples
             .iter()
-            .map(|sample| i24::from_s24(self.clamping_scale(*sample, Self::SCALE_S24) as i32))
+            .map(|sample| i24::from_s24(self.clamping_scale_s24(*sample) as i32))
             .collect()
     }
 
+    #[inline]
     pub fn f64_to_s16(&mut self, samples: &[f64]) -> Vec<i16> {
         samples
             .iter()
-            .map(|sample| self.scale(*sample, Self::SCALE_S16) as i16)
+            .map(|sample| self.scale(*sample, 0) as i16)
             .collect()
     }
 }
diff --git a/playback/src/dither.rs b/playback/src/dither.rs
@@ -64,6 +64,7 @@ impl Ditherer for TriangularDitherer {
         Self::NAME
     }
 
+    #[inline]
     fn noise(&mut self) -> f64 {
         self.distribution.sample(&mut self.cached_rng)
     }
@@ -98,6 +99,7 @@ impl Ditherer for GaussianDitherer {
         Self::NAME
     }
 
+    #[inline]
     fn noise(&mut self) -> f64 {
         self.distribution.sample(&mut self.cached_rng)
     }
@@ -130,6 +132,7 @@ impl Ditherer for HighPassDitherer {
         Self::NAME
     }
 
+    #[inline]
     fn noise(&mut self) -> f64 {
         let new_noise = self.distribution.sample(&mut self.cached_rng);
         let high_passed_noise = new_noise - self.previous_noises[self.active_channel];

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ impl Ditherer for TriangularDitherer {`
`64`	`64`	`Self::NAME`
`65`	`65`	`}`
`66`	`66`
	`67`	`+ #[inline]`
`67`	`68`	`fn noise(&mut self) -> f64 {`
`68`	`69`	`self.distribution.sample(&mut self.cached_rng)`
`69`	`70`	`}`
`@@ -98,6 +99,7 @@ impl Ditherer for GaussianDitherer {`
`98`	`99`	`Self::NAME`
`99`	`100`	`}`
`100`	`101`
	`102`	`+ #[inline]`
`101`	`103`	`fn noise(&mut self) -> f64 {`
`102`	`104`	`self.distribution.sample(&mut self.cached_rng)`
`103`	`105`	`}`
`@@ -130,6 +132,7 @@ impl Ditherer for HighPassDitherer {`
`130`	`132`	`Self::NAME`
`131`	`133`	`}`
`132`	`134`
	`135`	`+ #[inline]`
`133`	`136`	`fn noise(&mut self) -> f64 {`
`134`	`137`	`let new_noise = self.distribution.sample(&mut self.cached_rng);`
`135`	`138`	`let high_passed_noise = new_noise - self.previous_noises[self.active_channel];`