Optimize query_non_zero_columns()

This reduces the time spent in the fourth phase from ~6% of encoding time to ~1%, according to perf, and improves overall throughput by 3-4% on large symbol counts.
2024-06-27 09:19:02 +00:00 · 2020-11-29 09:39:50 -08:00 · 2020-11-29 09:39:50 -08:00 · 50301e1b5b
commit 50301e1b5b
parent c4d227fba1
2 changed files with 67 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz

 ```
 Symbol size: 1280 bytes (without pre-built plan)
-symbol count = 10, encoded 127 MB in 0.548secs, throughput: 1868.5Mbit/s
-symbol count = 100, encoded 127 MB in 0.637secs, throughput: 1606.7Mbit/s
-symbol count = 250, encoded 127 MB in 0.507secs, throughput: 2017.7Mbit/s
-symbol count = 500, encoded 127 MB in 0.488secs, throughput: 2091.2Mbit/s
-symbol count = 1000, encoded 126 MB in 0.523secs, throughput: 1941.9Mbit/s
-symbol count = 2000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s
-symbol count = 5000, encoded 122 MB in 0.636secs, throughput: 1535.5Mbit/s
-symbol count = 10000, encoded 122 MB in 0.769secs, throughput: 1269.9Mbit/s
-symbol count = 20000, encoded 122 MB in 1.122secs, throughput: 870.4Mbit/s
-symbol count = 50000, encoded 122 MB in 1.597secs, throughput: 611.5Mbit/s
+symbol count = 10, encoded 127 MB in 0.574secs, throughput: 1783.8Mbit/s
+symbol count = 100, encoded 127 MB in 0.653secs, throughput: 1567.3Mbit/s
+symbol count = 250, encoded 127 MB in 0.497secs, throughput: 2058.2Mbit/s
+symbol count = 500, encoded 127 MB in 0.477secs, throughput: 2139.4Mbit/s
+symbol count = 1000, encoded 126 MB in 0.513secs, throughput: 1979.8Mbit/s
+symbol count = 2000, encoded 126 MB in 0.590secs, throughput: 1721.4Mbit/s
+symbol count = 5000, encoded 122 MB in 0.629secs, throughput: 1552.6Mbit/s
+symbol count = 10000, encoded 122 MB in 0.736secs, throughput: 1326.9Mbit/s
+symbol count = 20000, encoded 122 MB in 1.094secs, throughput: 892.7Mbit/s
+symbol count = 50000, encoded 122 MB in 1.548secs, throughput: 630.9Mbit/s

 Symbol size: 1280 bytes (with pre-built plan)
-symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s
-symbol count = 100, encoded 127 MB in 0.154secs, throughput: 6645.7Mbit/s
-symbol count = 250, encoded 127 MB in 0.160secs, throughput: 6393.4Mbit/s
-symbol count = 500, encoded 127 MB in 0.163secs, throughput: 6260.8Mbit/s
-symbol count = 1000, encoded 126 MB in 0.173secs, throughput: 5870.7Mbit/s
-symbol count = 2000, encoded 126 MB in 0.199secs, throughput: 5103.6Mbit/s
-symbol count = 5000, encoded 122 MB in 0.255secs, throughput: 3829.7Mbit/s
-symbol count = 10000, encoded 122 MB in 0.339secs, throughput: 2880.7Mbit/s
-symbol count = 20000, encoded 122 MB in 0.425secs, throughput: 2297.8Mbit/s
-symbol count = 50000, encoded 122 MB in 0.536secs, throughput: 1821.9Mbit/s
+symbol count = 10, encoded 127 MB in 0.226secs, throughput: 4530.6Mbit/s
+symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s
+symbol count = 250, encoded 127 MB in 0.162secs, throughput: 6314.5Mbit/s
+symbol count = 500, encoded 127 MB in 0.164secs, throughput: 6222.6Mbit/s
+symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s
+symbol count = 2000, encoded 126 MB in 0.204secs, throughput: 4978.6Mbit/s
+symbol count = 5000, encoded 122 MB in 0.269secs, throughput: 3630.3Mbit/s
+symbol count = 10000, encoded 122 MB in 0.348secs, throughput: 2806.2Mbit/s
+symbol count = 20000, encoded 122 MB in 0.437secs, throughput: 2234.7Mbit/s
+symbol count = 50000, encoded 122 MB in 0.549secs, throughput: 1778.8Mbit/s

 Symbol size: 1280 bytes
-symbol count = 10, decoded 127 MB in 0.758secs using 0.0% overhead, throughput: 1350.8Mbit/s
-symbol count = 100, decoded 127 MB in 0.740secs using 0.0% overhead, throughput: 1383.0Mbit/s
-symbol count = 250, decoded 127 MB in 0.577secs using 0.0% overhead, throughput: 1772.9Mbit/s
-symbol count = 500, decoded 127 MB in 0.582secs using 0.0% overhead, throughput: 1753.4Mbit/s
-symbol count = 1000, decoded 126 MB in 0.628secs using 0.0% overhead, throughput: 1617.2Mbit/s
-symbol count = 2000, decoded 126 MB in 0.684secs using 0.0% overhead, throughput: 1484.8Mbit/s
-symbol count = 5000, decoded 122 MB in 0.785secs using 0.0% overhead, throughput: 1244.0Mbit/s
-symbol count = 10000, decoded 122 MB in 0.965secs using 0.0% overhead, throughput: 1012.0Mbit/s
-symbol count = 20000, decoded 122 MB in 1.345secs using 0.0% overhead, throughput: 726.1Mbit/s
-symbol count = 50000, decoded 122 MB in 2.101secs using 0.0% overhead, throughput: 464.8Mbit/s
+symbol count = 10, decoded 127 MB in 0.759secs using 0.0% overhead, throughput: 1349.0Mbit/s
+symbol count = 100, decoded 127 MB in 0.746secs using 0.0% overhead, throughput: 1371.9Mbit/s
+symbol count = 250, decoded 127 MB in 0.569secs using 0.0% overhead, throughput: 1797.8Mbit/s
+symbol count = 500, decoded 127 MB in 0.556secs using 0.0% overhead, throughput: 1835.4Mbit/s
+symbol count = 1000, decoded 126 MB in 0.591secs using 0.0% overhead, throughput: 1718.5Mbit/s
+symbol count = 2000, decoded 126 MB in 0.660secs using 0.0% overhead, throughput: 1538.8Mbit/s
+symbol count = 5000, decoded 122 MB in 0.738secs using 0.0% overhead, throughput: 1323.3Mbit/s
+symbol count = 10000, decoded 122 MB in 0.931secs using 0.0% overhead, throughput: 1048.9Mbit/s
+symbol count = 20000, decoded 122 MB in 1.192secs using 0.0% overhead, throughput: 819.3Mbit/s
+symbol count = 50000, decoded 122 MB in 2.050secs using 0.0% overhead, throughput: 476.4Mbit/s

-symbol count = 10, decoded 127 MB in 0.753secs using 5.0% overhead, throughput: 1359.8Mbit/s
-symbol count = 100, decoded 127 MB in 0.731secs using 5.0% overhead, throughput: 1400.1Mbit/s
-symbol count = 250, decoded 127 MB in 0.575secs using 5.0% overhead, throughput: 1779.0Mbit/s
-symbol count = 500, decoded 127 MB in 0.552secs using 5.0% overhead, throughput: 1848.7Mbit/s
-symbol count = 1000, decoded 126 MB in 0.568secs using 5.0% overhead, throughput: 1788.1Mbit/s
-symbol count = 2000, decoded 126 MB in 0.626secs using 5.0% overhead, throughput: 1622.4Mbit/s
-symbol count = 5000, decoded 122 MB in 0.713secs using 5.0% overhead, throughput: 1369.7Mbit/s
-symbol count = 10000, decoded 122 MB in 0.893secs using 5.0% overhead, throughput: 1093.6Mbit/s
-symbol count = 20000, decoded 122 MB in 1.147secs using 5.0% overhead, throughput: 851.4Mbit/s
-symbol count = 50000, decoded 122 MB in 1.943secs using 5.0% overhead, throughput: 502.6Mbit/s
+symbol count = 10, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.7Mbit/s
+symbol count = 100, decoded 127 MB in 0.745secs using 5.0% overhead, throughput: 1373.7Mbit/s
+symbol count = 250, decoded 127 MB in 0.562secs using 5.0% overhead, throughput: 1820.2Mbit/s
+symbol count = 500, decoded 127 MB in 0.540secs using 5.0% overhead, throughput: 1889.8Mbit/s
+symbol count = 1000, decoded 126 MB in 0.564secs using 5.0% overhead, throughput: 1800.8Mbit/s
+symbol count = 2000, decoded 126 MB in 0.612secs using 5.0% overhead, throughput: 1659.5Mbit/s
+symbol count = 5000, decoded 122 MB in 0.702secs using 5.0% overhead, throughput: 1391.1Mbit/s
+symbol count = 10000, decoded 122 MB in 0.879secs using 5.0% overhead, throughput: 1111.0Mbit/s
+symbol count = 20000, decoded 122 MB in 1.112secs using 5.0% overhead, throughput: 878.2Mbit/s
+symbol count = 50000, decoded 122 MB in 1.909secs using 5.0% overhead, throughput: 511.6Mbit/s
 ```

 ### Public API
--- a/src/sparse_matrix.rs
+++ b/src/sparse_matrix.rs
@ -203,20 +203,35 @@ impl BinaryMatrix for SparseBinaryMatrix {
        // The following implementation is equivalent to .filter(|x| self.get(row, x) != Octet::zero())
        // but this implementation optimizes for sequential access and avoids all the
        // extra bit index math
+        assert_eq!(start_col, self.width - self.num_dense_columns);
+        let mut result = vec![];
        let physical_row = self.logical_row_to_physical[row] as usize;
-        let (mut word, mut bit) =
+        let (mut word, bit) =
            self.bit_position(physical_row, self.logical_col_to_dense_col(start_col));
-        (start_col..self.width)
-            .filter(|_| {
-                let result = self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) != 0;
-                bit += 1;
-                if bit == WORD_WIDTH {
-                    word += 1;
-                    bit = 0;
-                }
-                result
-            })
-            .collect()
+        let mut col = start_col;
+        // Process the first word, which may not be entirely filled, due to left zero padding
+        // Because of the assert that start_col is always the first dense column, the first one
+        // must be the column we're looking for, so they're no need to zero out columns left of it.
+        let mut block = self.dense_elements[word];
+        while block.trailing_zeros() < WORD_WIDTH as u32 {
+            result.push(col + block.trailing_zeros() as usize - bit);
+            block &= !(SparseBinaryMatrix::select_mask(block.trailing_zeros() as usize));
+        }
+        col += WORD_WIDTH - bit;
+        word += 1;
+
+        while col < self.width() {
+            let mut block = self.dense_elements[word];
+            // process the whole word in one shot to improve efficiency
+            while block.trailing_zeros() < WORD_WIDTH as u32 {
+                result.push(col + block.trailing_zeros() as usize);
+                block &= !(SparseBinaryMatrix::select_mask(block.trailing_zeros() as usize));
+            }
+            col += WORD_WIDTH;
+            word += 1;
+        }
+
+        result
    }

    fn get(&self, i: usize, j: usize) -> Octet {