Optimize processing of U matrix

Optimize Phases 2-5 to avoid writes to the first i columns. Additionally, use pre-computed ops from first phase to implement third & fifth phases This improves encoding performance by ~15%, especially on large symbol counts
2024-06-28 09:41:41 +00:00 · 2020-11-27 15:52:19 -08:00 · 2020-11-27 15:52:19 -08:00 · 8b462f5c83
commit 8b462f5c83
parent ebe5404d14
4 changed files with 188 additions and 238 deletions
--- a/README.md
+++ b/README.md
@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz

 ```
 Symbol size: 1280 bytes (without pre-built plan)
-symbol count = 10, encoded 127 MB in 0.532secs, throughput: 1924.7Mbit/s
-symbol count = 100, encoded 127 MB in 0.590secs, throughput: 1734.6Mbit/s
-symbol count = 250, encoded 127 MB in 0.572secs, throughput: 1788.4Mbit/s
-symbol count = 500, encoded 127 MB in 0.549secs, throughput: 1858.8Mbit/s
-symbol count = 1000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s
-symbol count = 2000, encoded 126 MB in 0.673secs, throughput: 1509.1Mbit/s
-symbol count = 5000, encoded 122 MB in 0.758secs, throughput: 1288.3Mbit/s
-symbol count = 10000, encoded 122 MB in 0.953secs, throughput: 1024.7Mbit/s
-symbol count = 20000, encoded 122 MB in 1.383secs, throughput: 706.1Mbit/s
-symbol count = 50000, encoded 122 MB in 2.041secs, throughput: 478.5Mbit/s
+symbol count = 10, encoded 127 MB in 0.545secs, throughput: 1878.8Mbit/s
+symbol count = 100, encoded 127 MB in 0.645secs, throughput: 1586.7Mbit/s
+symbol count = 250, encoded 127 MB in 0.509secs, throughput: 2009.7Mbit/s
+symbol count = 500, encoded 127 MB in 0.503secs, throughput: 2028.8Mbit/s
+symbol count = 1000, encoded 126 MB in 0.544secs, throughput: 1867.0Mbit/s
+symbol count = 2000, encoded 126 MB in 0.628secs, throughput: 1617.2Mbit/s
+symbol count = 5000, encoded 122 MB in 0.686secs, throughput: 1423.6Mbit/s
+symbol count = 10000, encoded 122 MB in 0.833secs, throughput: 1172.3Mbit/s
+symbol count = 20000, encoded 122 MB in 1.234secs, throughput: 791.4Mbit/s
+symbol count = 50000, encoded 122 MB in 1.786secs, throughput: 546.8Mbit/s

 Symbol size: 1280 bytes (with pre-built plan)
-symbol count = 10, encoded 127 MB in 0.241secs, throughput: 4248.7Mbit/s
-symbol count = 100, encoded 127 MB in 0.160secs, throughput: 6396.5Mbit/s
-symbol count = 250, encoded 127 MB in 0.173secs, throughput: 5913.0Mbit/s
-symbol count = 500, encoded 127 MB in 0.176secs, throughput: 5798.3Mbit/s
-symbol count = 1000, encoded 126 MB in 0.200secs, throughput: 5078.1Mbit/s
-symbol count = 2000, encoded 126 MB in 0.208secs, throughput: 4882.8Mbit/s
-symbol count = 5000, encoded 122 MB in 0.280secs, throughput: 3487.7Mbit/s
-symbol count = 10000, encoded 122 MB in 0.400secs, throughput: 2441.4Mbit/s
-symbol count = 20000, encoded 122 MB in 0.494secs, throughput: 1976.8Mbit/s
-symbol count = 50000, encoded 122 MB in 0.656secs, throughput: 1488.7Mbit/s
+symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s
+symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s
+symbol count = 250, encoded 127 MB in 0.164secs, throughput: 6237.5Mbit/s
+symbol count = 500, encoded 127 MB in 0.169secs, throughput: 6038.5Mbit/s
+symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s
+symbol count = 2000, encoded 126 MB in 0.214secs, throughput: 4745.9Mbit/s
+symbol count = 5000, encoded 122 MB in 0.262secs, throughput: 3727.3Mbit/s
+symbol count = 10000, encoded 122 MB in 0.344secs, throughput: 2838.8Mbit/s
+symbol count = 20000, encoded 122 MB in 0.427secs, throughput: 2287.0Mbit/s
+symbol count = 50000, encoded 122 MB in 0.541secs, throughput: 1805.1Mbit/s

 Symbol size: 1280 bytes
-symbol count = 10, decoded 127 MB in 0.723secs using 0.0% overhead, throughput: 1416.2Mbit/s
-symbol count = 100, decoded 127 MB in 0.701secs using 0.0% overhead, throughput: 1460.0Mbit/s
-symbol count = 250, decoded 127 MB in 0.650secs using 0.0% overhead, throughput: 1573.8Mbit/s
-symbol count = 500, decoded 127 MB in 0.638secs using 0.0% overhead, throughput: 1599.5Mbit/s
-symbol count = 1000, decoded 126 MB in 0.676secs using 0.0% overhead, throughput: 1502.4Mbit/s
-symbol count = 2000, decoded 126 MB in 0.764secs using 0.0% overhead, throughput: 1329.4Mbit/s
-symbol count = 5000, decoded 122 MB in 0.896secs using 0.0% overhead, throughput: 1089.9Mbit/s
-symbol count = 10000, decoded 122 MB in 1.176secs using 0.0% overhead, throughput: 830.4Mbit/s
-symbol count = 20000, decoded 122 MB in 1.489secs using 0.0% overhead, throughput: 655.9Mbit/s
-symbol count = 50000, decoded 122 MB in 2.633secs using 0.0% overhead, throughput: 370.9Mbit/s
+symbol count = 10, decoded 127 MB in 0.749secs using 0.0% overhead, throughput: 1367.1Mbit/s
+symbol count = 100, decoded 127 MB in 0.742secs using 0.0% overhead, throughput: 1379.3Mbit/s
+symbol count = 250, decoded 127 MB in 0.589secs using 0.0% overhead, throughput: 1736.8Mbit/s
+symbol count = 500, decoded 127 MB in 0.594secs using 0.0% overhead, throughput: 1718.0Mbit/s
+symbol count = 1000, decoded 126 MB in 0.638secs using 0.0% overhead, throughput: 1591.9Mbit/s
+symbol count = 2000, decoded 126 MB in 0.718secs using 0.0% overhead, throughput: 1414.5Mbit/s
+symbol count = 5000, decoded 122 MB in 0.829secs using 0.0% overhead, throughput: 1178.0Mbit/s
+symbol count = 10000, decoded 122 MB in 1.049secs using 0.0% overhead, throughput: 930.9Mbit/s
+symbol count = 20000, decoded 122 MB in 1.382secs using 0.0% overhead, throughput: 706.6Mbit/s
+symbol count = 50000, decoded 122 MB in 2.355secs using 0.0% overhead, throughput: 414.7Mbit/s

-symbol count = 10, decoded 127 MB in 0.713secs using 5.0% overhead, throughput: 1436.1Mbit/s
-symbol count = 100, decoded 127 MB in 0.702secs using 5.0% overhead, throughput: 1457.9Mbit/s
-symbol count = 250, decoded 127 MB in 0.637secs using 5.0% overhead, throughput: 1605.9Mbit/s
-symbol count = 500, decoded 127 MB in 0.613secs using 5.0% overhead, throughput: 1664.8Mbit/s
-symbol count = 1000, decoded 126 MB in 0.643secs using 5.0% overhead, throughput: 1579.5Mbit/s
-symbol count = 2000, decoded 126 MB in 0.701secs using 5.0% overhead, throughput: 1448.8Mbit/s
-symbol count = 5000, decoded 122 MB in 0.826secs using 5.0% overhead, throughput: 1182.3Mbit/s
-symbol count = 10000, decoded 122 MB in 1.061secs using 5.0% overhead, throughput: 920.4Mbit/s
-symbol count = 20000, decoded 122 MB in 1.380secs using 5.0% overhead, throughput: 707.7Mbit/s
-symbol count = 50000, decoded 122 MB in 2.341secs using 5.0% overhead, throughput: 417.2Mbit/s
+symbol count = 10, decoded 127 MB in 0.740secs using 5.0% overhead, throughput: 1383.7Mbit/s
+symbol count = 100, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.1Mbit/s
+symbol count = 250, decoded 127 MB in 0.581secs using 5.0% overhead, throughput: 1760.7Mbit/s
+symbol count = 500, decoded 127 MB in 0.565secs using 5.0% overhead, throughput: 1806.2Mbit/s
+symbol count = 1000, decoded 126 MB in 0.605secs using 5.0% overhead, throughput: 1678.7Mbit/s
+symbol count = 2000, decoded 126 MB in 0.656secs using 5.0% overhead, throughput: 1548.2Mbit/s
+symbol count = 5000, decoded 122 MB in 0.763secs using 5.0% overhead, throughput: 1279.9Mbit/s
+symbol count = 10000, decoded 122 MB in 0.959secs using 5.0% overhead, throughput: 1018.3Mbit/s
+symbol count = 20000, decoded 122 MB in 1.242secs using 5.0% overhead, throughput: 786.3Mbit/s
+symbol count = 50000, decoded 122 MB in 2.146secs using 5.0% overhead, throughput: 455.1Mbit/s
 ```

 ### Public API
--- a/src/matrix.rs
+++ b/src/matrix.rs
@ -43,11 +43,8 @@ pub trait BinaryMatrix: Clone {
    // Hints that column i will not be swapped again, and is likely to become dense'ish
    fn hint_column_dense_and_frozen(&mut self, i: usize);

-    // other must be a rows x rows matrix
-    // sets self[0..rows][..] = X * self[0..rows][..]
-    fn mul_assign_submatrix(&mut self, other: &Self, rows: usize);
-
-    fn add_assign_rows(&mut self, dest: usize, src: usize);
+    // If start_col is non-zero, values left of start_col in dest row are undefined after this operation
+    fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize);

    fn resize(&mut self, new_height: usize, new_width: usize);
 }
@ -214,29 +211,7 @@ impl BinaryMatrix for DenseBinaryMatrix {
        // No-op
    }

-    // other must be a rows x rows matrix
-    // sets self[0..rows][..] = X * self[0..rows][..]
-    fn mul_assign_submatrix(&mut self, other: &DenseBinaryMatrix, rows: usize) {
-        assert_eq!(rows, other.height());
-        assert_eq!(rows, other.width());
-        assert!(rows <= self.height());
-        let mut temp = vec![vec![0; DenseBinaryMatrix::bit_position(self.width).0 + 1]; rows];
-        #[allow(clippy::needless_range_loop)]
-        for row in 0..rows {
-            for i in 0..rows {
-                let scalar = other.get(row, i);
-                if scalar == Octet::zero() {
-                    continue;
-                }
-                add_assign_binary(&mut temp[row], &self.elements[i]);
-            }
-        }
-        for row in (0..rows).rev() {
-            self.elements[row] = temp.pop().unwrap();
-        }
-    }
-
-    fn add_assign_rows(&mut self, dest: usize, src: usize) {
+    fn add_assign_rows(&mut self, dest: usize, src: usize, _start_col: usize) {
        assert_ne!(dest, src);
        let (dest_row, temp_row) = get_both_indices(&mut self.elements, dest, src);
        add_assign_binary(dest_row, temp_row);
@ -263,22 +238,6 @@ mod tests {
    use crate::octet::Octet;
    use crate::sparse_matrix::SparseBinaryMatrix;

-    fn dense_identity(size: usize) -> DenseBinaryMatrix {
-        let mut result = DenseBinaryMatrix::new(size, size, 0);
-        for i in 0..size {
-            result.set(i, i, Octet::one());
-        }
-        result
-    }
-
-    fn sparse_identity(size: usize) -> SparseBinaryMatrix {
-        let mut result = SparseBinaryMatrix::new(size, size, 0);
-        for i in 0..size {
-            result.set(i, i, Octet::one());
-        }
-        result
-    }
-
    fn rand_dense_and_sparse(size: usize) -> (DenseBinaryMatrix, SparseBinaryMatrix) {
        let mut dense = DenseBinaryMatrix::new(size, size, 0);
        let mut sparse = SparseBinaryMatrix::new(size, size, 1);
@ -345,39 +304,16 @@ mod tests {
        assert_eq!(dense.count_ones(3, 1, 2), sparse.count_ones(3, 1, 2));
    }

-    #[test]
-    fn mul_assign_submatrix() {
-        // rand_dense_and_sparse uses set(), so just check that it works
-        let (mut dense, mut sparse) = rand_dense_and_sparse(8);
-        let original = dense.clone();
-
-        let identity = dense_identity(5);
-        dense.mul_assign_submatrix(&identity, 5);
-        assert_matrices_eq(&dense, &original);
-
-        let identity = dense_identity(8);
-        dense.mul_assign_submatrix(&identity, 8);
-        assert_matrices_eq(&dense, &original);
-
-        let identity = sparse_identity(5);
-        sparse.mul_assign_submatrix(&identity, 5);
-        assert_matrices_eq(&sparse, &original);
-
-        let identity = sparse_identity(8);
-        sparse.mul_assign_submatrix(&identity, 8);
-        assert_matrices_eq(&sparse, &original);
-    }
-
    #[test]
    fn fma_rows() {
        // rand_dense_and_sparse uses set(), so just check that it works
        let (mut dense, mut sparse) = rand_dense_and_sparse(8);
-        dense.add_assign_rows(0, 1);
-        dense.add_assign_rows(0, 2);
-        dense.add_assign_rows(2, 1);
-        sparse.add_assign_rows(0, 1);
-        sparse.add_assign_rows(0, 2);
-        sparse.add_assign_rows(2, 1);
+        dense.add_assign_rows(0, 1, 0);
+        dense.add_assign_rows(0, 2, 0);
+        dense.add_assign_rows(2, 1, 0);
+        sparse.add_assign_rows(0, 1, 0);
+        sparse.add_assign_rows(0, 2, 0);
+        sparse.add_assign_rows(2, 1, 0);
        assert_matrices_eq(&dense, &sparse);
    }

--- a/src/pi_solver.rs
+++ b/src/pi_solver.rs
@ -12,6 +12,12 @@ use crate::systematic_constants::num_pi_symbols;
 use crate::util::get_both_indices;
 use std::mem::size_of;

+#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
+enum RowOp {
+    AddAssign { src: usize, dest: usize },
+    Swap { row1: usize, row2: usize },
+}
+
 #[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
 struct FirstPhaseRowSelectionStats {
    original_degree: U16ArrayMap,
@ -509,9 +515,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
    }

    // First phase (section 5.4.2.2)
+    //
+    // Returns the row operations required to convert the X matrix into the identity
    #[allow(non_snake_case)]
    #[inline(never)]
-    fn first_phase(&mut self) -> bool {
+    fn first_phase(&mut self) -> Option<Vec<RowOp>> {
        // First phase (section 5.4.2.2)

        //    ----------> i                 u <--------
@ -533,6 +541,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
        let mut selection_helper =
            FirstPhaseRowSelectionStats::new(&self.A, self.A.width() - self.u);

+        // Record of first phase row operations performed on non-HDPC rows
+        let mut row_ops = vec![];
+
        while self.i + self.u < self.L {
            // Calculate r
            // "Let r be the minimum integer such that at least one row of A has
@ -545,7 +556,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
            );

            if r == None {
-                return false;
+                return None;
            }
            let r = r.unwrap();
            let chosen_row = chosen_row.unwrap();
@ -556,6 +567,10 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
            let temp = self.i;
            self.swap_rows(temp, chosen_row);
            self.X.swap_rows(temp, chosen_row);
+            row_ops.push(RowOp::Swap {
+                row1: temp,
+                row2: chosen_row,
+            });
            selection_helper.swap_rows(temp, chosen_row);
            // Reorder columns
            self.first_phase_swap_columns_substep(r);
@ -586,7 +601,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
                let row = row as usize;
                assert_eq!(&temp_value, &Octet::one());
                // Addition is equivalent to subtraction.
-                self.fma_rows(temp, row, Octet::one());
+                self.fma_rows(temp, row, Octet::one(), 0);
+                row_ops.push(RowOp::AddAssign {
+                    src: temp,
+                    dest: row,
+                });
                if r == 1 {
                    // Hot path for r == 1, since it's very common due to maximum connected
                    // component selection, and recompute_row() is expensive
@ -613,6 +632,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
                        // since all the rest were column swapped into the PI submatrix.
                        Some(temp),
                        Some(&pi_octets),
+                        0,
                    );
                    // It's safe to skip updating the selection helper, since it will never
                    // select an HDPC row
@ -626,7 +646,34 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
        }

        self.record_symbol_ops(0);
-        return true;
+
+        let mut mapping: Vec<usize> = (0..self.A.height()).collect();
+        let mut row_ops: Vec<RowOp> = row_ops
+            .iter()
+            .rev()
+            .map(|x| match x {
+                RowOp::AddAssign { src, dest } => {
+                    assert!(mapping[*src] < self.i);
+                    RowOp::AddAssign {
+                        src: mapping[*src],
+                        dest: mapping[*dest],
+                    }
+                }
+                RowOp::Swap { row1, row2 } => {
+                    mapping.swap(*row1, *row2);
+                    RowOp::Swap {
+                        row1: *row1,
+                        row2: *row2,
+                    }
+                }
+            })
+            .filter(|x| match x {
+                RowOp::AddAssign { src, dest } => *src < self.i && *dest < self.i,
+                RowOp::Swap { .. } => false,
+            })
+            .collect();
+        row_ops.reverse();
+        return Some(row_ops);
    }

    // See section 5.4.2.2. Verifies the two all-zeros submatrices and the identity submatrix
@ -649,9 +696,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
    // Second phase (section 5.4.2.3)
    #[allow(non_snake_case)]
    #[inline(never)]
-    fn second_phase(&mut self) -> bool {
+    fn second_phase(&mut self, #[allow(unused_variables)] x_elimination_ops: &[RowOp]) -> bool {
        #[cfg(debug_assertions)]
-        self.second_phase_verify();
+        self.second_phase_verify(x_elimination_ops);

        self.X.resize(self.i, self.i);

@ -676,54 +723,52 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
    // Verifies that X is lower triangular. See section 5.4.2.3
    #[inline(never)]
    #[cfg(debug_assertions)]
-    fn second_phase_verify(&self) {
+    fn second_phase_verify(&self, x_elimination_ops: &[RowOp]) {
        for row in 0..self.i {
            for col in (row + 1)..self.i {
                assert_eq!(Octet::zero(), self.X.get(row, col));
            }
        }
+
+        // Also verify Errata 9
+        let mut tempX = self.X.clone();
+        for op in x_elimination_ops {
+            match op {
+                RowOp::AddAssign { src, dest } => {
+                    tempX.add_assign_rows(*dest, *src, 0);
+                }
+                RowOp::Swap { .. } => unreachable!(),
+            }
+        }
+        for row in 0..self.i {
+            for col in 0..self.i {
+                if row == col {
+                    assert_eq!(Octet::one(), tempX.get(row, col));
+                } else {
+                    assert_eq!(Octet::zero(), tempX.get(row, col));
+                }
+            }
+        }
    }

    // Third phase (section 5.4.2.4)
    #[allow(non_snake_case)]
    #[inline(never)]
-    fn third_phase(&mut self) {
+    fn third_phase(&mut self, x_elimination_ops: &[RowOp]) {
        #[cfg(debug_assertions)]
        self.third_phase_verify();

-        // A[0..i][..] = X * A[0..i][..]
-        self.A.mul_assign_submatrix(&self.X, self.i);
-
-        // Now apply the same operations to D.
-        // Note that X is lower triangular, so the row must be processed last to first
-        for row in (0..self.i).rev() {
-            if self.X.get(row, row) != Octet::one() {
-                self.debug_symbol_mul_ops += 1;
-                self.deferred_D_ops.push(SymbolOps::MulAssign {
-                    dest: self.d[row],
-                    scalar: self.X.get(row, row),
-                });
-            }
-
-            for (col, value) in self.X.get_row_iter(row, 0, row) {
-                if value == Octet::zero() {
-                    continue;
-                }
-                if value == Octet::one() {
-                    self.debug_symbol_add_ops += 1;
-                    self.deferred_D_ops.push(SymbolOps::AddAssign {
-                        dest: self.d[row],
-                        src: self.d[col],
-                    });
-                } else {
-                    self.debug_symbol_mul_ops += 1;
-                    self.debug_symbol_add_ops += 1;
-                    self.deferred_D_ops.push(SymbolOps::FMA {
-                        dest: self.d[row],
-                        src: self.d[col],
-                        scalar: self.X.get(row, col),
-                    });
+        // Perform A[0..i][..] = X * A[0..i][..] by applying Errata 10
+        for op in x_elimination_ops.iter().rev() {
+            match op {
+                RowOp::AddAssign { src, dest } => {
+                    #[cfg(debug_assertions)]
+                    self.fma_rows(*src, *dest, Octet::one(), 0);
+                    #[cfg(not(debug_assertions))]
+                    // Skip applying to cols before i due to Errata 11
+                    self.fma_rows(*src, *dest, Octet::one(), self.i);
                }
+                RowOp::Swap { .. } => unreachable!(),
            }
        }

@ -771,7 +816,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
                let b = self.A.get(i, j + self.i);
                if b != Octet::zero() {
                    let temp = self.i;
-                    self.fma_rows(temp + j, i, b);
+                    #[cfg(debug_assertions)]
+                    self.fma_rows(temp + j, i, b, 0);
+                    // Skip applying to cols before i due to Errata 11
+                    #[cfg(not(debug_assertions))]
+                    self.fma_rows(temp + j, i, b, self.i);
                }
            }
        }
@ -814,20 +863,18 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
    // Fifth phase (section 5.4.2.6)
    #[allow(non_snake_case)]
    #[inline(never)]
-    fn fifth_phase(&mut self) {
-        // "For j from 1 to i". Note that A is 1-indexed in the spec, and ranges are inclusive,
-        // this is means [1, i], which is equal to [0, i)
-        for j in 0..self.i as usize {
-            // Skip normalizing the diagonal, since there can't be non-binary values due to
-            // Errata 7
-
-            // "For l from 1 to j-1". This means the lower triangular columns, not including the
-            // diagonal, which is [0, j)
-            for (l, _) in self.A.get_row_iter(j, 0, j).clone() {
-                let temp = self.A.get(j, l);
-                if temp != Octet::zero() {
-                    self.fma_rows(l, j, temp);
+    fn fifth_phase(&mut self, x_elimination_ops: &[RowOp]) {
+        // Use the saved operations from first phase: Errata 9
+        for op in x_elimination_ops {
+            match op {
+                RowOp::AddAssign { src, dest } => {
+                    #[cfg(debug_assertions)]
+                    self.fma_rows(*src, *dest, Octet::one(), 0);
+                    // In release builds skip updating the A matrix, since it will never be read
+                    #[cfg(not(debug_assertions))]
+                    self.record_fma_rows(*src, *dest, Octet::one());
                }
+                RowOp::Swap { .. } => unreachable!(),
            }
        }

@ -1011,8 +1058,8 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
        assert!(self.A_hdpc_rows.is_none());
    }

-    fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
-        self.fma_rows_with_pi(i, iprime, beta, None, None);
+    fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet, start_col: usize) {
+        self.fma_rows_with_pi(i, iprime, beta, None, None, start_col);
    }

    fn record_fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
@ -1040,6 +1087,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
        beta: Octet,
        only_non_pi_nonzero_column: Option<usize>,
        pi_octets: Option<&Vec<u8>>,
+        start_col: usize,
    ) {
        self.record_fma_rows(i, iprime, beta.clone());

@ -1064,11 +1112,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
                );
            } else {
                assert_eq!(&beta, &Octet::one());
-                self.A.add_assign_rows(iprime, i);
+                self.A.add_assign_rows(iprime, i, start_col);
            }
        } else {
            assert_eq!(&beta, &Octet::one());
-            self.A.add_assign_rows(iprime, i);
+            self.A.add_assign_rows(iprime, i, start_col);
        }
    }

@ -1095,20 +1143,20 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
    pub fn execute(&mut self) -> (Option<Vec<Symbol>>, Option<Vec<SymbolOps>>) {
        self.X.disable_column_acccess_acceleration();

-        if !self.first_phase() {
+        if let Some(x_elimination_ops) = self.first_phase() {
+            self.A.disable_column_acccess_acceleration();
+
+            if !self.second_phase(&x_elimination_ops) {
+                return (None, None);
+            }
+
+            self.third_phase(&x_elimination_ops);
+            self.fourth_phase();
+            self.fifth_phase(&x_elimination_ops);
+        } else {
            return (None, None);
        }

-        self.A.disable_column_acccess_acceleration();
-
-        if !self.second_phase() {
-            return (None, None);
-        }
-
-        self.third_phase();
-        self.fourth_phase();
-        self.fifth_phase();
-
        self.apply_deferred_symbol_ops();

        // See end of section 5.4.2.1
--- a/src/sparse_matrix.rs
+++ b/src/sparse_matrix.rs
@ -290,48 +290,12 @@ impl BinaryMatrix for SparseBinaryMatrix {
        }
    }

-    // other must be a rows x rows matrix
-    // sets self[0..rows][..] = X * self[0..rows][..]
-    fn mul_assign_submatrix(&mut self, other: &SparseBinaryMatrix, rows: usize) {
-        assert_eq!(rows, other.height());
-        assert_eq!(rows, other.width());
-        assert!(rows <= self.height());
-        assert!(self.column_index_disabled);
-        if other.num_dense_columns != 0 {
-            unimplemented!();
-        }
-        // Note: rows are logically indexed
-        let mut temp_sparse = vec![SparseBinaryVec::with_capacity(10); rows];
-        let mut temp_dense = vec![0; rows * ((self.num_dense_columns - 1) / WORD_WIDTH + 1)];
-        for row in 0..rows {
-            for (i, scalar) in other.get_row_iter(row, 0, rows) {
-                let physical_i = self.logical_row_to_physical[i] as usize;
-                if scalar != Octet::zero() {
-                    temp_sparse[row].add_assign(&self.sparse_elements[physical_i]);
-                    let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
-                    for word in 0..words {
-                        let (src_word, _) = self.bit_position(physical_i, word * WORD_WIDTH);
-                        temp_dense[word * rows + row] ^= self.dense_elements[src_word];
-                    }
-                }
-            }
-        }
-        for row in (0..rows).rev() {
-            let physical_row = self.logical_row_to_physical[row] as usize;
-            self.sparse_elements[physical_row] = temp_sparse.pop().unwrap();
-            let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
-            for word in 0..words {
-                let (dest_word, _) = self.bit_position(physical_row, word * WORD_WIDTH);
-                self.dense_elements[dest_word] = temp_dense[word * rows + row];
-            }
-        }
-
-        #[cfg(debug_assertions)]
-        self.verify();
-    }
-
-    fn add_assign_rows(&mut self, dest: usize, src: usize) {
+    fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize) {
        assert_ne!(dest, src);
+        assert!(
+            start_col == 0 || start_col == self.width - self.num_dense_columns,
+            "start_col must be zero or at the beginning of the U matrix"
+        );
        let physical_dest = self.logical_row_to_physical[dest] as usize;
        let physical_src = self.logical_row_to_physical[src] as usize;
        // First handle the dense columns
@ -344,23 +308,25 @@ impl BinaryMatrix for SparseBinaryMatrix {
            }
        }

-        // Then the sparse columns
-        let (dest_row, temp_row) =
-            get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
-        // This shouldn't be needed, because while column indexing is enabled in first phase,
-        // columns are only eliminated one at a time in sparse section of matrix.
-        assert!(self.column_index_disabled || temp_row.len() == 1);
+        if start_col == 0 {
+            // Then the sparse columns
+            let (dest_row, temp_row) =
+                get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
+            // This shouldn't be needed, because while column indexing is enabled in first phase,
+            // columns are only eliminated one at a time in sparse section of matrix.
+            assert!(self.column_index_disabled || temp_row.len() == 1);

-        let column_added = dest_row.add_assign(temp_row);
-        // This shouldn't be needed, because while column indexing is enabled in first phase,
-        // columns are only removed.
-        assert!(self.column_index_disabled || !column_added);
+            let column_added = dest_row.add_assign(temp_row);
+            // This shouldn't be needed, because while column indexing is enabled in first phase,
+            // columns are only removed.
+            assert!(self.column_index_disabled || !column_added);

-        #[cfg(debug_assertions)]
-        {
-            if !self.column_index_disabled {
-                let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
-                self.debug_indexed_column_valid[col as usize] = false;
+            #[cfg(debug_assertions)]
+            {
+                if !self.column_index_disabled {
+                    let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
+                    self.debug_indexed_column_valid[col as usize] = false;
+                }
            }
        }