Optimize processing of U matrix

Optimize Phases 2-5 to avoid writes to the first i columns.
Additionally, use pre-computed ops from first phase to implement third &
fifth phases

This improves encoding performance by ~15%, especially on large symbol
counts
This commit is contained in:
Christopher Berner 2020-11-27 15:52:19 -08:00
parent ebe5404d14
commit 8b462f5c83
4 changed files with 188 additions and 238 deletions

@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz
```
Symbol size: 1280 bytes (without pre-built plan)
symbol count = 10, encoded 127 MB in 0.532secs, throughput: 1924.7Mbit/s
symbol count = 100, encoded 127 MB in 0.590secs, throughput: 1734.6Mbit/s
symbol count = 250, encoded 127 MB in 0.572secs, throughput: 1788.4Mbit/s
symbol count = 500, encoded 127 MB in 0.549secs, throughput: 1858.8Mbit/s
symbol count = 1000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s
symbol count = 2000, encoded 126 MB in 0.673secs, throughput: 1509.1Mbit/s
symbol count = 5000, encoded 122 MB in 0.758secs, throughput: 1288.3Mbit/s
symbol count = 10000, encoded 122 MB in 0.953secs, throughput: 1024.7Mbit/s
symbol count = 20000, encoded 122 MB in 1.383secs, throughput: 706.1Mbit/s
symbol count = 50000, encoded 122 MB in 2.041secs, throughput: 478.5Mbit/s
symbol count = 10, encoded 127 MB in 0.545secs, throughput: 1878.8Mbit/s
symbol count = 100, encoded 127 MB in 0.645secs, throughput: 1586.7Mbit/s
symbol count = 250, encoded 127 MB in 0.509secs, throughput: 2009.7Mbit/s
symbol count = 500, encoded 127 MB in 0.503secs, throughput: 2028.8Mbit/s
symbol count = 1000, encoded 126 MB in 0.544secs, throughput: 1867.0Mbit/s
symbol count = 2000, encoded 126 MB in 0.628secs, throughput: 1617.2Mbit/s
symbol count = 5000, encoded 122 MB in 0.686secs, throughput: 1423.6Mbit/s
symbol count = 10000, encoded 122 MB in 0.833secs, throughput: 1172.3Mbit/s
symbol count = 20000, encoded 122 MB in 1.234secs, throughput: 791.4Mbit/s
symbol count = 50000, encoded 122 MB in 1.786secs, throughput: 546.8Mbit/s
Symbol size: 1280 bytes (with pre-built plan)
symbol count = 10, encoded 127 MB in 0.241secs, throughput: 4248.7Mbit/s
symbol count = 100, encoded 127 MB in 0.160secs, throughput: 6396.5Mbit/s
symbol count = 250, encoded 127 MB in 0.173secs, throughput: 5913.0Mbit/s
symbol count = 500, encoded 127 MB in 0.176secs, throughput: 5798.3Mbit/s
symbol count = 1000, encoded 126 MB in 0.200secs, throughput: 5078.1Mbit/s
symbol count = 2000, encoded 126 MB in 0.208secs, throughput: 4882.8Mbit/s
symbol count = 5000, encoded 122 MB in 0.280secs, throughput: 3487.7Mbit/s
symbol count = 10000, encoded 122 MB in 0.400secs, throughput: 2441.4Mbit/s
symbol count = 20000, encoded 122 MB in 0.494secs, throughput: 1976.8Mbit/s
symbol count = 50000, encoded 122 MB in 0.656secs, throughput: 1488.7Mbit/s
symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s
symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s
symbol count = 250, encoded 127 MB in 0.164secs, throughput: 6237.5Mbit/s
symbol count = 500, encoded 127 MB in 0.169secs, throughput: 6038.5Mbit/s
symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s
symbol count = 2000, encoded 126 MB in 0.214secs, throughput: 4745.9Mbit/s
symbol count = 5000, encoded 122 MB in 0.262secs, throughput: 3727.3Mbit/s
symbol count = 10000, encoded 122 MB in 0.344secs, throughput: 2838.8Mbit/s
symbol count = 20000, encoded 122 MB in 0.427secs, throughput: 2287.0Mbit/s
symbol count = 50000, encoded 122 MB in 0.541secs, throughput: 1805.1Mbit/s
Symbol size: 1280 bytes
symbol count = 10, decoded 127 MB in 0.723secs using 0.0% overhead, throughput: 1416.2Mbit/s
symbol count = 100, decoded 127 MB in 0.701secs using 0.0% overhead, throughput: 1460.0Mbit/s
symbol count = 250, decoded 127 MB in 0.650secs using 0.0% overhead, throughput: 1573.8Mbit/s
symbol count = 500, decoded 127 MB in 0.638secs using 0.0% overhead, throughput: 1599.5Mbit/s
symbol count = 1000, decoded 126 MB in 0.676secs using 0.0% overhead, throughput: 1502.4Mbit/s
symbol count = 2000, decoded 126 MB in 0.764secs using 0.0% overhead, throughput: 1329.4Mbit/s
symbol count = 5000, decoded 122 MB in 0.896secs using 0.0% overhead, throughput: 1089.9Mbit/s
symbol count = 10000, decoded 122 MB in 1.176secs using 0.0% overhead, throughput: 830.4Mbit/s
symbol count = 20000, decoded 122 MB in 1.489secs using 0.0% overhead, throughput: 655.9Mbit/s
symbol count = 50000, decoded 122 MB in 2.633secs using 0.0% overhead, throughput: 370.9Mbit/s
symbol count = 10, decoded 127 MB in 0.749secs using 0.0% overhead, throughput: 1367.1Mbit/s
symbol count = 100, decoded 127 MB in 0.742secs using 0.0% overhead, throughput: 1379.3Mbit/s
symbol count = 250, decoded 127 MB in 0.589secs using 0.0% overhead, throughput: 1736.8Mbit/s
symbol count = 500, decoded 127 MB in 0.594secs using 0.0% overhead, throughput: 1718.0Mbit/s
symbol count = 1000, decoded 126 MB in 0.638secs using 0.0% overhead, throughput: 1591.9Mbit/s
symbol count = 2000, decoded 126 MB in 0.718secs using 0.0% overhead, throughput: 1414.5Mbit/s
symbol count = 5000, decoded 122 MB in 0.829secs using 0.0% overhead, throughput: 1178.0Mbit/s
symbol count = 10000, decoded 122 MB in 1.049secs using 0.0% overhead, throughput: 930.9Mbit/s
symbol count = 20000, decoded 122 MB in 1.382secs using 0.0% overhead, throughput: 706.6Mbit/s
symbol count = 50000, decoded 122 MB in 2.355secs using 0.0% overhead, throughput: 414.7Mbit/s
symbol count = 10, decoded 127 MB in 0.713secs using 5.0% overhead, throughput: 1436.1Mbit/s
symbol count = 100, decoded 127 MB in 0.702secs using 5.0% overhead, throughput: 1457.9Mbit/s
symbol count = 250, decoded 127 MB in 0.637secs using 5.0% overhead, throughput: 1605.9Mbit/s
symbol count = 500, decoded 127 MB in 0.613secs using 5.0% overhead, throughput: 1664.8Mbit/s
symbol count = 1000, decoded 126 MB in 0.643secs using 5.0% overhead, throughput: 1579.5Mbit/s
symbol count = 2000, decoded 126 MB in 0.701secs using 5.0% overhead, throughput: 1448.8Mbit/s
symbol count = 5000, decoded 122 MB in 0.826secs using 5.0% overhead, throughput: 1182.3Mbit/s
symbol count = 10000, decoded 122 MB in 1.061secs using 5.0% overhead, throughput: 920.4Mbit/s
symbol count = 20000, decoded 122 MB in 1.380secs using 5.0% overhead, throughput: 707.7Mbit/s
symbol count = 50000, decoded 122 MB in 2.341secs using 5.0% overhead, throughput: 417.2Mbit/s
symbol count = 10, decoded 127 MB in 0.740secs using 5.0% overhead, throughput: 1383.7Mbit/s
symbol count = 100, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.1Mbit/s
symbol count = 250, decoded 127 MB in 0.581secs using 5.0% overhead, throughput: 1760.7Mbit/s
symbol count = 500, decoded 127 MB in 0.565secs using 5.0% overhead, throughput: 1806.2Mbit/s
symbol count = 1000, decoded 126 MB in 0.605secs using 5.0% overhead, throughput: 1678.7Mbit/s
symbol count = 2000, decoded 126 MB in 0.656secs using 5.0% overhead, throughput: 1548.2Mbit/s
symbol count = 5000, decoded 122 MB in 0.763secs using 5.0% overhead, throughput: 1279.9Mbit/s
symbol count = 10000, decoded 122 MB in 0.959secs using 5.0% overhead, throughput: 1018.3Mbit/s
symbol count = 20000, decoded 122 MB in 1.242secs using 5.0% overhead, throughput: 786.3Mbit/s
symbol count = 50000, decoded 122 MB in 2.146secs using 5.0% overhead, throughput: 455.1Mbit/s
```
### Public API

@ -43,11 +43,8 @@ pub trait BinaryMatrix: Clone {
// Hints that column i will not be swapped again, and is likely to become dense'ish
fn hint_column_dense_and_frozen(&mut self, i: usize);
// other must be a rows x rows matrix
// sets self[0..rows][..] = X * self[0..rows][..]
fn mul_assign_submatrix(&mut self, other: &Self, rows: usize);
fn add_assign_rows(&mut self, dest: usize, src: usize);
// If start_col is non-zero, values left of start_col in dest row are undefined after this operation
fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize);
fn resize(&mut self, new_height: usize, new_width: usize);
}
@ -214,29 +211,7 @@ impl BinaryMatrix for DenseBinaryMatrix {
// No-op
}
// other must be a rows x rows matrix
// sets self[0..rows][..] = X * self[0..rows][..]
fn mul_assign_submatrix(&mut self, other: &DenseBinaryMatrix, rows: usize) {
assert_eq!(rows, other.height());
assert_eq!(rows, other.width());
assert!(rows <= self.height());
let mut temp = vec![vec![0; DenseBinaryMatrix::bit_position(self.width).0 + 1]; rows];
#[allow(clippy::needless_range_loop)]
for row in 0..rows {
for i in 0..rows {
let scalar = other.get(row, i);
if scalar == Octet::zero() {
continue;
}
add_assign_binary(&mut temp[row], &self.elements[i]);
}
}
for row in (0..rows).rev() {
self.elements[row] = temp.pop().unwrap();
}
}
fn add_assign_rows(&mut self, dest: usize, src: usize) {
fn add_assign_rows(&mut self, dest: usize, src: usize, _start_col: usize) {
assert_ne!(dest, src);
let (dest_row, temp_row) = get_both_indices(&mut self.elements, dest, src);
add_assign_binary(dest_row, temp_row);
@ -263,22 +238,6 @@ mod tests {
use crate::octet::Octet;
use crate::sparse_matrix::SparseBinaryMatrix;
fn dense_identity(size: usize) -> DenseBinaryMatrix {
let mut result = DenseBinaryMatrix::new(size, size, 0);
for i in 0..size {
result.set(i, i, Octet::one());
}
result
}
fn sparse_identity(size: usize) -> SparseBinaryMatrix {
let mut result = SparseBinaryMatrix::new(size, size, 0);
for i in 0..size {
result.set(i, i, Octet::one());
}
result
}
fn rand_dense_and_sparse(size: usize) -> (DenseBinaryMatrix, SparseBinaryMatrix) {
let mut dense = DenseBinaryMatrix::new(size, size, 0);
let mut sparse = SparseBinaryMatrix::new(size, size, 1);
@ -345,39 +304,16 @@ mod tests {
assert_eq!(dense.count_ones(3, 1, 2), sparse.count_ones(3, 1, 2));
}
#[test]
fn mul_assign_submatrix() {
// rand_dense_and_sparse uses set(), so just check that it works
let (mut dense, mut sparse) = rand_dense_and_sparse(8);
let original = dense.clone();
let identity = dense_identity(5);
dense.mul_assign_submatrix(&identity, 5);
assert_matrices_eq(&dense, &original);
let identity = dense_identity(8);
dense.mul_assign_submatrix(&identity, 8);
assert_matrices_eq(&dense, &original);
let identity = sparse_identity(5);
sparse.mul_assign_submatrix(&identity, 5);
assert_matrices_eq(&sparse, &original);
let identity = sparse_identity(8);
sparse.mul_assign_submatrix(&identity, 8);
assert_matrices_eq(&sparse, &original);
}
#[test]
fn fma_rows() {
// rand_dense_and_sparse uses set(), so just check that it works
let (mut dense, mut sparse) = rand_dense_and_sparse(8);
dense.add_assign_rows(0, 1);
dense.add_assign_rows(0, 2);
dense.add_assign_rows(2, 1);
sparse.add_assign_rows(0, 1);
sparse.add_assign_rows(0, 2);
sparse.add_assign_rows(2, 1);
dense.add_assign_rows(0, 1, 0);
dense.add_assign_rows(0, 2, 0);
dense.add_assign_rows(2, 1, 0);
sparse.add_assign_rows(0, 1, 0);
sparse.add_assign_rows(0, 2, 0);
sparse.add_assign_rows(2, 1, 0);
assert_matrices_eq(&dense, &sparse);
}

@ -12,6 +12,12 @@ use crate::systematic_constants::num_pi_symbols;
use crate::util::get_both_indices;
use std::mem::size_of;
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
enum RowOp {
AddAssign { src: usize, dest: usize },
Swap { row1: usize, row2: usize },
}
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
struct FirstPhaseRowSelectionStats {
original_degree: U16ArrayMap,
@ -509,9 +515,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
}
// First phase (section 5.4.2.2)
//
// Returns the row operations required to convert the X matrix into the identity
#[allow(non_snake_case)]
#[inline(never)]
fn first_phase(&mut self) -> bool {
fn first_phase(&mut self) -> Option<Vec<RowOp>> {
// First phase (section 5.4.2.2)
// ----------> i u <--------
@ -533,6 +541,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
let mut selection_helper =
FirstPhaseRowSelectionStats::new(&self.A, self.A.width() - self.u);
// Record of first phase row operations performed on non-HDPC rows
let mut row_ops = vec![];
while self.i + self.u < self.L {
// Calculate r
// "Let r be the minimum integer such that at least one row of A has
@ -545,7 +556,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
);
if r == None {
return false;
return None;
}
let r = r.unwrap();
let chosen_row = chosen_row.unwrap();
@ -556,6 +567,10 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
let temp = self.i;
self.swap_rows(temp, chosen_row);
self.X.swap_rows(temp, chosen_row);
row_ops.push(RowOp::Swap {
row1: temp,
row2: chosen_row,
});
selection_helper.swap_rows(temp, chosen_row);
// Reorder columns
self.first_phase_swap_columns_substep(r);
@ -586,7 +601,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
let row = row as usize;
assert_eq!(&temp_value, &Octet::one());
// Addition is equivalent to subtraction.
self.fma_rows(temp, row, Octet::one());
self.fma_rows(temp, row, Octet::one(), 0);
row_ops.push(RowOp::AddAssign {
src: temp,
dest: row,
});
if r == 1 {
// Hot path for r == 1, since it's very common due to maximum connected
// component selection, and recompute_row() is expensive
@ -613,6 +632,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
// since all the rest were column swapped into the PI submatrix.
Some(temp),
Some(&pi_octets),
0,
);
// It's safe to skip updating the selection helper, since it will never
// select an HDPC row
@ -626,7 +646,34 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
}
self.record_symbol_ops(0);
return true;
let mut mapping: Vec<usize> = (0..self.A.height()).collect();
let mut row_ops: Vec<RowOp> = row_ops
.iter()
.rev()
.map(|x| match x {
RowOp::AddAssign { src, dest } => {
assert!(mapping[*src] < self.i);
RowOp::AddAssign {
src: mapping[*src],
dest: mapping[*dest],
}
}
RowOp::Swap { row1, row2 } => {
mapping.swap(*row1, *row2);
RowOp::Swap {
row1: *row1,
row2: *row2,
}
}
})
.filter(|x| match x {
RowOp::AddAssign { src, dest } => *src < self.i && *dest < self.i,
RowOp::Swap { .. } => false,
})
.collect();
row_ops.reverse();
return Some(row_ops);
}
// See section 5.4.2.2. Verifies the two all-zeros submatrices and the identity submatrix
@ -649,9 +696,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
// Second phase (section 5.4.2.3)
#[allow(non_snake_case)]
#[inline(never)]
fn second_phase(&mut self) -> bool {
fn second_phase(&mut self, #[allow(unused_variables)] x_elimination_ops: &[RowOp]) -> bool {
#[cfg(debug_assertions)]
self.second_phase_verify();
self.second_phase_verify(x_elimination_ops);
self.X.resize(self.i, self.i);
@ -676,54 +723,52 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
// Verifies that X is lower triangular. See section 5.4.2.3
#[inline(never)]
#[cfg(debug_assertions)]
fn second_phase_verify(&self) {
fn second_phase_verify(&self, x_elimination_ops: &[RowOp]) {
for row in 0..self.i {
for col in (row + 1)..self.i {
assert_eq!(Octet::zero(), self.X.get(row, col));
}
}
// Also verify Errata 9
let mut tempX = self.X.clone();
for op in x_elimination_ops {
match op {
RowOp::AddAssign { src, dest } => {
tempX.add_assign_rows(*dest, *src, 0);
}
RowOp::Swap { .. } => unreachable!(),
}
}
for row in 0..self.i {
for col in 0..self.i {
if row == col {
assert_eq!(Octet::one(), tempX.get(row, col));
} else {
assert_eq!(Octet::zero(), tempX.get(row, col));
}
}
}
}
// Third phase (section 5.4.2.4)
#[allow(non_snake_case)]
#[inline(never)]
fn third_phase(&mut self) {
fn third_phase(&mut self, x_elimination_ops: &[RowOp]) {
#[cfg(debug_assertions)]
self.third_phase_verify();
// A[0..i][..] = X * A[0..i][..]
self.A.mul_assign_submatrix(&self.X, self.i);
// Now apply the same operations to D.
// Note that X is lower triangular, so the row must be processed last to first
for row in (0..self.i).rev() {
if self.X.get(row, row) != Octet::one() {
self.debug_symbol_mul_ops += 1;
self.deferred_D_ops.push(SymbolOps::MulAssign {
dest: self.d[row],
scalar: self.X.get(row, row),
});
}
for (col, value) in self.X.get_row_iter(row, 0, row) {
if value == Octet::zero() {
continue;
}
if value == Octet::one() {
self.debug_symbol_add_ops += 1;
self.deferred_D_ops.push(SymbolOps::AddAssign {
dest: self.d[row],
src: self.d[col],
});
} else {
self.debug_symbol_mul_ops += 1;
self.debug_symbol_add_ops += 1;
self.deferred_D_ops.push(SymbolOps::FMA {
dest: self.d[row],
src: self.d[col],
scalar: self.X.get(row, col),
});
// Perform A[0..i][..] = X * A[0..i][..] by applying Errata 10
for op in x_elimination_ops.iter().rev() {
match op {
RowOp::AddAssign { src, dest } => {
#[cfg(debug_assertions)]
self.fma_rows(*src, *dest, Octet::one(), 0);
#[cfg(not(debug_assertions))]
// Skip applying to cols before i due to Errata 11
self.fma_rows(*src, *dest, Octet::one(), self.i);
}
RowOp::Swap { .. } => unreachable!(),
}
}
@ -771,7 +816,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
let b = self.A.get(i, j + self.i);
if b != Octet::zero() {
let temp = self.i;
self.fma_rows(temp + j, i, b);
#[cfg(debug_assertions)]
self.fma_rows(temp + j, i, b, 0);
// Skip applying to cols before i due to Errata 11
#[cfg(not(debug_assertions))]
self.fma_rows(temp + j, i, b, self.i);
}
}
}
@ -814,20 +863,18 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
// Fifth phase (section 5.4.2.6)
#[allow(non_snake_case)]
#[inline(never)]
fn fifth_phase(&mut self) {
// "For j from 1 to i". Note that A is 1-indexed in the spec, and ranges are inclusive,
// this is means [1, i], which is equal to [0, i)
for j in 0..self.i as usize {
// Skip normalizing the diagonal, since there can't be non-binary values due to
// Errata 7
// "For l from 1 to j-1". This means the lower triangular columns, not including the
// diagonal, which is [0, j)
for (l, _) in self.A.get_row_iter(j, 0, j).clone() {
let temp = self.A.get(j, l);
if temp != Octet::zero() {
self.fma_rows(l, j, temp);
fn fifth_phase(&mut self, x_elimination_ops: &[RowOp]) {
// Use the saved operations from first phase: Errata 9
for op in x_elimination_ops {
match op {
RowOp::AddAssign { src, dest } => {
#[cfg(debug_assertions)]
self.fma_rows(*src, *dest, Octet::one(), 0);
// In release builds skip updating the A matrix, since it will never be read
#[cfg(not(debug_assertions))]
self.record_fma_rows(*src, *dest, Octet::one());
}
RowOp::Swap { .. } => unreachable!(),
}
}
@ -1011,8 +1058,8 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
assert!(self.A_hdpc_rows.is_none());
}
fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
self.fma_rows_with_pi(i, iprime, beta, None, None);
fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet, start_col: usize) {
self.fma_rows_with_pi(i, iprime, beta, None, None, start_col);
}
fn record_fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
@ -1040,6 +1087,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
beta: Octet,
only_non_pi_nonzero_column: Option<usize>,
pi_octets: Option<&Vec<u8>>,
start_col: usize,
) {
self.record_fma_rows(i, iprime, beta.clone());
@ -1064,11 +1112,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
);
} else {
assert_eq!(&beta, &Octet::one());
self.A.add_assign_rows(iprime, i);
self.A.add_assign_rows(iprime, i, start_col);
}
} else {
assert_eq!(&beta, &Octet::one());
self.A.add_assign_rows(iprime, i);
self.A.add_assign_rows(iprime, i, start_col);
}
}
@ -1095,20 +1143,20 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
pub fn execute(&mut self) -> (Option<Vec<Symbol>>, Option<Vec<SymbolOps>>) {
self.X.disable_column_acccess_acceleration();
if !self.first_phase() {
if let Some(x_elimination_ops) = self.first_phase() {
self.A.disable_column_acccess_acceleration();
if !self.second_phase(&x_elimination_ops) {
return (None, None);
}
self.third_phase(&x_elimination_ops);
self.fourth_phase();
self.fifth_phase(&x_elimination_ops);
} else {
return (None, None);
}
self.A.disable_column_acccess_acceleration();
if !self.second_phase() {
return (None, None);
}
self.third_phase();
self.fourth_phase();
self.fifth_phase();
self.apply_deferred_symbol_ops();
// See end of section 5.4.2.1

@ -290,48 +290,12 @@ impl BinaryMatrix for SparseBinaryMatrix {
}
}
// other must be a rows x rows matrix
// sets self[0..rows][..] = X * self[0..rows][..]
fn mul_assign_submatrix(&mut self, other: &SparseBinaryMatrix, rows: usize) {
assert_eq!(rows, other.height());
assert_eq!(rows, other.width());
assert!(rows <= self.height());
assert!(self.column_index_disabled);
if other.num_dense_columns != 0 {
unimplemented!();
}
// Note: rows are logically indexed
let mut temp_sparse = vec![SparseBinaryVec::with_capacity(10); rows];
let mut temp_dense = vec![0; rows * ((self.num_dense_columns - 1) / WORD_WIDTH + 1)];
for row in 0..rows {
for (i, scalar) in other.get_row_iter(row, 0, rows) {
let physical_i = self.logical_row_to_physical[i] as usize;
if scalar != Octet::zero() {
temp_sparse[row].add_assign(&self.sparse_elements[physical_i]);
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
for word in 0..words {
let (src_word, _) = self.bit_position(physical_i, word * WORD_WIDTH);
temp_dense[word * rows + row] ^= self.dense_elements[src_word];
}
}
}
}
for row in (0..rows).rev() {
let physical_row = self.logical_row_to_physical[row] as usize;
self.sparse_elements[physical_row] = temp_sparse.pop().unwrap();
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
for word in 0..words {
let (dest_word, _) = self.bit_position(physical_row, word * WORD_WIDTH);
self.dense_elements[dest_word] = temp_dense[word * rows + row];
}
}
#[cfg(debug_assertions)]
self.verify();
}
fn add_assign_rows(&mut self, dest: usize, src: usize) {
fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize) {
assert_ne!(dest, src);
assert!(
start_col == 0 || start_col == self.width - self.num_dense_columns,
"start_col must be zero or at the beginning of the U matrix"
);
let physical_dest = self.logical_row_to_physical[dest] as usize;
let physical_src = self.logical_row_to_physical[src] as usize;
// First handle the dense columns
@ -344,23 +308,25 @@ impl BinaryMatrix for SparseBinaryMatrix {
}
}
// Then the sparse columns
let (dest_row, temp_row) =
get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
// This shouldn't be needed, because while column indexing is enabled in first phase,
// columns are only eliminated one at a time in sparse section of matrix.
assert!(self.column_index_disabled || temp_row.len() == 1);
if start_col == 0 {
// Then the sparse columns
let (dest_row, temp_row) =
get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
// This shouldn't be needed, because while column indexing is enabled in first phase,
// columns are only eliminated one at a time in sparse section of matrix.
assert!(self.column_index_disabled || temp_row.len() == 1);
let column_added = dest_row.add_assign(temp_row);
// This shouldn't be needed, because while column indexing is enabled in first phase,
// columns are only removed.
assert!(self.column_index_disabled || !column_added);
let column_added = dest_row.add_assign(temp_row);
// This shouldn't be needed, because while column indexing is enabled in first phase,
// columns are only removed.
assert!(self.column_index_disabled || !column_added);
#[cfg(debug_assertions)]
{
if !self.column_index_disabled {
let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
self.debug_indexed_column_valid[col as usize] = false;
#[cfg(debug_assertions)]
{
if !self.column_index_disabled {
let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
self.debug_indexed_column_valid[col as usize] = false;
}
}
}