mirror of
https://github.com/cberner/raptorq.git
synced 2024-06-28 09:41:41 +00:00
Optimize processing of U matrix
Optimize Phases 2-5 to avoid writes to the first i columns. Additionally, use pre-computed ops from first phase to implement third & fifth phases This improves encoding performance by ~15%, especially on large symbol counts
This commit is contained in:
parent
ebe5404d14
commit
8b462f5c83
80
README.md
80
README.md
@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz
|
||||
|
||||
```
|
||||
Symbol size: 1280 bytes (without pre-built plan)
|
||||
symbol count = 10, encoded 127 MB in 0.532secs, throughput: 1924.7Mbit/s
|
||||
symbol count = 100, encoded 127 MB in 0.590secs, throughput: 1734.6Mbit/s
|
||||
symbol count = 250, encoded 127 MB in 0.572secs, throughput: 1788.4Mbit/s
|
||||
symbol count = 500, encoded 127 MB in 0.549secs, throughput: 1858.8Mbit/s
|
||||
symbol count = 1000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s
|
||||
symbol count = 2000, encoded 126 MB in 0.673secs, throughput: 1509.1Mbit/s
|
||||
symbol count = 5000, encoded 122 MB in 0.758secs, throughput: 1288.3Mbit/s
|
||||
symbol count = 10000, encoded 122 MB in 0.953secs, throughput: 1024.7Mbit/s
|
||||
symbol count = 20000, encoded 122 MB in 1.383secs, throughput: 706.1Mbit/s
|
||||
symbol count = 50000, encoded 122 MB in 2.041secs, throughput: 478.5Mbit/s
|
||||
symbol count = 10, encoded 127 MB in 0.545secs, throughput: 1878.8Mbit/s
|
||||
symbol count = 100, encoded 127 MB in 0.645secs, throughput: 1586.7Mbit/s
|
||||
symbol count = 250, encoded 127 MB in 0.509secs, throughput: 2009.7Mbit/s
|
||||
symbol count = 500, encoded 127 MB in 0.503secs, throughput: 2028.8Mbit/s
|
||||
symbol count = 1000, encoded 126 MB in 0.544secs, throughput: 1867.0Mbit/s
|
||||
symbol count = 2000, encoded 126 MB in 0.628secs, throughput: 1617.2Mbit/s
|
||||
symbol count = 5000, encoded 122 MB in 0.686secs, throughput: 1423.6Mbit/s
|
||||
symbol count = 10000, encoded 122 MB in 0.833secs, throughput: 1172.3Mbit/s
|
||||
symbol count = 20000, encoded 122 MB in 1.234secs, throughput: 791.4Mbit/s
|
||||
symbol count = 50000, encoded 122 MB in 1.786secs, throughput: 546.8Mbit/s
|
||||
|
||||
Symbol size: 1280 bytes (with pre-built plan)
|
||||
symbol count = 10, encoded 127 MB in 0.241secs, throughput: 4248.7Mbit/s
|
||||
symbol count = 100, encoded 127 MB in 0.160secs, throughput: 6396.5Mbit/s
|
||||
symbol count = 250, encoded 127 MB in 0.173secs, throughput: 5913.0Mbit/s
|
||||
symbol count = 500, encoded 127 MB in 0.176secs, throughput: 5798.3Mbit/s
|
||||
symbol count = 1000, encoded 126 MB in 0.200secs, throughput: 5078.1Mbit/s
|
||||
symbol count = 2000, encoded 126 MB in 0.208secs, throughput: 4882.8Mbit/s
|
||||
symbol count = 5000, encoded 122 MB in 0.280secs, throughput: 3487.7Mbit/s
|
||||
symbol count = 10000, encoded 122 MB in 0.400secs, throughput: 2441.4Mbit/s
|
||||
symbol count = 20000, encoded 122 MB in 0.494secs, throughput: 1976.8Mbit/s
|
||||
symbol count = 50000, encoded 122 MB in 0.656secs, throughput: 1488.7Mbit/s
|
||||
symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s
|
||||
symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s
|
||||
symbol count = 250, encoded 127 MB in 0.164secs, throughput: 6237.5Mbit/s
|
||||
symbol count = 500, encoded 127 MB in 0.169secs, throughput: 6038.5Mbit/s
|
||||
symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s
|
||||
symbol count = 2000, encoded 126 MB in 0.214secs, throughput: 4745.9Mbit/s
|
||||
symbol count = 5000, encoded 122 MB in 0.262secs, throughput: 3727.3Mbit/s
|
||||
symbol count = 10000, encoded 122 MB in 0.344secs, throughput: 2838.8Mbit/s
|
||||
symbol count = 20000, encoded 122 MB in 0.427secs, throughput: 2287.0Mbit/s
|
||||
symbol count = 50000, encoded 122 MB in 0.541secs, throughput: 1805.1Mbit/s
|
||||
|
||||
Symbol size: 1280 bytes
|
||||
symbol count = 10, decoded 127 MB in 0.723secs using 0.0% overhead, throughput: 1416.2Mbit/s
|
||||
symbol count = 100, decoded 127 MB in 0.701secs using 0.0% overhead, throughput: 1460.0Mbit/s
|
||||
symbol count = 250, decoded 127 MB in 0.650secs using 0.0% overhead, throughput: 1573.8Mbit/s
|
||||
symbol count = 500, decoded 127 MB in 0.638secs using 0.0% overhead, throughput: 1599.5Mbit/s
|
||||
symbol count = 1000, decoded 126 MB in 0.676secs using 0.0% overhead, throughput: 1502.4Mbit/s
|
||||
symbol count = 2000, decoded 126 MB in 0.764secs using 0.0% overhead, throughput: 1329.4Mbit/s
|
||||
symbol count = 5000, decoded 122 MB in 0.896secs using 0.0% overhead, throughput: 1089.9Mbit/s
|
||||
symbol count = 10000, decoded 122 MB in 1.176secs using 0.0% overhead, throughput: 830.4Mbit/s
|
||||
symbol count = 20000, decoded 122 MB in 1.489secs using 0.0% overhead, throughput: 655.9Mbit/s
|
||||
symbol count = 50000, decoded 122 MB in 2.633secs using 0.0% overhead, throughput: 370.9Mbit/s
|
||||
symbol count = 10, decoded 127 MB in 0.749secs using 0.0% overhead, throughput: 1367.1Mbit/s
|
||||
symbol count = 100, decoded 127 MB in 0.742secs using 0.0% overhead, throughput: 1379.3Mbit/s
|
||||
symbol count = 250, decoded 127 MB in 0.589secs using 0.0% overhead, throughput: 1736.8Mbit/s
|
||||
symbol count = 500, decoded 127 MB in 0.594secs using 0.0% overhead, throughput: 1718.0Mbit/s
|
||||
symbol count = 1000, decoded 126 MB in 0.638secs using 0.0% overhead, throughput: 1591.9Mbit/s
|
||||
symbol count = 2000, decoded 126 MB in 0.718secs using 0.0% overhead, throughput: 1414.5Mbit/s
|
||||
symbol count = 5000, decoded 122 MB in 0.829secs using 0.0% overhead, throughput: 1178.0Mbit/s
|
||||
symbol count = 10000, decoded 122 MB in 1.049secs using 0.0% overhead, throughput: 930.9Mbit/s
|
||||
symbol count = 20000, decoded 122 MB in 1.382secs using 0.0% overhead, throughput: 706.6Mbit/s
|
||||
symbol count = 50000, decoded 122 MB in 2.355secs using 0.0% overhead, throughput: 414.7Mbit/s
|
||||
|
||||
symbol count = 10, decoded 127 MB in 0.713secs using 5.0% overhead, throughput: 1436.1Mbit/s
|
||||
symbol count = 100, decoded 127 MB in 0.702secs using 5.0% overhead, throughput: 1457.9Mbit/s
|
||||
symbol count = 250, decoded 127 MB in 0.637secs using 5.0% overhead, throughput: 1605.9Mbit/s
|
||||
symbol count = 500, decoded 127 MB in 0.613secs using 5.0% overhead, throughput: 1664.8Mbit/s
|
||||
symbol count = 1000, decoded 126 MB in 0.643secs using 5.0% overhead, throughput: 1579.5Mbit/s
|
||||
symbol count = 2000, decoded 126 MB in 0.701secs using 5.0% overhead, throughput: 1448.8Mbit/s
|
||||
symbol count = 5000, decoded 122 MB in 0.826secs using 5.0% overhead, throughput: 1182.3Mbit/s
|
||||
symbol count = 10000, decoded 122 MB in 1.061secs using 5.0% overhead, throughput: 920.4Mbit/s
|
||||
symbol count = 20000, decoded 122 MB in 1.380secs using 5.0% overhead, throughput: 707.7Mbit/s
|
||||
symbol count = 50000, decoded 122 MB in 2.341secs using 5.0% overhead, throughput: 417.2Mbit/s
|
||||
symbol count = 10, decoded 127 MB in 0.740secs using 5.0% overhead, throughput: 1383.7Mbit/s
|
||||
symbol count = 100, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.1Mbit/s
|
||||
symbol count = 250, decoded 127 MB in 0.581secs using 5.0% overhead, throughput: 1760.7Mbit/s
|
||||
symbol count = 500, decoded 127 MB in 0.565secs using 5.0% overhead, throughput: 1806.2Mbit/s
|
||||
symbol count = 1000, decoded 126 MB in 0.605secs using 5.0% overhead, throughput: 1678.7Mbit/s
|
||||
symbol count = 2000, decoded 126 MB in 0.656secs using 5.0% overhead, throughput: 1548.2Mbit/s
|
||||
symbol count = 5000, decoded 122 MB in 0.763secs using 5.0% overhead, throughput: 1279.9Mbit/s
|
||||
symbol count = 10000, decoded 122 MB in 0.959secs using 5.0% overhead, throughput: 1018.3Mbit/s
|
||||
symbol count = 20000, decoded 122 MB in 1.242secs using 5.0% overhead, throughput: 786.3Mbit/s
|
||||
symbol count = 50000, decoded 122 MB in 2.146secs using 5.0% overhead, throughput: 455.1Mbit/s
|
||||
```
|
||||
|
||||
### Public API
|
||||
|
@ -43,11 +43,8 @@ pub trait BinaryMatrix: Clone {
|
||||
// Hints that column i will not be swapped again, and is likely to become dense'ish
|
||||
fn hint_column_dense_and_frozen(&mut self, i: usize);
|
||||
|
||||
// other must be a rows x rows matrix
|
||||
// sets self[0..rows][..] = X * self[0..rows][..]
|
||||
fn mul_assign_submatrix(&mut self, other: &Self, rows: usize);
|
||||
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize);
|
||||
// If start_col is non-zero, values left of start_col in dest row are undefined after this operation
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize);
|
||||
|
||||
fn resize(&mut self, new_height: usize, new_width: usize);
|
||||
}
|
||||
@ -214,29 +211,7 @@ impl BinaryMatrix for DenseBinaryMatrix {
|
||||
// No-op
|
||||
}
|
||||
|
||||
// other must be a rows x rows matrix
|
||||
// sets self[0..rows][..] = X * self[0..rows][..]
|
||||
fn mul_assign_submatrix(&mut self, other: &DenseBinaryMatrix, rows: usize) {
|
||||
assert_eq!(rows, other.height());
|
||||
assert_eq!(rows, other.width());
|
||||
assert!(rows <= self.height());
|
||||
let mut temp = vec![vec![0; DenseBinaryMatrix::bit_position(self.width).0 + 1]; rows];
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for row in 0..rows {
|
||||
for i in 0..rows {
|
||||
let scalar = other.get(row, i);
|
||||
if scalar == Octet::zero() {
|
||||
continue;
|
||||
}
|
||||
add_assign_binary(&mut temp[row], &self.elements[i]);
|
||||
}
|
||||
}
|
||||
for row in (0..rows).rev() {
|
||||
self.elements[row] = temp.pop().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize) {
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize, _start_col: usize) {
|
||||
assert_ne!(dest, src);
|
||||
let (dest_row, temp_row) = get_both_indices(&mut self.elements, dest, src);
|
||||
add_assign_binary(dest_row, temp_row);
|
||||
@ -263,22 +238,6 @@ mod tests {
|
||||
use crate::octet::Octet;
|
||||
use crate::sparse_matrix::SparseBinaryMatrix;
|
||||
|
||||
fn dense_identity(size: usize) -> DenseBinaryMatrix {
|
||||
let mut result = DenseBinaryMatrix::new(size, size, 0);
|
||||
for i in 0..size {
|
||||
result.set(i, i, Octet::one());
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn sparse_identity(size: usize) -> SparseBinaryMatrix {
|
||||
let mut result = SparseBinaryMatrix::new(size, size, 0);
|
||||
for i in 0..size {
|
||||
result.set(i, i, Octet::one());
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn rand_dense_and_sparse(size: usize) -> (DenseBinaryMatrix, SparseBinaryMatrix) {
|
||||
let mut dense = DenseBinaryMatrix::new(size, size, 0);
|
||||
let mut sparse = SparseBinaryMatrix::new(size, size, 1);
|
||||
@ -345,39 +304,16 @@ mod tests {
|
||||
assert_eq!(dense.count_ones(3, 1, 2), sparse.count_ones(3, 1, 2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mul_assign_submatrix() {
|
||||
// rand_dense_and_sparse uses set(), so just check that it works
|
||||
let (mut dense, mut sparse) = rand_dense_and_sparse(8);
|
||||
let original = dense.clone();
|
||||
|
||||
let identity = dense_identity(5);
|
||||
dense.mul_assign_submatrix(&identity, 5);
|
||||
assert_matrices_eq(&dense, &original);
|
||||
|
||||
let identity = dense_identity(8);
|
||||
dense.mul_assign_submatrix(&identity, 8);
|
||||
assert_matrices_eq(&dense, &original);
|
||||
|
||||
let identity = sparse_identity(5);
|
||||
sparse.mul_assign_submatrix(&identity, 5);
|
||||
assert_matrices_eq(&sparse, &original);
|
||||
|
||||
let identity = sparse_identity(8);
|
||||
sparse.mul_assign_submatrix(&identity, 8);
|
||||
assert_matrices_eq(&sparse, &original);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fma_rows() {
|
||||
// rand_dense_and_sparse uses set(), so just check that it works
|
||||
let (mut dense, mut sparse) = rand_dense_and_sparse(8);
|
||||
dense.add_assign_rows(0, 1);
|
||||
dense.add_assign_rows(0, 2);
|
||||
dense.add_assign_rows(2, 1);
|
||||
sparse.add_assign_rows(0, 1);
|
||||
sparse.add_assign_rows(0, 2);
|
||||
sparse.add_assign_rows(2, 1);
|
||||
dense.add_assign_rows(0, 1, 0);
|
||||
dense.add_assign_rows(0, 2, 0);
|
||||
dense.add_assign_rows(2, 1, 0);
|
||||
sparse.add_assign_rows(0, 1, 0);
|
||||
sparse.add_assign_rows(0, 2, 0);
|
||||
sparse.add_assign_rows(2, 1, 0);
|
||||
assert_matrices_eq(&dense, &sparse);
|
||||
}
|
||||
|
||||
|
186
src/pi_solver.rs
186
src/pi_solver.rs
@ -12,6 +12,12 @@ use crate::systematic_constants::num_pi_symbols;
|
||||
use crate::util::get_both_indices;
|
||||
use std::mem::size_of;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
enum RowOp {
|
||||
AddAssign { src: usize, dest: usize },
|
||||
Swap { row1: usize, row2: usize },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
struct FirstPhaseRowSelectionStats {
|
||||
original_degree: U16ArrayMap,
|
||||
@ -509,9 +515,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
}
|
||||
|
||||
// First phase (section 5.4.2.2)
|
||||
//
|
||||
// Returns the row operations required to convert the X matrix into the identity
|
||||
#[allow(non_snake_case)]
|
||||
#[inline(never)]
|
||||
fn first_phase(&mut self) -> bool {
|
||||
fn first_phase(&mut self) -> Option<Vec<RowOp>> {
|
||||
// First phase (section 5.4.2.2)
|
||||
|
||||
// ----------> i u <--------
|
||||
@ -533,6 +541,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
let mut selection_helper =
|
||||
FirstPhaseRowSelectionStats::new(&self.A, self.A.width() - self.u);
|
||||
|
||||
// Record of first phase row operations performed on non-HDPC rows
|
||||
let mut row_ops = vec![];
|
||||
|
||||
while self.i + self.u < self.L {
|
||||
// Calculate r
|
||||
// "Let r be the minimum integer such that at least one row of A has
|
||||
@ -545,7 +556,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
);
|
||||
|
||||
if r == None {
|
||||
return false;
|
||||
return None;
|
||||
}
|
||||
let r = r.unwrap();
|
||||
let chosen_row = chosen_row.unwrap();
|
||||
@ -556,6 +567,10 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
let temp = self.i;
|
||||
self.swap_rows(temp, chosen_row);
|
||||
self.X.swap_rows(temp, chosen_row);
|
||||
row_ops.push(RowOp::Swap {
|
||||
row1: temp,
|
||||
row2: chosen_row,
|
||||
});
|
||||
selection_helper.swap_rows(temp, chosen_row);
|
||||
// Reorder columns
|
||||
self.first_phase_swap_columns_substep(r);
|
||||
@ -586,7 +601,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
let row = row as usize;
|
||||
assert_eq!(&temp_value, &Octet::one());
|
||||
// Addition is equivalent to subtraction.
|
||||
self.fma_rows(temp, row, Octet::one());
|
||||
self.fma_rows(temp, row, Octet::one(), 0);
|
||||
row_ops.push(RowOp::AddAssign {
|
||||
src: temp,
|
||||
dest: row,
|
||||
});
|
||||
if r == 1 {
|
||||
// Hot path for r == 1, since it's very common due to maximum connected
|
||||
// component selection, and recompute_row() is expensive
|
||||
@ -613,6 +632,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
// since all the rest were column swapped into the PI submatrix.
|
||||
Some(temp),
|
||||
Some(&pi_octets),
|
||||
0,
|
||||
);
|
||||
// It's safe to skip updating the selection helper, since it will never
|
||||
// select an HDPC row
|
||||
@ -626,7 +646,34 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
}
|
||||
|
||||
self.record_symbol_ops(0);
|
||||
return true;
|
||||
|
||||
let mut mapping: Vec<usize> = (0..self.A.height()).collect();
|
||||
let mut row_ops: Vec<RowOp> = row_ops
|
||||
.iter()
|
||||
.rev()
|
||||
.map(|x| match x {
|
||||
RowOp::AddAssign { src, dest } => {
|
||||
assert!(mapping[*src] < self.i);
|
||||
RowOp::AddAssign {
|
||||
src: mapping[*src],
|
||||
dest: mapping[*dest],
|
||||
}
|
||||
}
|
||||
RowOp::Swap { row1, row2 } => {
|
||||
mapping.swap(*row1, *row2);
|
||||
RowOp::Swap {
|
||||
row1: *row1,
|
||||
row2: *row2,
|
||||
}
|
||||
}
|
||||
})
|
||||
.filter(|x| match x {
|
||||
RowOp::AddAssign { src, dest } => *src < self.i && *dest < self.i,
|
||||
RowOp::Swap { .. } => false,
|
||||
})
|
||||
.collect();
|
||||
row_ops.reverse();
|
||||
return Some(row_ops);
|
||||
}
|
||||
|
||||
// See section 5.4.2.2. Verifies the two all-zeros submatrices and the identity submatrix
|
||||
@ -649,9 +696,9 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
// Second phase (section 5.4.2.3)
|
||||
#[allow(non_snake_case)]
|
||||
#[inline(never)]
|
||||
fn second_phase(&mut self) -> bool {
|
||||
fn second_phase(&mut self, #[allow(unused_variables)] x_elimination_ops: &[RowOp]) -> bool {
|
||||
#[cfg(debug_assertions)]
|
||||
self.second_phase_verify();
|
||||
self.second_phase_verify(x_elimination_ops);
|
||||
|
||||
self.X.resize(self.i, self.i);
|
||||
|
||||
@ -676,54 +723,52 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
// Verifies that X is lower triangular. See section 5.4.2.3
|
||||
#[inline(never)]
|
||||
#[cfg(debug_assertions)]
|
||||
fn second_phase_verify(&self) {
|
||||
fn second_phase_verify(&self, x_elimination_ops: &[RowOp]) {
|
||||
for row in 0..self.i {
|
||||
for col in (row + 1)..self.i {
|
||||
assert_eq!(Octet::zero(), self.X.get(row, col));
|
||||
}
|
||||
}
|
||||
|
||||
// Also verify Errata 9
|
||||
let mut tempX = self.X.clone();
|
||||
for op in x_elimination_ops {
|
||||
match op {
|
||||
RowOp::AddAssign { src, dest } => {
|
||||
tempX.add_assign_rows(*dest, *src, 0);
|
||||
}
|
||||
RowOp::Swap { .. } => unreachable!(),
|
||||
}
|
||||
}
|
||||
for row in 0..self.i {
|
||||
for col in 0..self.i {
|
||||
if row == col {
|
||||
assert_eq!(Octet::one(), tempX.get(row, col));
|
||||
} else {
|
||||
assert_eq!(Octet::zero(), tempX.get(row, col));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Third phase (section 5.4.2.4)
|
||||
#[allow(non_snake_case)]
|
||||
#[inline(never)]
|
||||
fn third_phase(&mut self) {
|
||||
fn third_phase(&mut self, x_elimination_ops: &[RowOp]) {
|
||||
#[cfg(debug_assertions)]
|
||||
self.third_phase_verify();
|
||||
|
||||
// A[0..i][..] = X * A[0..i][..]
|
||||
self.A.mul_assign_submatrix(&self.X, self.i);
|
||||
|
||||
// Now apply the same operations to D.
|
||||
// Note that X is lower triangular, so the row must be processed last to first
|
||||
for row in (0..self.i).rev() {
|
||||
if self.X.get(row, row) != Octet::one() {
|
||||
self.debug_symbol_mul_ops += 1;
|
||||
self.deferred_D_ops.push(SymbolOps::MulAssign {
|
||||
dest: self.d[row],
|
||||
scalar: self.X.get(row, row),
|
||||
});
|
||||
}
|
||||
|
||||
for (col, value) in self.X.get_row_iter(row, 0, row) {
|
||||
if value == Octet::zero() {
|
||||
continue;
|
||||
}
|
||||
if value == Octet::one() {
|
||||
self.debug_symbol_add_ops += 1;
|
||||
self.deferred_D_ops.push(SymbolOps::AddAssign {
|
||||
dest: self.d[row],
|
||||
src: self.d[col],
|
||||
});
|
||||
} else {
|
||||
self.debug_symbol_mul_ops += 1;
|
||||
self.debug_symbol_add_ops += 1;
|
||||
self.deferred_D_ops.push(SymbolOps::FMA {
|
||||
dest: self.d[row],
|
||||
src: self.d[col],
|
||||
scalar: self.X.get(row, col),
|
||||
});
|
||||
// Perform A[0..i][..] = X * A[0..i][..] by applying Errata 10
|
||||
for op in x_elimination_ops.iter().rev() {
|
||||
match op {
|
||||
RowOp::AddAssign { src, dest } => {
|
||||
#[cfg(debug_assertions)]
|
||||
self.fma_rows(*src, *dest, Octet::one(), 0);
|
||||
#[cfg(not(debug_assertions))]
|
||||
// Skip applying to cols before i due to Errata 11
|
||||
self.fma_rows(*src, *dest, Octet::one(), self.i);
|
||||
}
|
||||
RowOp::Swap { .. } => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -771,7 +816,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
let b = self.A.get(i, j + self.i);
|
||||
if b != Octet::zero() {
|
||||
let temp = self.i;
|
||||
self.fma_rows(temp + j, i, b);
|
||||
#[cfg(debug_assertions)]
|
||||
self.fma_rows(temp + j, i, b, 0);
|
||||
// Skip applying to cols before i due to Errata 11
|
||||
#[cfg(not(debug_assertions))]
|
||||
self.fma_rows(temp + j, i, b, self.i);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -814,20 +863,18 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
// Fifth phase (section 5.4.2.6)
|
||||
#[allow(non_snake_case)]
|
||||
#[inline(never)]
|
||||
fn fifth_phase(&mut self) {
|
||||
// "For j from 1 to i". Note that A is 1-indexed in the spec, and ranges are inclusive,
|
||||
// this is means [1, i], which is equal to [0, i)
|
||||
for j in 0..self.i as usize {
|
||||
// Skip normalizing the diagonal, since there can't be non-binary values due to
|
||||
// Errata 7
|
||||
|
||||
// "For l from 1 to j-1". This means the lower triangular columns, not including the
|
||||
// diagonal, which is [0, j)
|
||||
for (l, _) in self.A.get_row_iter(j, 0, j).clone() {
|
||||
let temp = self.A.get(j, l);
|
||||
if temp != Octet::zero() {
|
||||
self.fma_rows(l, j, temp);
|
||||
fn fifth_phase(&mut self, x_elimination_ops: &[RowOp]) {
|
||||
// Use the saved operations from first phase: Errata 9
|
||||
for op in x_elimination_ops {
|
||||
match op {
|
||||
RowOp::AddAssign { src, dest } => {
|
||||
#[cfg(debug_assertions)]
|
||||
self.fma_rows(*src, *dest, Octet::one(), 0);
|
||||
// In release builds skip updating the A matrix, since it will never be read
|
||||
#[cfg(not(debug_assertions))]
|
||||
self.record_fma_rows(*src, *dest, Octet::one());
|
||||
}
|
||||
RowOp::Swap { .. } => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -1011,8 +1058,8 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
assert!(self.A_hdpc_rows.is_none());
|
||||
}
|
||||
|
||||
fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
|
||||
self.fma_rows_with_pi(i, iprime, beta, None, None);
|
||||
fn fma_rows(&mut self, i: usize, iprime: usize, beta: Octet, start_col: usize) {
|
||||
self.fma_rows_with_pi(i, iprime, beta, None, None, start_col);
|
||||
}
|
||||
|
||||
fn record_fma_rows(&mut self, i: usize, iprime: usize, beta: Octet) {
|
||||
@ -1040,6 +1087,7 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
beta: Octet,
|
||||
only_non_pi_nonzero_column: Option<usize>,
|
||||
pi_octets: Option<&Vec<u8>>,
|
||||
start_col: usize,
|
||||
) {
|
||||
self.record_fma_rows(i, iprime, beta.clone());
|
||||
|
||||
@ -1064,11 +1112,11 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
);
|
||||
} else {
|
||||
assert_eq!(&beta, &Octet::one());
|
||||
self.A.add_assign_rows(iprime, i);
|
||||
self.A.add_assign_rows(iprime, i, start_col);
|
||||
}
|
||||
} else {
|
||||
assert_eq!(&beta, &Octet::one());
|
||||
self.A.add_assign_rows(iprime, i);
|
||||
self.A.add_assign_rows(iprime, i, start_col);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1095,20 +1143,20 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
|
||||
pub fn execute(&mut self) -> (Option<Vec<Symbol>>, Option<Vec<SymbolOps>>) {
|
||||
self.X.disable_column_acccess_acceleration();
|
||||
|
||||
if !self.first_phase() {
|
||||
if let Some(x_elimination_ops) = self.first_phase() {
|
||||
self.A.disable_column_acccess_acceleration();
|
||||
|
||||
if !self.second_phase(&x_elimination_ops) {
|
||||
return (None, None);
|
||||
}
|
||||
|
||||
self.third_phase(&x_elimination_ops);
|
||||
self.fourth_phase();
|
||||
self.fifth_phase(&x_elimination_ops);
|
||||
} else {
|
||||
return (None, None);
|
||||
}
|
||||
|
||||
self.A.disable_column_acccess_acceleration();
|
||||
|
||||
if !self.second_phase() {
|
||||
return (None, None);
|
||||
}
|
||||
|
||||
self.third_phase();
|
||||
self.fourth_phase();
|
||||
self.fifth_phase();
|
||||
|
||||
self.apply_deferred_symbol_ops();
|
||||
|
||||
// See end of section 5.4.2.1
|
||||
|
@ -290,48 +290,12 @@ impl BinaryMatrix for SparseBinaryMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
// other must be a rows x rows matrix
|
||||
// sets self[0..rows][..] = X * self[0..rows][..]
|
||||
fn mul_assign_submatrix(&mut self, other: &SparseBinaryMatrix, rows: usize) {
|
||||
assert_eq!(rows, other.height());
|
||||
assert_eq!(rows, other.width());
|
||||
assert!(rows <= self.height());
|
||||
assert!(self.column_index_disabled);
|
||||
if other.num_dense_columns != 0 {
|
||||
unimplemented!();
|
||||
}
|
||||
// Note: rows are logically indexed
|
||||
let mut temp_sparse = vec![SparseBinaryVec::with_capacity(10); rows];
|
||||
let mut temp_dense = vec![0; rows * ((self.num_dense_columns - 1) / WORD_WIDTH + 1)];
|
||||
for row in 0..rows {
|
||||
for (i, scalar) in other.get_row_iter(row, 0, rows) {
|
||||
let physical_i = self.logical_row_to_physical[i] as usize;
|
||||
if scalar != Octet::zero() {
|
||||
temp_sparse[row].add_assign(&self.sparse_elements[physical_i]);
|
||||
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
|
||||
for word in 0..words {
|
||||
let (src_word, _) = self.bit_position(physical_i, word * WORD_WIDTH);
|
||||
temp_dense[word * rows + row] ^= self.dense_elements[src_word];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for row in (0..rows).rev() {
|
||||
let physical_row = self.logical_row_to_physical[row] as usize;
|
||||
self.sparse_elements[physical_row] = temp_sparse.pop().unwrap();
|
||||
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
|
||||
for word in 0..words {
|
||||
let (dest_word, _) = self.bit_position(physical_row, word * WORD_WIDTH);
|
||||
self.dense_elements[dest_word] = temp_dense[word * rows + row];
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
self.verify();
|
||||
}
|
||||
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize) {
|
||||
fn add_assign_rows(&mut self, dest: usize, src: usize, start_col: usize) {
|
||||
assert_ne!(dest, src);
|
||||
assert!(
|
||||
start_col == 0 || start_col == self.width - self.num_dense_columns,
|
||||
"start_col must be zero or at the beginning of the U matrix"
|
||||
);
|
||||
let physical_dest = self.logical_row_to_physical[dest] as usize;
|
||||
let physical_src = self.logical_row_to_physical[src] as usize;
|
||||
// First handle the dense columns
|
||||
@ -344,23 +308,25 @@ impl BinaryMatrix for SparseBinaryMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
// Then the sparse columns
|
||||
let (dest_row, temp_row) =
|
||||
get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
|
||||
// This shouldn't be needed, because while column indexing is enabled in first phase,
|
||||
// columns are only eliminated one at a time in sparse section of matrix.
|
||||
assert!(self.column_index_disabled || temp_row.len() == 1);
|
||||
if start_col == 0 {
|
||||
// Then the sparse columns
|
||||
let (dest_row, temp_row) =
|
||||
get_both_indices(&mut self.sparse_elements, physical_dest, physical_src);
|
||||
// This shouldn't be needed, because while column indexing is enabled in first phase,
|
||||
// columns are only eliminated one at a time in sparse section of matrix.
|
||||
assert!(self.column_index_disabled || temp_row.len() == 1);
|
||||
|
||||
let column_added = dest_row.add_assign(temp_row);
|
||||
// This shouldn't be needed, because while column indexing is enabled in first phase,
|
||||
// columns are only removed.
|
||||
assert!(self.column_index_disabled || !column_added);
|
||||
let column_added = dest_row.add_assign(temp_row);
|
||||
// This shouldn't be needed, because while column indexing is enabled in first phase,
|
||||
// columns are only removed.
|
||||
assert!(self.column_index_disabled || !column_added);
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
if !self.column_index_disabled {
|
||||
let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
|
||||
self.debug_indexed_column_valid[col as usize] = false;
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
if !self.column_index_disabled {
|
||||
let col = self.physical_col_to_logical[temp_row.get_by_raw_index(0).0];
|
||||
self.debug_indexed_column_valid[col as usize] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user