From c4d227fba1df691b9504e3fb01cc171725af9607 Mon Sep 17 00:00:00 2001 From: Christopher Berner Date: Sat, 28 Nov 2020 17:05:44 -0800 Subject: [PATCH] Optimize memory layout of dense U matrix Previously we used column major ordering. Switch to row major to optimize sequential access of rows which is much more common in the first phase, and can also be used in the fourth phase This improves performance by ~10% on large symbol counts --- README.md | 78 +++++++++++++-------------- src/matrix.rs | 32 +++++++++++ src/pi_solver.rs | 16 +++--- src/sparse_matrix.rs | 125 ++++++++++++++++++++++++++++++++----------- 4 files changed, 172 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index a812e2a..16f0e2d 100644 --- a/README.md +++ b/README.md @@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz ``` Symbol size: 1280 bytes (without pre-built plan) -symbol count = 10, encoded 127 MB in 0.545secs, throughput: 1878.8Mbit/s -symbol count = 100, encoded 127 MB in 0.645secs, throughput: 1586.7Mbit/s -symbol count = 250, encoded 127 MB in 0.509secs, throughput: 2009.7Mbit/s -symbol count = 500, encoded 127 MB in 0.503secs, throughput: 2028.8Mbit/s -symbol count = 1000, encoded 126 MB in 0.544secs, throughput: 1867.0Mbit/s -symbol count = 2000, encoded 126 MB in 0.628secs, throughput: 1617.2Mbit/s -symbol count = 5000, encoded 122 MB in 0.686secs, throughput: 1423.6Mbit/s -symbol count = 10000, encoded 122 MB in 0.833secs, throughput: 1172.3Mbit/s -symbol count = 20000, encoded 122 MB in 1.234secs, throughput: 791.4Mbit/s -symbol count = 50000, encoded 122 MB in 1.786secs, throughput: 546.8Mbit/s +symbol count = 10, encoded 127 MB in 0.548secs, throughput: 1868.5Mbit/s +symbol count = 100, encoded 127 MB in 0.637secs, throughput: 1606.7Mbit/s +symbol count = 250, encoded 127 MB in 0.507secs, throughput: 2017.7Mbit/s +symbol count = 500, encoded 127 MB in 0.488secs, throughput: 2091.2Mbit/s +symbol count = 1000, encoded 126 MB in 0.523secs, throughput: 1941.9Mbit/s +symbol count = 2000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s +symbol count = 5000, encoded 122 MB in 0.636secs, throughput: 1535.5Mbit/s +symbol count = 10000, encoded 122 MB in 0.769secs, throughput: 1269.9Mbit/s +symbol count = 20000, encoded 122 MB in 1.122secs, throughput: 870.4Mbit/s +symbol count = 50000, encoded 122 MB in 1.597secs, throughput: 611.5Mbit/s Symbol size: 1280 bytes (with pre-built plan) symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s -symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s -symbol count = 250, encoded 127 MB in 0.164secs, throughput: 6237.5Mbit/s -symbol count = 500, encoded 127 MB in 0.169secs, throughput: 6038.5Mbit/s -symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s -symbol count = 2000, encoded 126 MB in 0.214secs, throughput: 4745.9Mbit/s -symbol count = 5000, encoded 122 MB in 0.262secs, throughput: 3727.3Mbit/s -symbol count = 10000, encoded 122 MB in 0.344secs, throughput: 2838.8Mbit/s -symbol count = 20000, encoded 122 MB in 0.427secs, throughput: 2287.0Mbit/s -symbol count = 50000, encoded 122 MB in 0.541secs, throughput: 1805.1Mbit/s +symbol count = 100, encoded 127 MB in 0.154secs, throughput: 6645.7Mbit/s +symbol count = 250, encoded 127 MB in 0.160secs, throughput: 6393.4Mbit/s +symbol count = 500, encoded 127 MB in 0.163secs, throughput: 6260.8Mbit/s +symbol count = 1000, encoded 126 MB in 0.173secs, throughput: 5870.7Mbit/s +symbol count = 2000, encoded 126 MB in 0.199secs, throughput: 5103.6Mbit/s +symbol count = 5000, encoded 122 MB in 0.255secs, throughput: 3829.7Mbit/s +symbol count = 10000, encoded 122 MB in 0.339secs, throughput: 2880.7Mbit/s +symbol count = 20000, encoded 122 MB in 0.425secs, throughput: 2297.8Mbit/s +symbol count = 50000, encoded 122 MB in 0.536secs, throughput: 1821.9Mbit/s Symbol size: 1280 bytes -symbol count = 10, decoded 127 MB in 0.749secs using 0.0% overhead, throughput: 1367.1Mbit/s -symbol count = 100, decoded 127 MB in 0.742secs using 0.0% overhead, throughput: 1379.3Mbit/s -symbol count = 250, decoded 127 MB in 0.589secs using 0.0% overhead, throughput: 1736.8Mbit/s -symbol count = 500, decoded 127 MB in 0.594secs using 0.0% overhead, throughput: 1718.0Mbit/s -symbol count = 1000, decoded 126 MB in 0.638secs using 0.0% overhead, throughput: 1591.9Mbit/s -symbol count = 2000, decoded 126 MB in 0.718secs using 0.0% overhead, throughput: 1414.5Mbit/s -symbol count = 5000, decoded 122 MB in 0.829secs using 0.0% overhead, throughput: 1178.0Mbit/s -symbol count = 10000, decoded 122 MB in 1.049secs using 0.0% overhead, throughput: 930.9Mbit/s -symbol count = 20000, decoded 122 MB in 1.382secs using 0.0% overhead, throughput: 706.6Mbit/s -symbol count = 50000, decoded 122 MB in 2.355secs using 0.0% overhead, throughput: 414.7Mbit/s +symbol count = 10, decoded 127 MB in 0.758secs using 0.0% overhead, throughput: 1350.8Mbit/s +symbol count = 100, decoded 127 MB in 0.740secs using 0.0% overhead, throughput: 1383.0Mbit/s +symbol count = 250, decoded 127 MB in 0.577secs using 0.0% overhead, throughput: 1772.9Mbit/s +symbol count = 500, decoded 127 MB in 0.582secs using 0.0% overhead, throughput: 1753.4Mbit/s +symbol count = 1000, decoded 126 MB in 0.628secs using 0.0% overhead, throughput: 1617.2Mbit/s +symbol count = 2000, decoded 126 MB in 0.684secs using 0.0% overhead, throughput: 1484.8Mbit/s +symbol count = 5000, decoded 122 MB in 0.785secs using 0.0% overhead, throughput: 1244.0Mbit/s +symbol count = 10000, decoded 122 MB in 0.965secs using 0.0% overhead, throughput: 1012.0Mbit/s +symbol count = 20000, decoded 122 MB in 1.345secs using 0.0% overhead, throughput: 726.1Mbit/s +symbol count = 50000, decoded 122 MB in 2.101secs using 0.0% overhead, throughput: 464.8Mbit/s -symbol count = 10, decoded 127 MB in 0.740secs using 5.0% overhead, throughput: 1383.7Mbit/s -symbol count = 100, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.1Mbit/s -symbol count = 250, decoded 127 MB in 0.581secs using 5.0% overhead, throughput: 1760.7Mbit/s -symbol count = 500, decoded 127 MB in 0.565secs using 5.0% overhead, throughput: 1806.2Mbit/s -symbol count = 1000, decoded 126 MB in 0.605secs using 5.0% overhead, throughput: 1678.7Mbit/s -symbol count = 2000, decoded 126 MB in 0.656secs using 5.0% overhead, throughput: 1548.2Mbit/s -symbol count = 5000, decoded 122 MB in 0.763secs using 5.0% overhead, throughput: 1279.9Mbit/s -symbol count = 10000, decoded 122 MB in 0.959secs using 5.0% overhead, throughput: 1018.3Mbit/s -symbol count = 20000, decoded 122 MB in 1.242secs using 5.0% overhead, throughput: 786.3Mbit/s -symbol count = 50000, decoded 122 MB in 2.146secs using 5.0% overhead, throughput: 455.1Mbit/s +symbol count = 10, decoded 127 MB in 0.753secs using 5.0% overhead, throughput: 1359.8Mbit/s +symbol count = 100, decoded 127 MB in 0.731secs using 5.0% overhead, throughput: 1400.1Mbit/s +symbol count = 250, decoded 127 MB in 0.575secs using 5.0% overhead, throughput: 1779.0Mbit/s +symbol count = 500, decoded 127 MB in 0.552secs using 5.0% overhead, throughput: 1848.7Mbit/s +symbol count = 1000, decoded 126 MB in 0.568secs using 5.0% overhead, throughput: 1788.1Mbit/s +symbol count = 2000, decoded 126 MB in 0.626secs using 5.0% overhead, throughput: 1622.4Mbit/s +symbol count = 5000, decoded 122 MB in 0.713secs using 5.0% overhead, throughput: 1369.7Mbit/s +symbol count = 10000, decoded 122 MB in 0.893secs using 5.0% overhead, throughput: 1093.6Mbit/s +symbol count = 20000, decoded 122 MB in 1.147secs using 5.0% overhead, throughput: 851.4Mbit/s +symbol count = 50000, decoded 122 MB in 1.943secs using 5.0% overhead, throughput: 502.6Mbit/s ``` ### Public API diff --git a/src/matrix.rs b/src/matrix.rs index 7829e40..f485941 100644 --- a/src/matrix.rs +++ b/src/matrix.rs @@ -27,6 +27,9 @@ pub trait BinaryMatrix: Clone { // Get a slice of columns from a row as Octets fn get_sub_row_as_octets(&self, row: usize, start_col: usize) -> Vec; + // Returns a list of columns with non-zero values in the given row, starting with start_col + fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec; + fn get(&self, i: usize, j: usize) -> Octet; fn swap_rows(&mut self, i: usize, j: usize); @@ -177,6 +180,12 @@ impl BinaryMatrix for DenseBinaryMatrix { result } + fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec { + (start_col..self.width) + .filter(|col| self.get(row, *col) != Octet::zero()) + .collect() + } + fn get(&self, i: usize, j: usize) -> Octet { let (word, bit) = DenseBinaryMatrix::bit_position(j); if self.elements[i][word] & DenseBinaryMatrix::select_mask(bit) == 0 { @@ -337,4 +346,27 @@ mod tests { sparse.hint_column_dense_and_frozen(5); assert_matrices_eq(&dense, &sparse); } + + #[test] + fn dense_storage_math() { + let size = 128; + let (mut dense, mut sparse) = rand_dense_and_sparse(size); + sparse.enable_column_access_acceleration(); + for i in (0..(size - 1)).rev() { + sparse.hint_column_dense_and_frozen(i); + assert_matrices_eq(&dense, &sparse); + } + assert_matrices_eq(&dense, &sparse); + sparse.disable_column_access_acceleration(); + for _ in 0..1000 { + let i = rand::thread_rng().gen_range(0, size); + let mut j = rand::thread_rng().gen_range(0, size); + while j == i { + j = rand::thread_rng().gen_range(0, size); + } + dense.add_assign_rows(i, j, 0); + sparse.add_assign_rows(i, j, 0); + } + assert_matrices_eq(&dense, &sparse); + } } diff --git a/src/pi_solver.rs b/src/pi_solver.rs index bc61c90..41f5b82 100644 --- a/src/pi_solver.rs +++ b/src/pi_solver.rs @@ -812,16 +812,12 @@ impl IntermediateSymbolDecoder { #[inline(never)] fn fourth_phase(&mut self) { for i in 0..self.i { - for j in 0..self.u { - let b = self.A.get(i, j + self.i); - if b != Octet::zero() { - let temp = self.i; - #[cfg(debug_assertions)] - self.fma_rows(temp + j, i, b, 0); - // Skip applying to cols before i due to Errata 11 - #[cfg(not(debug_assertions))] - self.fma_rows(temp + j, i, b, self.i); - } + for j in self.A.query_non_zero_columns(i, self.i) { + #[cfg(debug_assertions)] + self.fma_rows(j, i, Octet::one(), 0); + // Skip applying to cols before i due to Errata 11 + #[cfg(not(debug_assertions))] + self.fma_rows(j, i, Octet::one(), self.i); } } diff --git a/src/sparse_matrix.rs b/src/sparse_matrix.rs index c380178..df02ac0 100644 --- a/src/sparse_matrix.rs +++ b/src/sparse_matrix.rs @@ -18,11 +18,8 @@ pub struct SparseBinaryMatrix { height: usize, width: usize, sparse_elements: Vec, - // Note these are stored such that the right-most 64 elements of a row are in - // dense_elements[row], the second 64 elements are stored in dense_elements[height + row], then - // the next in dense_elements[height * 2 + row]. Elements are numbered right to left, - // so the right-most element is in dense_elements[row] & 0b1. The second right most is in - // dense_elements[row] & 0b2. + // Note these are stored right aligned, so that the right most element is always at + // dense_elements[x] & (1 << 63) dense_elements: Vec, // Columnar storage of values. Only stores rows that have a 1-valued entry in the given column sparse_columnar_values: Option, @@ -56,14 +53,33 @@ impl SparseBinaryMatrix { } } - // Returns (word in elements vec, and bit in word) for the given col - fn bit_position(&self, row: usize, col: usize) -> (usize, usize) { - return (self.height * (col / WORD_WIDTH) + row, col % WORD_WIDTH); + // Convert a logical col index to the bit index in the dense columns + fn logical_col_to_dense_col(&self, col: usize) -> usize { + assert!(col >= self.width - self.num_dense_columns); + col - (self.width - self.num_dense_columns) } - // Return the word in which bit lives - fn word_offset(bit: usize) -> usize { - bit / WORD_WIDTH + // Returns (word in elements vec, and bit in word) for the given col + fn bit_position(&self, row: usize, col: usize) -> (usize, usize) { + return ( + row * self.row_word_width() + self.word_offset(col), + (self.left_padding_bits() + col) % WORD_WIDTH, + ); + } + + // Number of words required per row + fn row_word_width(&self) -> usize { + (self.num_dense_columns + WORD_WIDTH - 1) / WORD_WIDTH + } + + // Returns the number of unused bits on the left of each row + fn left_padding_bits(&self) -> usize { + (WORD_WIDTH - (self.num_dense_columns % WORD_WIDTH)) % WORD_WIDTH + } + + // Return the word in which bit lives, offset from the first for a row + fn word_offset(&self, bit: usize) -> usize { + (self.left_padding_bits() + bit) / WORD_WIDTH } // Returns mask to select the given bit in a word @@ -122,7 +138,7 @@ impl BinaryMatrix for SparseBinaryMatrix { let physical_i = self.logical_row_to_physical[i] as usize; let physical_j = self.logical_col_to_physical[j] as usize; if self.width - j <= self.num_dense_columns { - let (word, bit) = self.bit_position(physical_i, self.width - j - 1); + let (word, bit) = self.bit_position(physical_i, self.logical_col_to_dense_col(j)); if value == Octet::zero() { SparseBinaryMatrix::clear_bit(&mut self.dense_elements[word], bit); } else { @@ -160,8 +176,46 @@ impl BinaryMatrix for SparseBinaryMatrix { fn get_sub_row_as_octets(&self, row: usize, start_col: usize) -> Vec { let first_dense_column = self.width - self.num_dense_columns; assert!(start_col >= first_dense_column); + // The following implementation is equivalent to .map(|x| self.get(row, x)) + // but this implementation optimizes for sequential access and avoids all the + // extra bit index math + let physical_row = self.logical_row_to_physical[row] as usize; + let (mut word, mut bit) = + self.bit_position(physical_row, self.logical_col_to_dense_col(start_col)); (start_col..self.width) - .map(|col| self.get(row, col).byte()) + .map(|_| { + let x = if self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) == 0 { + Octet::zero() + } else { + Octet::one() + }; + bit += 1; + if bit == WORD_WIDTH { + word += 1; + bit = 0; + } + x.byte() + }) + .collect() + } + + fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec { + // The following implementation is equivalent to .filter(|x| self.get(row, x) != Octet::zero()) + // but this implementation optimizes for sequential access and avoids all the + // extra bit index math + let physical_row = self.logical_row_to_physical[row] as usize; + let (mut word, mut bit) = + self.bit_position(physical_row, self.logical_col_to_dense_col(start_col)); + (start_col..self.width) + .filter(|_| { + let result = self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) != 0; + bit += 1; + if bit == WORD_WIDTH { + word += 1; + bit = 0; + } + result + }) .collect() } @@ -169,7 +223,7 @@ impl BinaryMatrix for SparseBinaryMatrix { let physical_i = self.logical_row_to_physical[i] as usize; let physical_j = self.logical_col_to_physical[j] as usize; if self.width - j <= self.num_dense_columns { - let (word, bit) = self.bit_position(physical_i, self.width - j - 1); + let (word, bit) = self.bit_position(physical_i, self.logical_col_to_dense_col(j)); if self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) == 0 { return Octet::zero(); } else { @@ -262,11 +316,25 @@ impl BinaryMatrix for SparseBinaryMatrix { ); assert_eq!(self.column_index_disabled, false); self.num_dense_columns += 1; - let (last_word, last_bit) = self.bit_position(self.height - 1, self.num_dense_columns - 1); + let (last_word, _) = self.bit_position(self.height - 1, self.num_dense_columns - 1); // If this is in a new word - if last_bit == 0 && last_word >= self.dense_elements.len() { + if last_word >= self.dense_elements.len() { // Append a new set of words + let mut src = self.dense_elements.len(); self.dense_elements.extend(vec![0; self.height]); + let mut dest = self.dense_elements.len(); + // Re-space the elements, so that each row has an empty word + while src > 0 { + src -= 1; + dest -= 1; + self.dense_elements[dest] = self.dense_elements[src]; + if dest % self.row_word_width() == 1 { + dest -= 1; + self.dense_elements[dest] = 0; + } + } + assert_eq!(src, 0); + assert_eq!(dest, 0); } let physical_i = self.logical_col_to_physical[i] as usize; for maybe_present_in_row in self @@ -277,7 +345,7 @@ impl BinaryMatrix for SparseBinaryMatrix { { let physical_row = *maybe_present_in_row as usize; if let Some(value) = self.sparse_elements[physical_row].remove(physical_i) { - let (word, bit) = self.bit_position(physical_row, self.num_dense_columns - 1); + let (word, bit) = self.bit_position(physical_row, 0); if value == Octet::zero() { SparseBinaryMatrix::clear_bit(&mut self.dense_elements[word], bit); } else { @@ -297,11 +365,10 @@ impl BinaryMatrix for SparseBinaryMatrix { let physical_src = self.logical_row_to_physical[src] as usize; // First handle the dense columns if self.num_dense_columns > 0 { - let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1; - for word in 0..words { - let (dest_word, _) = self.bit_position(physical_dest, word * WORD_WIDTH); - let (src_word, _) = self.bit_position(physical_src, word * WORD_WIDTH); - self.dense_elements[dest_word] ^= self.dense_elements[src_word]; + let (dest_word, _) = self.bit_position(physical_dest, 0); + let (src_word, _) = self.bit_position(physical_src, 0); + for word in 0..self.row_word_width() { + self.dense_elements[dest_word + word] ^= self.dense_elements[src_word + word]; } } @@ -352,14 +419,12 @@ impl BinaryMatrix for SparseBinaryMatrix { if columns_to_remove == 0 && self.num_dense_columns > 0 { // TODO: optimize to not allocate this extra vec - let mut new_dense = - vec![0; new_height * ((self.num_dense_columns - 1) / WORD_WIDTH + 1)]; - let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1; - for word in 0..words { - for logical_row in 0..new_height { - let physical_row = self.logical_row_to_physical[logical_row] as usize; - new_dense[word * new_height + logical_row] = - self.dense_elements[word * self.height + physical_row]; + let mut new_dense = vec![0; new_height * self.row_word_width()]; + for logical_row in 0..new_height { + let physical_row = self.logical_row_to_physical[logical_row] as usize; + for word in 0..self.row_word_width() { + new_dense[logical_row * self.row_word_width() + word] = + self.dense_elements[physical_row * self.row_word_width() + word]; } } self.dense_elements = new_dense;