Optimize memory layout of dense U matrix

Previously we used column major ordering. Switch to row major to
optimize sequential access of rows which is much more common in the
first phase, and can also be used in the fourth phase

This improves performance by ~10% on large symbol counts
This commit is contained in:
Christopher Berner 2020-11-28 17:05:44 -08:00
parent 6245ab1c9a
commit c4d227fba1
4 changed files with 172 additions and 79 deletions

@ -23,51 +23,51 @@ The following were run on an Intel Core i5-6600K @ 3.50GHz
```
Symbol size: 1280 bytes (without pre-built plan)
symbol count = 10, encoded 127 MB in 0.545secs, throughput: 1878.8Mbit/s
symbol count = 100, encoded 127 MB in 0.645secs, throughput: 1586.7Mbit/s
symbol count = 250, encoded 127 MB in 0.509secs, throughput: 2009.7Mbit/s
symbol count = 500, encoded 127 MB in 0.503secs, throughput: 2028.8Mbit/s
symbol count = 1000, encoded 126 MB in 0.544secs, throughput: 1867.0Mbit/s
symbol count = 2000, encoded 126 MB in 0.628secs, throughput: 1617.2Mbit/s
symbol count = 5000, encoded 122 MB in 0.686secs, throughput: 1423.6Mbit/s
symbol count = 10000, encoded 122 MB in 0.833secs, throughput: 1172.3Mbit/s
symbol count = 20000, encoded 122 MB in 1.234secs, throughput: 791.4Mbit/s
symbol count = 50000, encoded 122 MB in 1.786secs, throughput: 546.8Mbit/s
symbol count = 10, encoded 127 MB in 0.548secs, throughput: 1868.5Mbit/s
symbol count = 100, encoded 127 MB in 0.637secs, throughput: 1606.7Mbit/s
symbol count = 250, encoded 127 MB in 0.507secs, throughput: 2017.7Mbit/s
symbol count = 500, encoded 127 MB in 0.488secs, throughput: 2091.2Mbit/s
symbol count = 1000, encoded 126 MB in 0.523secs, throughput: 1941.9Mbit/s
symbol count = 2000, encoded 126 MB in 0.599secs, throughput: 1695.5Mbit/s
symbol count = 5000, encoded 122 MB in 0.636secs, throughput: 1535.5Mbit/s
symbol count = 10000, encoded 122 MB in 0.769secs, throughput: 1269.9Mbit/s
symbol count = 20000, encoded 122 MB in 1.122secs, throughput: 870.4Mbit/s
symbol count = 50000, encoded 122 MB in 1.597secs, throughput: 611.5Mbit/s
Symbol size: 1280 bytes (with pre-built plan)
symbol count = 10, encoded 127 MB in 0.221secs, throughput: 4633.1Mbit/s
symbol count = 100, encoded 127 MB in 0.149secs, throughput: 6868.7Mbit/s
symbol count = 250, encoded 127 MB in 0.164secs, throughput: 6237.5Mbit/s
symbol count = 500, encoded 127 MB in 0.169secs, throughput: 6038.5Mbit/s
symbol count = 1000, encoded 126 MB in 0.178secs, throughput: 5705.8Mbit/s
symbol count = 2000, encoded 126 MB in 0.214secs, throughput: 4745.9Mbit/s
symbol count = 5000, encoded 122 MB in 0.262secs, throughput: 3727.3Mbit/s
symbol count = 10000, encoded 122 MB in 0.344secs, throughput: 2838.8Mbit/s
symbol count = 20000, encoded 122 MB in 0.427secs, throughput: 2287.0Mbit/s
symbol count = 50000, encoded 122 MB in 0.541secs, throughput: 1805.1Mbit/s
symbol count = 100, encoded 127 MB in 0.154secs, throughput: 6645.7Mbit/s
symbol count = 250, encoded 127 MB in 0.160secs, throughput: 6393.4Mbit/s
symbol count = 500, encoded 127 MB in 0.163secs, throughput: 6260.8Mbit/s
symbol count = 1000, encoded 126 MB in 0.173secs, throughput: 5870.7Mbit/s
symbol count = 2000, encoded 126 MB in 0.199secs, throughput: 5103.6Mbit/s
symbol count = 5000, encoded 122 MB in 0.255secs, throughput: 3829.7Mbit/s
symbol count = 10000, encoded 122 MB in 0.339secs, throughput: 2880.7Mbit/s
symbol count = 20000, encoded 122 MB in 0.425secs, throughput: 2297.8Mbit/s
symbol count = 50000, encoded 122 MB in 0.536secs, throughput: 1821.9Mbit/s
Symbol size: 1280 bytes
symbol count = 10, decoded 127 MB in 0.749secs using 0.0% overhead, throughput: 1367.1Mbit/s
symbol count = 100, decoded 127 MB in 0.742secs using 0.0% overhead, throughput: 1379.3Mbit/s
symbol count = 250, decoded 127 MB in 0.589secs using 0.0% overhead, throughput: 1736.8Mbit/s
symbol count = 500, decoded 127 MB in 0.594secs using 0.0% overhead, throughput: 1718.0Mbit/s
symbol count = 1000, decoded 126 MB in 0.638secs using 0.0% overhead, throughput: 1591.9Mbit/s
symbol count = 2000, decoded 126 MB in 0.718secs using 0.0% overhead, throughput: 1414.5Mbit/s
symbol count = 5000, decoded 122 MB in 0.829secs using 0.0% overhead, throughput: 1178.0Mbit/s
symbol count = 10000, decoded 122 MB in 1.049secs using 0.0% overhead, throughput: 930.9Mbit/s
symbol count = 20000, decoded 122 MB in 1.382secs using 0.0% overhead, throughput: 706.6Mbit/s
symbol count = 50000, decoded 122 MB in 2.355secs using 0.0% overhead, throughput: 414.7Mbit/s
symbol count = 10, decoded 127 MB in 0.758secs using 0.0% overhead, throughput: 1350.8Mbit/s
symbol count = 100, decoded 127 MB in 0.740secs using 0.0% overhead, throughput: 1383.0Mbit/s
symbol count = 250, decoded 127 MB in 0.577secs using 0.0% overhead, throughput: 1772.9Mbit/s
symbol count = 500, decoded 127 MB in 0.582secs using 0.0% overhead, throughput: 1753.4Mbit/s
symbol count = 1000, decoded 126 MB in 0.628secs using 0.0% overhead, throughput: 1617.2Mbit/s
symbol count = 2000, decoded 126 MB in 0.684secs using 0.0% overhead, throughput: 1484.8Mbit/s
symbol count = 5000, decoded 122 MB in 0.785secs using 0.0% overhead, throughput: 1244.0Mbit/s
symbol count = 10000, decoded 122 MB in 0.965secs using 0.0% overhead, throughput: 1012.0Mbit/s
symbol count = 20000, decoded 122 MB in 1.345secs using 0.0% overhead, throughput: 726.1Mbit/s
symbol count = 50000, decoded 122 MB in 2.101secs using 0.0% overhead, throughput: 464.8Mbit/s
symbol count = 10, decoded 127 MB in 0.740secs using 5.0% overhead, throughput: 1383.7Mbit/s
symbol count = 100, decoded 127 MB in 0.747secs using 5.0% overhead, throughput: 1370.1Mbit/s
symbol count = 250, decoded 127 MB in 0.581secs using 5.0% overhead, throughput: 1760.7Mbit/s
symbol count = 500, decoded 127 MB in 0.565secs using 5.0% overhead, throughput: 1806.2Mbit/s
symbol count = 1000, decoded 126 MB in 0.605secs using 5.0% overhead, throughput: 1678.7Mbit/s
symbol count = 2000, decoded 126 MB in 0.656secs using 5.0% overhead, throughput: 1548.2Mbit/s
symbol count = 5000, decoded 122 MB in 0.763secs using 5.0% overhead, throughput: 1279.9Mbit/s
symbol count = 10000, decoded 122 MB in 0.959secs using 5.0% overhead, throughput: 1018.3Mbit/s
symbol count = 20000, decoded 122 MB in 1.242secs using 5.0% overhead, throughput: 786.3Mbit/s
symbol count = 50000, decoded 122 MB in 2.146secs using 5.0% overhead, throughput: 455.1Mbit/s
symbol count = 10, decoded 127 MB in 0.753secs using 5.0% overhead, throughput: 1359.8Mbit/s
symbol count = 100, decoded 127 MB in 0.731secs using 5.0% overhead, throughput: 1400.1Mbit/s
symbol count = 250, decoded 127 MB in 0.575secs using 5.0% overhead, throughput: 1779.0Mbit/s
symbol count = 500, decoded 127 MB in 0.552secs using 5.0% overhead, throughput: 1848.7Mbit/s
symbol count = 1000, decoded 126 MB in 0.568secs using 5.0% overhead, throughput: 1788.1Mbit/s
symbol count = 2000, decoded 126 MB in 0.626secs using 5.0% overhead, throughput: 1622.4Mbit/s
symbol count = 5000, decoded 122 MB in 0.713secs using 5.0% overhead, throughput: 1369.7Mbit/s
symbol count = 10000, decoded 122 MB in 0.893secs using 5.0% overhead, throughput: 1093.6Mbit/s
symbol count = 20000, decoded 122 MB in 1.147secs using 5.0% overhead, throughput: 851.4Mbit/s
symbol count = 50000, decoded 122 MB in 1.943secs using 5.0% overhead, throughput: 502.6Mbit/s
```
### Public API

@ -27,6 +27,9 @@ pub trait BinaryMatrix: Clone {
// Get a slice of columns from a row as Octets
fn get_sub_row_as_octets(&self, row: usize, start_col: usize) -> Vec<u8>;
// Returns a list of columns with non-zero values in the given row, starting with start_col
fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec<usize>;
fn get(&self, i: usize, j: usize) -> Octet;
fn swap_rows(&mut self, i: usize, j: usize);
@ -177,6 +180,12 @@ impl BinaryMatrix for DenseBinaryMatrix {
result
}
fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec<usize> {
(start_col..self.width)
.filter(|col| self.get(row, *col) != Octet::zero())
.collect()
}
fn get(&self, i: usize, j: usize) -> Octet {
let (word, bit) = DenseBinaryMatrix::bit_position(j);
if self.elements[i][word] & DenseBinaryMatrix::select_mask(bit) == 0 {
@ -337,4 +346,27 @@ mod tests {
sparse.hint_column_dense_and_frozen(5);
assert_matrices_eq(&dense, &sparse);
}
#[test]
fn dense_storage_math() {
let size = 128;
let (mut dense, mut sparse) = rand_dense_and_sparse(size);
sparse.enable_column_access_acceleration();
for i in (0..(size - 1)).rev() {
sparse.hint_column_dense_and_frozen(i);
assert_matrices_eq(&dense, &sparse);
}
assert_matrices_eq(&dense, &sparse);
sparse.disable_column_access_acceleration();
for _ in 0..1000 {
let i = rand::thread_rng().gen_range(0, size);
let mut j = rand::thread_rng().gen_range(0, size);
while j == i {
j = rand::thread_rng().gen_range(0, size);
}
dense.add_assign_rows(i, j, 0);
sparse.add_assign_rows(i, j, 0);
}
assert_matrices_eq(&dense, &sparse);
}
}

@ -812,16 +812,12 @@ impl<T: BinaryMatrix> IntermediateSymbolDecoder<T> {
#[inline(never)]
fn fourth_phase(&mut self) {
for i in 0..self.i {
for j in 0..self.u {
let b = self.A.get(i, j + self.i);
if b != Octet::zero() {
let temp = self.i;
#[cfg(debug_assertions)]
self.fma_rows(temp + j, i, b, 0);
// Skip applying to cols before i due to Errata 11
#[cfg(not(debug_assertions))]
self.fma_rows(temp + j, i, b, self.i);
}
for j in self.A.query_non_zero_columns(i, self.i) {
#[cfg(debug_assertions)]
self.fma_rows(j, i, Octet::one(), 0);
// Skip applying to cols before i due to Errata 11
#[cfg(not(debug_assertions))]
self.fma_rows(j, i, Octet::one(), self.i);
}
}

@ -18,11 +18,8 @@ pub struct SparseBinaryMatrix {
height: usize,
width: usize,
sparse_elements: Vec<SparseBinaryVec>,
// Note these are stored such that the right-most 64 elements of a row are in
// dense_elements[row], the second 64 elements are stored in dense_elements[height + row], then
// the next in dense_elements[height * 2 + row]. Elements are numbered right to left,
// so the right-most element is in dense_elements[row] & 0b1. The second right most is in
// dense_elements[row] & 0b2.
// Note these are stored right aligned, so that the right most element is always at
// dense_elements[x] & (1 << 63)
dense_elements: Vec<u64>,
// Columnar storage of values. Only stores rows that have a 1-valued entry in the given column
sparse_columnar_values: Option<ImmutableListMap>,
@ -56,14 +53,33 @@ impl SparseBinaryMatrix {
}
}
// Returns (word in elements vec, and bit in word) for the given col
fn bit_position(&self, row: usize, col: usize) -> (usize, usize) {
return (self.height * (col / WORD_WIDTH) + row, col % WORD_WIDTH);
// Convert a logical col index to the bit index in the dense columns
fn logical_col_to_dense_col(&self, col: usize) -> usize {
assert!(col >= self.width - self.num_dense_columns);
col - (self.width - self.num_dense_columns)
}
// Return the word in which bit lives
fn word_offset(bit: usize) -> usize {
bit / WORD_WIDTH
// Returns (word in elements vec, and bit in word) for the given col
fn bit_position(&self, row: usize, col: usize) -> (usize, usize) {
return (
row * self.row_word_width() + self.word_offset(col),
(self.left_padding_bits() + col) % WORD_WIDTH,
);
}
// Number of words required per row
fn row_word_width(&self) -> usize {
(self.num_dense_columns + WORD_WIDTH - 1) / WORD_WIDTH
}
// Returns the number of unused bits on the left of each row
fn left_padding_bits(&self) -> usize {
(WORD_WIDTH - (self.num_dense_columns % WORD_WIDTH)) % WORD_WIDTH
}
// Return the word in which bit lives, offset from the first for a row
fn word_offset(&self, bit: usize) -> usize {
(self.left_padding_bits() + bit) / WORD_WIDTH
}
// Returns mask to select the given bit in a word
@ -122,7 +138,7 @@ impl BinaryMatrix for SparseBinaryMatrix {
let physical_i = self.logical_row_to_physical[i] as usize;
let physical_j = self.logical_col_to_physical[j] as usize;
if self.width - j <= self.num_dense_columns {
let (word, bit) = self.bit_position(physical_i, self.width - j - 1);
let (word, bit) = self.bit_position(physical_i, self.logical_col_to_dense_col(j));
if value == Octet::zero() {
SparseBinaryMatrix::clear_bit(&mut self.dense_elements[word], bit);
} else {
@ -160,8 +176,46 @@ impl BinaryMatrix for SparseBinaryMatrix {
fn get_sub_row_as_octets(&self, row: usize, start_col: usize) -> Vec<u8> {
let first_dense_column = self.width - self.num_dense_columns;
assert!(start_col >= first_dense_column);
// The following implementation is equivalent to .map(|x| self.get(row, x))
// but this implementation optimizes for sequential access and avoids all the
// extra bit index math
let physical_row = self.logical_row_to_physical[row] as usize;
let (mut word, mut bit) =
self.bit_position(physical_row, self.logical_col_to_dense_col(start_col));
(start_col..self.width)
.map(|col| self.get(row, col).byte())
.map(|_| {
let x = if self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) == 0 {
Octet::zero()
} else {
Octet::one()
};
bit += 1;
if bit == WORD_WIDTH {
word += 1;
bit = 0;
}
x.byte()
})
.collect()
}
fn query_non_zero_columns(&self, row: usize, start_col: usize) -> Vec<usize> {
// The following implementation is equivalent to .filter(|x| self.get(row, x) != Octet::zero())
// but this implementation optimizes for sequential access and avoids all the
// extra bit index math
let physical_row = self.logical_row_to_physical[row] as usize;
let (mut word, mut bit) =
self.bit_position(physical_row, self.logical_col_to_dense_col(start_col));
(start_col..self.width)
.filter(|_| {
let result = self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) != 0;
bit += 1;
if bit == WORD_WIDTH {
word += 1;
bit = 0;
}
result
})
.collect()
}
@ -169,7 +223,7 @@ impl BinaryMatrix for SparseBinaryMatrix {
let physical_i = self.logical_row_to_physical[i] as usize;
let physical_j = self.logical_col_to_physical[j] as usize;
if self.width - j <= self.num_dense_columns {
let (word, bit) = self.bit_position(physical_i, self.width - j - 1);
let (word, bit) = self.bit_position(physical_i, self.logical_col_to_dense_col(j));
if self.dense_elements[word] & SparseBinaryMatrix::select_mask(bit) == 0 {
return Octet::zero();
} else {
@ -262,11 +316,25 @@ impl BinaryMatrix for SparseBinaryMatrix {
);
assert_eq!(self.column_index_disabled, false);
self.num_dense_columns += 1;
let (last_word, last_bit) = self.bit_position(self.height - 1, self.num_dense_columns - 1);
let (last_word, _) = self.bit_position(self.height - 1, self.num_dense_columns - 1);
// If this is in a new word
if last_bit == 0 && last_word >= self.dense_elements.len() {
if last_word >= self.dense_elements.len() {
// Append a new set of words
let mut src = self.dense_elements.len();
self.dense_elements.extend(vec![0; self.height]);
let mut dest = self.dense_elements.len();
// Re-space the elements, so that each row has an empty word
while src > 0 {
src -= 1;
dest -= 1;
self.dense_elements[dest] = self.dense_elements[src];
if dest % self.row_word_width() == 1 {
dest -= 1;
self.dense_elements[dest] = 0;
}
}
assert_eq!(src, 0);
assert_eq!(dest, 0);
}
let physical_i = self.logical_col_to_physical[i] as usize;
for maybe_present_in_row in self
@ -277,7 +345,7 @@ impl BinaryMatrix for SparseBinaryMatrix {
{
let physical_row = *maybe_present_in_row as usize;
if let Some(value) = self.sparse_elements[physical_row].remove(physical_i) {
let (word, bit) = self.bit_position(physical_row, self.num_dense_columns - 1);
let (word, bit) = self.bit_position(physical_row, 0);
if value == Octet::zero() {
SparseBinaryMatrix::clear_bit(&mut self.dense_elements[word], bit);
} else {
@ -297,11 +365,10 @@ impl BinaryMatrix for SparseBinaryMatrix {
let physical_src = self.logical_row_to_physical[src] as usize;
// First handle the dense columns
if self.num_dense_columns > 0 {
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
for word in 0..words {
let (dest_word, _) = self.bit_position(physical_dest, word * WORD_WIDTH);
let (src_word, _) = self.bit_position(physical_src, word * WORD_WIDTH);
self.dense_elements[dest_word] ^= self.dense_elements[src_word];
let (dest_word, _) = self.bit_position(physical_dest, 0);
let (src_word, _) = self.bit_position(physical_src, 0);
for word in 0..self.row_word_width() {
self.dense_elements[dest_word + word] ^= self.dense_elements[src_word + word];
}
}
@ -352,14 +419,12 @@ impl BinaryMatrix for SparseBinaryMatrix {
if columns_to_remove == 0 && self.num_dense_columns > 0 {
// TODO: optimize to not allocate this extra vec
let mut new_dense =
vec![0; new_height * ((self.num_dense_columns - 1) / WORD_WIDTH + 1)];
let words = SparseBinaryMatrix::word_offset(self.num_dense_columns - 1) + 1;
for word in 0..words {
for logical_row in 0..new_height {
let physical_row = self.logical_row_to_physical[logical_row] as usize;
new_dense[word * new_height + logical_row] =
self.dense_elements[word * self.height + physical_row];
let mut new_dense = vec![0; new_height * self.row_word_width()];
for logical_row in 0..new_height {
let physical_row = self.logical_row_to_physical[logical_row] as usize;
for word in 0..self.row_word_width() {
new_dense[logical_row * self.row_word_width() + word] =
self.dense_elements[physical_row * self.row_word_width() + word];
}
}
self.dense_elements = new_dense;