transpose/out_of_place.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
// Block size used by the tiling algoritms
const BLOCK_SIZE: usize = 16;
// Number of segments used by the segmented block transpose function
const NBR_SEGMENTS: usize = 4;
// recursively split data until the number of rows and columns is below this number
const RECURSIVE_LIMIT: usize = 128;
// Largest size for for using the direct approach
const SMALL_LEN: usize = 255;
// Largest size for using the tiled approach
const MEDIUM_LEN: usize = 1024*1024;
/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// Benchmarking shows that loop tiling isn't effective for small arrays.
unsafe fn transpose_small<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize) {
for x in 0..width {
for y in 0..height {
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
unsafe fn transpose_block<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
for inner_x in 0..block_width {
for inner_y in 0..block_height {
let x = start_x + inner_x;
let y = start_y + inner_y;
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
// This function works as `transpose_block`, but also divides the loop into a number of segments. This makes it more cache fiendly for large sizes.
unsafe fn transpose_block_segmented<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
let height_per_div = block_height/NBR_SEGMENTS;
for subblock in 0..NBR_SEGMENTS {
for inner_x in 0..block_width {
for inner_y in 0..height_per_div {
let x = start_x + inner_x;
let y = start_y + inner_y + subblock*height_per_div;
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
}
/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// This algorithm divides the input into tiles of size BLOCK_SIZE*BLOCK_SIZE,
/// in order to reduce cache misses. This works well for medium sizes, when the
/// data for each tile fits in the caches.
fn transpose_tiled<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
let x_block_count = input_width / BLOCK_SIZE;
let y_block_count = input_height / BLOCK_SIZE;
let remainder_x = input_width - x_block_count * BLOCK_SIZE;
let remainder_y = input_height - y_block_count * BLOCK_SIZE;
for y_block in 0..y_block_count {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
input_width, input_height,
x_block * BLOCK_SIZE, y_block * BLOCK_SIZE,
BLOCK_SIZE, BLOCK_SIZE,
);
}
}
//if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
input_width, input_height,
input_width - remainder_x, y_block * BLOCK_SIZE,
remainder_x, BLOCK_SIZE);
}
}
}
//if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
if remainder_y > 0 {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
input_width, input_height,
x_block * BLOCK_SIZE, input_height - remainder_y,
BLOCK_SIZE, remainder_y,
);
}
}
//if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
input_width, input_height,
input_width - remainder_x, input_height - remainder_y,
remainder_x, remainder_y);
}
}
}
}
/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// This is a recursive algorithm that divides the array into smaller pieces, until they are small enough to
/// transpose directly without worrying about cache misses.
/// Once they are small enough, they are transposed using a tiling algorithm.
fn transpose_recursive<T: Copy>(input: &[T], output: &mut [T], row_start: usize, row_end: usize, col_start: usize, col_end: usize, total_columns: usize, total_rows: usize) {
let nbr_rows = row_end - row_start;
let nbr_cols = col_end - col_start;
if (nbr_rows <= RECURSIVE_LIMIT && nbr_cols <= RECURSIVE_LIMIT) || nbr_rows<=2 || nbr_cols<=2 {
let x_block_count = nbr_cols / BLOCK_SIZE;
let y_block_count = nbr_rows / BLOCK_SIZE;
let remainder_x = nbr_cols - x_block_count * BLOCK_SIZE;
let remainder_y = nbr_rows - y_block_count * BLOCK_SIZE;
for y_block in 0..y_block_count {
for x_block in 0..x_block_count {
unsafe {
transpose_block_segmented(
input, output,
total_columns, total_rows,
col_start + x_block * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
BLOCK_SIZE, BLOCK_SIZE,
);
}
}
//if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block_count * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
remainder_x, BLOCK_SIZE);
}
}
}
//if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
if remainder_y > 0 {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
BLOCK_SIZE, remainder_y,
);
}
}
//if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block_count * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
remainder_x, remainder_y);
}
}
}
} else if nbr_rows >= nbr_cols {
transpose_recursive(input, output, row_start, row_start + (nbr_rows / 2), col_start, col_end, total_columns, total_rows);
transpose_recursive(input, output, row_start + (nbr_rows / 2), row_end, col_start, col_end, total_columns, total_rows);
} else {
transpose_recursive(input, output, row_start, row_end, col_start, col_start + (nbr_cols / 2), total_columns, total_rows);
transpose_recursive(input, output, row_start, row_end, col_start + (nbr_cols / 2), col_end, total_columns, total_rows);
}
}
/// Transpose the input array into the output array.
///
/// Given an input array of size input_width * input_height, representing flattened 2D data stored in row-major order,
/// transpose the rows and columns of that input array into the output array
/// ```
/// // row-major order: the rows of our 2D array are contiguous,
/// // and the columns are strided
/// let input_array = vec![ 1, 2, 3,
/// 4, 5, 6];
///
/// // Treat our 6-element array as a 2D 3x2 array, and transpose it to a 2x3 array
/// let mut output_array = vec![0; 6];
/// transpose::transpose(&input_array, &mut output_array, 3, 2);
///
/// // The rows have become the columns, and the columns have become the rows
/// let expected_array = vec![ 1, 4,
/// 2, 5,
/// 3, 6];
/// assert_eq!(output_array, expected_array);
///
/// // If we transpose it again, we should get our original data back.
/// let mut final_array = vec![0; 6];
/// transpose::transpose(&output_array, &mut final_array, 2, 3);
/// assert_eq!(final_array, input_array);
/// ```
///
/// # Panics
///
/// Panics if `input.len() != input_width * input_height` or if `output.len() != input_width * input_height`
pub fn transpose<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
assert_eq!(input_width.checked_mul(input_height), Some(input.len()));
assert_eq!(input.len(), output.len());
if input.len() <= SMALL_LEN {
unsafe { transpose_small(input, output, input_width, input_height) };
}
else if input.len() <= MEDIUM_LEN {
transpose_tiled(input, output, input_width, input_height);
}
else {
transpose_recursive(input, output, 0, input_height, 0, input_width, input_width, input_height);
}
}