transpose/
out_of_place.rs

1// Block size used by the tiling algoritms
2const BLOCK_SIZE: usize = 16;
3// Number of segments used by the segmented block transpose function
4const NBR_SEGMENTS: usize = 4;
5// recursively split data until the number of rows and columns is below this number
6const RECURSIVE_LIMIT: usize = 128;
7
8// Largest size for for using the direct approach
9const SMALL_LEN: usize = 255;
10// Largest size for using the tiled approach
11const MEDIUM_LEN: usize = 1024*1024;
12
13
14/// Given an array of size width * height, representing a flattened 2D array,
15/// transpose the rows and columns of that 2D array into the output.
16/// Benchmarking shows that loop tiling isn't effective for small arrays.
17unsafe fn transpose_small<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize) {
18    for x in 0..width {
19        for y in 0..height {
20            let input_index = x + y * width;
21            let output_index = y + x * height;
22
23            *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
24        }
25    }
26}
27
28// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
29// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
30unsafe fn transpose_block<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
31    for inner_x in 0..block_width {
32        for inner_y in 0..block_height {
33            let x = start_x + inner_x;
34            let y = start_y + inner_y;
35
36            let input_index = x + y * width;
37            let output_index = y + x * height;
38
39            *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
40        }
41    }
42}
43
44// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
45// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
46// This function works as `transpose_block`, but also divides the loop into a number of segments. This makes it more cache fiendly for large sizes.
47unsafe fn transpose_block_segmented<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
48    let height_per_div = block_height/NBR_SEGMENTS;
49    for subblock in 0..NBR_SEGMENTS {
50        for inner_x in 0..block_width {
51            for inner_y in 0..height_per_div {
52                let x = start_x + inner_x;
53                let y = start_y + inner_y + subblock*height_per_div;
54
55                let input_index = x + y * width;
56                let output_index = y + x * height;
57
58                *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
59            }
60        }
61    }
62}
63
64/// Given an array of size width * height, representing a flattened 2D array,
65/// transpose the rows and columns of that 2D array into the output.
66/// This algorithm divides the input into tiles of size BLOCK_SIZE*BLOCK_SIZE, 
67/// in order to reduce cache misses. This works well for medium sizes, when the
68/// data for each tile fits in the caches.  
69fn transpose_tiled<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
70
71    let x_block_count = input_width / BLOCK_SIZE;
72    let y_block_count = input_height / BLOCK_SIZE;
73
74    let remainder_x = input_width - x_block_count * BLOCK_SIZE;
75    let remainder_y = input_height - y_block_count * BLOCK_SIZE;
76
77    for y_block in 0..y_block_count {
78        for x_block in 0..x_block_count {
79            unsafe {
80                transpose_block(
81                    input, output,
82                    input_width, input_height,
83                    x_block * BLOCK_SIZE, y_block * BLOCK_SIZE,
84                    BLOCK_SIZE, BLOCK_SIZE,
85                    );
86            }
87        }
88
89        //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
90        if remainder_x > 0 {
91            unsafe {
92                transpose_block(
93                    input, output, 
94                    input_width, input_height, 
95                    input_width - remainder_x, y_block * BLOCK_SIZE, 
96                    remainder_x, BLOCK_SIZE);
97            }
98        }
99    }
100
101    //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
102    if remainder_y > 0 {
103        for x_block in 0..x_block_count {
104            unsafe {
105                transpose_block(
106                    input, output,
107                    input_width, input_height,
108                    x_block * BLOCK_SIZE, input_height - remainder_y,
109                    BLOCK_SIZE, remainder_y,
110                    );
111            }
112        }
113
114        //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
115        if remainder_x > 0 {
116            unsafe {
117                transpose_block(
118                    input, output,
119                    input_width, input_height, 
120                    input_width - remainder_x, input_height - remainder_y, 
121                    remainder_x, remainder_y);
122            }
123        }
124    } 
125}
126
127/// Given an array of size width * height, representing a flattened 2D array,
128/// transpose the rows and columns of that 2D array into the output.
129/// This is a recursive algorithm that divides the array into smaller pieces, until they are small enough to
130/// transpose directly without worrying about cache misses.
131/// Once they are small enough, they are transposed using a tiling algorithm. 
132fn transpose_recursive<T: Copy>(input: &[T], output: &mut [T], row_start: usize, row_end: usize, col_start: usize, col_end:  usize, total_columns: usize, total_rows: usize) {
133    let nbr_rows = row_end - row_start; 
134    let nbr_cols = col_end - col_start;
135    if (nbr_rows <= RECURSIVE_LIMIT && nbr_cols <= RECURSIVE_LIMIT) || nbr_rows<=2 || nbr_cols<=2 {
136        let x_block_count = nbr_cols / BLOCK_SIZE;
137        let y_block_count = nbr_rows / BLOCK_SIZE;
138
139        let remainder_x = nbr_cols - x_block_count * BLOCK_SIZE;
140        let remainder_y = nbr_rows - y_block_count * BLOCK_SIZE;
141
142
143        for y_block in 0..y_block_count {
144            for x_block in 0..x_block_count {
145                unsafe {
146                    transpose_block_segmented(
147                        input, output,
148                        total_columns, total_rows,
149                        col_start + x_block * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
150                        BLOCK_SIZE, BLOCK_SIZE,
151                        );
152                }
153            }
154
155            //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
156            if remainder_x > 0 {
157                unsafe {
158                    transpose_block(
159                        input, output,
160                        total_columns, total_rows,
161                        col_start + x_block_count * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE, 
162                        remainder_x, BLOCK_SIZE);
163                }
164            }
165        }
166
167        //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
168        if remainder_y > 0 {
169            for x_block in 0..x_block_count {
170                unsafe {
171                    transpose_block(
172                        input, output,
173                        total_columns, total_rows,
174                        col_start + x_block * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
175                        BLOCK_SIZE, remainder_y,
176                        );
177                }
178            }
179        
180            //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
181            if remainder_x > 0 {
182                unsafe {
183                    transpose_block(
184                        input, output,
185                        total_columns, total_rows,
186                        col_start + x_block_count * BLOCK_SIZE,  row_start + y_block_count * BLOCK_SIZE, 
187                        remainder_x, remainder_y);
188                }
189            }
190        } 
191    } else if nbr_rows >= nbr_cols {
192        transpose_recursive(input, output, row_start, row_start + (nbr_rows / 2), col_start, col_end, total_columns, total_rows);
193        transpose_recursive(input, output, row_start + (nbr_rows / 2), row_end, col_start, col_end, total_columns, total_rows);
194    } else {
195        transpose_recursive(input, output, row_start, row_end, col_start, col_start + (nbr_cols / 2), total_columns, total_rows);
196        transpose_recursive(input, output, row_start, row_end, col_start + (nbr_cols / 2), col_end, total_columns, total_rows);
197    }
198}
199
200
201/// Transpose the input array into the output array. 
202///
203/// Given an input array of size input_width * input_height, representing flattened 2D data stored in row-major order,
204/// transpose the rows and columns of that input array into the output array
205/// ```
206/// // row-major order: the rows of our 2D array are contiguous,
207/// // and the columns are strided
208/// let input_array = vec![ 1, 2, 3,
209/// 						4, 5, 6];
210/// 
211/// // Treat our 6-element array as a 2D 3x2 array, and transpose it to a 2x3 array
212/// let mut output_array = vec![0; 6];
213/// transpose::transpose(&input_array, &mut output_array, 3, 2);
214///
215/// // The rows have become the columns, and the columns have become the rows
216/// let expected_array =  vec![ 1, 4,
217///								2, 5,
218///								3, 6];
219/// assert_eq!(output_array, expected_array);
220///
221/// // If we transpose it again, we should get our original data back.
222/// let mut final_array = vec![0; 6];
223/// transpose::transpose(&output_array, &mut final_array, 2, 3);
224/// assert_eq!(final_array, input_array);
225/// ```
226///
227/// # Panics
228/// 
229/// Panics if `input.len() != input_width * input_height` or if `output.len() != input_width * input_height`
230pub fn transpose<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
231    assert_eq!(input_width.checked_mul(input_height), Some(input.len()));
232    assert_eq!(input.len(), output.len());
233    if input.len() <= SMALL_LEN {
234        unsafe { transpose_small(input, output, input_width, input_height) };
235    }
236    else if input.len() <= MEDIUM_LEN {
237        transpose_tiled(input, output, input_width, input_height);
238    }
239    else {
240        transpose_recursive(input, output, 0, input_height, 0, input_width, input_width, input_height);
241    }
242}
243