transpose/out_of_place.rs
1// Block size used by the tiling algoritms
2const BLOCK_SIZE: usize = 16;
3// Number of segments used by the segmented block transpose function
4const NBR_SEGMENTS: usize = 4;
5// recursively split data until the number of rows and columns is below this number
6const RECURSIVE_LIMIT: usize = 128;
7
8// Largest size for for using the direct approach
9const SMALL_LEN: usize = 255;
10// Largest size for using the tiled approach
11const MEDIUM_LEN: usize = 1024*1024;
12
13
14/// Given an array of size width * height, representing a flattened 2D array,
15/// transpose the rows and columns of that 2D array into the output.
16/// Benchmarking shows that loop tiling isn't effective for small arrays.
17unsafe fn transpose_small<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize) {
18 for x in 0..width {
19 for y in 0..height {
20 let input_index = x + y * width;
21 let output_index = y + x * height;
22
23 *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
24 }
25 }
26}
27
28// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
29// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
30unsafe fn transpose_block<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
31 for inner_x in 0..block_width {
32 for inner_y in 0..block_height {
33 let x = start_x + inner_x;
34 let y = start_y + inner_y;
35
36 let input_index = x + y * width;
37 let output_index = y + x * height;
38
39 *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
40 }
41 }
42}
43
44// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
45// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
46// This function works as `transpose_block`, but also divides the loop into a number of segments. This makes it more cache fiendly for large sizes.
47unsafe fn transpose_block_segmented<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
48 let height_per_div = block_height/NBR_SEGMENTS;
49 for subblock in 0..NBR_SEGMENTS {
50 for inner_x in 0..block_width {
51 for inner_y in 0..height_per_div {
52 let x = start_x + inner_x;
53 let y = start_y + inner_y + subblock*height_per_div;
54
55 let input_index = x + y * width;
56 let output_index = y + x * height;
57
58 *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
59 }
60 }
61 }
62}
63
64/// Given an array of size width * height, representing a flattened 2D array,
65/// transpose the rows and columns of that 2D array into the output.
66/// This algorithm divides the input into tiles of size BLOCK_SIZE*BLOCK_SIZE,
67/// in order to reduce cache misses. This works well for medium sizes, when the
68/// data for each tile fits in the caches.
69fn transpose_tiled<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
70
71 let x_block_count = input_width / BLOCK_SIZE;
72 let y_block_count = input_height / BLOCK_SIZE;
73
74 let remainder_x = input_width - x_block_count * BLOCK_SIZE;
75 let remainder_y = input_height - y_block_count * BLOCK_SIZE;
76
77 for y_block in 0..y_block_count {
78 for x_block in 0..x_block_count {
79 unsafe {
80 transpose_block(
81 input, output,
82 input_width, input_height,
83 x_block * BLOCK_SIZE, y_block * BLOCK_SIZE,
84 BLOCK_SIZE, BLOCK_SIZE,
85 );
86 }
87 }
88
89 //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
90 if remainder_x > 0 {
91 unsafe {
92 transpose_block(
93 input, output,
94 input_width, input_height,
95 input_width - remainder_x, y_block * BLOCK_SIZE,
96 remainder_x, BLOCK_SIZE);
97 }
98 }
99 }
100
101 //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
102 if remainder_y > 0 {
103 for x_block in 0..x_block_count {
104 unsafe {
105 transpose_block(
106 input, output,
107 input_width, input_height,
108 x_block * BLOCK_SIZE, input_height - remainder_y,
109 BLOCK_SIZE, remainder_y,
110 );
111 }
112 }
113
114 //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
115 if remainder_x > 0 {
116 unsafe {
117 transpose_block(
118 input, output,
119 input_width, input_height,
120 input_width - remainder_x, input_height - remainder_y,
121 remainder_x, remainder_y);
122 }
123 }
124 }
125}
126
127/// Given an array of size width * height, representing a flattened 2D array,
128/// transpose the rows and columns of that 2D array into the output.
129/// This is a recursive algorithm that divides the array into smaller pieces, until they are small enough to
130/// transpose directly without worrying about cache misses.
131/// Once they are small enough, they are transposed using a tiling algorithm.
132fn transpose_recursive<T: Copy>(input: &[T], output: &mut [T], row_start: usize, row_end: usize, col_start: usize, col_end: usize, total_columns: usize, total_rows: usize) {
133 let nbr_rows = row_end - row_start;
134 let nbr_cols = col_end - col_start;
135 if (nbr_rows <= RECURSIVE_LIMIT && nbr_cols <= RECURSIVE_LIMIT) || nbr_rows<=2 || nbr_cols<=2 {
136 let x_block_count = nbr_cols / BLOCK_SIZE;
137 let y_block_count = nbr_rows / BLOCK_SIZE;
138
139 let remainder_x = nbr_cols - x_block_count * BLOCK_SIZE;
140 let remainder_y = nbr_rows - y_block_count * BLOCK_SIZE;
141
142
143 for y_block in 0..y_block_count {
144 for x_block in 0..x_block_count {
145 unsafe {
146 transpose_block_segmented(
147 input, output,
148 total_columns, total_rows,
149 col_start + x_block * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
150 BLOCK_SIZE, BLOCK_SIZE,
151 );
152 }
153 }
154
155 //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
156 if remainder_x > 0 {
157 unsafe {
158 transpose_block(
159 input, output,
160 total_columns, total_rows,
161 col_start + x_block_count * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
162 remainder_x, BLOCK_SIZE);
163 }
164 }
165 }
166
167 //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
168 if remainder_y > 0 {
169 for x_block in 0..x_block_count {
170 unsafe {
171 transpose_block(
172 input, output,
173 total_columns, total_rows,
174 col_start + x_block * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
175 BLOCK_SIZE, remainder_y,
176 );
177 }
178 }
179
180 //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
181 if remainder_x > 0 {
182 unsafe {
183 transpose_block(
184 input, output,
185 total_columns, total_rows,
186 col_start + x_block_count * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
187 remainder_x, remainder_y);
188 }
189 }
190 }
191 } else if nbr_rows >= nbr_cols {
192 transpose_recursive(input, output, row_start, row_start + (nbr_rows / 2), col_start, col_end, total_columns, total_rows);
193 transpose_recursive(input, output, row_start + (nbr_rows / 2), row_end, col_start, col_end, total_columns, total_rows);
194 } else {
195 transpose_recursive(input, output, row_start, row_end, col_start, col_start + (nbr_cols / 2), total_columns, total_rows);
196 transpose_recursive(input, output, row_start, row_end, col_start + (nbr_cols / 2), col_end, total_columns, total_rows);
197 }
198}
199
200
201/// Transpose the input array into the output array.
202///
203/// Given an input array of size input_width * input_height, representing flattened 2D data stored in row-major order,
204/// transpose the rows and columns of that input array into the output array
205/// ```
206/// // row-major order: the rows of our 2D array are contiguous,
207/// // and the columns are strided
208/// let input_array = vec![ 1, 2, 3,
209/// 4, 5, 6];
210///
211/// // Treat our 6-element array as a 2D 3x2 array, and transpose it to a 2x3 array
212/// let mut output_array = vec![0; 6];
213/// transpose::transpose(&input_array, &mut output_array, 3, 2);
214///
215/// // The rows have become the columns, and the columns have become the rows
216/// let expected_array = vec![ 1, 4,
217/// 2, 5,
218/// 3, 6];
219/// assert_eq!(output_array, expected_array);
220///
221/// // If we transpose it again, we should get our original data back.
222/// let mut final_array = vec![0; 6];
223/// transpose::transpose(&output_array, &mut final_array, 2, 3);
224/// assert_eq!(final_array, input_array);
225/// ```
226///
227/// # Panics
228///
229/// Panics if `input.len() != input_width * input_height` or if `output.len() != input_width * input_height`
230pub fn transpose<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
231 assert_eq!(input_width.checked_mul(input_height), Some(input.len()));
232 assert_eq!(input.len(), output.len());
233 if input.len() <= SMALL_LEN {
234 unsafe { transpose_small(input, output, input_width, input_height) };
235 }
236 else if input.len() <= MEDIUM_LEN {
237 transpose_tiled(input, output, input_width, input_height);
238 }
239 else {
240 transpose_recursive(input, output, 0, input_height, 0, input_width, input_width, input_height);
241 }
242}
243