transpose/
out_of_place.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Block size used by the tiling algoritms
const BLOCK_SIZE: usize = 16;
// Number of segments used by the segmented block transpose function
const NBR_SEGMENTS: usize = 4;
// recursively split data until the number of rows and columns is below this number
const RECURSIVE_LIMIT: usize = 128;

// Largest size for for using the direct approach
const SMALL_LEN: usize = 255;
// Largest size for using the tiled approach
const MEDIUM_LEN: usize = 1024*1024;


/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// Benchmarking shows that loop tiling isn't effective for small arrays.
unsafe fn transpose_small<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize) {
    for x in 0..width {
        for y in 0..height {
            let input_index = x + y * width;
            let output_index = y + x * height;

            *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
        }
    }
}

// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
unsafe fn transpose_block<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
    for inner_x in 0..block_width {
        for inner_y in 0..block_height {
            let x = start_x + inner_x;
            let y = start_y + inner_y;

            let input_index = x + y * width;
            let output_index = y + x * height;

            *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
        }
    }
}

// Transpose a subset of the array, from the input into the output. The idea is that by transposing one block at a time, we can be more cache-friendly
// SAFETY: Width * height must equal input.len() and output.len(), start_x + block_width must be <= width, start_y + block height must be <= height
// This function works as `transpose_block`, but also divides the loop into a number of segments. This makes it more cache fiendly for large sizes.
unsafe fn transpose_block_segmented<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
    let height_per_div = block_height/NBR_SEGMENTS;
    for subblock in 0..NBR_SEGMENTS {
        for inner_x in 0..block_width {
            for inner_y in 0..height_per_div {
                let x = start_x + inner_x;
                let y = start_y + inner_y + subblock*height_per_div;

                let input_index = x + y * width;
                let output_index = y + x * height;

                *output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
            }
        }
    }
}

/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// This algorithm divides the input into tiles of size BLOCK_SIZE*BLOCK_SIZE, 
/// in order to reduce cache misses. This works well for medium sizes, when the
/// data for each tile fits in the caches.  
fn transpose_tiled<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {

    let x_block_count = input_width / BLOCK_SIZE;
    let y_block_count = input_height / BLOCK_SIZE;

    let remainder_x = input_width - x_block_count * BLOCK_SIZE;
    let remainder_y = input_height - y_block_count * BLOCK_SIZE;

    for y_block in 0..y_block_count {
        for x_block in 0..x_block_count {
            unsafe {
                transpose_block(
                    input, output,
                    input_width, input_height,
                    x_block * BLOCK_SIZE, y_block * BLOCK_SIZE,
                    BLOCK_SIZE, BLOCK_SIZE,
                    );
            }
        }

        //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
        if remainder_x > 0 {
            unsafe {
                transpose_block(
                    input, output, 
                    input_width, input_height, 
                    input_width - remainder_x, y_block * BLOCK_SIZE, 
                    remainder_x, BLOCK_SIZE);
            }
        }
    }

    //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
    if remainder_y > 0 {
        for x_block in 0..x_block_count {
            unsafe {
                transpose_block(
                    input, output,
                    input_width, input_height,
                    x_block * BLOCK_SIZE, input_height - remainder_y,
                    BLOCK_SIZE, remainder_y,
                    );
            }
        }

        //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
        if remainder_x > 0 {
            unsafe {
                transpose_block(
                    input, output,
                    input_width, input_height, 
                    input_width - remainder_x, input_height - remainder_y, 
                    remainder_x, remainder_y);
            }
        }
    } 
}

/// Given an array of size width * height, representing a flattened 2D array,
/// transpose the rows and columns of that 2D array into the output.
/// This is a recursive algorithm that divides the array into smaller pieces, until they are small enough to
/// transpose directly without worrying about cache misses.
/// Once they are small enough, they are transposed using a tiling algorithm. 
fn transpose_recursive<T: Copy>(input: &[T], output: &mut [T], row_start: usize, row_end: usize, col_start: usize, col_end:  usize, total_columns: usize, total_rows: usize) {
    let nbr_rows = row_end - row_start; 
    let nbr_cols = col_end - col_start;
    if (nbr_rows <= RECURSIVE_LIMIT && nbr_cols <= RECURSIVE_LIMIT) || nbr_rows<=2 || nbr_cols<=2 {
        let x_block_count = nbr_cols / BLOCK_SIZE;
        let y_block_count = nbr_rows / BLOCK_SIZE;

        let remainder_x = nbr_cols - x_block_count * BLOCK_SIZE;
        let remainder_y = nbr_rows - y_block_count * BLOCK_SIZE;


        for y_block in 0..y_block_count {
            for x_block in 0..x_block_count {
                unsafe {
                    transpose_block_segmented(
                        input, output,
                        total_columns, total_rows,
                        col_start + x_block * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
                        BLOCK_SIZE, BLOCK_SIZE,
                        );
                }
            }

            //if the input_width is not cleanly divisible by block_size, there are still a few columns that haven't been transposed
            if remainder_x > 0 {
                unsafe {
                    transpose_block(
                        input, output,
                        total_columns, total_rows,
                        col_start + x_block_count * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE, 
                        remainder_x, BLOCK_SIZE);
                }
            }
        }

        //if the input_height is not cleanly divisible by BLOCK_SIZE, there are still a few rows that haven't been transposed
        if remainder_y > 0 {
            for x_block in 0..x_block_count {
                unsafe {
                    transpose_block(
                        input, output,
                        total_columns, total_rows,
                        col_start + x_block * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
                        BLOCK_SIZE, remainder_y,
                        );
                }
            }
        
            //if the input_width is not cleanly divisible by block_size, there are still a few rows+columns that haven't been transposed
            if remainder_x > 0 {
                unsafe {
                    transpose_block(
                        input, output,
                        total_columns, total_rows,
                        col_start + x_block_count * BLOCK_SIZE,  row_start + y_block_count * BLOCK_SIZE, 
                        remainder_x, remainder_y);
                }
            }
        } 
    } else if nbr_rows >= nbr_cols {
        transpose_recursive(input, output, row_start, row_start + (nbr_rows / 2), col_start, col_end, total_columns, total_rows);
        transpose_recursive(input, output, row_start + (nbr_rows / 2), row_end, col_start, col_end, total_columns, total_rows);
    } else {
        transpose_recursive(input, output, row_start, row_end, col_start, col_start + (nbr_cols / 2), total_columns, total_rows);
        transpose_recursive(input, output, row_start, row_end, col_start + (nbr_cols / 2), col_end, total_columns, total_rows);
    }
}


/// Transpose the input array into the output array. 
///
/// Given an input array of size input_width * input_height, representing flattened 2D data stored in row-major order,
/// transpose the rows and columns of that input array into the output array
/// ```
/// // row-major order: the rows of our 2D array are contiguous,
/// // and the columns are strided
/// let input_array = vec![ 1, 2, 3,
/// 						4, 5, 6];
/// 
/// // Treat our 6-element array as a 2D 3x2 array, and transpose it to a 2x3 array
/// let mut output_array = vec![0; 6];
/// transpose::transpose(&input_array, &mut output_array, 3, 2);
///
/// // The rows have become the columns, and the columns have become the rows
/// let expected_array =  vec![ 1, 4,
///								2, 5,
///								3, 6];
/// assert_eq!(output_array, expected_array);
///
/// // If we transpose it again, we should get our original data back.
/// let mut final_array = vec![0; 6];
/// transpose::transpose(&output_array, &mut final_array, 2, 3);
/// assert_eq!(final_array, input_array);
/// ```
///
/// # Panics
/// 
/// Panics if `input.len() != input_width * input_height` or if `output.len() != input_width * input_height`
pub fn transpose<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
    assert_eq!(input_width.checked_mul(input_height), Some(input.len()));
    assert_eq!(input.len(), output.len());
    if input.len() <= SMALL_LEN {
        unsafe { transpose_small(input, output, input_width, input_height) };
    }
    else if input.len() <= MEDIUM_LEN {
        transpose_tiled(input, output, input_width, input_height);
    }
    else {
        transpose_recursive(input, output, 0, input_height, 0, input_width, input_width, input_height);
    }
}