textwrap/wrap_algorithms/
optimal_fit.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
use std::cell::RefCell;

use crate::core::Fragment;

/// Penalties for
/// [`WrapAlgorithm::OptimalFit`](crate::WrapAlgorithm::OptimalFit)
/// and [`wrap_optimal_fit`].
///
/// This wrapping algorithm in [`wrap_optimal_fit`] considers the
/// entire paragraph to find optimal line breaks. When wrapping text,
/// "penalties" are assigned to line breaks based on the gaps left at
/// the end of lines. The penalties are given by this struct, with
/// [`Penalties::default`] assigning penalties that work well for
/// monospace text.
///
/// If you are wrapping proportional text, you are advised to assign
/// your own penalties according to your font size. See the individual
/// penalties below for details.
///
/// **Note:** Only available when the `smawk` Cargo feature is
/// enabled.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Penalties {
    /// Per-line penalty. This is added for every line, which makes it
    /// expensive to output more lines than the minimum required.
    pub nline_penalty: usize,

    /// Per-character cost for lines that overflow the target line width.
    ///
    /// With a default value of 50², every single character costs as
    /// much as leaving a gap of 50 characters behind. This is because
    /// we assign as cost of `gap * gap` to a short line. When
    /// wrapping monospace text, we can overflow the line by 1
    /// character in extreme cases:
    ///
    /// ```
    /// use textwrap::core::Word;
    /// use textwrap::wrap_algorithms::{wrap_optimal_fit, Penalties};
    ///
    /// let short = "foo ";
    /// let long = "x".repeat(50);
    /// let length = (short.len() + long.len()) as f64;
    /// let fragments = vec![Word::from(short), Word::from(&long)];
    /// let penalties = Penalties::new();
    ///
    /// // Perfect fit, both words are on a single line with no overflow.
    /// let wrapped = wrap_optimal_fit(&fragments, &[length], &penalties).unwrap();
    /// assert_eq!(wrapped, vec![&[Word::from(short), Word::from(&long)]]);
    ///
    /// // The words no longer fit, yet we get a single line back. While
    /// // the cost of overflow (`1 * 2500`) is the same as the cost of the
    /// // gap (`50 * 50 = 2500`), the tie is broken by `nline_penalty`
    /// // which makes it cheaper to overflow than to use two lines.
    /// let wrapped = wrap_optimal_fit(&fragments, &[length - 1.0], &penalties).unwrap();
    /// assert_eq!(wrapped, vec![&[Word::from(short), Word::from(&long)]]);
    ///
    /// // The cost of overflow would be 2 * 2500, whereas the cost of
    /// // the gap is only `49 * 49 + nline_penalty = 2401 + 1000 =
    /// // 3401`. We therefore get two lines.
    /// let wrapped = wrap_optimal_fit(&fragments, &[length - 2.0], &penalties).unwrap();
    /// assert_eq!(wrapped, vec![&[Word::from(short)],
    ///                          &[Word::from(&long)]]);
    /// ```
    ///
    /// This only happens if the overflowing word is 50 characters
    /// long _and_ if the word overflows the line by exactly one
    /// character. If it overflows by more than one character, the
    /// overflow penalty will quickly outgrow the cost of the gap, as
    /// seen above.
    pub overflow_penalty: usize,

    /// When should the a single word on the last line be considered
    /// "too short"?
    ///
    /// If the last line of the text consist of a single word and if
    /// this word is shorter than `1 / short_last_line_fraction` of
    /// the line width, then the final line will be considered "short"
    /// and `short_last_line_penalty` is added as an extra penalty.
    ///
    /// The effect of this is to avoid a final line consisting of a
    /// single small word. For example, with a
    /// `short_last_line_penalty` of 25 (the default), a gap of up to
    /// 5 columns will be seen as more desirable than having a final
    /// short line.
    ///
    /// ## Examples
    ///
    /// ```
    /// use textwrap::{wrap, wrap_algorithms, Options, WrapAlgorithm};
    ///
    /// let text = "This is a demo of the short last line penalty.";
    ///
    /// // The first-fit algorithm leaves a single short word on the last line:
    /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::FirstFit)),
    ///            vec!["This is a demo of the short last line",
    ///                 "penalty."]);
    ///
    /// #[cfg(feature = "smawk")] {
    /// let mut penalties = wrap_algorithms::Penalties::new();
    ///
    /// // Since "penalty." is shorter than 25% of the line width, the
    /// // optimal-fit algorithm adds a penalty of 25. This is enough
    /// // to move "line " down:
    /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))),
    ///            vec!["This is a demo of the short last",
    ///                 "line penalty."]);
    ///
    /// // We can change the meaning of "short" lines. Here, only words
    /// // shorter than 1/10th of the line width will be considered short:
    /// penalties.short_last_line_fraction = 10;
    /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))),
    ///            vec!["This is a demo of the short last line",
    ///                 "penalty."]);
    ///
    /// // If desired, the penalty can also be disabled:
    /// penalties.short_last_line_fraction = 4;
    /// penalties.short_last_line_penalty = 0;
    /// assert_eq!(wrap(text, Options::new(37).wrap_algorithm(WrapAlgorithm::OptimalFit(penalties))),
    ///            vec!["This is a demo of the short last line",
    ///                 "penalty."]);
    /// }
    /// ```
    pub short_last_line_fraction: usize,

    /// Penalty for a last line with a single short word.
    ///
    /// Set this to zero if you do not want to penalize short last lines.
    pub short_last_line_penalty: usize,

    /// Penalty for lines ending with a hyphen.
    pub hyphen_penalty: usize,
}

impl Penalties {
    /// Default penalties for monospace text.
    ///
    /// The penalties here work well for monospace text. This is
    /// because they expect the gaps at the end of lines to be roughly
    /// in the range `0..100`. If the gaps are larger, the
    /// `overflow_penalty` and `hyphen_penalty` become insignificant.
    pub const fn new() -> Self {
        Penalties {
            nline_penalty: 1000,
            overflow_penalty: 50 * 50,
            short_last_line_fraction: 4,
            short_last_line_penalty: 25,
            hyphen_penalty: 25,
        }
    }
}

impl Default for Penalties {
    fn default() -> Self {
        Self::new()
    }
}

/// Cache for line numbers. This is necessary to avoid a O(n**2)
/// behavior when computing line numbers in [`wrap_optimal_fit`].
struct LineNumbers {
    line_numbers: RefCell<Vec<usize>>,
}

impl LineNumbers {
    fn new(size: usize) -> Self {
        let mut line_numbers = Vec::with_capacity(size);
        line_numbers.push(0);
        LineNumbers {
            line_numbers: RefCell::new(line_numbers),
        }
    }

    fn get<T>(&self, i: usize, minima: &[(usize, T)]) -> usize {
        while self.line_numbers.borrow_mut().len() < i + 1 {
            let pos = self.line_numbers.borrow().len();
            let line_number = 1 + self.get(minima[pos].0, minima);
            self.line_numbers.borrow_mut().push(line_number);
        }

        self.line_numbers.borrow()[i]
    }
}

/// Overflow error during the [`wrap_optimal_fit`] computation.
#[derive(Debug, PartialEq, Eq)]
pub struct OverflowError;

impl std::fmt::Display for OverflowError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "wrap_optimal_fit cost computation overflowed")
    }
}

impl std::error::Error for OverflowError {}

/// Wrap abstract fragments into lines with an optimal-fit algorithm.
///
/// The `line_widths` slice gives the target line width for each line
/// (the last slice element is repeated as necessary). This can be
/// used to implement hanging indentation.
///
/// The fragments must already have been split into the desired
/// widths, this function will not (and cannot) attempt to split them
/// further when arranging them into lines.
///
/// # Optimal-Fit Algorithm
///
/// The algorithm considers all possible break points and picks the
/// breaks which minimizes the gaps at the end of each line. More
/// precisely, the algorithm assigns a cost or penalty to each break
/// point, determined by `cost = gap * gap` where `gap = target_width -
/// line_width`. Shorter lines are thus penalized more heavily since
/// they leave behind a larger gap.
///
/// We can illustrate this with the text “To be, or not to be: that is
/// the question”. We will be wrapping it in a narrow column with room
/// for only 10 characters. The [greedy
/// algorithm](super::wrap_first_fit) will produce these lines, each
/// annotated with the corresponding penalty:
///
/// ```text
/// "To be, or"   1² =  1
/// "not to be:"  0² =  0
/// "that is"     3² =  9
/// "the"         7² = 49
/// "question"    2² =  4
/// ```
///
/// We see that line four with “the” leaves a gap of 7 columns, which
/// gives it a penalty of 49. The sum of the penalties is 63.
///
/// There are 10 words, which means that there are `2_u32.pow(9)` or
/// 512 different ways to typeset it. We can compute
/// the sum of the penalties for each possible line break and search
/// for the one with the lowest sum:
///
/// ```text
/// "To be,"     4² = 16
/// "or not to"  1² =  1
/// "be: that"   2² =  4
/// "is the"     4² = 16
/// "question"   2² =  4
/// ```
///
/// The sum of the penalties is 41, which is better than what the
/// greedy algorithm produced.
///
/// Searching through all possible combinations would normally be
/// prohibitively slow. However, it turns out that the problem can be
/// formulated as the task of finding column minima in a cost matrix.
/// This matrix has a special form (totally monotone) which lets us
/// use a [linear-time algorithm called
/// SMAWK](https://lib.rs/crates/smawk) to find the optimal break
/// points.
///
/// This means that the time complexity remains O(_n_) where _n_ is
/// the number of words. Compared to
/// [`wrap_first_fit()`](super::wrap_first_fit), this function is
/// about 4 times slower.
///
/// The optimization of per-line costs over the entire paragraph is
/// inspired by the line breaking algorithm used in TeX, as described
/// in the 1981 article [_Breaking Paragraphs into
/// Lines_](http://www.eprg.org/G53DOC/pdfs/knuth-plass-breaking.pdf)
/// by Knuth and Plass. The implementation here is based on [Python
/// code by David
/// Eppstein](https://github.com/jfinkels/PADS/blob/master/pads/wrap.py).
///
/// # Errors
///
/// In case of an overflow during the cost computation, an `Err` is
/// returned. Overflows happens when fragments or lines have infinite
/// widths (`f64::INFINITY`) or if the widths are so large that the
/// gaps at the end of lines have sizes larger than `f64::MAX.sqrt()`
/// (approximately 1e154):
///
/// ```
/// use textwrap::core::Fragment;
/// use textwrap::wrap_algorithms::{wrap_optimal_fit, OverflowError, Penalties};
///
/// #[derive(Debug, PartialEq)]
/// struct Word(f64);
///
/// impl Fragment for Word {
///     fn width(&self) -> f64 { self.0 }
///     fn whitespace_width(&self) -> f64 { 1.0 }
///     fn penalty_width(&self) -> f64 { 0.0 }
/// }
///
/// // Wrapping overflows because 1e155 * 1e155 = 1e310, which is
/// // larger than f64::MAX:
/// assert_eq!(wrap_optimal_fit(&[Word(0.0), Word(0.0)], &[1e155], &Penalties::default()),
///            Err(OverflowError));
/// ```
///
/// When using fragment widths and line widths which fit inside an
/// `u64`, overflows cannot happen. This means that fragments derived
/// from a `&str` cannot cause overflows.
///
/// **Note:** Only available when the `smawk` Cargo feature is
/// enabled.
pub fn wrap_optimal_fit<'a, 'b, T: Fragment>(
    fragments: &'a [T],
    line_widths: &'b [f64],
    penalties: &'b Penalties,
) -> Result<Vec<&'a [T]>, OverflowError> {
    // The final line width is used for all remaining lines.
    let default_line_width = line_widths.last().copied().unwrap_or(0.0);
    let mut widths = Vec::with_capacity(fragments.len() + 1);
    let mut width = 0.0;
    widths.push(width);
    for fragment in fragments {
        width += fragment.width() + fragment.whitespace_width();
        widths.push(width);
    }

    let line_numbers = LineNumbers::new(fragments.len());

    let minima = smawk::online_column_minima(0.0, widths.len(), |minima, i, j| {
        // Line number for fragment `i`.
        let line_number = line_numbers.get(i, minima);
        let line_width = line_widths
            .get(line_number)
            .copied()
            .unwrap_or(default_line_width);
        let target_width = line_width.max(1.0);

        // Compute the width of a line spanning fragments[i..j] in
        // constant time. We need to adjust widths[j] by subtracting
        // the whitespace of fragment[j-1] and then add the penalty.
        let line_width = widths[j] - widths[i] - fragments[j - 1].whitespace_width()
            + fragments[j - 1].penalty_width();

        // We compute cost of the line containing fragments[i..j]. We
        // start with values[i].1, which is the optimal cost for
        // breaking before fragments[i].
        //
        // First, every extra line cost NLINE_PENALTY.
        let mut cost = minima[i].1 + penalties.nline_penalty as f64;

        // Next, we add a penalty depending on the line length.
        if line_width > target_width {
            // Lines that overflow get a hefty penalty.
            let overflow = line_width - target_width;
            cost += overflow * penalties.overflow_penalty as f64;
        } else if j < fragments.len() {
            // Other lines (except for the last line) get a milder
            // penalty which depend on the size of the gap.
            let gap = target_width - line_width;
            cost += gap * gap;
        } else if i + 1 == j
            && line_width < target_width / penalties.short_last_line_fraction as f64
        {
            // The last line can have any size gap, but we do add a
            // penalty if the line is very short (typically because it
            // contains just a single word).
            cost += penalties.short_last_line_penalty as f64;
        }

        // Finally, we discourage hyphens.
        if fragments[j - 1].penalty_width() > 0.0 {
            // TODO: this should use a penalty value from the fragment
            // instead.
            cost += penalties.hyphen_penalty as f64;
        }

        cost
    });

    for (_, cost) in &minima {
        if cost.is_infinite() {
            return Err(OverflowError);
        }
    }

    let mut lines = Vec::with_capacity(line_numbers.get(fragments.len(), &minima));
    let mut pos = fragments.len();
    loop {
        let prev = minima[pos].0;
        lines.push(&fragments[prev..pos]);
        pos = prev;
        if pos == 0 {
            break;
        }
    }

    lines.reverse();
    Ok(lines)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[derive(Debug, PartialEq)]
    struct Word(f64);

    #[rustfmt::skip]
    impl Fragment for Word {
        fn width(&self) -> f64 { self.0 }
        fn whitespace_width(&self) -> f64 { 1.0 }
        fn penalty_width(&self) -> f64 { 0.0 }
    }

    #[test]
    fn wrap_fragments_with_infinite_widths() {
        let words = vec![Word(f64::INFINITY)];
        assert_eq!(
            wrap_optimal_fit(&words, &[0.0], &Penalties::default()),
            Err(OverflowError)
        );
    }

    #[test]
    fn wrap_fragments_with_huge_widths() {
        let words = vec![Word(1e200), Word(1e250), Word(1e300)];
        assert_eq!(
            wrap_optimal_fit(&words, &[1e300], &Penalties::default()),
            Err(OverflowError)
        );
    }

    #[test]
    fn wrap_fragments_with_large_widths() {
        // The gaps will be of the sizes between 1e25 and 1e75. This
        // makes the `gap * gap` cost fit comfortably in a f64.
        let words = vec![Word(1e25), Word(1e50), Word(1e75)];
        assert_eq!(
            wrap_optimal_fit(&words, &[1e100], &Penalties::default()),
            Ok(vec![&vec![Word(1e25), Word(1e50), Word(1e75)][..]])
        );
    }
}