regex_lite/hir/mod.rs
1use alloc::{boxed::Box, string::String, vec, vec::Vec};
2
3use crate::{error::Error, utf8};
4
5mod parse;
6
7/// Escapes all regular expression meta characters in `pattern`.
8///
9/// The string returned may be safely used as a literal in a regular
10/// expression.
11pub fn escape(pattern: &str) -> String {
12 let mut buf = String::new();
13 buf.reserve(pattern.len());
14 for ch in pattern.chars() {
15 if is_meta_character(ch) {
16 buf.push('\\');
17 }
18 buf.push(ch);
19 }
20 buf
21}
22
23/// Returns true if the given character has significance in a regex.
24///
25/// Generally speaking, these are the only characters which _must_ be escaped
26/// in order to match their literal meaning. For example, to match a literal
27/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
28/// example, `-` is treated as a meta character because of its significance
29/// for writing ranges inside of character classes, but the regex `-` will
30/// match a literal `-` because `-` has no special meaning outside of character
31/// classes.
32///
33/// In order to determine whether a character may be escaped at all, the
34/// [`is_escapeable_character`] routine should be used. The difference between
35/// `is_meta_character` and `is_escapeable_character` is that the latter will
36/// return true for some characters that are _not_ meta characters. For
37/// example, `%` and `\%` both match a literal `%` in all contexts. In other
38/// words, `is_escapeable_character` includes "superfluous" escapes.
39///
40/// Note that the set of characters for which this function returns `true` or
41/// `false` is fixed and won't change in a semver compatible release. (In this
42/// case, "semver compatible release" actually refers to the `regex` crate
43/// itself, since reducing or expanding the set of meta characters would be a
44/// breaking change for not just `regex-syntax` but also `regex` itself.)
45fn is_meta_character(c: char) -> bool {
46 match c {
47 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
48 | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
49 _ => false,
50 }
51}
52
53/// Returns true if the given character can be escaped in a regex.
54///
55/// This returns true in all cases that `is_meta_character` returns true, but
56/// also returns true in some cases where `is_meta_character` returns false.
57/// For example, `%` is not a meta character, but it is escapeable. That is,
58/// `%` and `\%` both match a literal `%` in all contexts.
59///
60/// The purpose of this routine is to provide knowledge about what characters
61/// may be escaped. Namely, most regex engines permit "superfluous" escapes
62/// where characters without any special significance may be escaped even
63/// though there is no actual _need_ to do so.
64///
65/// This will return false for some characters. For example, `e` is not
66/// escapeable. Therefore, `\e` will either result in a parse error (which is
67/// true today), or it could backwards compatibly evolve into a new construct
68/// with its own meaning. Indeed, that is the purpose of banning _some_
69/// superfluous escapes: it provides a way to evolve the syntax in a compatible
70/// manner.
71fn is_escapeable_character(c: char) -> bool {
72 // Certainly escapeable if it's a meta character.
73 if is_meta_character(c) {
74 return true;
75 }
76 // Any character that isn't ASCII is definitely not escapeable. There's
77 // no real need to allow things like \☃ right?
78 if !c.is_ascii() {
79 return false;
80 }
81 // Otherwise, we basically say that everything is escapeable unless it's a
82 // letter or digit. Things like \3 are either octal (when enabled) or an
83 // error, and we should keep it that way. Otherwise, letters are reserved
84 // for adding new syntax in a backwards compatible way.
85 match c {
86 '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
87 // While not currently supported, we keep these as not escapeable to
88 // give us some flexibility with respect to supporting the \< and
89 // \> word boundary assertions in the future. By rejecting them as
90 // escapeable, \< and \> will result in a parse error. Thus, we can
91 // turn them into something else in the future without it being a
92 // backwards incompatible change.
93 '<' | '>' => false,
94 _ => true,
95 }
96}
97
98/// The configuration for a regex parser.
99#[derive(Clone, Copy, Debug)]
100pub(crate) struct Config {
101 /// The maximum number of times we're allowed to recurse.
102 ///
103 /// Note that unlike the regex-syntax parser, we actually use recursion in
104 /// this parser for simplicity. My hope is that by setting a conservative
105 /// default call limit and providing a way to configure it, that we can
106 /// keep this simplification. But if we must, we can re-work the parser to
107 /// put the call stack on the heap like regex-syntax does.
108 pub(crate) nest_limit: u32,
109 /// Various flags that control how a pattern is interpreted.
110 pub(crate) flags: Flags,
111}
112
113impl Default for Config {
114 fn default() -> Config {
115 Config { nest_limit: 50, flags: Flags::default() }
116 }
117}
118
119/// Various flags that control the interpretation of the pattern.
120///
121/// These can be set via explicit configuration in code, or change dynamically
122/// during parsing via inline flags. For example, `foo(?i:bar)baz` will match
123/// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a
124/// default configuration).
125#[derive(Clone, Copy, Debug, Default)]
126pub(crate) struct Flags {
127 /// Whether to match case insensitively.
128 ///
129 /// This is the `i` flag.
130 pub(crate) case_insensitive: bool,
131 /// Whether `^` and `$` should be treated as line anchors or not.
132 ///
133 /// This is the `m` flag.
134 pub(crate) multi_line: bool,
135 /// Whether `.` should match line terminators or not.
136 ///
137 /// This is the `s` flag.
138 pub(crate) dot_matches_new_line: bool,
139 /// Whether to swap the meaning of greedy and non-greedy operators.
140 ///
141 /// This is the `U` flag.
142 pub(crate) swap_greed: bool,
143 /// Whether to enable CRLF mode.
144 ///
145 /// This is the `R` flag.
146 pub(crate) crlf: bool,
147 /// Whether to ignore whitespace. i.e., verbose mode.
148 ///
149 /// This is the `x` flag.
150 pub(crate) ignore_whitespace: bool,
151}
152
153#[derive(Clone, Debug, Eq, PartialEq)]
154pub(crate) struct Hir {
155 kind: HirKind,
156 is_start_anchored: bool,
157 is_match_empty: bool,
158 static_explicit_captures_len: Option<usize>,
159}
160
161#[derive(Clone, Debug, Eq, PartialEq)]
162pub(crate) enum HirKind {
163 Empty,
164 Char(char),
165 Class(Class),
166 Look(Look),
167 Repetition(Repetition),
168 Capture(Capture),
169 Concat(Vec<Hir>),
170 Alternation(Vec<Hir>),
171}
172
173impl Hir {
174 /// Parses the given pattern string with the given configuration into a
175 /// structured representation. If the pattern is invalid, then an error
176 /// is returned.
177 pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> {
178 self::parse::Parser::new(config, pattern).parse()
179 }
180
181 /// Returns the underlying kind of this high-level intermediate
182 /// representation.
183 ///
184 /// Note that there is explicitly no way to build an `Hir` directly from
185 /// an `HirKind`. If you need to do that, then you must do case analysis
186 /// on the `HirKind` and call the appropriate smart constructor on `Hir`.
187 pub(crate) fn kind(&self) -> &HirKind {
188 &self.kind
189 }
190
191 /// Returns true if and only if this Hir expression can only match at the
192 /// beginning of a haystack.
193 pub(crate) fn is_start_anchored(&self) -> bool {
194 self.is_start_anchored
195 }
196
197 /// Returns true if and only if this Hir expression can match the empty
198 /// string.
199 pub(crate) fn is_match_empty(&self) -> bool {
200 self.is_match_empty
201 }
202
203 /// If the pattern always reports the same number of matching capture groups
204 /// for every match, then this returns the number of those groups. This
205 /// doesn't include the implicit group found in every pattern.
206 pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> {
207 self.static_explicit_captures_len
208 }
209
210 fn fail() -> Hir {
211 let kind = HirKind::Class(Class { ranges: vec![] });
212 Hir {
213 kind,
214 is_start_anchored: false,
215 is_match_empty: false,
216 static_explicit_captures_len: Some(0),
217 }
218 }
219
220 fn empty() -> Hir {
221 let kind = HirKind::Empty;
222 Hir {
223 kind,
224 is_start_anchored: false,
225 is_match_empty: true,
226 static_explicit_captures_len: Some(0),
227 }
228 }
229
230 fn char(ch: char) -> Hir {
231 let kind = HirKind::Char(ch);
232 Hir {
233 kind,
234 is_start_anchored: false,
235 is_match_empty: false,
236 static_explicit_captures_len: Some(0),
237 }
238 }
239
240 fn class(class: Class) -> Hir {
241 let kind = HirKind::Class(class);
242 Hir {
243 kind,
244 is_start_anchored: false,
245 is_match_empty: false,
246 static_explicit_captures_len: Some(0),
247 }
248 }
249
250 fn look(look: Look) -> Hir {
251 let kind = HirKind::Look(look);
252 Hir {
253 kind,
254 is_start_anchored: matches!(look, Look::Start),
255 is_match_empty: true,
256 static_explicit_captures_len: Some(0),
257 }
258 }
259
260 fn repetition(rep: Repetition) -> Hir {
261 if rep.min == 0 && rep.max == Some(0) {
262 return Hir::empty();
263 } else if rep.min == 1 && rep.max == Some(1) {
264 return *rep.sub;
265 }
266 let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored;
267 let is_match_empty = rep.min == 0 || rep.sub.is_match_empty;
268 let mut static_explicit_captures_len =
269 rep.sub.static_explicit_captures_len;
270 // If the static captures len of the sub-expression is not known or
271 // is greater than zero, then it automatically propagates to the
272 // repetition, regardless of the repetition. Otherwise, it might
273 // change, but only when the repetition can match 0 times.
274 if rep.min == 0
275 && static_explicit_captures_len.map_or(false, |len| len > 0)
276 {
277 // If we require a match 0 times, then our captures len is
278 // guaranteed to be zero. Otherwise, if we *can* match the empty
279 // string, then it's impossible to know how many captures will be
280 // in the resulting match.
281 if rep.max == Some(0) {
282 static_explicit_captures_len = Some(0);
283 } else {
284 static_explicit_captures_len = None;
285 }
286 }
287 Hir {
288 kind: HirKind::Repetition(rep),
289 is_start_anchored,
290 is_match_empty,
291 static_explicit_captures_len,
292 }
293 }
294
295 fn capture(cap: Capture) -> Hir {
296 let is_start_anchored = cap.sub.is_start_anchored;
297 let is_match_empty = cap.sub.is_match_empty;
298 let static_explicit_captures_len = cap
299 .sub
300 .static_explicit_captures_len
301 .map(|len| len.saturating_add(1));
302 let kind = HirKind::Capture(cap);
303 Hir {
304 kind,
305 is_start_anchored,
306 is_match_empty,
307 static_explicit_captures_len,
308 }
309 }
310
311 fn concat(mut subs: Vec<Hir>) -> Hir {
312 if subs.is_empty() {
313 Hir::empty()
314 } else if subs.len() == 1 {
315 subs.pop().unwrap()
316 } else {
317 let is_start_anchored = subs[0].is_start_anchored;
318 let mut is_match_empty = true;
319 let mut static_explicit_captures_len = Some(0usize);
320 for sub in subs.iter() {
321 is_match_empty = is_match_empty && sub.is_match_empty;
322 static_explicit_captures_len = static_explicit_captures_len
323 .and_then(|len1| {
324 Some((len1, sub.static_explicit_captures_len?))
325 })
326 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
327 }
328 Hir {
329 kind: HirKind::Concat(subs),
330 is_start_anchored,
331 is_match_empty,
332 static_explicit_captures_len,
333 }
334 }
335 }
336
337 fn alternation(mut subs: Vec<Hir>) -> Hir {
338 if subs.is_empty() {
339 Hir::fail()
340 } else if subs.len() == 1 {
341 subs.pop().unwrap()
342 } else {
343 let mut it = subs.iter().peekable();
344 let mut is_start_anchored =
345 it.peek().map_or(false, |sub| sub.is_start_anchored);
346 let mut is_match_empty =
347 it.peek().map_or(false, |sub| sub.is_match_empty);
348 let mut static_explicit_captures_len =
349 it.peek().and_then(|sub| sub.static_explicit_captures_len);
350 for sub in it {
351 is_start_anchored = is_start_anchored && sub.is_start_anchored;
352 is_match_empty = is_match_empty || sub.is_match_empty;
353 if static_explicit_captures_len
354 != sub.static_explicit_captures_len
355 {
356 static_explicit_captures_len = None;
357 }
358 }
359 Hir {
360 kind: HirKind::Alternation(subs),
361 is_start_anchored,
362 is_match_empty,
363 static_explicit_captures_len,
364 }
365 }
366 }
367}
368
369impl HirKind {
370 /// Returns a slice of this kind's sub-expressions, if any.
371 fn subs(&self) -> &[Hir] {
372 use core::slice::from_ref;
373
374 match *self {
375 HirKind::Empty
376 | HirKind::Char(_)
377 | HirKind::Class(_)
378 | HirKind::Look(_) => &[],
379 HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
380 HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
381 HirKind::Concat(ref subs) => subs,
382 HirKind::Alternation(ref subs) => subs,
383 }
384 }
385}
386
387#[derive(Clone, Debug, Eq, PartialEq)]
388pub(crate) struct Class {
389 pub(crate) ranges: Vec<ClassRange>,
390}
391
392impl Class {
393 /// Create a new class from the given ranges. The ranges may be provided
394 /// in any order or may even overlap. They will be automatically
395 /// canonicalized.
396 fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class {
397 let mut class = Class { ranges: ranges.into_iter().collect() };
398 class.canonicalize();
399 class
400 }
401
402 /// Expand this class such that it matches the ASCII codepoints in this set
403 /// case insensitively.
404 fn ascii_case_fold(&mut self) {
405 let len = self.ranges.len();
406 for i in 0..len {
407 if let Some(folded) = self.ranges[i].ascii_case_fold() {
408 self.ranges.push(folded);
409 }
410 }
411 self.canonicalize();
412 }
413
414 /// Negate this set.
415 ///
416 /// For all `x` where `x` is any element, if `x` was in this set, then it
417 /// will not be in this set after negation.
418 fn negate(&mut self) {
419 const MIN: char = '\x00';
420 const MAX: char = char::MAX;
421
422 if self.ranges.is_empty() {
423 self.ranges.push(ClassRange { start: MIN, end: MAX });
424 return;
425 }
426
427 // There should be a way to do this in-place with constant memory,
428 // but I couldn't figure out a simple way to do it. So just append
429 // the negation to the end of this range, and then drain it before
430 // we're done.
431 let drain_end = self.ranges.len();
432
433 // If our class doesn't start the minimum possible char, then negation
434 // needs to include all codepoints up to the minimum in this set.
435 if self.ranges[0].start > MIN {
436 self.ranges.push(ClassRange {
437 start: MIN,
438 // OK because we know it's bigger than MIN.
439 end: prev_char(self.ranges[0].start).unwrap(),
440 });
441 }
442 for i in 1..drain_end {
443 // let lower = self.ranges[i - 1].upper().increment();
444 // let upper = self.ranges[i].lower().decrement();
445 // self.ranges.push(I::create(lower, upper));
446 self.ranges.push(ClassRange {
447 // OK because we know i-1 is never the last range and therefore
448 // there must be a range greater than it. It therefore follows
449 // that 'end' can never be char::MAX, and thus there must be
450 // a next char.
451 start: next_char(self.ranges[i - 1].end).unwrap(),
452 // Since 'i' is guaranteed to never be the first range, it
453 // follows that there is always a range before this and thus
454 // 'start' can never be '\x00'. Thus, there must be a previous
455 // char.
456 end: prev_char(self.ranges[i].start).unwrap(),
457 });
458 }
459 if self.ranges[drain_end - 1].end < MAX {
460 // let lower = self.ranges[drain_end - 1].upper().increment();
461 // self.ranges.push(I::create(lower, I::Bound::max_value()));
462 self.ranges.push(ClassRange {
463 // OK because we know 'end' is less than char::MAX, and thus
464 // there is a next char.
465 start: next_char(self.ranges[drain_end - 1].end).unwrap(),
466 end: MAX,
467 });
468 }
469 self.ranges.drain(..drain_end);
470 // We don't need to canonicalize because we processed the ranges above
471 // in canonical order and the new ranges we added based on those are
472 // also necessarily in canonical order.
473 }
474
475 /// Converts this set into a canonical ordering.
476 fn canonicalize(&mut self) {
477 if self.is_canonical() {
478 return;
479 }
480 self.ranges.sort();
481 assert!(!self.ranges.is_empty());
482
483 // Is there a way to do this in-place with constant memory? I couldn't
484 // figure out a way to do it. So just append the canonicalization to
485 // the end of this range, and then drain it before we're done.
486 let drain_end = self.ranges.len();
487 for oldi in 0..drain_end {
488 // If we've added at least one new range, then check if we can
489 // merge this range in the previously added range.
490 if self.ranges.len() > drain_end {
491 let (last, rest) = self.ranges.split_last_mut().unwrap();
492 if let Some(union) = last.union(&rest[oldi]) {
493 *last = union;
494 continue;
495 }
496 }
497 self.ranges.push(self.ranges[oldi]);
498 }
499 self.ranges.drain(..drain_end);
500 }
501
502 /// Returns true if and only if this class is in a canonical ordering.
503 fn is_canonical(&self) -> bool {
504 for pair in self.ranges.windows(2) {
505 if pair[0] >= pair[1] {
506 return false;
507 }
508 if pair[0].is_contiguous(&pair[1]) {
509 return false;
510 }
511 }
512 true
513 }
514}
515
516#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)]
517pub(crate) struct ClassRange {
518 pub(crate) start: char,
519 pub(crate) end: char,
520}
521
522impl ClassRange {
523 /// Apply simple case folding to this byte range. Only ASCII case mappings
524 /// (for A-Za-z) are applied.
525 ///
526 /// Additional ranges are appended to the given vector. Canonical ordering
527 /// is *not* maintained in the given vector.
528 fn ascii_case_fold(&self) -> Option<ClassRange> {
529 if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) {
530 let start = core::cmp::max(self.start, 'a');
531 let end = core::cmp::min(self.end, 'z');
532 return Some(ClassRange {
533 start: char::try_from(u32::from(start) - 32).unwrap(),
534 end: char::try_from(u32::from(end) - 32).unwrap(),
535 });
536 }
537 if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) {
538 let start = core::cmp::max(self.start, 'A');
539 let end = core::cmp::min(self.end, 'Z');
540 return Some(ClassRange {
541 start: char::try_from(u32::from(start) + 32).unwrap(),
542 end: char::try_from(u32::from(end) + 32).unwrap(),
543 });
544 }
545 None
546 }
547
548 /// Union the given overlapping range into this range.
549 ///
550 /// If the two ranges aren't contiguous, then this returns `None`.
551 fn union(&self, other: &ClassRange) -> Option<ClassRange> {
552 if !self.is_contiguous(other) {
553 return None;
554 }
555 let start = core::cmp::min(self.start, other.start);
556 let end = core::cmp::max(self.end, other.end);
557 Some(ClassRange { start, end })
558 }
559
560 /// Returns true if and only if the two ranges are contiguous. Two ranges
561 /// are contiguous if and only if the ranges are either overlapping or
562 /// adjacent.
563 fn is_contiguous(&self, other: &ClassRange) -> bool {
564 let (s1, e1) = (u32::from(self.start), u32::from(self.end));
565 let (s2, e2) = (u32::from(other.start), u32::from(other.end));
566 core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1)
567 }
568
569 /// Returns true if and only if the intersection of this range and the
570 /// other range is empty.
571 fn is_intersection_empty(&self, other: &ClassRange) -> bool {
572 let (s1, e1) = (self.start, self.end);
573 let (s2, e2) = (other.start, other.end);
574 core::cmp::max(s1, s2) > core::cmp::min(e1, e2)
575 }
576}
577
578/// The high-level intermediate representation for a look-around assertion.
579///
580/// An assertion match is always zero-length. Also called an "empty match."
581#[derive(Clone, Copy, Debug, Eq, PartialEq)]
582pub(crate) enum Look {
583 /// Match the beginning of text. Specifically, this matches at the starting
584 /// position of the input.
585 Start = 1 << 0,
586 /// Match the end of text. Specifically, this matches at the ending
587 /// position of the input.
588 End = 1 << 1,
589 /// Match the beginning of a line or the beginning of text. Specifically,
590 /// this matches at the starting position of the input, or at the position
591 /// immediately following a `\n` character.
592 StartLF = 1 << 2,
593 /// Match the end of a line or the end of text. Specifically, this matches
594 /// at the end position of the input, or at the position immediately
595 /// preceding a `\n` character.
596 EndLF = 1 << 3,
597 /// Match the beginning of a line or the beginning of text. Specifically,
598 /// this matches at the starting position of the input, or at the position
599 /// immediately following either a `\r` or `\n` character, but never after
600 /// a `\r` when a `\n` follows.
601 StartCRLF = 1 << 4,
602 /// Match the end of a line or the end of text. Specifically, this matches
603 /// at the end position of the input, or at the position immediately
604 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
605 /// precedes it.
606 EndCRLF = 1 << 5,
607 /// Match an ASCII-only word boundary. That is, this matches a position
608 /// where the left adjacent character and right adjacent character
609 /// correspond to a word and non-word or a non-word and word character.
610 Word = 1 << 6,
611 /// Match an ASCII-only negation of a word boundary.
612 WordNegate = 1 << 7,
613 /// Match the start of an ASCII-only word boundary. That is, this matches a
614 /// position at either the beginning of the haystack or where the previous
615 /// character is not a word character and the following character is a word
616 /// character.
617 WordStart = 1 << 8,
618 /// Match the end of an ASCII-only word boundary. That is, this matches
619 /// a position at either the end of the haystack or where the previous
620 /// character is a word character and the following character is not a word
621 /// character.
622 WordEnd = 1 << 9,
623 /// Match the start half of an ASCII-only word boundary. That is, this
624 /// matches a position at either the beginning of the haystack or where the
625 /// previous character is not a word character.
626 WordStartHalf = 1 << 10,
627 /// Match the end half of an ASCII-only word boundary. That is, this
628 /// matches a position at either the end of the haystack or where the
629 /// following character is not a word character.
630 WordEndHalf = 1 << 11,
631}
632
633impl Look {
634 /// Returns true if the given position in the given haystack matches this
635 /// look-around assertion.
636 pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool {
637 use self::Look::*;
638
639 match *self {
640 Start => at == 0,
641 End => at == haystack.len(),
642 StartLF => at == 0 || haystack[at - 1] == b'\n',
643 EndLF => at == haystack.len() || haystack[at] == b'\n',
644 StartCRLF => {
645 at == 0
646 || haystack[at - 1] == b'\n'
647 || (haystack[at - 1] == b'\r'
648 && (at >= haystack.len() || haystack[at] != b'\n'))
649 }
650 EndCRLF => {
651 at == haystack.len()
652 || haystack[at] == b'\r'
653 || (haystack[at] == b'\n'
654 && (at == 0 || haystack[at - 1] != b'\r'))
655 }
656 Word => {
657 let word_before =
658 at > 0 && utf8::is_word_byte(haystack[at - 1]);
659 let word_after =
660 at < haystack.len() && utf8::is_word_byte(haystack[at]);
661 word_before != word_after
662 }
663 WordNegate => {
664 let word_before =
665 at > 0 && utf8::is_word_byte(haystack[at - 1]);
666 let word_after =
667 at < haystack.len() && utf8::is_word_byte(haystack[at]);
668 word_before == word_after
669 }
670 WordStart => {
671 let word_before =
672 at > 0 && utf8::is_word_byte(haystack[at - 1]);
673 let word_after =
674 at < haystack.len() && utf8::is_word_byte(haystack[at]);
675 !word_before && word_after
676 }
677 WordEnd => {
678 let word_before =
679 at > 0 && utf8::is_word_byte(haystack[at - 1]);
680 let word_after =
681 at < haystack.len() && utf8::is_word_byte(haystack[at]);
682 word_before && !word_after
683 }
684 WordStartHalf => {
685 let word_before =
686 at > 0 && utf8::is_word_byte(haystack[at - 1]);
687 !word_before
688 }
689 WordEndHalf => {
690 let word_after =
691 at < haystack.len() && utf8::is_word_byte(haystack[at]);
692 !word_after
693 }
694 }
695 }
696}
697
698/// The high-level intermediate representation of a repetition operator.
699///
700/// A repetition operator permits the repetition of an arbitrary
701/// sub-expression.
702#[derive(Clone, Debug, Eq, PartialEq)]
703pub(crate) struct Repetition {
704 /// The minimum range of the repetition.
705 ///
706 /// Note that special cases like `?`, `+` and `*` all get translated into
707 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
708 ///
709 /// When `min` is zero, this expression can match the empty string
710 /// regardless of what its sub-expression is.
711 pub(crate) min: u32,
712 /// The maximum range of the repetition.
713 ///
714 /// Note that when `max` is `None`, `min` acts as a lower bound but where
715 /// there is no upper bound. For something like `x{5}` where the min and
716 /// max are equivalent, `min` will be set to `5` and `max` will be set to
717 /// `Some(5)`.
718 pub(crate) max: Option<u32>,
719 /// Whether this repetition operator is greedy or not. A greedy operator
720 /// will match as much as it can. A non-greedy operator will match as
721 /// little as it can.
722 ///
723 /// Typically, operators are greedy by default and are only non-greedy when
724 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
725 /// not. However, this can be inverted via the `U` "ungreedy" flag.
726 pub(crate) greedy: bool,
727 /// The expression being repeated.
728 pub(crate) sub: Box<Hir>,
729}
730
731/// The high-level intermediate representation for a capturing group.
732///
733/// A capturing group always has an index and a child expression. It may
734/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
735/// necessary.
736///
737/// Note that there is no explicit representation of a non-capturing group
738/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
739/// the recursive structure of the `Hir` itself.
740#[derive(Clone, Debug, Eq, PartialEq)]
741pub(crate) struct Capture {
742 /// The capture index of the capture.
743 pub(crate) index: u32,
744 /// The name of the capture, if it exists.
745 pub(crate) name: Option<Box<str>>,
746 /// The expression inside the capturing group, which may be empty.
747 pub(crate) sub: Box<Hir>,
748}
749
750fn next_char(ch: char) -> Option<char> {
751 // Skip over the surrogate range.
752 if ch == '\u{D7FF}' {
753 return Some('\u{E000}');
754 }
755 // OK because char::MAX < u32::MAX and we handle U+D7FF above.
756 char::from_u32(u32::from(ch).checked_add(1).unwrap())
757}
758
759fn prev_char(ch: char) -> Option<char> {
760 // Skip over the surrogate range.
761 if ch == '\u{E000}' {
762 return Some('\u{D7FF}');
763 }
764 // OK because subtracting 1 from any valid scalar value other than 0
765 // and U+E000 yields a valid scalar value.
766 Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap())
767}
768
769impl Drop for Hir {
770 fn drop(&mut self) {
771 use core::mem;
772
773 match *self.kind() {
774 HirKind::Empty
775 | HirKind::Char(_)
776 | HirKind::Class(_)
777 | HirKind::Look(_) => return,
778 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
779 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
780 return
781 }
782 HirKind::Concat(ref x) if x.is_empty() => return,
783 HirKind::Alternation(ref x) if x.is_empty() => return,
784 _ => {}
785 }
786
787 let mut stack = vec![mem::replace(self, Hir::empty())];
788 while let Some(mut expr) = stack.pop() {
789 match expr.kind {
790 HirKind::Empty
791 | HirKind::Char(_)
792 | HirKind::Class(_)
793 | HirKind::Look(_) => {}
794 HirKind::Capture(ref mut x) => {
795 stack.push(mem::replace(&mut x.sub, Hir::empty()));
796 }
797 HirKind::Repetition(ref mut x) => {
798 stack.push(mem::replace(&mut x.sub, Hir::empty()));
799 }
800 HirKind::Concat(ref mut x) => {
801 stack.extend(x.drain(..));
802 }
803 HirKind::Alternation(ref mut x) => {
804 stack.extend(x.drain(..));
805 }
806 }
807 }
808 }
809}