@@ -25,6 +25,7 @@ pub(crate) struct State<'a> {
25
25
pub string_position : usize ,
26
26
popped_context : Option < MatchContext > ,
27
27
pub has_matched : Option < bool > ,
28
+ pub match_all : bool ,
28
29
}
29
30
30
31
impl < ' a > State < ' a > {
@@ -51,6 +52,7 @@ impl<'a> State<'a> {
51
52
string_position : start,
52
53
popped_context : None ,
53
54
has_matched : None ,
55
+ match_all : false ,
54
56
}
55
57
}
56
58
@@ -105,6 +107,7 @@ impl<'a> State<'a> {
105
107
string_offset : self . string . offset ( 0 , self . start ) ,
106
108
code_position : 0 ,
107
109
has_matched : None ,
110
+ toplevel : true ,
108
111
} ;
109
112
self . context_stack . push ( ctx) ;
110
113
@@ -132,6 +135,7 @@ impl<'a> State<'a> {
132
135
pub fn search ( mut self ) -> Self {
133
136
// TODO: optimize by op info and skip prefix
134
137
while self . start <= self . end {
138
+ self . match_all = false ;
135
139
self = self . pymatch ( ) ;
136
140
137
141
if self . has_matched == Some ( true ) {
@@ -232,12 +236,13 @@ impl<'a> StrDrive<'a> {
232
236
}
233
237
}
234
238
235
- #[ derive( Debug , Copy , Clone ) ]
239
+ #[ derive( Debug , Clone , Copy ) ]
236
240
struct MatchContext {
237
241
string_position : usize ,
238
242
string_offset : usize ,
239
243
code_position : usize ,
240
244
has_matched : Option < bool > ,
245
+ toplevel : bool ,
241
246
}
242
247
243
248
trait MatchContextDrive {
@@ -463,8 +468,12 @@ impl OpcodeDispatcher {
463
468
drive. ctx_mut ( ) . has_matched = Some ( false ) ;
464
469
} ) ,
465
470
SreOpcode :: SUCCESS => once ( |drive| {
466
- drive. state . string_position = drive. ctx ( ) . string_position ;
467
- drive. ctx_mut ( ) . has_matched = Some ( true ) ;
471
+ if drive. ctx ( ) . toplevel && drive. state . match_all && !drive. at_end ( ) {
472
+ drive. ctx_mut ( ) . has_matched = Some ( false ) ;
473
+ } else {
474
+ drive. state . string_position = drive. ctx ( ) . string_position ;
475
+ drive. ctx_mut ( ) . has_matched = Some ( true ) ;
476
+ }
468
477
} ) ,
469
478
SreOpcode :: ANY => once ( |drive| {
470
479
if drive. at_end ( ) || drive. at_linebreak ( ) {
@@ -482,15 +491,19 @@ impl OpcodeDispatcher {
482
491
drive. skip_char ( 1 ) ;
483
492
}
484
493
} ) ,
494
+ /* assert subpattern */
495
+ /* <ASSERT> <skip> <back> <pattern> */
485
496
SreOpcode :: ASSERT => twice (
486
497
|drive| {
487
498
let back = drive. peek_code ( 2 ) as usize ;
488
- if back > drive. ctx ( ) . string_position {
499
+ let passed = drive. ctx ( ) . string_position - drive. state . start ;
500
+ if passed < back {
489
501
drive. ctx_mut ( ) . has_matched = Some ( false ) ;
490
502
return ;
491
503
}
492
504
drive. state . string_position = drive. ctx ( ) . string_position - back;
493
505
drive. push_new_context ( 3 ) ;
506
+ drive. state . context_stack . last_mut ( ) . unwrap ( ) . toplevel = false ;
494
507
} ,
495
508
|drive| {
496
509
let child_ctx = drive. state . popped_context . unwrap ( ) ;
@@ -504,12 +517,14 @@ impl OpcodeDispatcher {
504
517
SreOpcode :: ASSERT_NOT => twice (
505
518
|drive| {
506
519
let back = drive. peek_code ( 2 ) as usize ;
507
- if back > drive. ctx ( ) . string_position {
520
+ let passed = drive. ctx ( ) . string_position - drive. state . start ;
521
+ if passed < back {
508
522
drive. skip_code ( drive. peek_code ( 1 ) as usize + 1 ) ;
509
523
return ;
510
524
}
511
525
drive. state . string_position = drive. ctx ( ) . string_position - back;
512
526
drive. push_new_context ( 3 ) ;
527
+ drive. state . context_stack . last_mut ( ) . unwrap ( ) . toplevel = false ;
513
528
} ,
514
529
|drive| {
515
530
let child_ctx = drive. state . popped_context . unwrap ( ) ;
@@ -770,17 +785,17 @@ fn charset(set: &[u32], ch: u32) -> bool {
770
785
}
771
786
SreOpcode :: CHARSET => {
772
787
/* <CHARSET> <bitmap> */
773
- let set = & set[ 1 ..] ;
788
+ let set = & set[ i + 1 ..] ;
774
789
if ch < 256 && ( ( set[ ( ch >> 5 ) as usize ] & ( 1u32 << ( ch & 31 ) ) ) != 0 ) {
775
790
return ok;
776
791
}
777
- i += 8 ;
792
+ i += 1 + 8 ;
778
793
}
779
794
SreOpcode :: BIGCHARSET => {
780
795
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
781
796
let count = set[ i + 1 ] as usize ;
782
797
if ch < 0x10000 {
783
- let set = & set[ 2 ..] ;
798
+ let set = & set[ i + 2 ..] ;
784
799
let block_index = ch >> 8 ;
785
800
let ( _, blockindices, _) = unsafe { set. align_to :: < u8 > ( ) } ;
786
801
let blocks = & set[ 64 ..] ;
@@ -1085,6 +1100,7 @@ struct OpMinRepeatOne {
1085
1100
count : usize ,
1086
1101
}
1087
1102
impl OpcodeExecutor for OpMinRepeatOne {
1103
+ /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1088
1104
fn next ( & mut self , drive : & mut StackDrive ) -> Option < ( ) > {
1089
1105
match self . jump_id {
1090
1106
0 => {
@@ -1110,7 +1126,11 @@ impl OpcodeExecutor for OpMinRepeatOne {
1110
1126
count
1111
1127
} ;
1112
1128
1113
- if drive. peek_code ( drive. peek_code ( 1 ) as usize + 1 ) == SreOpcode :: SUCCESS as u32 {
1129
+ let next_code = drive. peek_code ( drive. peek_code ( 1 ) as usize + 1 ) ;
1130
+ if next_code == SreOpcode :: SUCCESS as u32
1131
+ && !( drive. ctx ( ) . toplevel && drive. state . match_all && !drive. at_end ( ) )
1132
+ {
1133
+ // tail is empty. we're finished
1114
1134
drive. state . string_position = drive. ctx ( ) . string_position ;
1115
1135
drive. ctx_mut ( ) . has_matched = Some ( true ) ;
1116
1136
return None ;
@@ -1377,9 +1397,18 @@ struct OpRepeatOne {
1377
1397
jump_id : usize ,
1378
1398
mincount : usize ,
1379
1399
maxcount : usize ,
1380
- count : isize ,
1400
+ count : usize ,
1401
+ following_literal : Option < u32 > ,
1381
1402
}
1382
1403
impl OpcodeExecutor for OpRepeatOne {
1404
+ /* match repeated sequence (maximizing regexp) */
1405
+
1406
+ /* this operator only works if the repeated item is
1407
+ exactly one character wide, and we're not already
1408
+ collecting backtracking points. for other cases,
1409
+ use the MAX_REPEAT operator */
1410
+
1411
+ /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1383
1412
fn next ( & mut self , drive : & mut StackDrive ) -> Option < ( ) > {
1384
1413
match self . jump_id {
1385
1414
0 => {
@@ -1388,62 +1417,76 @@ impl OpcodeExecutor for OpRepeatOne {
1388
1417
1389
1418
if drive. remaining_chars ( ) < self . mincount {
1390
1419
drive. ctx_mut ( ) . has_matched = Some ( false ) ;
1420
+ return None ;
1391
1421
}
1422
+
1392
1423
drive. state . string_position = drive. ctx ( ) . string_position ;
1393
- self . count = count ( drive, self . maxcount ) as isize ;
1394
- drive. skip_char ( self . count as usize ) ;
1395
- if self . count < self . mincount as isize {
1424
+
1425
+ self . count = count ( drive, self . maxcount ) ;
1426
+ drive. skip_char ( self . count ) ;
1427
+ if self . count < self . mincount {
1396
1428
drive. ctx_mut ( ) . has_matched = Some ( false ) ;
1397
1429
return None ;
1398
1430
}
1399
1431
1400
1432
let next_code = drive. peek_code ( drive. peek_code ( 1 ) as usize + 1 ) ;
1401
- if next_code == SreOpcode :: SUCCESS as u32 {
1433
+ if next_code == SreOpcode :: SUCCESS as u32 && drive. at_end ( ) && !drive. ctx ( ) . toplevel
1434
+ {
1402
1435
// tail is empty. we're finished
1403
1436
drive. state . string_position = drive. ctx ( ) . string_position ;
1404
1437
drive. ctx_mut ( ) . has_matched = Some ( true ) ;
1405
1438
return None ;
1406
1439
}
1407
1440
1408
1441
drive. state . marks_push ( ) ;
1409
- // TODO:
1442
+
1410
1443
// Special case: Tail starts with a literal. Skip positions where
1411
1444
// the rest of the pattern cannot possibly match.
1445
+ if next_code == SreOpcode :: LITERAL as u32 {
1446
+ self . following_literal = Some ( drive. peek_code ( drive. peek_code ( 1 ) as usize + 2 ) )
1447
+ }
1448
+
1412
1449
self . jump_id = 1 ;
1413
1450
self . next ( drive)
1414
1451
}
1415
1452
1 => {
1416
- // General case: backtracking
1417
- if self . count >= self . mincount as isize {
1418
- drive. state . string_position = drive. ctx ( ) . string_position ;
1419
- drive. push_new_context ( drive. peek_code ( 1 ) as usize + 1 ) ;
1420
- self . jump_id = 2 ;
1421
- return Some ( ( ) ) ;
1453
+ if let Some ( c) = self . following_literal {
1454
+ while drive. at_end ( ) || drive. peek_char ( ) != c {
1455
+ if self . count <= self . mincount {
1456
+ drive. state . marks_pop_discard ( ) ;
1457
+ drive. ctx_mut ( ) . has_matched = Some ( false ) ;
1458
+ return None ;
1459
+ }
1460
+ drive. back_skip_char ( 1 ) ;
1461
+ self . count -= 1 ;
1462
+ }
1422
1463
}
1423
1464
1424
- drive. state . marks_pop_discard ( ) ;
1425
- drive. ctx_mut ( ) . has_matched = Some ( false ) ;
1426
- None
1465
+ // General case: backtracking
1466
+ drive. state . string_position = drive. ctx ( ) . string_position ;
1467
+ drive. push_new_context ( drive. peek_code ( 1 ) as usize + 1 ) ;
1468
+ self . jump_id = 2 ;
1469
+ Some ( ( ) )
1427
1470
}
1428
1471
2 => {
1429
1472
let child_ctx = drive. state . popped_context . unwrap ( ) ;
1430
1473
if child_ctx. has_matched == Some ( true ) {
1431
1474
drive. ctx_mut ( ) . has_matched = Some ( true ) ;
1432
1475
return None ;
1433
1476
}
1434
- if self . count <= self . mincount as isize {
1477
+ if self . count <= self . mincount {
1435
1478
drive. state . marks_pop_discard ( ) ;
1436
1479
drive. ctx_mut ( ) . has_matched = Some ( false ) ;
1437
1480
return None ;
1438
1481
}
1439
1482
1440
- // TODO: unnesscary double check
1441
1483
drive. back_skip_char ( 1 ) ;
1442
1484
self . count -= 1 ;
1485
+
1443
1486
drive. state . marks_pop_keep ( ) ;
1444
1487
1445
1488
self . jump_id = 1 ;
1446
- Some ( ( ) )
1489
+ self . next ( drive )
1447
1490
}
1448
1491
_ => unreachable ! ( ) ,
1449
1492
}
0 commit comments