@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
1036
1036
/*----------
1037
1037
* Prepare a general-purpose state skeleton.
1038
1038
*
1039
- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040
- * / /
1041
- * [lp] ----> [s2] ----bypass---------------------
1039
+ * In the no-backrefs case, we want this:
1042
1040
*
1043
- * where bypass is an empty, and prefix is some repetitions of atom
1041
+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042
+ *
1043
+ * where prefix is some repetitions of atom. In the general case we need
1044
+ *
1045
+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046
+ *
1047
+ * where the iterator wraps around [begin] ---atom---> [end]
1048
+ *
1049
+ * We make the s state here for both cases; s2 is made below if needed
1044
1050
*----------
1045
1051
*/
1046
1052
s = newstate (v -> nfa ); /* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
1051
1057
NOERR ();
1052
1058
atom -> begin = s ;
1053
1059
atom -> end = s2 ;
1054
- s = newstate (v -> nfa ); /* and spots for prefix and bypass */
1055
- s2 = newstate (v -> nfa );
1060
+ s = newstate (v -> nfa ); /* set up starting state */
1056
1061
NOERR ();
1057
1062
EMPTYARC (lp , s );
1058
- EMPTYARC (lp , s2 );
1059
1063
NOERR ();
1060
1064
1061
1065
/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
1089
1093
}
1090
1094
1091
1095
/*
1092
- * It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093
- * with quantifiers internally. Otherwise, the first step is to turn
1094
- * x{0,...} into x{1,...}|empty
1096
+ * It's quantifier time. If the atom is just a backref, we'll let it deal
1097
+ * with quantifiers internally.
1095
1098
*/
1096
- if (m == 0 && atomtype != BACKREF )
1097
- {
1098
- EMPTYARC (s2 , atom -> end ); /* the bypass */
1099
- assert (PREF (qprefer ) != 0 );
1100
- f = COMBINE (qprefer , atom -> flags );
1101
- t = subre (v , '|' , f , lp , atom -> end );
1102
- NOERR ();
1103
- t -> left = atom ;
1104
- t -> right = subre (v , '|' , PREF (f ), s2 , atom -> end );
1105
- NOERR ();
1106
- t -> right -> left = subre (v , '=' , 0 , s2 , atom -> end );
1107
- NOERR ();
1108
- * atomp = t ;
1109
- atomp = & t -> left ;
1110
- m = 1 ;
1111
- }
1112
-
1113
- /* deal with the rest of the quantifier */
1114
1099
if (atomtype == BACKREF )
1115
1100
{
1116
1101
/* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
1120
1105
atom -> min = (short ) m ;
1121
1106
atom -> max = (short ) n ;
1122
1107
atom -> flags |= COMBINE (qprefer , atom -> flags );
1108
+ /* rest of branch can be strung starting from atom->end */
1109
+ s2 = atom -> end ;
1123
1110
}
1124
1111
else if (m == 1 && n == 1 )
1125
1112
{
1126
1113
/* no/vacuous quantifier: done */
1127
1114
EMPTYARC (s , atom -> begin ); /* empty prefix */
1115
+ /* rest of branch can be strung starting from atom->end */
1116
+ s2 = atom -> end ;
1128
1117
}
1129
- else
1118
+ else if ( m > 0 && !( atom -> flags & BACKR ))
1130
1119
{
1131
1120
/*
1132
- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133
- * second x
1121
+ * If there's no backrefs involved, we can turn x{m,n} into
1122
+ * x{m-1,n-1}x, with capturing parens in only the second x. This
1123
+ * is valid because we only care about capturing matches from the
1124
+ * final iteration of the quantifier. It's a win because we can
1125
+ * implement the backref-free left side as a plain DFA node, since
1126
+ * we don't really care where its submatches are.
1134
1127
*/
1135
1128
dupnfa (v -> nfa , atom -> begin , atom -> end , s , atom -> begin );
1136
1129
assert (m >= 1 && m != INFINITY && n >= 1 );
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
1142
1135
NOERR ();
1143
1136
t -> right = atom ;
1144
1137
* atomp = t ;
1138
+ /* rest of branch can be strung starting from atom->end */
1139
+ s2 = atom -> end ;
1140
+ }
1141
+ else
1142
+ {
1143
+ /* general case: need an iteration node */
1144
+ s2 = newstate (v -> nfa );
1145
+ NOERR ();
1146
+ moveouts (v -> nfa , atom -> end , s2 );
1147
+ NOERR ();
1148
+ dupnfa (v -> nfa , atom -> begin , atom -> end , s , s2 );
1149
+ repeat (v , s , s2 , m , n );
1150
+ f = COMBINE (qprefer , atom -> flags );
1151
+ t = subre (v , '*' , f , s , s2 );
1152
+ NOERR ();
1153
+ t -> min = (short ) m ;
1154
+ t -> max = (short ) n ;
1155
+ t -> left = atom ;
1156
+ * atomp = t ;
1157
+ /* rest of branch is to be strung from iteration's end state */
1145
1158
}
1146
1159
1147
1160
/* and finally, look after that postponed recursion */
1148
1161
t = top -> right ;
1149
1162
if (!(SEE ('|' ) || SEE (stopper ) || SEE (EOS )))
1150
- t -> right = parsebranch (v , stopper , type , atom -> end , rp , 1 );
1163
+ t -> right = parsebranch (v , stopper , type , s2 , rp , 1 );
1151
1164
else
1152
1165
{
1153
- EMPTYARC (atom -> end , rp );
1154
- t -> right = subre (v , '=' , 0 , atom -> end , rp );
1166
+ EMPTYARC (s2 , rp );
1167
+ t -> right = subre (v , '=' , 0 , s2 , rp );
1155
1168
}
1156
1169
assert (SEE ('|' ) || SEE (stopper ) || SEE (EOS ));
1157
1170
t -> flags |= COMBINE (t -> flags , t -> right -> flags );
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
1214
1227
/*
1215
1228
* repeat - replicate subNFA for quantifiers
1216
1229
*
1230
+ * The sub-NFA strung from lp to rp is modified to represent m to n
1231
+ * repetitions of its initial contents.
1232
+ *
1217
1233
* The duplication sequences used here are chosen carefully so that any
1218
1234
* pointers starting out pointing into the subexpression end up pointing into
1219
1235
* the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
1229
1245
int n )
1230
1246
{
1231
1247
#define SOME 2
1232
- #define INF 3
1248
+ #define INF 3
1233
1249
#define PAIR (x , y ) ((x)*4 + (y))
1234
1250
#define REDUCE (x ) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1235
1251
const int rm = REDUCE (m );
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
1603
1619
v -> treechain = ret ;
1604
1620
}
1605
1621
1606
- assert (strchr ("|.b(= " , op ) != NULL );
1622
+ assert (strchr ("=b|.*( " , op ) != NULL );
1607
1623
1608
1624
ret -> op = op ;
1609
1625
ret -> flags = flags ;
0 commit comments