updated for version 7.3.1137

Problem: New regexp engine: collections are slow. Solution: Handle all characters in one go.
author: Bram Moolenaar <Bram@vim.org> 2013-06-07 14:08:30 +0200
committer: Bram Moolenaar <Bram@vim.org> 2013-06-07 14:08:30 +0200
commit: 417bad227ae3c6c930dc088d598f65205cce13d9 (patch)
tree: 9dc33f76958229f64acbf77bc9666e458c8b68e7 /src
parent: 203d04d76413fb46ada3feb73144f3172b1a9e05 (diff)
download: vim-417bad227ae3c6c930dc088d598f65205cce13d9.zip
2 files changed, 254 insertions, 168 deletions
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index 42030ac0b..254b3d931 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -34,15 +34,23 @@ enum
     NFA_SPLIT = -1024,
     NFA_MATCH,
     NFA_SKIP_CHAR,		    /* matches a 0-length char */
-    NFA_END_NEG_RANGE,		    /* Used when expanding [^ab] */
 
-    NFA_CONCAT,
+    NFA_START_COLL,		    /* [abc] start */
+    NFA_END_COLL,		    /* [abc] end */
+    NFA_START_NEG_COLL,		    /* [^abc] start */
+    NFA_END_NEG_COLL,		    /* [^abc] end (only used in postfix) */
+    NFA_RANGE,			    /* range of the two previous items (only
+				     * used in postfix) */
+    NFA_RANGE_MIN,		    /* low end of a range  */
+    NFA_RANGE_MAX,		    /* high end of a range  */
+
+    NFA_CONCAT,			    /* concatenate two previous items (only
+				     * used in postfix) */
     NFA_OR,
     NFA_STAR,			    /* greedy * */
     NFA_STAR_NONGREEDY,		    /* non-greedy * */
     NFA_QUEST,			    /* greedy \? */
     NFA_QUEST_NONGREEDY,	    /* non-greedy \? */
-    NFA_NOT,			    /* used for [^ab] negated char ranges */
 
     NFA_BOL,			    /* ^    Begin line */
     NFA_EOL,			    /* $    End line */
@@ -260,7 +268,7 @@ static int nfa_regcomp_start __ARGS((char_u *expr, int re_flags));
 static int nfa_get_reganch __ARGS((nfa_state_T *start, int depth));
 static int nfa_get_regstart __ARGS((nfa_state_T *start, int depth));
 static int nfa_recognize_char_class __ARGS((char_u *start, char_u *end, int extra_newl));
-static int nfa_emit_equi_class __ARGS((int c, int neg));
+static int nfa_emit_equi_class __ARGS((int c));
 static int nfa_regatom __ARGS((void));
 static int nfa_regpiece __ARGS((void));
 static int nfa_regconcat __ARGS((void));
@@ -664,21 +672,10 @@ nfa_recognize_char_class(start, end, extra_newl)
  * NOTE! When changing this function, also update reg_equi_class()
  */
     static int
-nfa_emit_equi_class(c, neg)
+nfa_emit_equi_class(c)
     int	    c;
-    int	    neg;
 {
-    int	first = TRUE;
-    int	glue = neg == TRUE ? NFA_CONCAT : NFA_OR;
-#define EMIT2(c)		\
-	EMIT(c);		\
-	if (neg == TRUE) {	\
-	    EMIT(NFA_NOT);	\
-	}			\
-	if (first == FALSE)	\
-	    EMIT(glue);		\
-	else			\
-	    first = FALSE;	\
+#define EMIT2(c)   EMIT(c); EMIT(NFA_CONCAT);
 
 #ifdef FEAT_MBYTE
     if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
@@ -687,84 +684,84 @@ nfa_emit_equi_class(c, neg)
     {
 	switch (c)
 	{
-	    case 'A': case '\300': case '\301': case '\302':
-	    case '\303': case '\304': case '\305':
-		    EMIT2('A');	    EMIT2('\300');  EMIT2('\301');
-		    EMIT2('\302');  EMIT2('\303');  EMIT2('\304');
-		    EMIT2('\305');
+	    case 'A': case 0300: case 0301: case 0302:
+	    case 0303: case 0304: case 0305:
+		    EMIT2('A');	    EMIT2(0300);  EMIT2(0301);
+		    EMIT2(0302);  EMIT2(0303);  EMIT2(0304);
+		    EMIT2(0305);
 		    return OK;
 
-	    case 'C': case '\307':
-		    EMIT2('C');	    EMIT2('\307');
+	    case 'C': case 0307:
+		    EMIT2('C');	    EMIT2(0307);
 		    return OK;
 
-	    case 'E': case '\310': case '\311': case '\312': case '\313':
-		    EMIT2('E');	    EMIT2('\310');  EMIT2('\311');
-		    EMIT2('\312');  EMIT2('\313');
+	    case 'E': case 0310: case 0311: case 0312: case 0313:
+		    EMIT2('E');	    EMIT2(0310);  EMIT2(0311);
+		    EMIT2(0312);  EMIT2(0313);
 		    return OK;
 
-	    case 'I': case '\314': case '\315': case '\316': case '\317':
-		    EMIT2('I');	    EMIT2('\314');  EMIT2('\315');
-		    EMIT2('\316');  EMIT2('\317');
+	    case 'I': case 0314: case 0315: case 0316: case 0317:
+		    EMIT2('I');	    EMIT2(0314);  EMIT2(0315);
+		    EMIT2(0316);  EMIT2(0317);
 		    return OK;
 
-	    case 'N': case '\321':
-		    EMIT2('N');	    EMIT2('\321');
+	    case 'N': case 0321:
+		    EMIT2('N');	    EMIT2(0321);
 		    return OK;
 
-	    case 'O': case '\322': case '\323': case '\324': case '\325':
-	    case '\326':
-		    EMIT2('O');	    EMIT2('\322');  EMIT2('\323');
-		    EMIT2('\324');  EMIT2('\325');  EMIT2('\326');
+	    case 'O': case 0322: case 0323: case 0324: case 0325:
+	    case 0326:
+		    EMIT2('O');	    EMIT2(0322);  EMIT2(0323);
+		    EMIT2(0324);  EMIT2(0325);  EMIT2(0326);
 		    return OK;
 
-	    case 'U': case '\331': case '\332': case '\333': case '\334':
-		    EMIT2('U');	    EMIT2('\331');  EMIT2('\332');
-		    EMIT2('\333');  EMIT2('\334');
+	    case 'U': case 0331: case 0332: case 0333: case 0334:
+		    EMIT2('U');	    EMIT2(0331);  EMIT2(0332);
+		    EMIT2(0333);  EMIT2(0334);
 		    return OK;
 
-	    case 'Y': case '\335':
-		    EMIT2('Y');	    EMIT2('\335');
+	    case 'Y': case 0335:
+		    EMIT2('Y');	    EMIT2(0335);
 		    return OK;
 
-	    case 'a': case '\340': case '\341': case '\342':
-	    case '\343': case '\344': case '\345':
-		    EMIT2('a');	    EMIT2('\340');  EMIT2('\341');
-		    EMIT2('\342');  EMIT2('\343');  EMIT2('\344');
-		    EMIT2('\345');
+	    case 'a': case 0340: case 0341: case 0342:
+	    case 0343: case 0344: case 0345:
+		    EMIT2('a');	    EMIT2(0340);  EMIT2(0341);
+		    EMIT2(0342);  EMIT2(0343);  EMIT2(0344);
+		    EMIT2(0345);
 		    return OK;
 
-	    case 'c': case '\347':
-		    EMIT2('c');	    EMIT2('\347');
+	    case 'c': case 0347:
+		    EMIT2('c');	    EMIT2(0347);
 		    return OK;
 
-	    case 'e': case '\350': case '\351': case '\352': case '\353':
-		    EMIT2('e');	    EMIT2('\350');  EMIT2('\351');
-		    EMIT2('\352');  EMIT2('\353');
+	    case 'e': case 0350: case 0351: case 0352: case 0353:
+		    EMIT2('e');	    EMIT2(0350);  EMIT2(0351);
+		    EMIT2(0352);  EMIT2(0353);
 		    return OK;
 
-	    case 'i': case '\354': case '\355': case '\356': case '\357':
-		    EMIT2('i');	    EMIT2('\354');  EMIT2('\355');
-		    EMIT2('\356');  EMIT2('\357');
+	    case 'i': case 0354: case 0355: case 0356: case 0357:
+		    EMIT2('i');	    EMIT2(0354);  EMIT2(0355);
+		    EMIT2(0356);  EMIT2(0357);
 		    return OK;
 
-	    case 'n': case '\361':
-		    EMIT2('n');	    EMIT2('\361');
+	    case 'n': case 0361:
+		    EMIT2('n');	    EMIT2(0361);
 		    return OK;
 
-	    case 'o': case '\362': case '\363': case '\364': case '\365':
-	    case '\366':
-		    EMIT2('o');	    EMIT2('\362');  EMIT2('\363');
-		    EMIT2('\364');  EMIT2('\365');  EMIT2('\366');
+	    case 'o': case 0362: case 0363: case 0364: case 0365:
+	    case 0366:
+		    EMIT2('o');	    EMIT2(0362);  EMIT2(0363);
+		    EMIT2(0364);  EMIT2(0365);  EMIT2(0366);
 		    return OK;
 
-	    case 'u': case '\371': case '\372': case '\373': case '\374':
-		    EMIT2('u');	    EMIT2('\371');  EMIT2('\372');
-		    EMIT2('\373');  EMIT2('\374');
+	    case 'u': case 0371: case 0372: case 0373: case 0374:
+		    EMIT2('u');	    EMIT2(0371);  EMIT2(0372);
+		    EMIT2(0373);  EMIT2(0374);
 		    return OK;
 
-	    case 'y': case '\375': case '\377':
-		    EMIT2('y');	    EMIT2('\375');  EMIT2('\377');
+	    case 'y': case 0375: case 0377:
+		    EMIT2('y');	    EMIT2(0375);  EMIT2(0377);
 		    return OK;
 
 	    default:
@@ -811,14 +808,12 @@ nfa_regatom()
     char_u	*old_regparse = regparse;
 #endif
     int		extra = 0;
-    int		first;
     int		emit_range;
     int		negated;
     int		result;
     int		startc = -1;
     int		endc = -1;
     int		oldstartc = -1;
-    int		glue;		/* ID that will "glue" nodes together */
 
     c = getchr();
     switch (c)
@@ -927,8 +922,8 @@ nfa_regatom()
 
 	case Magic('n'):
 	    if (reg_string)
-	    /* In a string "\n" matches a newline character. */
-	    EMIT(NL);
+		/* In a string "\n" matches a newline character. */
+		EMIT(NL);
 	    else
 	    {
 		/* In buffer text "\n" matches the end of a line. */
@@ -1160,32 +1155,15 @@ nfa_regatom()
 	case Magic('['):
 collection:
 	    /*
-	     * Glue is emitted between several atoms from the [].
-	     * It is either NFA_OR, or NFA_CONCAT.
-	     *
-	     * [abc] expands to 'a b NFA_OR c NFA_OR' (in postfix notation)
-	     * [^abc] expands to 'a NFA_NOT b NFA_NOT NFA_CONCAT c NFA_NOT
-	     *		NFA_CONCAT NFA_END_NEG_RANGE NFA_CONCAT' (in postfix
-	     *		notation)
-	     *
+	     * [abc]  uses NFA_START_COLL - NFA_END_COLL
+	     * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
+	     * Each character is produced as a regular state, using
+	     * NFA_CONCAT to bind them together.
+	     * Besides normal characters there can be:
+	     * - character classes  NFA_CLASS_*
+	     * - ranges, two characters followed by NFA_RANGE.
 	     */
 
-
-/* Emit negation atoms, if needed.
- * The CONCAT below merges the NOT with the previous node. */
-#define TRY_NEG()		    \
-	    if (negated == TRUE)    \
-	    {			    \
-		EMIT(NFA_NOT);	    \
-	    }
-
-/* Emit glue between important nodes : CONCAT or OR. */
-#define EMIT_GLUE()		    \
-	    if (first == FALSE)	    \
-		EMIT(glue);	    \
-	    else		    \
-		first = FALSE;
-
 	    p = regparse;
 	    endp = skip_anyof(p);
 	    if (*endp == ']')
@@ -1216,21 +1194,20 @@ collection:
 		 * version that turns [abc] into 'a' OR 'b' OR 'c'
 		 */
 		startc = endc = oldstartc = -1;
-		first = TRUE;	    /* Emitting first atom in this sequence? */
 		negated = FALSE;
-		glue = NFA_OR;
 		if (*regparse == '^')			/* negated range */
 		{
 		    negated = TRUE;
-		    glue = NFA_CONCAT;
 		    mb_ptr_adv(regparse);
+		    EMIT(NFA_START_NEG_COLL);
 		}
+		else
+		    EMIT(NFA_START_COLL);
 		if (*regparse == '-')
 		{
 		    startc = '-';
 		    EMIT(startc);
-		    TRY_NEG();
-		    EMIT_GLUE();
+		    EMIT(NFA_CONCAT);
 		    mb_ptr_adv(regparse);
 		}
 		/* Emit the OR branches for each character in the [] */
@@ -1306,20 +1283,18 @@ collection:
 				    EMIT(NFA_CLASS_ESCAPE);
 				    break;
 			    }
-			    TRY_NEG();
-			    EMIT_GLUE();
+			    EMIT(NFA_CONCAT);
 			    continue;
 			}
 			/* Try equivalence class [=a=] and the like */
 			if (equiclass != 0)
 			{
-			    result = nfa_emit_equi_class(equiclass, negated);
+			    result = nfa_emit_equi_class(equiclass);
 			    if (result == FAIL)
 			    {
 				/* should never happen */
 				EMSG_RET_FAIL(_("E868: Error building NFA with equivalence class!"));
 			    }
-			    EMIT_GLUE();
 			    continue;
 			}
 			/* Try collating class like [. .]  */
@@ -1391,19 +1366,32 @@ collection:
 			startc = oldstartc;
 			if (startc > endc)
 			    EMSG_RET_FAIL(_(e_invrange));
+
+			if (endc > startc + 2)
+			{
+			    /* Emit a range instead of the sequence of
+			     * individual characters. */
+			    if (startc == 0)
+				/* \x00 is translated to \x0a, start at \x01. */
+				EMIT(1);
+			    else
+				--post_ptr; /* remove NFA_CONCAT */
+			    EMIT(endc);
+			    EMIT(NFA_RANGE);
+			    EMIT(NFA_CONCAT);
+			}
+			else
 #ifdef FEAT_MBYTE
-			if (has_mbyte && ((*mb_char2len)(startc) > 1
+			     if (has_mbyte && ((*mb_char2len)(startc) > 1
 				    || (*mb_char2len)(endc) > 1))
 			{
-			    if (endc > startc + 256)
-				EMSG_RET_FAIL(_(e_invrange));
-			    /* Emit the range. "startc" was already emitted, so
-			     * skip it. */
+			    /* Emit the characters in the range.
+			     * "startc" was already emitted, so skip it.
+			     * */
 			    for (c = startc + 1; c <= endc; c++)
 			    {
 				EMIT(c);
-				TRY_NEG();
-				EMIT_GLUE();
+				EMIT(NFA_CONCAT);
 			    }
 			}
 			else
@@ -1425,8 +1413,7 @@ collection:
 #endif
 				{
 				    EMIT(c);
-				    TRY_NEG();
-				    EMIT_GLUE();
+				    EMIT(NFA_CONCAT);
 				}
 			}
 			emit_range = FALSE;
@@ -1434,23 +1421,29 @@ collection:
 		    }
 		    else
 		    {
-			/*
-			 * This char (startc) is not part of a range. Just
+			/* This char (startc) is not part of a range. Just
 			 * emit it.
-			 *
 			 * Normally, simply emit startc. But if we get char
 			 * code=0 from a collating char, then replace it with
 			 * 0x0a.
-			 *
 			 * This is needed to completely mimic the behaviour of
-			 * the backtracking engine.
-			 */
-			if (got_coll_char == TRUE && startc == 0)
-			    EMIT(0x0a);
+			 * the backtracking engine. */
+			if (startc == NFA_NEWL)
+			{
+			    /* Line break can't be matched as part of the
+			     * collection, add an OR below. But not for negated
+			     * range. */
+			    if (!negated)
+				extra = ADD_NL;
+			}
 			else
-			    EMIT(startc);
-			TRY_NEG();
-			EMIT_GLUE();
+			{
+			    if (got_coll_char == TRUE && startc == 0)
+				EMIT(0x0a);
+			    else
+				EMIT(startc);
+			    EMIT(NFA_CONCAT);
+			}
 		    }
 
 		    mb_ptr_adv(regparse);
@@ -1460,20 +1453,19 @@ collection:
 		if (*regparse == '-')	    /* if last, '-' is just a char */
 		{
 		    EMIT('-');
-		    TRY_NEG();
-		    EMIT_GLUE();
+		    EMIT(NFA_CONCAT);
 		}
 		mb_ptr_adv(regparse);
 
 		/* skip the trailing ] */
 		regparse = endp;
 		mb_ptr_adv(regparse);
+
+		/* Mark end of the collection. */
 		if (negated == TRUE)
-		{
-		    /* Mark end of negated char range */
-		    EMIT(NFA_END_NEG_RANGE);
-		    EMIT(NFA_CONCAT);
-		}
+		    EMIT(NFA_END_NEG_COLL);
+		else
+		    EMIT(NFA_END_COLL);
 
 		/* \_[] also matches \n but it's not negated */
 		if (extra == ADD_NL)
@@ -1532,9 +1524,6 @@ nfa_do_multibyte:
 	    }
     }
 
-#undef TRY_NEG
-#undef EMIT_GLUE
-
     return OK;
 }
 
@@ -2091,10 +2080,17 @@ nfa_set_code(c)
 	case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
 	case NFA_QUEST:		STRCPY(code, "NFA_QUEST"); break;
 	case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
-	case NFA_NOT:		STRCPY(code, "NFA_NOT "); break;
 	case NFA_SKIP_CHAR:	STRCPY(code, "NFA_SKIP_CHAR"); break;
 	case NFA_OR:		STRCPY(code, "NFA_OR"); break;
-	case NFA_END_NEG_RANGE:	STRCPY(code, "NFA_END_NEG_RANGE"); break;
+
+	case NFA_START_COLL:	STRCPY(code, "NFA_START_COLL"); break;
+	case NFA_END_COLL:	STRCPY(code, "NFA_END_COLL"); break;
+	case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
+	case NFA_END_NEG_COLL:	STRCPY(code, "NFA_END_NEG_COLL"); break;
+	case NFA_RANGE:		STRCPY(code, "NFA_RANGE"); break;
+	case NFA_RANGE_MIN:	STRCPY(code, "NFA_RANGE_MIN"); break;
+	case NFA_RANGE_MAX:	STRCPY(code, "NFA_RANGE_MAX"); break;
+
 	case NFA_CLASS_ALNUM:	STRCPY(code, "NFA_CLASS_ALNUM"); break;
 	case NFA_CLASS_ALPHA:	STRCPY(code, "NFA_CLASS_ALPHA"); break;
 	case NFA_CLASS_BLANK:	STRCPY(code, "NFA_CLASS_BLANK"); break;
@@ -2231,8 +2227,12 @@ nfa_print_state2(debugf, state, indent)
 	fprintf(debugf, " %s", p);
 
     nfa_set_code(state->c);
-    fprintf(debugf, "%s%s (%d) (id=%d)\n",
-		 state->negated ? "NOT " : "", code, state->c, abs(state->id));
+    fprintf(debugf, "%s%s (%d) (id=%d) val=%d\n",
+		 state->negated ? "NOT " : "",
+		 code,
+		 state->c,
+		 abs(state->id),
+		 state->val);
     if (state->id < 0)
 	return;
 
@@ -2325,6 +2325,7 @@ alloc_state(c, out, out1)
     s->c    = c;
     s->out  = out;
     s->out1 = out1;
+    s->val  = 0;
 
     s->id   = istate;
     s->lastlist[0] = 0;
@@ -2565,13 +2566,10 @@ post2nfa(postfix, end, nfa_calc_size)
 	switch (*p)
 	{
 	case NFA_CONCAT:
-	    /* Catenation.
-	     * Pay attention: this operator does not exist
-	     * in the r.e. itself (it is implicit, really).
-	     * It is added when r.e. is translated to postfix
-	     * form in re2post().
-	     *
-	     * No new state added here. */
+	    /* Concatenation.
+	     * Pay attention: this operator does not exist in the r.e. itself
+	     * (it is implicit, really).  It is added when r.e. is translated
+	     * to postfix form in re2post(). */
 	    if (nfa_calc_size == TRUE)
 	    {
 		/* nstate += 0; */
@@ -2583,22 +2581,6 @@ post2nfa(postfix, end, nfa_calc_size)
 	    PUSH(frag(e1.start, e2.out));
 	    break;
 
-	case NFA_NOT:
-	    /* Negation of a character */
-	    if (nfa_calc_size == TRUE)
-	    {
-		/* nstate += 0; */
-		break;
-	    }
-	    e1 = POP();
-	    e1.start->negated = TRUE;
-#ifdef FEAT_MBYTE
-	    if (e1.start->c == NFA_COMPOSING)
-		e1.start->out1->negated = TRUE;
-#endif
-	    PUSH(e1);
-	    break;
-
 	case NFA_OR:
 	    /* Alternation */
 	    if (nfa_calc_size == TRUE)
@@ -2672,6 +2654,43 @@ post2nfa(postfix, end, nfa_calc_size)
 	    PUSH(frag(s, append(e.out, list1(&s->out))));
 	    break;
 
+	case NFA_END_COLL:
+	case NFA_END_NEG_COLL:
+	    /* On the stack is the sequence starting with NFA_START_COLL or
+	     * NFA_START_NEG_COLL and all possible characters. Patch it to
+	     * add the output to the start. */
+	    if (nfa_calc_size == TRUE)
+	    {
+		nstate++;
+		break;
+	    }
+	    e = POP();
+	    s = alloc_state(NFA_END_COLL, NULL, NULL);
+	    if (s == NULL)
+		goto theend;
+	    patch(e.out, s);
+	    e.start->out1 = s;
+	    PUSH(frag(e.start, list1(&s->out)));
+	    break;
+
+	case NFA_RANGE:
+	    /* Before this are two characters, the low and high end of a
+	     * range.  Turn them into two states with MIN and MAX. */
+	    if (nfa_calc_size == TRUE)
+	    {
+		/* nstate += 0; */
+		break;
+	    }
+	    e2 = POP();
+	    e1 = POP();
+	    e2.start->val = e2.start->c;
+	    e2.start->c = NFA_RANGE_MAX;
+	    e1.start->val = e1.start->c;
+	    e1.start->c = NFA_RANGE_MIN;
+	    patch(e1.out, e2.start);
+	    PUSH(frag(e1.start, e2.out));
+	    break;
+
 	case NFA_SKIP_CHAR:
 	    /* Symbol of 0-length, Used in a repetition
 	     * with max/min count of 0 */
@@ -2990,6 +3009,8 @@ post2nfa(postfix, end, nfa_calc_size)
     matchstate = &state_ptr[istate++]; /* the match state */
     matchstate->c = NFA_MATCH;
     matchstate->out = matchstate->out1 = NULL;
+    matchstate->negated = FALSE;
+    matchstate->id = 0;
 
     patch(e.out, matchstate);
     ret = e.start;
@@ -3308,7 +3329,6 @@ addstate(l, state, subs, off)
     switch (state->c)
     {
 	case NFA_SPLIT:
-	case NFA_NOT:
 	case NFA_NOPEN:
 	case NFA_SKIP_CHAR:
 	case NFA_NCLOSE:
@@ -3782,7 +3802,8 @@ check_char_class(class, c)
 
 	default:
 	    /* should not be here :P */
-	    EMSG_RET_FAIL(_("E877: (NFA regexp) Invalid character class "));
+	    EMSGN("E877: (NFA regexp) Invalid character class: %ld", class);
+	    return FAIL;
     }
     return FAIL;
 }
@@ -4320,8 +4341,8 @@ nfa_regmatch(prog, start, submatch, m)
     addstate(thislist, start, m, 0);
 
     /* There are two cases when the NFA advances: 1. input char matches the
-     * NFA node and 2. input char does not match the NFA node, but the next
-     * node is NFA_NOT. The following macro calls addstate() according to
+     * NFA node and 2. input char does not match the NFA node and the state
+     * has the negated flag. The following macro calls addstate() according to
      * these rules. It is used A LOT, so use the "listtbl" table for speed */
     listtbl[0][0] = NULL;
     listtbl[0][1] = neglist;
@@ -4845,16 +4866,79 @@ nfa_regmatch(prog, start, submatch, m)
 		ADD_POS_NEG_STATE(t->state);
 		break;
 
-	    case NFA_END_NEG_RANGE:
-		/* This follows a series of negated nodes, like:
-		 * NOT CHAR(x), NOT CHAR(y), etc. */
-		if (curc > 0)
+	    case NFA_START_COLL:
+	    case NFA_START_NEG_COLL:
+	      {
+		/* What follows is a list of characters, until NFA_END_COLL.
+		 * One of them must match or none of them must match. */
+		nfa_state_T	*state;
+		int		result_if_matched;
+		int		c1, c2;
+
+		/* Never match EOL. If it's part of the collection it is added
+		 * as a separate state with an OR. */
+		if (curc == NUL)
+		    break;
+
+		state = t->state->out;
+		result_if_matched = (t->state->c == NFA_START_COLL);
+		for (;;)
 		{
+		    if (state->c == NFA_END_COLL)
+		    {
+			result = !result_if_matched;
+			break;
+		    }
+		    if (state->c == NFA_RANGE_MIN)
+		    {
+			c1 = state->val;
+			state = state->out; /* advance to NFA_RANGE_MAX */
+			c2 = state->val;
+#ifdef ENABLE_LOG
+			fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
+				curc, c1, c2);
+#endif
+			if (curc >= c1 && curc <= c2)
+			{
+			    result = result_if_matched;
+			    break;
+			}
+			if (ireg_ic)
+			{
+			    int curc_low = MB_TOLOWER(curc);
+			    int done = FALSE;
+
+			    for ( ; c1 <= c2; ++c1)
+				if (MB_TOLOWER(c1) == curc_low)
+				{
+				    result = result_if_matched;
+				    done = TRUE;
+				    break;
+				}
+			    if (done)
+				break;
+			}
+		    }
+		    else if (state->c < 0 ? check_char_class(state->c, curc)
+			        : (curc == state->c
+				   || (ireg_ic && MB_TOLOWER(curc)
+						    == MB_TOLOWER(state->c))))
+		    {
+			result = result_if_matched;
+			break;
+		    }
+		    state = state->out;
+		}
+		if (result)
+		{
+		    /* next state is in out of the NFA_END_COLL, out1 of
+		     * START points to the END state */
 		    ll = nextlist;
-		    add_state = t->state->out;
+		    add_state = t->state->out1->out;
 		    add_off = clen;
 		}
 		break;
+	      }
 
 	    case NFA_ANY:
 		/* Any char except '\0', (end of input) does not match. */
diff --git a/src/version.c b/src/version.c
index 76bdd6cf8..51887f4ae 100644
--- a/src/version.c
+++ b/src/version.c
@@ -729,6 +729,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    1137,
+/**/
     1136,
 /**/
     1135,
author	Bram Moolenaar <Bram@vim.org>	2013-06-07 14:08:30 +0200
committer	Bram Moolenaar <Bram@vim.org>	2013-06-07 14:08:30 +0200
commit	417bad227ae3c6c930dc088d598f65205cce13d9 (patch)
tree	9dc33f76958229f64acbf77bc9666e458c8b68e7 /src
parent	203d04d76413fb46ada3feb73144f3172b1a9e05 (diff)
download	vim-417bad227ae3c6c930dc088d598f65205cce13d9.zip