updated for version 7.0016

author: Bram Moolenaar <Bram@vim.org> 2004-09-13 20:26:32 +0000
committer: Bram Moolenaar <Bram@vim.org> 2004-09-13 20:26:32 +0000
commit: c0197e2815208269fa9ba2fba95230138ec39ceb (patch)
tree: 21db1c3acd16fb095a8e34ce2e15ed87275cbd79 /src/regexp.c
parent: 15d0a8c77dad867b69822e2fd8f9f6bbcf765c48 (diff)
download: vim-c0197e2815208269fa9ba2fba95230138ec39ceb.zip
1 files changed, 183 insertions, 3 deletions
diff --git a/src/regexp.c b/src/regexp.c
index 9715a1ea5..dac34a2b3 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -33,8 +33,8 @@
  * precedence is structured in regular expressions.  Serious changes in
  * regular-expression syntax might require a total rethink.
  *
- * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb
- * and Bram Moolenaar.
+ * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
+ * Webb, Ciaran McCreesh and Bram Moolenaar.
  * Named character class support added by Walter Briscoe (1998 Jul 01)
  */
 
@@ -376,9 +376,14 @@ static char_u		*reg_prev_sub;
  *  \t	- Tab (TAB).
  *  \e	- Escape (ESC).
  *  \b	- Backspace (Ctrl_H).
+ *  \d  - Character code in decimal, eg \d123
+ *  \o	- Character code in octal, eg \o80
+ *  \x	- Character code in hex, eg \x4a
+ *  \u	- Multibyte character code, eg \u20ac
+ *  \U	- Long multibyte character code, eg \U12345678
  */
 static char_u REGEXP_INRANGE[] = "]^-n\\";
-static char_u REGEXP_ABBR[] = "nrteb";
+static char_u REGEXP_ABBR[] = "nrtebdoxuU";
 
 static int	backslash_trans __ARGS((int c));
 static int	skip_class_name __ARGS((char_u **pp));
@@ -681,6 +686,10 @@ static void	skipchr_keepstart __ARGS((void));
 static int	peekchr __ARGS((void));
 static void	skipchr __ARGS((void));
 static void	ungetchr __ARGS((void));
+static int	gethexchrs __ARGS((int maxinputlen));
+static int	getoctchrs __ARGS((void));
+static int	getdecchrs __ARGS((void));
+static int	coll_get_char __ARGS((void));
 static void	regcomp_start __ARGS((char_u *expr, int flags));
 static char_u	*reg __ARGS((int, int *));
 static char_u	*regbranch __ARGS((int *flagp));
@@ -1722,6 +1731,42 @@ regatom(flagp)
 			      break;
 			  }
 
+		case 'd':   /* %d123 decimal */
+		case 'o':   /* %o123 octal */
+		case 'x':   /* %xab hex 2 */
+		case 'u':   /* %uabcd hex 4 */
+		case 'U':   /* %U1234abcd hex 8 */
+			  {
+			      int i;
+
+			      switch (c)
+			      {
+				  case 'd': i = getdecchrs(); break;
+				  case 'o': i = getoctchrs(); break;
+				  case 'x': i = gethexchrs(2); break;
+				  case 'u': i = gethexchrs(4); break;
+				  case 'U': i = gethexchrs(8); break;
+				  default:  i = -1; break;
+			      }
+
+			      if (i < 0)
+			          EMSG_M_RET_NULL(
+					_("E678: Invalid character after %s%%[dxouU]"),
+					reg_magic == MAGIC_ALL);
+			      ret = regnode(EXACTLY);
+			      if (i == 0)
+				  regc(0x0a);
+			      else
+#ifdef FEAT_MBYTE
+				  regmbc(i);
+#else
+				  regc(i);
+#endif
+			      regc(NUL);
+			      *flagp |= HASWIDTH;
+			      break;
+			  }
+
 		default:
 			  if (VIM_ISDIGIT(c) || c == '<' || c == '>')
 			  {
@@ -1816,6 +1861,11 @@ collection:
 			    else
 #endif
 				endc = *regparse++;
+
+			    /* Handle \o40, \x20 and \u20AC style sequences */
+			    if (endc == '\\' && !cpo_lit)
+				endc = coll_get_char();
+
 			    if (startc > endc)
 				EMSG_RET_NULL(_(e_invrange));
 #ifdef FEAT_MBYTE
@@ -1875,6 +1925,22 @@ collection:
 			    regparse++;
 			    startc = -1;
 			}
+			else if (*regparse == 'd'
+				|| *regparse == 'o'
+				|| *regparse == 'x'
+				|| *regparse == 'u'
+				|| *regparse == 'U')
+			{
+			    startc = coll_get_char();
+			    if (startc == 0)
+				regc(0x0a);
+			    else
+#ifdef FEAT_MBYTE
+				regmbc(startc);
+#else
+				regc(startc);
+#endif
+			}
 			else
 			{
 			    startc = backslash_trans(*regparse++);
@@ -2517,6 +2583,120 @@ ungetchr()
 }
 
 /*
+ * get and return the value of the hex string immediately after the current
+ * position. Return -1 for invalid, or 0-255 for valid. Position is updated:
+ *     blahblah\%x20asdf
+ *         before-^ ^-after
+ * The parameter controls the maximum number of input characters. This will be
+ * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
+ */
+    static int
+gethexchrs(maxinputlen)
+    int		maxinputlen;
+{
+    int		nr = 0;
+    int		c;
+    int		i;
+
+    for (i = 0; i < maxinputlen; ++i)
+    {
+	c = regparse[0];
+	if (!vim_isxdigit(c))
+	    break;
+	nr <<= 4;
+	nr |= hex2nr(c);
+	++regparse;
+    }
+
+    if (i == 0)
+	return -1;
+    return nr;
+}
+
+/*
+ * get and return the value of the decimal string immediately after the
+ * current position. Return -1 for invalid.  Consumes all digits.
+ */
+    static int
+getdecchrs()
+{
+    int		nr = 0;
+    int		c;
+    int		i;
+
+    for (i = 0; ; ++i)
+    {
+	c = regparse[0];
+	if (c < '0' || c > '9')
+	    break;
+	nr *= 10;
+	nr += c - '0';
+	++regparse;
+    }
+
+    if (i == 0)
+	return -1;
+    return nr;
+}
+
+/*
+ * get and return the value of the octal string immediately after the current
+ * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
+ * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
+ * treat 8 or 9 as recognised characters. Position is updated:
+ *     blahblah\%o210asdf
+ *         before-^  ^-after
+ */
+    static int
+getoctchrs()
+{
+    int		nr = 0;
+    int		c;
+    int		i;
+
+    for (i = 0; i < 3 && nr < 040; ++i)
+    {
+	c = regparse[0];
+	if (c < '0' || c > '7')
+	    break;
+	nr <<= 3;
+	nr |= hex2nr(c);
+	++regparse;
+    }
+
+    if (i == 0)
+	return -1;
+    return nr;
+}
+
+/*
+ * Get a number after a backslash that is inside [].
+ * When nothing is recognized return a backslash.
+ */
+    static int
+coll_get_char()
+{
+    int	    nr = -1;
+
+    switch (*regparse++)
+    {
+	case 'd': nr = getdecchrs(); break;
+	case 'o': nr = getoctchrs(); break;
+	case 'x': nr = gethexchrs(2); break;
+	case 'u': nr = gethexchrs(4); break;
+	case 'U': nr = gethexchrs(8); break;
+    }
+    if (nr < 0)
+    {
+	/* If getting the number fails be backwards compatible: the character
+	 * is a backslash. */
+	--regparse;
+	nr = '\\';
+    }
+    return nr;
+}
+
+/*
  * read_limits - Read two integers to be taken as a minimum and maximum.
  * If the first character is '-', then the range is reversed.
  * Should end with 'end'.  If minval is missing, zero is default, if maxval is
author	Bram Moolenaar <Bram@vim.org>	2004-09-13 20:26:32 +0000
committer	Bram Moolenaar <Bram@vim.org>	2004-09-13 20:26:32 +0000
commit	c0197e2815208269fa9ba2fba95230138ec39ceb (patch)
tree	21db1c3acd16fb095a8e34ce2e15ed87275cbd79 /src/regexp.c
parent	15d0a8c77dad867b69822e2fd8f9f6bbcf765c48 (diff)
download	vim-c0197e2815208269fa9ba2fba95230138ec39ceb.zip