summaryrefslogtreecommitdiff
path: root/src/regexp.c
diff options
context:
space:
mode:
authorBram Moolenaar <Bram@vim.org>2004-09-13 20:26:32 +0000
committerBram Moolenaar <Bram@vim.org>2004-09-13 20:26:32 +0000
commitc0197e2815208269fa9ba2fba95230138ec39ceb (patch)
tree21db1c3acd16fb095a8e34ce2e15ed87275cbd79 /src/regexp.c
parent15d0a8c77dad867b69822e2fd8f9f6bbcf765c48 (diff)
downloadvim-c0197e2815208269fa9ba2fba95230138ec39ceb.zip
updated for version 7.0016
Diffstat (limited to 'src/regexp.c')
-rw-r--r--src/regexp.c186
1 files changed, 183 insertions, 3 deletions
diff --git a/src/regexp.c b/src/regexp.c
index 9715a1ea5..dac34a2b3 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -33,8 +33,8 @@
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*
- * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb
- * and Bram Moolenaar.
+ * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
+ * Webb, Ciaran McCreesh and Bram Moolenaar.
* Named character class support added by Walter Briscoe (1998 Jul 01)
*/
@@ -376,9 +376,14 @@ static char_u *reg_prev_sub;
* \t - Tab (TAB).
* \e - Escape (ESC).
* \b - Backspace (Ctrl_H).
+ * \d - Character code in decimal, eg \d123
+ * \o - Character code in octal, eg \o80
+ * \x - Character code in hex, eg \x4a
+ * \u - Multibyte character code, eg \u20ac
+ * \U - Long multibyte character code, eg \U12345678
*/
static char_u REGEXP_INRANGE[] = "]^-n\\";
-static char_u REGEXP_ABBR[] = "nrteb";
+static char_u REGEXP_ABBR[] = "nrtebdoxuU";
static int backslash_trans __ARGS((int c));
static int skip_class_name __ARGS((char_u **pp));
@@ -681,6 +686,10 @@ static void skipchr_keepstart __ARGS((void));
static int peekchr __ARGS((void));
static void skipchr __ARGS((void));
static void ungetchr __ARGS((void));
+static int gethexchrs __ARGS((int maxinputlen));
+static int getoctchrs __ARGS((void));
+static int getdecchrs __ARGS((void));
+static int coll_get_char __ARGS((void));
static void regcomp_start __ARGS((char_u *expr, int flags));
static char_u *reg __ARGS((int, int *));
static char_u *regbranch __ARGS((int *flagp));
@@ -1722,6 +1731,42 @@ regatom(flagp)
break;
}
+ case 'd': /* %d123 decimal */
+ case 'o': /* %o123 octal */
+ case 'x': /* %xab hex 2 */
+ case 'u': /* %uabcd hex 4 */
+ case 'U': /* %U1234abcd hex 8 */
+ {
+ int i;
+
+ switch (c)
+ {
+ case 'd': i = getdecchrs(); break;
+ case 'o': i = getoctchrs(); break;
+ case 'x': i = gethexchrs(2); break;
+ case 'u': i = gethexchrs(4); break;
+ case 'U': i = gethexchrs(8); break;
+ default: i = -1; break;
+ }
+
+ if (i < 0)
+ EMSG_M_RET_NULL(
+ _("E678: Invalid character after %s%%[dxouU]"),
+ reg_magic == MAGIC_ALL);
+ ret = regnode(EXACTLY);
+ if (i == 0)
+ regc(0x0a);
+ else
+#ifdef FEAT_MBYTE
+ regmbc(i);
+#else
+ regc(i);
+#endif
+ regc(NUL);
+ *flagp |= HASWIDTH;
+ break;
+ }
+
default:
if (VIM_ISDIGIT(c) || c == '<' || c == '>')
{
@@ -1816,6 +1861,11 @@ collection:
else
#endif
endc = *regparse++;
+
+ /* Handle \o40, \x20 and \u20AC style sequences */
+ if (endc == '\\' && !cpo_lit)
+ endc = coll_get_char();
+
if (startc > endc)
EMSG_RET_NULL(_(e_invrange));
#ifdef FEAT_MBYTE
@@ -1875,6 +1925,22 @@ collection:
regparse++;
startc = -1;
}
+ else if (*regparse == 'd'
+ || *regparse == 'o'
+ || *regparse == 'x'
+ || *regparse == 'u'
+ || *regparse == 'U')
+ {
+ startc = coll_get_char();
+ if (startc == 0)
+ regc(0x0a);
+ else
+#ifdef FEAT_MBYTE
+ regmbc(startc);
+#else
+ regc(startc);
+#endif
+ }
else
{
startc = backslash_trans(*regparse++);
@@ -2517,6 +2583,120 @@ ungetchr()
}
/*
+ * get and return the value of the hex string immediately after the current
+ * position. Return -1 for invalid, or 0-255 for valid. Position is updated:
+ * blahblah\%x20asdf
+ * before-^ ^-after
+ * The parameter controls the maximum number of input characters. This will be
+ * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
+ */
+ static int
+gethexchrs(maxinputlen)
+ int maxinputlen;
+{
+ int nr = 0;
+ int c;
+ int i;
+
+ for (i = 0; i < maxinputlen; ++i)
+ {
+ c = regparse[0];
+ if (!vim_isxdigit(c))
+ break;
+ nr <<= 4;
+ nr |= hex2nr(c);
+ ++regparse;
+ }
+
+ if (i == 0)
+ return -1;
+ return nr;
+}
+
+/*
+ * get and return the value of the decimal string immediately after the
+ * current position. Return -1 for invalid. Consumes all digits.
+ */
+ static int
+getdecchrs()
+{
+ int nr = 0;
+ int c;
+ int i;
+
+ for (i = 0; ; ++i)
+ {
+ c = regparse[0];
+ if (c < '0' || c > '9')
+ break;
+ nr *= 10;
+ nr += c - '0';
+ ++regparse;
+ }
+
+ if (i == 0)
+ return -1;
+ return nr;
+}
+
+/*
+ * get and return the value of the octal string immediately after the current
+ * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
+ * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
+ * treat 8 or 9 as recognised characters. Position is updated:
+ * blahblah\%o210asdf
+ * before-^ ^-after
+ */
+ static int
+getoctchrs()
+{
+ int nr = 0;
+ int c;
+ int i;
+
+ for (i = 0; i < 3 && nr < 040; ++i)
+ {
+ c = regparse[0];
+ if (c < '0' || c > '7')
+ break;
+ nr <<= 3;
+ nr |= hex2nr(c);
+ ++regparse;
+ }
+
+ if (i == 0)
+ return -1;
+ return nr;
+}
+
+/*
+ * Get a number after a backslash that is inside [].
+ * When nothing is recognized return a backslash.
+ */
+ static int
+coll_get_char()
+{
+ int nr = -1;
+
+ switch (*regparse++)
+ {
+ case 'd': nr = getdecchrs(); break;
+ case 'o': nr = getoctchrs(); break;
+ case 'x': nr = gethexchrs(2); break;
+ case 'u': nr = gethexchrs(4); break;
+ case 'U': nr = gethexchrs(8); break;
+ }
+ if (nr < 0)
+ {
+ /* If getting the number fails be backwards compatible: the character
+ * is a backslash. */
+ --regparse;
+ nr = '\\';
+ }
+ return nr;
+}
+
+/*
* read_limits - Read two integers to be taken as a minimum and maximum.
* If the first character is '-', then the range is reversed.
* Should end with 'end'. If minval is missing, zero is default, if maxval is