diff options
| author | Thomas Ulmer <thomasmulmer02@gmail.com> | 2026-02-23 16:54:28 -0800 |
|---|---|---|
| committer | Thomas Ulmer <thomasmulmer02@gmail.com> | 2026-02-23 16:54:28 -0800 |
| commit | 15bd7946cc838a3151c357e4b0bc1ab85eecda62 (patch) | |
| tree | 56977cb9bfc4349f46e2c608503a298df30ca957 /libuxre/bracket.c | |
add musl and vi
Diffstat (limited to 'libuxre/bracket.c')
| -rw-r--r-- | libuxre/bracket.c | 829 |
1 files changed, 829 insertions, 0 deletions
diff --git a/libuxre/bracket.c b/libuxre/bracket.c new file mode 100644 index 0000000..bc31b23 --- /dev/null +++ b/libuxre/bracket.c @@ -0,0 +1,829 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)bracket.c 1.14 (gritter) 10/18/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include "re.h" + +/* +* Build and match the [...] part of REs. +* +* In general, each compiled bracket construct holds a set of mapped +* wide character values and a set of character classifications. +* The mapping applied (when the current LC_COLLATE is not CHF_ENCODED) +* is the "basic" weight (cep->weight[0]); otherwise the actual wide +* character is used. +* +* To support simplified range handling, this code assumes that a w_type, +* a signed integer type, can hold all valid basic weight values (as well +* as all wide character values for CHF_ENCODED locales) and that these +* are all positive. Negative values indicate error conditions (BKT_*); +* zero (which must be the same as WGHT_IGNORE) indicates success, but +* that the item installed is not a range endpoint. +*/ + +static int +addwide(Bracket *bp, wchar_t ord) +{ + unsigned int nw; + + if ((nw = bp->nwide) < NWIDE) + bp->wide[nw] = ord; + else + { + if (nw % NWIDE == 0 && (bp->exwide = + realloc(bp->exwide, nw * sizeof(wchar_t))) == 0) + { + return BKT_ESPACE; + } + nw -= NWIDE; + bp->exwide[nw] = ord; + } + bp->nwide++; + return 0; +} + +#if USHRT_MAX == 65535 /* have 16 bits */ +#define PLIND(n) ((n) >> 4) +#define PLBIT(n) (1 << ((n) & 0xf)) +#else +#define PLIND(n) ((n) / CHAR_BIT) +#define PLBIT(n) (1 << ((n) % CHAR_BIT)) +#endif + +#define RANGE ((wchar_t)'-') /* separates wide chars in ranges */ + +static int +addrange(Bracket *bp, wchar_t ord, w_type prev) +{ + int ret; + + if (prev > 0 && prev != ord) /* try for range */ + { + if (prev > ord) + { + if (bp->flags & BKT_ODDRANGE) /* prev only - done */ + return 0; + else if ((bp->flags & BKT_BADRANGE) == 0) + return BKT_ERANGE; + } + else + { + if (++prev <= UCHAR_MAX) /* "prev" already there */ + { + do + { + bp->byte[PLIND(prev)] |= PLBIT(prev); + if (prev == ord) + return 0; + } while (++prev <= UCHAR_MAX); + } + if ((ret = addwide(bp, prev)) != 0) + return ret; + if (++prev > ord) + return 0; + if (prev < ord && (ret = addwide(bp, RANGE)) != 0) + return ret; + return addwide(bp, ord); + } + } + if (ord <= UCHAR_MAX) + { + bp->byte[PLIND(ord)] |= PLBIT(ord); + return 0; + } + if (prev == ord) /* don't bother */ + return 0; + return addwide(bp, ord); +} + +static w_type +place(Bracket *bp, wchar_t wc, w_type prev, int mb_cur_max) +{ + const CollElem *cep; + CollElem spare; + int ret; + + if ((cep = libuxre_collelem(bp->col, &spare, wc)) != ELEM_ENCODED) + { + if (cep == ELEM_BADCHAR) + return BKT_BADCHAR; + wc = cep->weight[0]; + } + if ((ret = addrange(bp, wc, prev)) != 0) + return ret; + return wc; +} + +#ifndef CHARCLASS_NAME_MAX +# define CHARCLASS_NAME_MAX 127 +#endif + +static w_type +chcls(Bracket *bp, const unsigned char *s, int n) +{ + char clsstr[CHARCLASS_NAME_MAX + 1]; + unsigned int nt; + wctype_t wct; + + if (n > CHARCLASS_NAME_MAX) + return BKT_ECTYPE; + (void)memcpy(clsstr, s, n); + clsstr[n] = '\0'; + if ((wct = wctype(clsstr)) == 0) + return BKT_ECTYPE; + if ((nt = bp->ntype) < NTYPE) + bp->type[nt] = wct; + else + { + if (nt % NTYPE == 0 && (bp->extype = + realloc(bp->extype, nt * sizeof(wctype_t))) == 0) + { + return BKT_ESPACE; + } + nt -= NTYPE; + bp->extype[nt] = wct; + } + bp->ntype++; + return 0; /* cannot be end point of a range */ +} + + /* + * The purpose of mcce() and its Mcce structure is to locate + * the next full collation element from "wc" and "s". It is + * called both at compile and execute time. These two differ + * primarily in that at compile time there is an exact number + * of bytes to be consumed, while at execute time the longest + * valid collation element is to be found. + * + * When BKT_ONECASE is set, MCCEs become particularly messy. + * There is no guarantee that all possible combinations of + * upper/lower case are defined as MCCEs. Thus, this code + * tries both lower- and uppercase (in that order) for each + * character than might be part of an MCCE. + */ + +typedef struct +{ + const unsigned char *max; /* restriction by caller */ + const unsigned char *aft; /* longest successful */ + Bracket *bp; /* readonly */ + struct lc_collate *col; /* readonly */ + const CollElem *cep; /* entry matching longest */ + wchar_t ch; /* initial character (if any) */ + w_type wc; /* character matching "aft" */ +} Mcce; + +static int +mcce(Mcce *mcp, const CollElem *cep, const unsigned char *s, int mb_cur_max, + int compile_time) +{ + const CollElem *nxt; + CollElem spare; + w_type ch, wc; + int i; + + /* + * Get next character. + */ + if ((wc = mcp->ch) != '\0') + { + mcp->ch = '\0'; + } + else if (ISONEBYTE(wc = *s++)) + { + if (wc == '\0') + return 0; + } + else if ((i = libuxre_mb2wc(&wc, s)) > 0) + { + s += i; + if (mcp->max != 0 && s > mcp->max) + return 0; + } + else if (i < 0) + return BKT_ILLSEQ; + /* + * Try out the this character as part of an MCCE. + * If BKT_ONECASE is set, this code tries both the lower- and + * uppercase version, continuing if it matches so far. + */ + ch = wc; + if (mcp->bp->flags & BKT_ONECASE) + { + if ((wc = to_lower(wc)) == ch) + ch = to_upper(wc); + } + for (;;) /* at most twice */ + { + if (cep == ELEM_BADCHAR) /* first character */ + { + if ((nxt = libuxre_collelem(mcp->col, &spare, wc)) + == ELEM_ENCODED + || (mcp->col->flags & CHF_MULTICH) == 0 + || s == mcp->max) + { + mcp->aft = s; + mcp->cep = nxt; + mcp->wc = wc; + break; + } + } + else + { + nxt = libuxre_collmult(mcp->col, cep, wc); + } + if (nxt != ELEM_BADCHAR) + { + /* + * Okay so far. Record this collating element + * if it's really one (not WGHT_IGNORE) and + * we've reached a new high point or it's the + * first match. + * + * If there's a possibility for more, call mcce() + * recursively for the subsequent characters. + */ + if (nxt->weight[0] != WGHT_IGNORE + && (mcp->aft < s || mcp->cep == ELEM_BADCHAR)) + { + mcp->aft = s; + mcp->cep = nxt; + mcp->wc = wc; + } + if (nxt->multbeg != 0 + && (mcp->max == 0 || s < mcp->max)) + { + if ((i = mcce(mcp, nxt, s, mb_cur_max, + compile_time)) != 0) + return i; + } + } + if (wc == ch) + break; + wc = ch; + } + return 0; +} + +static w_type +eqcls(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max) +{ + w_type last; + Mcce mcbuf; + int err; + + mcbuf.max = &s[n]; + mcbuf.aft = &s[0]; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcbuf.ch = '\0'; + if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0) + return err; + if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max) + return BKT_EEQUIV; + last = mcbuf.wc; + if (mcbuf.cep != ELEM_ENCODED && mcbuf.col->nweight > 1) + { + const CollElem *cep; + + /* + * The first and last weight[0] values for equivalence + * classes are stuffed into the terminator for the + * multiple character lists. If these values are + * scattered (elements that are not part of this + * equivalence class have weight[0] values between the + * two end points), then SUBN_SPECIAL is placed in + * this terminator. Note that weight[1] of the + * terminator must be other than WGHT_IGNORE, too. + */ + last = mcbuf.cep->weight[0]; + if ((cep = libuxre_collmult(bp->col, mcbuf.cep, 0)) + != ELEM_BADCHAR + && cep->weight[1] != WGHT_IGNORE) + { + last = cep->weight[1]; + if (cep->subnbeg == SUBN_SPECIAL) + { + unsigned int nq; + + /* + * Permit ranges up to the first and + * after the last. + */ + if (prev > 0 && prev != cep->weight[0] + && (prev = addrange(bp, + cep->weight[0], prev)) != 0) + { + return prev; + } + /* + * Record the equivalence class by storing + * the primary weight. + */ + if ((nq = bp->nquiv) < NQUIV) + bp->quiv[nq] = mcbuf.cep->weight[1]; + else + { + if (nq % NQUIV == 0 && (bp->exquiv = + realloc(bp->exquiv, + nq * sizeof(wuchar_type))) + == 0) + { + return REG_ESPACE; + } + nq -= NQUIV; + bp->exquiv[nq] = mcbuf.cep->weight[1]; + } + bp->nquiv++; + return last; + } + mcbuf.cep = cep; + } + mcbuf.wc = mcbuf.cep->weight[0]; + } + /* + * Determine range, if any, to install. + * + * If there's a pending low (prev > 0), then try to use it. + * + * Otherwise, try to use mcbuf.wc as the low end of the range. + * Since addrange() assumes that the low point has already been + * placed, we try to fool it by using a prev of one less than + * mcbuf.wc. But, if that value would not look like a valid + * low point of a range, we have to explicitly place mcbuf.wc. + */ + if (prev <= 0 && (prev = mcbuf.wc - 1) <= 0) + { + if ((prev = addrange(bp, mcbuf.wc, 0)) != 0) + return prev; + } + if ((mcbuf.wc = addrange(bp, last, prev)) != 0) + return mcbuf.wc; + return last; +} + +static w_type +clsym(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max) +{ + Mcce mcbuf; + int err; + + mcbuf.max = &s[n]; + mcbuf.aft = &s[0]; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcbuf.ch = '\0'; + if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0) + return err; + if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max) + return BKT_ECOLLATE; + if (mcbuf.cep != ELEM_ENCODED) + mcbuf.wc = mcbuf.cep->weight[0]; + if ((err = addrange(bp, mcbuf.wc, prev)) != 0) + return err; + return mcbuf.wc; +} + + /* + * Scans the rest of a bracket construction within a regular + * expression and fills in a description for it. + * The leading [ and the optional set complement indicator + * were handled already by the caller. + * Returns: + * <0 error (a BKT_* value) + * >0 success; equals how many bytes were scanned. + */ +LIBUXRE_STATIC int +libuxre_bktmbcomp(Bracket *bp, const unsigned char *pat0, + int flags, int mb_cur_max) +{ + static const Bracket zero = {0}; + const unsigned char *pat = pat0; + struct lc_collate *savecol; + w_type n, wc, prev = 0; + + /* + * Set represented set to empty. Easiest to copy an empty + * version over the caller's, (re)setting col and flags. + */ + savecol = bp->col; + *bp = zero; + bp->col = savecol; + bp->flags = flags + & (BKT_NEGATED | BKT_ONECASE | BKT_NOTNL | BKT_BADRANGE | + BKT_ODDRANGE); + /* + * Handle optional "empty" brackets; typically only used + * in combination with BKT_QUOTE or BKT_ESCAPE. + */ + if ((wc = *pat) == ']' && (flags & BKT_EMPTY) != 0) + return 1; + /* + * Populate *bp. + */ + for (;; prev = n) + { + switch (wc) + { + case '\0': + ebrack:; + n = BKT_EBRACK; + goto err; + case '\n': + if (flags & BKT_NLBAD) + goto ebrack; + goto regular; + case '/': + if (flags & BKT_SLASHBAD) + goto ebrack; + goto regular; + case '\\': + if ((flags & (BKT_ESCAPE | BKT_QUOTE + | BKT_ESCNL | BKT_ESCSEQ)) == 0) + { + goto regular; + } + switch (wc = *++pat) + { + default: + noesc:; + if ((flags & BKT_ESCAPE) == 0) + { + wc = '\\'; + pat--; + } + break; + case '\\': + case ']': + case '-': + case '^': + if ((flags & BKT_QUOTE) == 0) + goto noesc; + break; + case 'a': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + wc = '\a'; + break; + case 'b': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\b'; + break; + case 'f': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\f'; + break; + case 'n': + if ((flags & (BKT_ESCSEQ | BKT_ESCNL)) == 0) + goto noesc; + wc = '\n'; + break; + case 'r': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\r'; + break; + case 't': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\t'; + break; + case 'v': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + wc = '\v'; + break; + case 'x': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + if (!isxdigit(wc = *++pat)) + { + pat--; + goto noesc; + } + /* + * Take as many hex digits as possible, + * ignoring overflows. + * Any positive result is okay. + */ + n = 0; + do + { + if (isdigit(wc)) + wc -= '0'; + else if (isupper(wc)) + wc -= 'A' + 10; + else + wc -= 'a' + 10; + n <<= 4; + n |= wc; + } while (isxdigit(wc = *++pat)); + pat--; + if ((wc = n) <= 0) + { + n = BKT_BADESC; + goto err; + } + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + /* + * For compatibility (w/awk), + * permit "octal" 8 and 9. + */ + n = wc - '0'; + if ((wc = *++pat) >= '0' && wc <= '9') + { + n <<= 3; + n += wc - '0'; + if ((wc = *++pat) >= '0' && wc <= '9') + { + n <<= 3; + n += wc - '0'; + } + } + pat--; + if ((wc = n) <= 0) + { + n = BKT_BADESC; + goto err; + } + break; + } + goto regular; + case '[': + if (((wc = *++pat) == ':' || wc == '=' || wc == '.') && + (flags & BKT_NOI18N) == 0) + { + n = 0; + while (*++pat != wc || pat[1] != ']') + { + if (*pat == '\0') + { + badpat:; + n = BKT_BADPAT; + goto err; + } + else if (*pat == '/') + { + if (flags & BKT_SLASHBAD) + goto badpat; + } + else if (*pat == '\n') + { + if (flags & BKT_NLBAD) + goto badpat; + } + n++; + } + if (n == 0) + { + n = BKT_EMPTYSUBBKT; + goto err; + } + if (wc == ':') + n = chcls(bp, &pat[-n], n); + else if (wc == '=') + n = eqcls(bp, &pat[-n], n, prev, + mb_cur_max); + else /* wc == '.' */ + n = clsym(bp, &pat[-n], n, prev, + mb_cur_max); + pat++; + break; + } + wc = '['; + pat--; + goto regular; + default: + if (!ISONEBYTE(wc) && + (n = libuxre_mb2wc(&wc, pat + 1)) > 0) + pat += n; + regular:; + n = place(bp, wc, prev, mb_cur_max); + break; + } + if (n < 0) { + n = BKT_ILLSEQ; + goto err; + } + if ((wc = *++pat) == ']') + break; + if (wc == '-' && n != 0) + { + if (prev == 0 || (flags & BKT_SEPRANGE) == 0) + { + if ((wc = *++pat) != ']') + continue; /* valid range */ + wc = '-'; + pat--; + } + } + n = 0; /* no range this time */ + } + return pat - pat0 + 1; +err:; + libuxre_bktfree(bp); + return n; +} + +LIBUXRE_STATIC void +libuxre_bktfree(Bracket *bp) +{ + if (bp->extype != 0) + free(bp->extype); + if (bp->exquiv != 0) + free(bp->exquiv); + if (bp->exwide != 0) + free(bp->exwide); +} + +LIBUXRE_STATIC int +libuxre_bktmbexec(Bracket *bp, wchar_t wc, + const unsigned char *str, int mb_cur_max) +{ + unsigned int i; + wchar_t lc, uc; + Mcce mcbuf; + + mcbuf.aft = str; /* in case of match in character classes */ + mcbuf.ch = wc; + /* + * First: check the single wc against any character classes. + * Since multiple character collating elements are not part + * of this world, they don't apply here. + */ + if ((i = bp->ntype) != 0) + { + wctype_t *wctp = &bp->type[0]; + + if (bp->flags & BKT_ONECASE) + { + if ((wc = to_lower(wc)) == mcbuf.ch) + mcbuf.ch = to_upper(wc); + } + for (;;) + { + if (iswctype(mb_cur_max==1?btowc(wc):wc, *wctp)) + goto match; + if (wc != mcbuf.ch && + iswctype(mb_cur_max==1?btowc(mcbuf.ch):mcbuf.ch, + *wctp)) + goto match; + if (--i == 0) + break; + if (++wctp == &bp->type[NTYPE]) + wctp = &bp->extype[0]; + } + } + /* + * The main match is determined by the weight[0] value + * of the character (or characters, if the input can be + * taken as a multiple character collating element). + */ + mcbuf.max = 0; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcce(&mcbuf, ELEM_BADCHAR, str, mb_cur_max, 0); + if (mcbuf.cep == ELEM_BADCHAR) + return -1; /* never matches */ + if (mcbuf.cep != ELEM_ENCODED) + mcbuf.wc = mcbuf.cep->weight[0]; + /* + * POSIX.2 demands that both a character and its case counterpart + * can match if REG_ICASE is set. This means that [B-z] matches + * 'A', 'a', and '['. + */ + if (bp->flags & BKT_ONECASE) + { + lc = to_lower(mcbuf.wc); + uc = to_upper(mcbuf.wc); + } + else + lc = uc = mcbuf.wc; + /* + * See if it's in the set. Note that the list of true wide + * character values has explicit ranges. + */ + if (mcbuf.wc <= UCHAR_MAX) + { + if (bp->byte[PLIND(lc)] & PLBIT(lc)) + goto match; + if (lc != uc && (bp->byte[PLIND(uc)] & PLBIT(uc))) + goto match; + } + else if ((i = bp->nwide) != 0) + { + wchar_t *wcp = &bp->wide[0]; + long lcmp, ucmp; + + for (;;) + { + if ((lcmp = lc - *wcp) == 0) + goto match; + ucmp = uc - *wcp; + if (lc != uc && ucmp == 0) + goto match; + if (--i == 0) + break; + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + if (*wcp == RANGE) + { + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + if (lcmp > 0 && lc <= *wcp) + goto match; + if (lc != uc && ucmp > 0 && uc < *wcp) + goto match; + if ((i -= 2) == 0) + break; + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + } + } + } + /* + * The last chance for a match is if an equivalence class + * was specified for which the primary weights are scattered + * through the weight[0]s. + */ + if ((i = bp->nquiv) != 0 && mcbuf.cep != ELEM_ENCODED) + { + wuchar_type *wucp = &bp->quiv[0]; + + mcbuf.wc = mcbuf.cep->weight[1]; + for (;;) + { + if (mcbuf.wc == *wucp) + goto match; + if (--i == 0) + break; + if (++wucp == &bp->quiv[NQUIV]) + wucp = &bp->exquiv[0]; + } + } + /* + * Only here when no match against the set was found. + * One final special case w/r/t newline. + */ + if (bp->flags & BKT_NEGATED) + { + if (wc != '\n' || (bp->flags & BKT_NOTNL) == 0) + return mcbuf.aft - str; + } + return -1; +match:; + /* + * Only here when a match against the described set is found. + */ + if (bp->flags & BKT_NEGATED) + return -1; + return mcbuf.aft - str; +} |
