summaryrefslogtreecommitdiff
path: root/libuxre/re.h
blob: 9e0241eefd1007ffb92ceae2c32a506b8e7eb667 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
 * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
 *
 * Sccsid @(#)re.h	1.15 (gritter) 2/6/05
 */
/*  UNIX(R) Regular Expresssion Library
 *
 *  Note: Code is released under the GNU LGPL
 *
 *  Copyright (C) 2001 Caldera International, Inc.
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to:
 *        Free Software Foundation, Inc.
 *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#ifndef	LIBUXRE_RE_H
#define	LIBUXRE_RE_H

	/*
	* Maps safe external tag to internal one
	*/
#define re_coll_	lc_collate	/* <regex.h> */
/*	#define __fnm_collate	lc_collate	*/	/* <fnmatch.h> */

#include <limits.h>
#include "uxre_regex.h"
/*	#include <fnmatch.h>	*/
#include <colldata.h>

#define NBSHT	(sizeof(unsigned short) * CHAR_BIT)
#define NBYTE	(((1 << CHAR_BIT) + NBSHT - 1) / NBSHT)
#define NTYPE	4
#define NWIDE	32
#define NQUIV	4

typedef struct
{
	struct lc_collate	*col;	/* only member set by caller */
	wctype_t		*extype;
	wuchar_type		*exquiv;
	wchar_t			*exwide;
	wctype_t		type[NTYPE];
	wuchar_type		quiv[NQUIV];
	wchar_t			wide[NWIDE];
	unsigned short		byte[NBYTE];
	unsigned short		ntype;
	unsigned short		nquiv;
	unsigned short		nwide;
	unsigned int		flags;
} Bracket;

#define BKT_NEGATED	0x001	/* complemented set */
#define BKT_ONECASE	0x002	/* uppercase same as lowercase */
#define BKT_NOTNL	0x004	/* do not match newline when BKT_NEGATED */
#define BKT_BADRANGE	0x008	/* accept [m-a] ranges as [ma] */
#define BKT_SEPRANGE	0x010	/* disallow [a-m-z] style ranges */
#define BKT_NLBAD	0x020	/* newline disallowed */
#define BKT_SLASHBAD	0x040	/* slash disallowed (for pathnames) */
#define BKT_EMPTY	0x080	/* take leading ] is end (empty set) */
#define BKT_ESCAPE	0x100	/* allow \ as quote for next anything */
#define BKT_QUOTE	0x200	/* allow \ as quote for \\, \^, \- or \] */
#define BKT_ESCNL	0x400	/* take \n as the newline character */
#define BKT_ESCSEQ	0x800	/* otherwise, take \ as in C escapes */
#define	BKT_ODDRANGE	0x1000	/* oawk oddity: [m-a] means [m] */
#define	BKT_NOI18N	0x2000	/* disable [::] [==] [..] */
#define	BKT_OLDESC	0x4000	/* enable \b \f \n \r \t only */

	/*
	* These error returns for libuxre_bktmbcomp() are directly tied to
	* the error returns for regcomp() for convenience.
	*/
#define BKT_BADPAT	(-REG_BADPAT)
#define BKT_ECOLLATE	(-REG_ECOLLATE)
#define BKT_ECTYPE	(-REG_ECTYPE)
#define BKT_EEQUIV	(-REG_EEQUIV)
#define BKT_BADCHAR	(-REG_EBKTCHAR)
#define BKT_EBRACK	(-REG_EBRACK)
#define BKT_EMPTYSUBBKT	(-REG_EMPTYSUBBKT)
#define BKT_ERANGE	(-REG_ERANGE)
#define BKT_ESPACE	(-REG_ESPACE)
#define BKT_BADESC	(-REG_BADESC)
#define	BKT_ILLSEQ	(-REG_ILLSEQ)

	/*
	* These must be distinct from the flags in <fnmatch.h>.
	*/
#define FNM_COLLATE	0x2000	/* have collation information */
#define FNM_CURRENT	0x4000	/* have full-sized fnm_t structure */

	/*
	* These must be distinct from the flags in <regex.h>.
	*/
#define REG_NFA		0x20000000
#define REG_DFA		0x40000000
#define REG_GOTBKT	0x80000000

#define BRACE_INF	USHRT_MAX
#define BRACE_MAX	5100	/* arbitrary number < SHRT_MAX */
#define BRACE_DFAMAX	255	/* max amount for r.e. duplication */

typedef union	/* extra info always kept for some tokens/nodes */
{
	Bracket		*bkt;	/* ROP_BKT */
	size_t		sub;	/* ROP_LP (ROP_RP), ROP_REF */
	unsigned short	num[2];	/* ROP_BRACE: num[0]=low, num[1]=high */
} Info;

typedef struct	/* lexical context while parsing */
{
	Info			info;
	const unsigned char	*pat;
	unsigned char		*clist;
	struct lc_collate	*col;
	unsigned long		flags;
	w_type			tok;
	size_t			maxref;
	size_t			nleft;
	size_t			nright;
	size_t			nclist;
	int			bktflags;
	int			err;
	int			mb_cur_max;
} Lex;

typedef struct t_tree	Tree;	/* RE parse tree node */
struct t_tree
{
	union
	{
		Tree	*ptr;	/* unary & binary nodes */
		size_t	pos;	/* position for DFA leaves */
	} left;
	union
	{
		Tree	*ptr;	/* binary nodes */
		Info	info;
	} right;
	Tree		*parent;
	w_type		op;	/* positive => char. to match */
};

typedef struct re_dfa_	Dfa;	/* DFA engine description */
typedef struct re_nfa_	Nfa;	/* NFA engine description */

typedef struct
{
	const unsigned char	*str;
	regmatch_t		*match;
	size_t			nmatch;
	unsigned long		flags;
	int			mb_cur_max;
} Exec;

	/*
	* Regular expression operators.  Some only used internally.
	* All are negative, to distinguish them from the regular
	* "match this particular wide character" operation.
	*/
#define BINARY_ROP	0x02
#define UNARY_ROP	0x01
#define LEAF_ROP	0x00

#define MAKE_ROP(k, v)	(-((v) | ((k) << 4)))
#define KIND_ROP(v)	((-(v)) >> 4)

#define ROP_OR		MAKE_ROP(BINARY_ROP, 1)
#define ROP_CAT		MAKE_ROP(BINARY_ROP, 2)

#define ROP_STAR	MAKE_ROP(UNARY_ROP, 1)
#define ROP_PLUS	MAKE_ROP(UNARY_ROP, 2)
#define ROP_QUEST	MAKE_ROP(UNARY_ROP, 3)
#define ROP_BRACE	MAKE_ROP(UNARY_ROP, 4)
#define ROP_LP		MAKE_ROP(UNARY_ROP, 5)
#define ROP_RP		MAKE_ROP(UNARY_ROP, 6)

#define ROP_NOP		MAKE_ROP(LEAF_ROP, 1)	/* temporary */
#define ROP_BOL		MAKE_ROP(LEAF_ROP, 2)	/* ^ anchor */
#define ROP_EOL		MAKE_ROP(LEAF_ROP, 3)	/* $ anchor */
#define ROP_ALL		MAKE_ROP(LEAF_ROP, 4)	/* anything (added) */
#define ROP_ANYCH	MAKE_ROP(LEAF_ROP, 5)	/* . w/\n */
#define ROP_NOTNL	MAKE_ROP(LEAF_ROP, 6)	/* . w/out \n */
#define ROP_EMPTY	MAKE_ROP(LEAF_ROP, 7)	/* empty string */
#define ROP_NONE	MAKE_ROP(LEAF_ROP, 8)	/* match failure */
#define ROP_BKT		MAKE_ROP(LEAF_ROP, 9)	/* [...] */
#define ROP_BKTCOPY	MAKE_ROP(LEAF_ROP, 10)	/* [...] (duplicated) */
#define ROP_LT		MAKE_ROP(LEAF_ROP, 11)	/* \< word begin */
#define ROP_GT		MAKE_ROP(LEAF_ROP, 12)	/* \> word end */
#define ROP_REF		MAKE_ROP(LEAF_ROP, 13)	/* \digit */
#define ROP_END		MAKE_ROP(LEAF_ROP, 14)	/* final (added) */

	/*
	* Return values:
	*  libuxre_bktmbcomp()
	*	<0 error (see BKT_* above); >0 #bytes scanned
	*  libuxre_bktmbexec()
	*	<0 doesn't match; >=0 matches, #extra bytes scanned
	*/
LIBUXRE_STATIC void	libuxre_bktfree(Bracket *);
LIBUXRE_STATIC int	libuxre_bktmbcomp(Bracket *, const unsigned char *,
				int, int);
LIBUXRE_STATIC int	libuxre_bktmbexec(Bracket *, wchar_t,
				const unsigned char *, int);

LIBUXRE_STATIC void	libuxre_regdeltree(Tree *, int);
LIBUXRE_STATIC Tree	*libuxre_reg1tree(w_type, Tree *);
LIBUXRE_STATIC Tree	*libuxre_reg2tree(w_type, Tree *, Tree *);
LIBUXRE_STATIC Tree	*libuxre_regparse(Lex *, const unsigned char *, int);

extern void		libuxre_regdeldfa(Dfa *);
LIBUXRE_STATIC int	libuxre_regdfacomp(regex_t *, Tree *, Lex *);
LIBUXRE_STATIC int	libuxre_regdfaexec(Dfa *, Exec *);

extern void		libuxre_regdelnfa(Nfa *);
LIBUXRE_STATIC int	libuxre_regnfacomp(regex_t *, Tree *, Lex *);
LIBUXRE_STATIC int	libuxre_regnfaexec(Nfa *, Exec *);
#endif	/* !LIBUXRE_RE_H */