diff options
| author | Thomas Ulmer <thomasmulmer02@gmail.com> | 2026-02-23 16:54:28 -0800 |
|---|---|---|
| committer | Thomas Ulmer <thomasmulmer02@gmail.com> | 2026-02-23 16:54:28 -0800 |
| commit | 15bd7946cc838a3151c357e4b0bc1ab85eecda62 (patch) | |
| tree | 56977cb9bfc4349f46e2c608503a298df30ca957 /libuxre | |
add musl and vi
Diffstat (limited to 'libuxre')
| -rw-r--r-- | libuxre/COPYING.LGPL | 504 | ||||
| -rw-r--r-- | libuxre/Makefile | 12 | ||||
| -rw-r--r-- | libuxre/NOTES | 14 | ||||
| -rw-r--r-- | libuxre/_collelem.c | 119 | ||||
| -rw-r--r-- | libuxre/_collmult.c | 55 | ||||
| -rw-r--r-- | libuxre/bracket.c | 829 | ||||
| -rw-r--r-- | libuxre/colldata.h | 226 | ||||
| -rw-r--r-- | libuxre/onefile.c | 38 | ||||
| -rw-r--r-- | libuxre/re.h | 228 | ||||
| -rw-r--r-- | libuxre/regcomp.c | 77 | ||||
| -rw-r--r-- | libuxre/regdfa.c | 877 | ||||
| -rw-r--r-- | libuxre/regdfa.h | 75 | ||||
| -rw-r--r-- | libuxre/regerror.c | 95 | ||||
| -rw-r--r-- | libuxre/regex.h | 153 | ||||
| -rw-r--r-- | libuxre/regexec.c | 68 | ||||
| -rw-r--r-- | libuxre/regfree.c | 42 | ||||
| -rw-r--r-- | libuxre/regnfa.c | 1070 | ||||
| -rw-r--r-- | libuxre/regparse.c | 1091 | ||||
| -rw-r--r-- | libuxre/stubs.c | 97 | ||||
| -rw-r--r-- | libuxre/wcharm.h | 63 |
20 files changed, 5733 insertions, 0 deletions
diff --git a/libuxre/COPYING.LGPL b/libuxre/COPYING.LGPL new file mode 100644 index 0000000..b1e3f5a --- /dev/null +++ b/libuxre/COPYING.LGPL @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/libuxre/Makefile b/libuxre/Makefile new file mode 100644 index 0000000..46d7320 --- /dev/null +++ b/libuxre/Makefile @@ -0,0 +1,12 @@ +CFLAGS = $(COPT) $(RPMCFLAGS) -I. +OBJS = bracket.o _collelem.o _collmult.o regcomp.o regdfa.o regerror.o regexec.o regfree.o regnfa.o regparse.o stubs.o + +.c.o: ; $(CC) $(CFLAGS) -c $< + +all: libuxre.a + +libuxre.a: $(OBJS) + ar cr libuxre.a $(OBJS) + +clean: + rm -f libuxre.a $(OBJS) core diff --git a/libuxre/NOTES b/libuxre/NOTES new file mode 100644 index 0000000..19aedf1 --- /dev/null +++ b/libuxre/NOTES @@ -0,0 +1,14 @@ +Notes for the modified 'UNIX(R) Regular Expression Library' +============================================================ + +The code this is based on was released by Caldera as 'osutils-0.1a' +and is available at <http://unixtools.sourceforge.net/>. Notable +changes include: + +- Support for multibyte characters was enabled again. +- Support for traditional extended regular expression syntax was added. +- Fix: With REG_ICASE, [B-z] matches 'A', 'a', and '[' according to + POSIX.2. +- Some speed improvements. + + Gunnar Ritter 9/22/03 diff --git a/libuxre/_collelem.c b/libuxre/_collelem.c new file mode 100644 index 0000000..c5dbb05 --- /dev/null +++ b/libuxre/_collelem.c @@ -0,0 +1,119 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)_collelem.c 1.4 (gritter) 10/18/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include "colldata.h" +#include <stddef.h> + +#define CCE(p) ((const CollElem *)(p)) +#define CCM(p) ((const CollMult *)(p)) + +LIBUXRE_STATIC const CollElem * +libuxre_collelem(struct lc_collate *col, CollElem *spare, wchar_t wc) +{ + const char *tbl; + size_t hi, lo, cur; + const CollMult *cmp; + const CollElem *cep; + long diff; + int sz; + + /* + * ELEM_ENCODED is returned when the collation is entirely + * based on the encoded value of the character. + */ + if (col == 0 || col->flags & CHF_ENCODED + || (tbl = (const char *)col->maintbl) == 0) + { + return ELEM_ENCODED; + } + if ((wuchar_type)wc <= UCHAR_MAX) + { + indexed:; + cep = CCE(&tbl[(wuchar_type)wc * col->elemsize]); + if (cep->weight[0] == WGHT_SPECIAL) + return ELEM_BADCHAR; + return cep; + } + if (col->flags & CHF_INDEXED) + { + if ((wuchar_type)wc >= col->nmain) + return ELEM_BADCHAR; + goto indexed; + } + /* + * Binary search for a match. Could speed up the search if + * some interpolation was used, but keep it simple for now. + * Note that this is actually a table of CollMult's. + * + * To save space in the file, sequences of similar elements + * are sometimes compressed into a single CollMult that + * describes many entries. This is denoted by a subnbeg + * with the SUBN_SPECIAL bit set. The rest of the bits give + * the range covered by this entry. + */ + sz = col->elemsize + (sizeof(CollMult) - sizeof(CollElem)); + tbl += (1 + UCHAR_MAX) * col->elemsize; + lo = 0; + hi = col->nmain - UCHAR_MAX; + while (lo < hi) + { + if ((cur = (hi + lo) >> 1) < lo) /* hi+lo overflowed */ + cur |= ~(~(size_t)0 >> 1); /* lost high order bit */ + cmp = CCM(&tbl[cur * sz]); + if ((diff = wc - cmp->ch) < 0) + hi = cur; + else if (cmp->elem.subnbeg & SUBN_SPECIAL) + { + if (diff > (long)(cmp->elem.subnbeg & ~SUBN_SPECIAL)) + lo = cur + 1; + else /* create an entry from the sequence in spare */ + { + spare->multbeg = cmp->elem.multbeg; + spare->subnbeg = 0; + spare->weight[0] = cmp->elem.weight[0] + diff; + for (lo = 1; lo < col->nweight; lo++) + { + wuchar_type w; + + if ((w = cmp->elem.weight[lo]) + == WGHT_SPECIAL) + { + w = spare->weight[0]; + } + spare->weight[lo] = w; + } + return spare; + } + } + else if (diff == 0) + return &cmp->elem; + else + lo = cur + 1; + } + return ELEM_BADCHAR; +} diff --git a/libuxre/_collmult.c b/libuxre/_collmult.c new file mode 100644 index 0000000..7a199b3 --- /dev/null +++ b/libuxre/_collmult.c @@ -0,0 +1,55 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)_collmult.c 1.4 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include "colldata.h" +#include <stddef.h> + +#define CCM(p) ((const CollMult *)(p)) + +LIBUXRE_STATIC const CollElem * +libuxre_collmult(struct lc_collate *col, const CollElem *cep, wchar_t wc) +{ + const char *tbl; + size_t sz; + w_type ch; + + if (col == 0 || cep->multbeg == 0 + || (tbl = (const char *)col->multtbl) == 0) + { + return ELEM_BADCHAR; + } + sz = col->elemsize + (sizeof(CollMult) - sizeof(CollElem)); + tbl += sz * cep->multbeg; + while ((ch = CCM(tbl)->ch) != wc) + { + if (ch == 0) + return ELEM_BADCHAR; /* end of list */ + tbl += sz; + } + return &CCM(tbl)->elem; +} diff --git a/libuxre/bracket.c b/libuxre/bracket.c new file mode 100644 index 0000000..bc31b23 --- /dev/null +++ b/libuxre/bracket.c @@ -0,0 +1,829 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)bracket.c 1.14 (gritter) 10/18/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include "re.h" + +/* +* Build and match the [...] part of REs. +* +* In general, each compiled bracket construct holds a set of mapped +* wide character values and a set of character classifications. +* The mapping applied (when the current LC_COLLATE is not CHF_ENCODED) +* is the "basic" weight (cep->weight[0]); otherwise the actual wide +* character is used. +* +* To support simplified range handling, this code assumes that a w_type, +* a signed integer type, can hold all valid basic weight values (as well +* as all wide character values for CHF_ENCODED locales) and that these +* are all positive. Negative values indicate error conditions (BKT_*); +* zero (which must be the same as WGHT_IGNORE) indicates success, but +* that the item installed is not a range endpoint. +*/ + +static int +addwide(Bracket *bp, wchar_t ord) +{ + unsigned int nw; + + if ((nw = bp->nwide) < NWIDE) + bp->wide[nw] = ord; + else + { + if (nw % NWIDE == 0 && (bp->exwide = + realloc(bp->exwide, nw * sizeof(wchar_t))) == 0) + { + return BKT_ESPACE; + } + nw -= NWIDE; + bp->exwide[nw] = ord; + } + bp->nwide++; + return 0; +} + +#if USHRT_MAX == 65535 /* have 16 bits */ +#define PLIND(n) ((n) >> 4) +#define PLBIT(n) (1 << ((n) & 0xf)) +#else +#define PLIND(n) ((n) / CHAR_BIT) +#define PLBIT(n) (1 << ((n) % CHAR_BIT)) +#endif + +#define RANGE ((wchar_t)'-') /* separates wide chars in ranges */ + +static int +addrange(Bracket *bp, wchar_t ord, w_type prev) +{ + int ret; + + if (prev > 0 && prev != ord) /* try for range */ + { + if (prev > ord) + { + if (bp->flags & BKT_ODDRANGE) /* prev only - done */ + return 0; + else if ((bp->flags & BKT_BADRANGE) == 0) + return BKT_ERANGE; + } + else + { + if (++prev <= UCHAR_MAX) /* "prev" already there */ + { + do + { + bp->byte[PLIND(prev)] |= PLBIT(prev); + if (prev == ord) + return 0; + } while (++prev <= UCHAR_MAX); + } + if ((ret = addwide(bp, prev)) != 0) + return ret; + if (++prev > ord) + return 0; + if (prev < ord && (ret = addwide(bp, RANGE)) != 0) + return ret; + return addwide(bp, ord); + } + } + if (ord <= UCHAR_MAX) + { + bp->byte[PLIND(ord)] |= PLBIT(ord); + return 0; + } + if (prev == ord) /* don't bother */ + return 0; + return addwide(bp, ord); +} + +static w_type +place(Bracket *bp, wchar_t wc, w_type prev, int mb_cur_max) +{ + const CollElem *cep; + CollElem spare; + int ret; + + if ((cep = libuxre_collelem(bp->col, &spare, wc)) != ELEM_ENCODED) + { + if (cep == ELEM_BADCHAR) + return BKT_BADCHAR; + wc = cep->weight[0]; + } + if ((ret = addrange(bp, wc, prev)) != 0) + return ret; + return wc; +} + +#ifndef CHARCLASS_NAME_MAX +# define CHARCLASS_NAME_MAX 127 +#endif + +static w_type +chcls(Bracket *bp, const unsigned char *s, int n) +{ + char clsstr[CHARCLASS_NAME_MAX + 1]; + unsigned int nt; + wctype_t wct; + + if (n > CHARCLASS_NAME_MAX) + return BKT_ECTYPE; + (void)memcpy(clsstr, s, n); + clsstr[n] = '\0'; + if ((wct = wctype(clsstr)) == 0) + return BKT_ECTYPE; + if ((nt = bp->ntype) < NTYPE) + bp->type[nt] = wct; + else + { + if (nt % NTYPE == 0 && (bp->extype = + realloc(bp->extype, nt * sizeof(wctype_t))) == 0) + { + return BKT_ESPACE; + } + nt -= NTYPE; + bp->extype[nt] = wct; + } + bp->ntype++; + return 0; /* cannot be end point of a range */ +} + + /* + * The purpose of mcce() and its Mcce structure is to locate + * the next full collation element from "wc" and "s". It is + * called both at compile and execute time. These two differ + * primarily in that at compile time there is an exact number + * of bytes to be consumed, while at execute time the longest + * valid collation element is to be found. + * + * When BKT_ONECASE is set, MCCEs become particularly messy. + * There is no guarantee that all possible combinations of + * upper/lower case are defined as MCCEs. Thus, this code + * tries both lower- and uppercase (in that order) for each + * character than might be part of an MCCE. + */ + +typedef struct +{ + const unsigned char *max; /* restriction by caller */ + const unsigned char *aft; /* longest successful */ + Bracket *bp; /* readonly */ + struct lc_collate *col; /* readonly */ + const CollElem *cep; /* entry matching longest */ + wchar_t ch; /* initial character (if any) */ + w_type wc; /* character matching "aft" */ +} Mcce; + +static int +mcce(Mcce *mcp, const CollElem *cep, const unsigned char *s, int mb_cur_max, + int compile_time) +{ + const CollElem *nxt; + CollElem spare; + w_type ch, wc; + int i; + + /* + * Get next character. + */ + if ((wc = mcp->ch) != '\0') + { + mcp->ch = '\0'; + } + else if (ISONEBYTE(wc = *s++)) + { + if (wc == '\0') + return 0; + } + else if ((i = libuxre_mb2wc(&wc, s)) > 0) + { + s += i; + if (mcp->max != 0 && s > mcp->max) + return 0; + } + else if (i < 0) + return BKT_ILLSEQ; + /* + * Try out the this character as part of an MCCE. + * If BKT_ONECASE is set, this code tries both the lower- and + * uppercase version, continuing if it matches so far. + */ + ch = wc; + if (mcp->bp->flags & BKT_ONECASE) + { + if ((wc = to_lower(wc)) == ch) + ch = to_upper(wc); + } + for (;;) /* at most twice */ + { + if (cep == ELEM_BADCHAR) /* first character */ + { + if ((nxt = libuxre_collelem(mcp->col, &spare, wc)) + == ELEM_ENCODED + || (mcp->col->flags & CHF_MULTICH) == 0 + || s == mcp->max) + { + mcp->aft = s; + mcp->cep = nxt; + mcp->wc = wc; + break; + } + } + else + { + nxt = libuxre_collmult(mcp->col, cep, wc); + } + if (nxt != ELEM_BADCHAR) + { + /* + * Okay so far. Record this collating element + * if it's really one (not WGHT_IGNORE) and + * we've reached a new high point or it's the + * first match. + * + * If there's a possibility for more, call mcce() + * recursively for the subsequent characters. + */ + if (nxt->weight[0] != WGHT_IGNORE + && (mcp->aft < s || mcp->cep == ELEM_BADCHAR)) + { + mcp->aft = s; + mcp->cep = nxt; + mcp->wc = wc; + } + if (nxt->multbeg != 0 + && (mcp->max == 0 || s < mcp->max)) + { + if ((i = mcce(mcp, nxt, s, mb_cur_max, + compile_time)) != 0) + return i; + } + } + if (wc == ch) + break; + wc = ch; + } + return 0; +} + +static w_type +eqcls(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max) +{ + w_type last; + Mcce mcbuf; + int err; + + mcbuf.max = &s[n]; + mcbuf.aft = &s[0]; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcbuf.ch = '\0'; + if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0) + return err; + if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max) + return BKT_EEQUIV; + last = mcbuf.wc; + if (mcbuf.cep != ELEM_ENCODED && mcbuf.col->nweight > 1) + { + const CollElem *cep; + + /* + * The first and last weight[0] values for equivalence + * classes are stuffed into the terminator for the + * multiple character lists. If these values are + * scattered (elements that are not part of this + * equivalence class have weight[0] values between the + * two end points), then SUBN_SPECIAL is placed in + * this terminator. Note that weight[1] of the + * terminator must be other than WGHT_IGNORE, too. + */ + last = mcbuf.cep->weight[0]; + if ((cep = libuxre_collmult(bp->col, mcbuf.cep, 0)) + != ELEM_BADCHAR + && cep->weight[1] != WGHT_IGNORE) + { + last = cep->weight[1]; + if (cep->subnbeg == SUBN_SPECIAL) + { + unsigned int nq; + + /* + * Permit ranges up to the first and + * after the last. + */ + if (prev > 0 && prev != cep->weight[0] + && (prev = addrange(bp, + cep->weight[0], prev)) != 0) + { + return prev; + } + /* + * Record the equivalence class by storing + * the primary weight. + */ + if ((nq = bp->nquiv) < NQUIV) + bp->quiv[nq] = mcbuf.cep->weight[1]; + else + { + if (nq % NQUIV == 0 && (bp->exquiv = + realloc(bp->exquiv, + nq * sizeof(wuchar_type))) + == 0) + { + return REG_ESPACE; + } + nq -= NQUIV; + bp->exquiv[nq] = mcbuf.cep->weight[1]; + } + bp->nquiv++; + return last; + } + mcbuf.cep = cep; + } + mcbuf.wc = mcbuf.cep->weight[0]; + } + /* + * Determine range, if any, to install. + * + * If there's a pending low (prev > 0), then try to use it. + * + * Otherwise, try to use mcbuf.wc as the low end of the range. + * Since addrange() assumes that the low point has already been + * placed, we try to fool it by using a prev of one less than + * mcbuf.wc. But, if that value would not look like a valid + * low point of a range, we have to explicitly place mcbuf.wc. + */ + if (prev <= 0 && (prev = mcbuf.wc - 1) <= 0) + { + if ((prev = addrange(bp, mcbuf.wc, 0)) != 0) + return prev; + } + if ((mcbuf.wc = addrange(bp, last, prev)) != 0) + return mcbuf.wc; + return last; +} + +static w_type +clsym(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max) +{ + Mcce mcbuf; + int err; + + mcbuf.max = &s[n]; + mcbuf.aft = &s[0]; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcbuf.ch = '\0'; + if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0) + return err; + if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max) + return BKT_ECOLLATE; + if (mcbuf.cep != ELEM_ENCODED) + mcbuf.wc = mcbuf.cep->weight[0]; + if ((err = addrange(bp, mcbuf.wc, prev)) != 0) + return err; + return mcbuf.wc; +} + + /* + * Scans the rest of a bracket construction within a regular + * expression and fills in a description for it. + * The leading [ and the optional set complement indicator + * were handled already by the caller. + * Returns: + * <0 error (a BKT_* value) + * >0 success; equals how many bytes were scanned. + */ +LIBUXRE_STATIC int +libuxre_bktmbcomp(Bracket *bp, const unsigned char *pat0, + int flags, int mb_cur_max) +{ + static const Bracket zero = {0}; + const unsigned char *pat = pat0; + struct lc_collate *savecol; + w_type n, wc, prev = 0; + + /* + * Set represented set to empty. Easiest to copy an empty + * version over the caller's, (re)setting col and flags. + */ + savecol = bp->col; + *bp = zero; + bp->col = savecol; + bp->flags = flags + & (BKT_NEGATED | BKT_ONECASE | BKT_NOTNL | BKT_BADRANGE | + BKT_ODDRANGE); + /* + * Handle optional "empty" brackets; typically only used + * in combination with BKT_QUOTE or BKT_ESCAPE. + */ + if ((wc = *pat) == ']' && (flags & BKT_EMPTY) != 0) + return 1; + /* + * Populate *bp. + */ + for (;; prev = n) + { + switch (wc) + { + case '\0': + ebrack:; + n = BKT_EBRACK; + goto err; + case '\n': + if (flags & BKT_NLBAD) + goto ebrack; + goto regular; + case '/': + if (flags & BKT_SLASHBAD) + goto ebrack; + goto regular; + case '\\': + if ((flags & (BKT_ESCAPE | BKT_QUOTE + | BKT_ESCNL | BKT_ESCSEQ)) == 0) + { + goto regular; + } + switch (wc = *++pat) + { + default: + noesc:; + if ((flags & BKT_ESCAPE) == 0) + { + wc = '\\'; + pat--; + } + break; + case '\\': + case ']': + case '-': + case '^': + if ((flags & BKT_QUOTE) == 0) + goto noesc; + break; + case 'a': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + wc = '\a'; + break; + case 'b': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\b'; + break; + case 'f': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\f'; + break; + case 'n': + if ((flags & (BKT_ESCSEQ | BKT_ESCNL)) == 0) + goto noesc; + wc = '\n'; + break; + case 'r': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\r'; + break; + case 't': + if ((flags & BKT_ESCSEQ) == 0) + goto noesc; + wc = '\t'; + break; + case 'v': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + wc = '\v'; + break; + case 'x': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + if (!isxdigit(wc = *++pat)) + { + pat--; + goto noesc; + } + /* + * Take as many hex digits as possible, + * ignoring overflows. + * Any positive result is okay. + */ + n = 0; + do + { + if (isdigit(wc)) + wc -= '0'; + else if (isupper(wc)) + wc -= 'A' + 10; + else + wc -= 'a' + 10; + n <<= 4; + n |= wc; + } while (isxdigit(wc = *++pat)); + pat--; + if ((wc = n) <= 0) + { + n = BKT_BADESC; + goto err; + } + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if ((flags & BKT_ESCSEQ) == 0 || + (flags & BKT_OLDESC)) + goto noesc; + /* + * For compatibility (w/awk), + * permit "octal" 8 and 9. + */ + n = wc - '0'; + if ((wc = *++pat) >= '0' && wc <= '9') + { + n <<= 3; + n += wc - '0'; + if ((wc = *++pat) >= '0' && wc <= '9') + { + n <<= 3; + n += wc - '0'; + } + } + pat--; + if ((wc = n) <= 0) + { + n = BKT_BADESC; + goto err; + } + break; + } + goto regular; + case '[': + if (((wc = *++pat) == ':' || wc == '=' || wc == '.') && + (flags & BKT_NOI18N) == 0) + { + n = 0; + while (*++pat != wc || pat[1] != ']') + { + if (*pat == '\0') + { + badpat:; + n = BKT_BADPAT; + goto err; + } + else if (*pat == '/') + { + if (flags & BKT_SLASHBAD) + goto badpat; + } + else if (*pat == '\n') + { + if (flags & BKT_NLBAD) + goto badpat; + } + n++; + } + if (n == 0) + { + n = BKT_EMPTYSUBBKT; + goto err; + } + if (wc == ':') + n = chcls(bp, &pat[-n], n); + else if (wc == '=') + n = eqcls(bp, &pat[-n], n, prev, + mb_cur_max); + else /* wc == '.' */ + n = clsym(bp, &pat[-n], n, prev, + mb_cur_max); + pat++; + break; + } + wc = '['; + pat--; + goto regular; + default: + if (!ISONEBYTE(wc) && + (n = libuxre_mb2wc(&wc, pat + 1)) > 0) + pat += n; + regular:; + n = place(bp, wc, prev, mb_cur_max); + break; + } + if (n < 0) { + n = BKT_ILLSEQ; + goto err; + } + if ((wc = *++pat) == ']') + break; + if (wc == '-' && n != 0) + { + if (prev == 0 || (flags & BKT_SEPRANGE) == 0) + { + if ((wc = *++pat) != ']') + continue; /* valid range */ + wc = '-'; + pat--; + } + } + n = 0; /* no range this time */ + } + return pat - pat0 + 1; +err:; + libuxre_bktfree(bp); + return n; +} + +LIBUXRE_STATIC void +libuxre_bktfree(Bracket *bp) +{ + if (bp->extype != 0) + free(bp->extype); + if (bp->exquiv != 0) + free(bp->exquiv); + if (bp->exwide != 0) + free(bp->exwide); +} + +LIBUXRE_STATIC int +libuxre_bktmbexec(Bracket *bp, wchar_t wc, + const unsigned char *str, int mb_cur_max) +{ + unsigned int i; + wchar_t lc, uc; + Mcce mcbuf; + + mcbuf.aft = str; /* in case of match in character classes */ + mcbuf.ch = wc; + /* + * First: check the single wc against any character classes. + * Since multiple character collating elements are not part + * of this world, they don't apply here. + */ + if ((i = bp->ntype) != 0) + { + wctype_t *wctp = &bp->type[0]; + + if (bp->flags & BKT_ONECASE) + { + if ((wc = to_lower(wc)) == mcbuf.ch) + mcbuf.ch = to_upper(wc); + } + for (;;) + { + if (iswctype(mb_cur_max==1?btowc(wc):wc, *wctp)) + goto match; + if (wc != mcbuf.ch && + iswctype(mb_cur_max==1?btowc(mcbuf.ch):mcbuf.ch, + *wctp)) + goto match; + if (--i == 0) + break; + if (++wctp == &bp->type[NTYPE]) + wctp = &bp->extype[0]; + } + } + /* + * The main match is determined by the weight[0] value + * of the character (or characters, if the input can be + * taken as a multiple character collating element). + */ + mcbuf.max = 0; + mcbuf.bp = bp; + mcbuf.col = bp->col; + mcbuf.cep = ELEM_BADCHAR; + mcce(&mcbuf, ELEM_BADCHAR, str, mb_cur_max, 0); + if (mcbuf.cep == ELEM_BADCHAR) + return -1; /* never matches */ + if (mcbuf.cep != ELEM_ENCODED) + mcbuf.wc = mcbuf.cep->weight[0]; + /* + * POSIX.2 demands that both a character and its case counterpart + * can match if REG_ICASE is set. This means that [B-z] matches + * 'A', 'a', and '['. + */ + if (bp->flags & BKT_ONECASE) + { + lc = to_lower(mcbuf.wc); + uc = to_upper(mcbuf.wc); + } + else + lc = uc = mcbuf.wc; + /* + * See if it's in the set. Note that the list of true wide + * character values has explicit ranges. + */ + if (mcbuf.wc <= UCHAR_MAX) + { + if (bp->byte[PLIND(lc)] & PLBIT(lc)) + goto match; + if (lc != uc && (bp->byte[PLIND(uc)] & PLBIT(uc))) + goto match; + } + else if ((i = bp->nwide) != 0) + { + wchar_t *wcp = &bp->wide[0]; + long lcmp, ucmp; + + for (;;) + { + if ((lcmp = lc - *wcp) == 0) + goto match; + ucmp = uc - *wcp; + if (lc != uc && ucmp == 0) + goto match; + if (--i == 0) + break; + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + if (*wcp == RANGE) + { + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + if (lcmp > 0 && lc <= *wcp) + goto match; + if (lc != uc && ucmp > 0 && uc < *wcp) + goto match; + if ((i -= 2) == 0) + break; + if (++wcp == &bp->wide[NWIDE]) + wcp = &bp->exwide[0]; + } + } + } + /* + * The last chance for a match is if an equivalence class + * was specified for which the primary weights are scattered + * through the weight[0]s. + */ + if ((i = bp->nquiv) != 0 && mcbuf.cep != ELEM_ENCODED) + { + wuchar_type *wucp = &bp->quiv[0]; + + mcbuf.wc = mcbuf.cep->weight[1]; + for (;;) + { + if (mcbuf.wc == *wucp) + goto match; + if (--i == 0) + break; + if (++wucp == &bp->quiv[NQUIV]) + wucp = &bp->exquiv[0]; + } + } + /* + * Only here when no match against the set was found. + * One final special case w/r/t newline. + */ + if (bp->flags & BKT_NEGATED) + { + if (wc != '\n' || (bp->flags & BKT_NOTNL) == 0) + return mcbuf.aft - str; + } + return -1; +match:; + /* + * Only here when a match against the described set is found. + */ + if (bp->flags & BKT_NEGATED) + return -1; + return mcbuf.aft - str; +} diff --git a/libuxre/colldata.h b/libuxre/colldata.h new file mode 100644 index 0000000..e3a3784 --- /dev/null +++ b/libuxre/colldata.h @@ -0,0 +1,226 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)colldata.h 1.5 (gritter) 5/1/04 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LIBUXRE_COLLDATA_H +#define LIBUXRE_COLLDATA_H + +typedef struct +{ + long coll_offst; /* offset to xnd table */ + long sub_cnt; /* length of subnd table */ + long sub_offst; /* offset to subnd table */ + long str_offst; /* offset to strings for subnd table */ + long flags; /* nonzero if reg.exp. used */ +} hd; + +typedef struct +{ + unsigned char ch; /* character or number of followers */ + unsigned char pwt; /* primary weight */ + unsigned char swt; /* secondary weight */ + unsigned char ns; /* index of follower state list */ +} xnd; + +typedef struct +{ + char *exp; /* expression to be replaced */ + long explen; /* length of expression */ + char *repl; /* replacement string */ +} subnd; + +/*----------------------------------*/ + +#include <wcharm.h> +#include <limits.h> +/* #include <stdlock.h> */ + +/* +* Structure of a collation file: +* 1. CollHead (maintbl is 0 if CHF_ENCODED) +* if !CHF_ENCODED then +* 2. CollElem[bytes] (256 for 8 bit bytes) +* 3. if CHF_INDEXED then +* CollElem[wides] (nmain-256 for 8 bit bytes) +* else +* CollMult[wides] +* 4. CollMult[*] (none if multtbl is 0) +* 5. wuchar_type[*] (none if repltbl is 0) +* 6. CollSubn[*] (none if subntbl is 0) +* 7. strings (first is pathname for .so if CHF_DYNAMIC) +* +* The actual location of parts 2 through 7 is not important. +* +* The main table is in encoded value order. +* +* All indeces/offsets must be nonzero to be effective; zero is reserved +* to indicate no-such-entry. This implies either that an unused initial +* entry is placed in each of (4) through (7), or that the "start offset" +* given by the header is artificially pushed back by an entry size. +* +* Note that if CHF_ENCODED is not set, then nweight must be positive. +* +* If an element can begin a multiple character element, it contains a +* nonzero multbeg which is the initial index into (4) for its list; +* the list is terminated by a CollMult with a ch of zero. +* +* If there are elements with the same primary weight (weight[1]), then +* for each such element, it must have a CollMult list. The CollMult +* that terminates the list (ch==0) notes the lowest and highest basic +* weights for those elements with that same primary weight value +* respectively in weight[0] and weight[1]. If there are some basic +* weights between these values that do not have the same primary +* weight--are not in the equivalence class--then the terminator also +* has a SUBN_SPECIAL mark. Note that this list terminator should be +* shared when the elements are not multiple character collating +* elements because they wouldn't otherwise have a CollMult list. +* +* WGHT_IGNORE is used to denote ignored collating elements for a +* particular collation ordering pass. All main table entries other +* than for '\0' will have a non-WGHT_IGNORE weight[0]. However, it is +* possible for a CollMult entries from (4) to have a WGHT_IGNORE +* weight[0]: If, for example, "xyz" is a multiple character collating +* element, but "xy" is not, then the CollMult for "y" will have a +* WGHT_IGNORE weight[0]. Also, WGHT_IGNORE is used to terminate each +* list of replacement weights. +* +* Within (3), it is possible to describe a sequence of unremarkable +* collating elements with a single CollMult entry. If the SUBN_SPECIAL +* bit is set, the rest of subnbeg represents the number of collating +* elements covered by this entry. The weight[0] values are determined +* by adding the difference between the encoded value and the entry's ch +* value to the entry's weight[0]. This value is then substituted for +* any weight[n], n>0 that has only the WGHT_SPECIAL bit set. libuxre_collelem() +* hides any match to such an entry by filling in a "spare" CollElem. +* +* If there are substitution strings, then for each character that begins +* a string, it has a nonzero subnbeg which is similarly the initial +* index into (6). The indeces in (6) refer to offsets within (7). +*/ + +#define TOPBIT(t) (((t)1) << (sizeof(t) * CHAR_BIT - 1)) + +#define CHF_ENCODED 0x1 /* collation by encoded values only */ +#define CHF_INDEXED 0x2 /* main table indexed by encoded values */ +#define CHF_MULTICH 0x4 /* a multiple char. coll. elem. exists */ +#define CHF_DYNAMIC 0x8 /* shared object has collation functions */ + +#define CWF_BACKWARD 0x1 /* reversed ordering for this weight */ +#define CWF_POSITION 0x2 /* weight takes position into account */ + +#define CLVERS 1 /* most recent version */ + +#define WGHT_IGNORE 0 /* ignore this collating element */ +#define WGHT_SPECIAL TOPBIT(wuchar_type) +#define SUBN_SPECIAL TOPBIT(unsigned short) + +#ifndef COLL_WEIGHTS_MAX +#define COLL_WEIGHTS_MAX 1 +#endif + +typedef struct +{ + unsigned long maintbl; /* start of main table */ + unsigned long multtbl; /* start of multi-char table */ + unsigned long repltbl; /* start of replacement weights */ + unsigned long subntbl; /* start of substitutions */ + unsigned long strstbl; /* start of sub. strings */ + unsigned long nmain; /* # entries in main table */ + unsigned short flags; /* CHF_* bits */ + unsigned short version; /* handle future changes */ + unsigned char elemsize; /* # bytes/element (w/padding) */ + unsigned char nweight; /* # weights/element */ + unsigned char order[COLL_WEIGHTS_MAX]; /* CWF_* bits/weight */ +} CollHead; + +typedef struct +{ + unsigned short multbeg; /* start of multi-chars */ + unsigned short subnbeg; /* start of substitutions */ + wuchar_type weight[COLL_WEIGHTS_MAX]; +} CollElem; + +typedef struct +{ + wchar_t ch; /* "this" character (of sequence) */ + CollElem elem; /* its full information */ +} CollMult; + +typedef struct +{ + unsigned short strbeg; /* start of match string */ + unsigned short length; /* length of match string */ + unsigned short repbeg; /* start of replacement */ +} CollSubn; + +struct lc_collate +{ + const unsigned char *strstbl; + const wuchar_type *repltbl; + const CollElem *maintbl; + const CollMult *multtbl; + const CollSubn *subntbl; +#ifdef DSHLIB + void *handle; + void (*done)(struct lc_collate *); + int (*strc)(struct lc_collate *, const char *, const char *); + int (*wcsc)(struct lc_collate *, const wchar_t *, const wchar_t *); + size_t (*strx)(struct lc_collate *, char *, const char *, size_t); + size_t (*wcsx)(struct lc_collate *, wchar_t *, const wchar_t *, size_t); +#endif + const char *mapobj; + size_t mapsize; + unsigned long nmain; + short nuse; + unsigned short flags; + unsigned char elemsize; + unsigned char nweight; + unsigned char order[COLL_WEIGHTS_MAX]; +}; + +#define ELEM_BADCHAR ((CollElem *)0) +#define ELEM_ENCODED ((CollElem *)-1) + +/* +LIBUXRE_STATIC int libuxre_old_collate(struct lc_collate *); +LIBUXRE_STATIC int libuxre_strqcoll(struct lc_collate *, const char *, + const char *); +LIBUXRE_STATIC int libuxre_wcsqcoll(struct lc_collate *, const wchar_t *, + const wchar_t *); +*/ +extern struct lc_collate *libuxre_lc_collate(struct lc_collate *); +LIBUXRE_STATIC const CollElem *libuxre_collelem(struct lc_collate *, + CollElem *, wchar_t); +LIBUXRE_STATIC const CollElem *libuxre_collmult(struct lc_collate *, + const CollElem *, wchar_t); +/* +LIBUXRE_STATIC const CollElem *libuxre_collmbs(struct lc_collate *, + CollElem *, const unsigned char **); +LIBUXRE_STATIC const CollElem *libuxre_collwcs(struct lc_collate *, + CollElem *, const wchar_t **); +*/ + +#endif /* !LIBUXRE_COLLDATA_H */ diff --git a/libuxre/onefile.c b/libuxre/onefile.c new file mode 100644 index 0000000..78f22a0 --- /dev/null +++ b/libuxre/onefile.c @@ -0,0 +1,38 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)onefile.c 1.1 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define LIBUXRE_STATIC static + +#include "_collelem.c" +#include "_collmult.c" +#include "stubs.c" +#include "bracket.c" +#include "regdfa.c" +#include "regnfa.c" +#include "regparse.c" +#include "regcomp.c" +#include "regexec.c" diff --git a/libuxre/re.h b/libuxre/re.h new file mode 100644 index 0000000..2738a05 --- /dev/null +++ b/libuxre/re.h @@ -0,0 +1,228 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)re.h 1.15 (gritter) 2/6/05 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LIBUXRE_RE_H +#define LIBUXRE_RE_H + + /* + * Maps safe external tag to internal one + */ +#define re_coll_ lc_collate /* <regex.h> */ +/* #define __fnm_collate lc_collate */ /* <fnmatch.h> */ + +#include <limits.h> +#include <regex.h> +/* #include <fnmatch.h> */ +#include <colldata.h> + +#define NBSHT (sizeof(unsigned short) * CHAR_BIT) +#define NBYTE (((1 << CHAR_BIT) + NBSHT - 1) / NBSHT) +#define NTYPE 4 +#define NWIDE 32 +#define NQUIV 4 + +typedef struct +{ + struct lc_collate *col; /* only member set by caller */ + wctype_t *extype; + wuchar_type *exquiv; + wchar_t *exwide; + wctype_t type[NTYPE]; + wuchar_type quiv[NQUIV]; + wchar_t wide[NWIDE]; + unsigned short byte[NBYTE]; + unsigned short ntype; + unsigned short nquiv; + unsigned short nwide; + unsigned int flags; +} Bracket; + +#define BKT_NEGATED 0x001 /* complemented set */ +#define BKT_ONECASE 0x002 /* uppercase same as lowercase */ +#define BKT_NOTNL 0x004 /* do not match newline when BKT_NEGATED */ +#define BKT_BADRANGE 0x008 /* accept [m-a] ranges as [ma] */ +#define BKT_SEPRANGE 0x010 /* disallow [a-m-z] style ranges */ +#define BKT_NLBAD 0x020 /* newline disallowed */ +#define BKT_SLASHBAD 0x040 /* slash disallowed (for pathnames) */ +#define BKT_EMPTY 0x080 /* take leading ] is end (empty set) */ +#define BKT_ESCAPE 0x100 /* allow \ as quote for next anything */ +#define BKT_QUOTE 0x200 /* allow \ as quote for \\, \^, \- or \] */ +#define BKT_ESCNL 0x400 /* take \n as the newline character */ +#define BKT_ESCSEQ 0x800 /* otherwise, take \ as in C escapes */ +#define BKT_ODDRANGE 0x1000 /* oawk oddity: [m-a] means [m] */ +#define BKT_NOI18N 0x2000 /* disable [::] [==] [..] */ +#define BKT_OLDESC 0x4000 /* enable \b \f \n \r \t only */ + + /* + * These error returns for libuxre_bktmbcomp() are directly tied to + * the error returns for regcomp() for convenience. + */ +#define BKT_BADPAT (-REG_BADPAT) +#define BKT_ECOLLATE (-REG_ECOLLATE) +#define BKT_ECTYPE (-REG_ECTYPE) +#define BKT_EEQUIV (-REG_EEQUIV) +#define BKT_BADCHAR (-REG_EBKTCHAR) +#define BKT_EBRACK (-REG_EBRACK) +#define BKT_EMPTYSUBBKT (-REG_EMPTYSUBBKT) +#define BKT_ERANGE (-REG_ERANGE) +#define BKT_ESPACE (-REG_ESPACE) +#define BKT_BADESC (-REG_BADESC) +#define BKT_ILLSEQ (-REG_ILLSEQ) + + /* + * These must be distinct from the flags in <fnmatch.h>. + */ +#define FNM_COLLATE 0x2000 /* have collation information */ +#define FNM_CURRENT 0x4000 /* have full-sized fnm_t structure */ + + /* + * These must be distinct from the flags in <regex.h>. + */ +#define REG_NFA 0x20000000 +#define REG_DFA 0x40000000 +#define REG_GOTBKT 0x80000000 + +#define BRACE_INF USHRT_MAX +#define BRACE_MAX 5100 /* arbitrary number < SHRT_MAX */ +#define BRACE_DFAMAX 255 /* max amount for r.e. duplication */ + +typedef union /* extra info always kept for some tokens/nodes */ +{ + Bracket *bkt; /* ROP_BKT */ + size_t sub; /* ROP_LP (ROP_RP), ROP_REF */ + unsigned short num[2]; /* ROP_BRACE: num[0]=low, num[1]=high */ +} Info; + +typedef struct /* lexical context while parsing */ +{ + Info info; + const unsigned char *pat; + unsigned char *clist; + struct lc_collate *col; + unsigned long flags; + w_type tok; + size_t maxref; + size_t nleft; + size_t nright; + size_t nclist; + int bktflags; + int err; + int mb_cur_max; +} Lex; + +typedef struct t_tree Tree; /* RE parse tree node */ +struct t_tree +{ + union + { + Tree *ptr; /* unary & binary nodes */ + size_t pos; /* position for DFA leaves */ + } left; + union + { + Tree *ptr; /* binary nodes */ + Info info; + } right; + Tree *parent; + w_type op; /* positive => char. to match */ +}; + +typedef struct re_dfa_ Dfa; /* DFA engine description */ +typedef struct re_nfa_ Nfa; /* NFA engine description */ + +typedef struct +{ + const unsigned char *str; + regmatch_t *match; + size_t nmatch; + unsigned long flags; + int mb_cur_max; +} Exec; + + /* + * Regular expression operators. Some only used internally. + * All are negative, to distinguish them from the regular + * "match this particular wide character" operation. + */ +#define BINARY_ROP 0x02 +#define UNARY_ROP 0x01 +#define LEAF_ROP 0x00 + +#define MAKE_ROP(k, v) (-((v) | ((k) << 4))) +#define KIND_ROP(v) ((-(v)) >> 4) + +#define ROP_OR MAKE_ROP(BINARY_ROP, 1) +#define ROP_CAT MAKE_ROP(BINARY_ROP, 2) + +#define ROP_STAR MAKE_ROP(UNARY_ROP, 1) +#define ROP_PLUS MAKE_ROP(UNARY_ROP, 2) +#define ROP_QUEST MAKE_ROP(UNARY_ROP, 3) +#define ROP_BRACE MAKE_ROP(UNARY_ROP, 4) +#define ROP_LP MAKE_ROP(UNARY_ROP, 5) +#define ROP_RP MAKE_ROP(UNARY_ROP, 6) + +#define ROP_NOP MAKE_ROP(LEAF_ROP, 1) /* temporary */ +#define ROP_BOL MAKE_ROP(LEAF_ROP, 2) /* ^ anchor */ +#define ROP_EOL MAKE_ROP(LEAF_ROP, 3) /* $ anchor */ +#define ROP_ALL MAKE_ROP(LEAF_ROP, 4) /* anything (added) */ +#define ROP_ANYCH MAKE_ROP(LEAF_ROP, 5) /* . w/\n */ +#define ROP_NOTNL MAKE_ROP(LEAF_ROP, 6) /* . w/out \n */ +#define ROP_EMPTY MAKE_ROP(LEAF_ROP, 7) /* empty string */ +#define ROP_NONE MAKE_ROP(LEAF_ROP, 8) /* match failure */ +#define ROP_BKT MAKE_ROP(LEAF_ROP, 9) /* [...] */ +#define ROP_BKTCOPY MAKE_ROP(LEAF_ROP, 10) /* [...] (duplicated) */ +#define ROP_LT MAKE_ROP(LEAF_ROP, 11) /* \< word begin */ +#define ROP_GT MAKE_ROP(LEAF_ROP, 12) /* \> word end */ +#define ROP_REF MAKE_ROP(LEAF_ROP, 13) /* \digit */ +#define ROP_END MAKE_ROP(LEAF_ROP, 14) /* final (added) */ + + /* + * Return values: + * libuxre_bktmbcomp() + * <0 error (see BKT_* above); >0 #bytes scanned + * libuxre_bktmbexec() + * <0 doesn't match; >=0 matches, #extra bytes scanned + */ +LIBUXRE_STATIC void libuxre_bktfree(Bracket *); +LIBUXRE_STATIC int libuxre_bktmbcomp(Bracket *, const unsigned char *, + int, int); +LIBUXRE_STATIC int libuxre_bktmbexec(Bracket *, wchar_t, + const unsigned char *, int); + +LIBUXRE_STATIC void libuxre_regdeltree(Tree *, int); +LIBUXRE_STATIC Tree *libuxre_reg1tree(w_type, Tree *); +LIBUXRE_STATIC Tree *libuxre_reg2tree(w_type, Tree *, Tree *); +LIBUXRE_STATIC Tree *libuxre_regparse(Lex *, const unsigned char *, int); + +extern void libuxre_regdeldfa(Dfa *); +LIBUXRE_STATIC int libuxre_regdfacomp(regex_t *, Tree *, Lex *); +LIBUXRE_STATIC int libuxre_regdfaexec(Dfa *, Exec *); + +extern void libuxre_regdelnfa(Nfa *); +LIBUXRE_STATIC int libuxre_regnfacomp(regex_t *, Tree *, Lex *); +LIBUXRE_STATIC int libuxre_regnfaexec(Nfa *, Exec *); +#endif /* !LIBUXRE_RE_H */ diff --git a/libuxre/regcomp.c b/libuxre/regcomp.c new file mode 100644 index 0000000..20a197d --- /dev/null +++ b/libuxre/regcomp.c @@ -0,0 +1,77 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regcomp.c 1.6 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include "re.h" + +/* #pragma weak regcomp = _regcomp */ + +int +regcomp(regex_t *ep, const char *pat, int flags) +{ + Tree *tp; + Lex lex; + + if ((tp=libuxre_regparse(&lex, (const unsigned char *)pat, flags)) == 0) + goto out; + ep->re_nsub = lex.nleft; + ep->re_flags = lex.flags & ~(REG_NOTBOL | REG_NOTEOL | REG_NONEMPTY); + ep->re_col = lex.col; + ep->re_mb_cur_max = lex.mb_cur_max; + /* + * Build the engine(s). The factors determining which are built: + * 1. If the pattern built insists on an NFA, then only build NFA. + * 2. If flags include REG_NOSUB or REG_ONESUB and not (1), + * then only build DFA. + * 3. Otherwise, build both. + * Since libuxre_regdfacomp() modifies the tree and libuxre_regnfacomp() + * doesn't, libuxre_regnfacomp() must be called first, if both are to + * be called. + */ + if (ep->re_nsub != 0 && (flags & (REG_NOSUB | REG_ONESUB)) == 0 + || lex.flags & REG_NFA) + { + ep->re_flags |= REG_NFA; + if ((lex.err = libuxre_regnfacomp(ep, tp, &lex)) != 0) + goto out; + } + if ((lex.flags & REG_NFA) == 0) + { + ep->re_flags |= REG_DFA; + if ((lex.err = libuxre_regdfacomp(ep, tp, &lex)) != 0) + { + if (ep->re_flags & REG_NFA) + libuxre_regdelnfa(ep->re_nfa); + } + } +out:; + if (lex.err != 0 && lex.col != 0) + (void)libuxre_lc_collate(lex.col); + if (tp != 0) + libuxre_regdeltree(tp, lex.err); + return lex.err; +} diff --git a/libuxre/regdfa.c b/libuxre/regdfa.c new file mode 100644 index 0000000..8142e8d --- /dev/null +++ b/libuxre/regdfa.c @@ -0,0 +1,877 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regdfa.c 1.9 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "regdfa.h" + +/* +* Deterministic Finite Automata. +*/ + + /* + * Postorder traversal that returns a copy of the subtree, + * except that ROP_BKT becomes ROP_BKTCOPY (since they + * share the same pointed to Bracket object). + */ +static Tree * +copy(regex_t *ep, Tree *tp) +{ + Tree *np; + + if ((np = malloc(sizeof(Tree))) == 0) + return 0; + switch (np->op = tp->op) /* almost always correct */ + { + case ROP_BKT: + np->op = ROP_BKTCOPY; + /*FALLTHROUGH*/ + case ROP_BKTCOPY: + np->right.info.bkt = tp->right.info.bkt; + /*FALLTHROUGH*/ + default: + np->left.pos = ep->re_dfa->nposn++; + /*FALLTHROUGH*/ + case ROP_EMPTY: + return np; + case ROP_CAT: + case ROP_OR: + if ((np->right.ptr = copy(ep, tp->right.ptr)) == 0) + { + free(np); + return 0; + } + np->right.ptr->parent = np; + /*FALLTHROUGH*/ + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + case ROP_LP: + if ((np->left.ptr = copy(ep, tp->left.ptr)) == 0) + break; + np->left.ptr->parent = np; + return np; + } + libuxre_regdeltree(np, 1); + return 0; +} + + /* + * Postorder traversal. + * Assign unique ascending integer values to the leaves. + * Since the right child is traversed before the left, + * the position for ROP_END is guaranteed to be zero. + * The parse tree is rewritten in two cases: + * - Each ROP_BRACE is replaced by an equivalent--sometimes + * large--subtree using only ROP_CAT, ROP_QUEST, and + * ROP_PLUS. + * - If REG_ICASE, replace each simple character that has + * an uppercase equivalent with a ROP_OR subtree over the + * two versions. + * Since these rewrites occur bottom up, they have already + * been applied before any subtrees passed to copy(). + */ +static Tree * +findposn(regex_t *ep, Tree *tp, int mb_cur_max) +{ + unsigned int lo, hi; + Tree *ptr, *par; + w_type wc; + + switch (tp->op) + { + default: + if (ep->re_flags & REG_ICASE + && (wc = to_upper(tp->op)) != tp->op) + { + if ((ptr = libuxre_reg1tree(tp->op, 0)) == 0) + return 0; + ptr->parent = tp; + ptr->left.pos = ep->re_dfa->nposn++; + tp->op = ROP_OR; + tp->left.ptr = ptr; + ptr = libuxre_reg1tree(wc, 0); + if ((tp->right.ptr = ptr) == 0) + return 0; + ptr->parent = tp; + ptr->left.pos = ep->re_dfa->nposn++; + return tp; + } + /*FALLTHROUGH*/ + case ROP_BOL: + case ROP_EOL: + case ROP_ALL: + case ROP_ANYCH: + case ROP_NOTNL: + case ROP_NONE: + case ROP_BKT: + case ROP_BKTCOPY: + case ROP_END: + tp->left.pos = ep->re_dfa->nposn++; + return tp; + case ROP_EMPTY: + return tp; + case ROP_OR: + case ROP_CAT: + if ((tp->right.ptr = findposn(ep, tp->right.ptr, + mb_cur_max)) == 0) + return 0; + /*FALLTHROUGH*/ + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + case ROP_LP: + if ((tp->left.ptr = findposn(ep, tp->left.ptr, + mb_cur_max)) == 0) + return 0; + return tp; + case ROP_BRACE: + if ((tp->left.ptr = findposn(ep, tp->left.ptr, + mb_cur_max)) == 0) + return 0; + break; + } + /* + * ROP_BRACE as is cannot be handled in a DFA. This code + * duplicates the ROP_BRACE subtree as a left-towering + * series of ROP_CAT nodes, the first "lo" of which are + * direct copies of the original subtree. The tail of + * the series are either some number of ROP_QUESTs over + * copies of the original subtree, or a single ROP_PLUS + * over a copy (when "hi" is infinity). + * + * All interesting cases {lo,hi}: + * {0,0} -> ROP_EMPTY, parsing, temporary + * {0,1} -> ROP_QUEST, parsing + * {0,2} -> CAT(QUEST(left), QUEST(copy)) + * {0,n} -> CAT({0,n-1}, QUEST(copy)) + * {0,} -> ROP_STAR, parsing + * + * {1,1} -> ROP_NOP, parsing, temporary + * {1,2} -> CAT(left, QUEST(copy)) + * {1,n} -> CAT({1,n-1}, QUEST(copy)) + * {1,} -> ROP_PLUS, parsing + * + * {2,2} -> CAT(left, copy) + * {2,n} -> CAT({2,n-1}, QUEST(copy)) + * {2,} -> CAT(left, PLUS(copy)) + * + * {3,3} -> CAT({2,2}, copy) + * {3,n} -> CAT({3,n-1}, QUEST(copy)) + * {3,} -> CAT({2,2}, PLUS(copy)) + * + * {n,} -> CAT({n-1,n-1}, PLUS(copy)) + * + * In all cases, the ROP_BRACE node is turned into the + * left-most ROP_CAT, and a copy of its original subtree + * is connected as the right child. Note that the bottom- + * up nature of this duplication guarantees that copy() + * never sees a ROP_BRACE node. + */ + par = tp->parent; + lo = tp->right.info.num[0]; + hi = tp->right.info.num[1]; + if ((ptr = copy(ep, tp->left.ptr)) == 0) + return 0; + ptr->parent = tp; + tp->op = ROP_CAT; + tp->right.ptr = ptr; + if (lo == 0) + { + if ((tp->left.ptr = libuxre_reg1tree(ROP_QUEST, tp->left.ptr)) + == 0) + return 0; + tp->left.ptr->parent = tp; + } + else + { + if (hi == BRACE_INF || (hi -= lo) == 0) + lo--; /* lo > 1; no extra needed */ + while (--lo != 0) + { + if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) + == 0) + return 0; + } + } + if (hi == BRACE_INF) + { + if ((tp->right.ptr = libuxre_reg1tree(ROP_PLUS, tp->right.ptr)) + == 0) + return 0; + tp->right.ptr->parent = tp; + } + else if (hi != 0) + { + if ((tp->right.ptr = libuxre_reg1tree(ROP_QUEST, tp->right.ptr)) + == 0) + return 0; + ptr = tp->right.ptr; + ptr->parent = tp; + while (--hi != 0) + { + if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) + == 0) + return 0; + } + } + tp->parent = par; + return tp; +} + + /* + * Postorder traversal, but not always entire subtree. + * For each leaf reachable by the empty string, add it + * to the set. Return 0 if the subtree can match empty. + */ +static int +first(Dfa *dp, Tree *tp) +{ + switch (tp->op) + { + case ROP_BOL: + if (dp->flags & REG_NOTBOL) + return 0; + break; + case ROP_EOL: + if (dp->flags & REG_NOTEOL) + return 0; + break; + case ROP_EMPTY: + return 0; + case ROP_OR: + return first(dp, tp->left.ptr) & first(dp, tp->right.ptr); + case ROP_CAT: + if (first(dp, tp->left.ptr) != 0) + return 1; + return first(dp, tp->right.ptr); + case ROP_BRACE: + if (tp->right.info.num[0] != 0 && first(dp, tp->left.ptr) != 0) + return 1; + /*FALLTHROUGH*/ + case ROP_STAR: + case ROP_QUEST: + first(dp, tp->left.ptr); + return 0; + case ROP_LP: + case ROP_PLUS: + return first(dp, tp->left.ptr); + } + if (dp->posset[tp->left.pos] == 0) + { + dp->posset[tp->left.pos] = 1; + dp->nset++; + } + return 1; +} + + /* + * Walk from leaf up (most likely not to root). + * Determine follow set for the leaf by filling + * set[] with the positions reachable. + */ +static void +follow(Dfa *dp, Tree *tp) +{ + Tree *pp; + + switch ((pp = tp->parent)->op) + { + case ROP_CAT: + if (pp->left.ptr == tp && first(dp, pp->right.ptr) != 0) + break; + /*FALLTHROUGH*/ + case ROP_OR: + case ROP_QUEST: + case ROP_LP: + follow(dp, pp); + break; + case ROP_STAR: + case ROP_PLUS: + case ROP_BRACE: + first(dp, tp); + follow(dp, pp); + break; + } +} + + /* + * Postorder traversal. + * At each leaf, copy it into posn[] and assign its follow set. + * Because the left-most subtree is ROP_ALL under ROP_STAR, the + * follow set for its leaf (position dp->nposn-1) is the same + * as the initial state's signature (prior to any ROP_BOL). + */ +static int +posnfoll(Dfa *dp, Tree *tp) +{ + unsigned char *s; + size_t i, n; + size_t *fp; + Posn *p; + int ret; + + switch (tp->op) + { + case ROP_OR: + case ROP_CAT: + if ((ret = posnfoll(dp, tp->right.ptr)) != 0) + return ret; + /*FALLTHROUGH*/ + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + case ROP_LP: + if ((ret = posnfoll(dp, tp->left.ptr)) != 0) + return ret; + return 0; + case ROP_END: /* keeps follow() from walking above the root */ + p = &dp->posn[tp->left.pos]; + p->op = tp->op; + p->seti = 0; + p->nset = 0; + return 0; + case ROP_BKT: + case ROP_BKTCOPY: + p = &dp->posn[tp->left.pos]; + p->bkt = tp->right.info.bkt; + goto skip; + case ROP_BOL: + dp->flags |= REG_NOTBOL; /* adjacent ROP_BOLs match empty */ + break; + case ROP_EOL: + dp->flags |= REG_NOTEOL; /* adjacent ROP_EOLs match empty */ + break; + } + p = &dp->posn[tp->left.pos]; +skip:; + p->op = tp->op; + memset(dp->posset, 0, dp->nposn); + dp->nset = 0; + follow(dp, tp); + dp->flags &= ~(REG_NOTBOL | REG_NOTEOL); + fp = dp->posfoll; + if ((p->nset = dp->nset) > dp->avail) /* need more */ + { + if ((n = p->nset << 1) < dp->nposn) + n = dp->nposn; + dp->avail += n; + if ((fp = realloc(dp->posfoll, + sizeof(size_t) * (dp->avail + dp->used))) == 0) + { + return REG_ESPACE; + } + dp->posfoll = fp; + } + p->seti = dp->used; + if ((i = dp->nset) != 0) + { + dp->used += i; + dp->avail -= i; + fp += p->seti; + s = dp->posset; + n = 0; + do + { + if (*s++ != 0) + { + *fp++ = n; + if (--i == 0) + break; + } + } while (++n != dp->nposn); + } + return 0; +} + +static int +addstate(Dfa *dp) /* install state if unique; return its index */ +{ + size_t *sp, *fp; + size_t t, n, i; + int flushed; + + /* + * Compare dp->nset/dp->cursig[] against remembered states. + */ + t = dp->top; + do + { + if (dp->nsig[--t] != dp->nset) + continue; + if ((n = dp->nset) != 0) + { + fp = &dp->sigfoll[dp->sigi[t]]; + sp = &dp->cursig[0]; + loop:; + if (*fp++ != *sp++) + continue; /* to the do-while */ + if (--n != 0) + goto loop; + } + return t + 1; + } while (t != 0); + /* + * Not in currently cached states; add it. + */ + flushed = 0; + if ((t = dp->top) >= CACHESZ) /* need to flush the cache */ + { + flushed = 1; + n = dp->anybol; + n = dp->sigi[n] + dp->nsig[n]; /* past invariant states */ + dp->avail += dp->used - n; + dp->used = n; + dp->top = n = dp->nfix; + memset((void *)&dp->trans, 0, sizeof(dp->trans)); + memset((void *)&dp->acc[n], 0, CACHESZ - n); + t = n; + } + dp->top++; + fp = dp->sigfoll; + if ((n = dp->nset) > dp->avail) /* grow strip */ + { + i = dp->avail + n << 1; + if ((fp = realloc(fp, sizeof(size_t) * (i + dp->used))) == 0) + return 0; + dp->avail = i; + dp->sigfoll = fp; + } + dp->acc[t] = 0; + if ((dp->nsig[t] = n) != 0) + { + sp = dp->cursig; + if (sp[0] == 0) + dp->acc[t] = 1; + dp->sigi[t] = i = dp->used; + dp->used += n; + dp->avail -= n; + fp += i; + do + *fp++ = *sp++; + while (--n != 0); + } + t++; + if (flushed) + return -t; + return t; +} + +void +libuxre_regdeldfa(Dfa *dp) +{ + Posn *pp; + size_t np; + + if (dp->posfoll != 0) + free(dp->posfoll); + if (dp->sigfoll != 0) + free(dp->sigfoll); + if (dp->cursig != 0) + free(dp->cursig); + if ((pp = dp->posn) != 0) + { + /* + * Need to walk the positions list to free any + * space used for ROP_BKTs. + */ + np = dp->nposn; + do + { + if (pp->op == ROP_BKT) + { + libuxre_bktfree(pp->bkt); + free(pp->bkt); + } + } while (++pp, --np != 0); + free(dp->posn); + } + free(dp); +} + +int +regtrans(Dfa *dp, int st, w_type wc, int mb_cur_max) +{ + const unsigned char *s; + size_t *fp, *sp; + size_t i, n; + Posn *pp; + int nst; + + if ((n = dp->nsig[st]) == 0) /* dead state */ + return st + 1; /* stay here */ + memset(dp->posset, 0, dp->nposn); + dp->nset = 0; + fp = &dp->sigfoll[dp->sigi[st]]; + do + { + pp = &dp->posn[*fp]; + switch (pp->op) + { + case ROP_EOL: + if (wc == '\0' && (dp->flags & REG_NOTEOL) == 0) + break; + /*FALLTHROUGH*/ + case ROP_BOL: + default: + if (pp->op == wc) + break; + /*FALLTHROUGH*/ + case ROP_END: + case ROP_NONE: + continue; + case ROP_NOTNL: + if (wc == '\n') + continue; + /*FALLTHROUGH*/ + case ROP_ANYCH: + if (wc <= '\0') + continue; + break; + case ROP_ALL: + if (wc == '\0') + continue; + break; + case ROP_BKT: + case ROP_BKTCOPY: + /* + * Note that multiple character bracket matches + * are precluded from DFAs. (See regparse.c and + * regcomp.c.) Thus, the continuation string + * argument is not used in libuxre_bktmbexec(). + */ + if (wc > '\0' && + libuxre_bktmbexec(pp->bkt, wc, 0, mb_cur_max) == 0) + break; + continue; + } + /* + * Current character matches this position. + * For each position in its follow list, + * add that position to the new state's signature. + */ + i = pp->nset; + sp = &dp->posfoll[pp->seti]; + do + { + if (dp->posset[*sp] == 0) + { + dp->posset[*sp] = 1; + dp->nset++; + } + } while (++sp, --i != 0); + } while (++fp, --n != 0); + /* + * Move the signature (if any) into cursig[] and install it. + */ + if ((i = dp->nset) != 0) + { + fp = dp->cursig; + s = dp->posset; + for (n = 0;; n++) + { + if (*s++ != 0) + { + *fp++ = n; + if (--i == 0) + break; + } + } + } + if ((nst = addstate(dp)) < 0) /* flushed cache */ + nst = -nst; + else if (nst > 0 && (wc & ~(long)(NCHAR - 1)) == 0) + dp->trans[st][wc] = nst; + return nst; +} + +LIBUXRE_STATIC int +libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp) +{ + Tree *lp; + Dfa *dp; + Posn *p; + int st; + + /* + * It's convenient to insert an STAR(ALL) subtree to the + * immediate left of the current tree. This makes the + * "any match" libuxre_regdfaexec() not a special case, + * and the initial state signature will fall out when + * building the follow sets for all the leaves. + */ + if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0 + || (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0 + || (tp->left.ptr = lp + = libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0) + { + return REG_ESPACE; + } + lp->parent = tp; + if ((dp = calloc(1, sizeof(Dfa))) == 0) + return REG_ESPACE; + ep->re_dfa = dp; + /* + * Just in case null pointers aren't just all bits zero... + */ + dp->posfoll = 0; + dp->sigfoll = 0; + dp->cursig = 0; + dp->posn = 0; + /* + * Assign position values to each of the tree's leaves + * (the important parts), meanwhile potentially rewriting + * the parse tree so that it fits within the restrictions + * of our DFA. + */ + if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0) + goto err; + /* + * Get space for the array of positions and current set, + * now that the number of positions is known. + */ + if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0) + goto err; + dp->posset = (unsigned char *)&dp->posn[dp->nposn]; + /* + * Get follow sets for each position. + */ + if (posnfoll(dp, tp) != 0) + goto err; + /* + * Set up the special invariant states: + * - dead state (no valid transitions); index 0. + * - initial state for any match [STAR(ALL) follow set]; index 1. + * - initial state for any match after ROP_BOL. + * - initial state for left-most longest if REG_NOTBOL. + * - initial state for left-most longest after ROP_BOL. + * The final two are not allocated if leftmost() cannot be called. + * The pairs of initial states are the same if there is no + * explicit ROP_BOL transition. + */ + dp->avail += dp->used; + dp->used = 0; + if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0) + goto err; + p = &dp->posn[dp->nposn - 1]; /* same as first(root) */ + dp->cursig = &dp->posfoll[p->seti]; + dp->nset = p->nset; + dp->top = 1; /* index 0 is dead state */ + addstate(dp); /* must be state index 1 (returns 2) */ + if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0) + goto err; + dp->nfix = 2; + if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0) + goto err; + if ((dp->anybol = st - 1) == 2) /* new state */ + dp->nfix = 3; + if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */ + { + /* + * leftmost() initial states are the same as the + * "any match" ones without the STAR(ALL) position. + */ + dp->sigi[dp->nfix] = 0; + dp->nsig[dp->nfix] = dp->nsig[1] - 1; + dp->acc[dp->nfix] = dp->acc[1]; + dp->leftbol = dp->leftmost = dp->nfix; + dp->nfix++; + if (dp->anybol != 1) /* distinct state w/BOL */ + { + dp->sigi[dp->nfix] = dp->sigi[2]; + dp->nsig[dp->nfix] = dp->nsig[2] - 1; + dp->acc[dp->nfix] = dp->acc[2]; + dp->leftbol = dp->nfix; + dp->nfix++; + } + dp->top = dp->nfix; + } + return 0; +err:; + libuxre_regdeldfa(dp); + return REG_ESPACE; +} + +static int +leftmost(Dfa *dp, Exec *xp) +{ + const unsigned char *s, *beg, *end; + int i, nst, st, mb_cur_max; + w_type wc; + + mb_cur_max = xp->mb_cur_max; + beg = s = xp->str; + end = 0; + st = dp->leftbol; + if (xp->flags & REG_NOTBOL) + st = dp->leftmost; + if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) + end = s; /* initial empty match allowed */ + for (;;) + { + if ((wc = *s++) == '\n') + { + if (xp->flags & REG_NEWLINE) + wc = ROP_EOL; + } + else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) + s += i; + if ((wc & ~(long)(NCHAR - 1)) != 0 + || (nst = dp->trans[st][wc]) == 0) + { + if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) + return REG_ESPACE; + if (wc == ROP_EOL) /* REG_NEWLINE only */ + { + if (dp->acc[nst - 1]) + { + if (end == 0 || end < s) + end = s; + break; + } + beg = s; + st = dp->leftbol; + goto newst; + } + } + if ((st = nst - 1) == 0) /* dead state */ + { + if (end != 0) + break; + if ((wc = *beg++) == '\0') + return REG_NOMATCH; + else if (!ISONEBYTE(wc) && + (i = libuxre_mb2wc(&wc, beg)) > 0) + beg += i; + s = beg; + st = dp->leftmost; + goto newst; + } + if (wc == '\0') + { + if (dp->acc[st]) + { + s--; /* don't include \0 */ + if (end == 0 || end < s) + end = s; + break; + } + if (end != 0) + break; + return REG_NOMATCH; + } + newst:; + if (dp->acc[st]) + { + if (end == 0 || end < s) + end = s; + } + } + xp->match[0].rm_so = beg - xp->str; + xp->match[0].rm_eo = end - xp->str; + return 0; +} + +/* +* Optimization by simplification: singlebyte locale and REG_NEWLINE not set. +* Performance gain for grep is 25% so it's worth the hack. +*/ +static int +regdfaexec_opt(Dfa *dp, Exec *xp) +{ + const unsigned char *s; + int nst, st; + + s = xp->str; + st = dp->anybol; + if (xp->flags & REG_NOTBOL) + st = 1; + if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) + return 0; /* initial empty match allowed */ + do + { + if ((nst = dp->trans[st][*s]) == 0) + { + if ((nst = regtrans(dp, st, *s, 1)) == 0) + return REG_ESPACE; + } + if (dp->acc[st = nst - 1]) + return 0; + } while (*s++ != '\0'); /* st != 0 */ + return REG_NOMATCH; +} + +LIBUXRE_STATIC int +libuxre_regdfaexec(Dfa *dp, Exec *xp) +{ + const unsigned char *s; + int i, nst, st, mb_cur_max; + w_type wc; + + dp->flags = xp->flags & REG_NOTEOL; /* for regtrans() */ + mb_cur_max = xp->mb_cur_max; + if (xp->nmatch != 0) + return leftmost(dp, xp); + if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0) + return regdfaexec_opt(dp, xp); + s = xp->str; + st = dp->anybol; + if (xp->flags & REG_NOTBOL) + st = 1; + if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) + return 0; /* initial empty match allowed */ + for (;;) + { + if ((wc = *s++) == '\n') + { + if (xp->flags & REG_NEWLINE) + wc = ROP_EOL; + } + else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) + s += i; + if ((wc & ~(long)(NCHAR - 1)) != 0 + || (nst = dp->trans[st][wc]) == 0) + { + if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) + return REG_ESPACE; + if (wc == ROP_EOL) /* REG_NEWLINE only */ + { + if (dp->acc[nst - 1]) + return 0; + if (dp->acc[st = dp->anybol]) + return 0; + continue; + } + } + if (dp->acc[st = nst - 1]) + return 0; + if (wc == '\0') /* st == 0 */ + return REG_NOMATCH; + } +} diff --git a/libuxre/regdfa.h b/libuxre/regdfa.h new file mode 100644 index 0000000..8cb0d48 --- /dev/null +++ b/libuxre/regdfa.h @@ -0,0 +1,75 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regdfa.h 1.3 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ + +/* +* Deterministic Finite Automata. +*/ + +#ifndef LIBUXRE_REGDFA_H +#define LIBUXRE_REGDFA_H + +#include <re.h> + +typedef struct +{ + Bracket *bkt; /* extra info for ROP_BKT */ + size_t nset; /* number of items in the follow set */ + size_t seti; /* index into the follow set strip */ + w_type op; /* the leaf match operation */ +} Posn; + +#define CACHESZ 32 /* max. states to remember (must fit in uchar) */ +#define NCHAR (1 << CHAR_BIT) + +struct re_dfa_ /*Dfa*/ +{ + unsigned char *posset; /* signatures built here */ + size_t *posfoll; /* follow strip for posn[] */ + size_t *sigfoll; /* follow strip for sigi[] */ + size_t *cursig; /* current state's signature */ + Posn *posn; /* important positions */ + size_t nposn; /* length of posn,cursig,posset */ + size_t used; /* used portion of follow strip */ + size_t avail; /* unused part of follow strip */ + size_t nset; /* # items nonzero in posset[] */ + size_t nsig[CACHESZ]; /* number of items in signature */ + size_t sigi[CACHESZ]; /* index into sigfoll[] */ + unsigned char acc[CACHESZ]; /* nonzero for accepting states */ + unsigned char leftmost; /* leftmost() start, not BOL */ + unsigned char leftbol; /* leftmost() start, w/BOL */ + unsigned char anybol; /* any match start, w/BOL */ + unsigned char nfix; /* number of invariant states */ + unsigned char top; /* next state index available */ + unsigned char flags; /* interesting flags */ + unsigned char trans[CACHESZ][NCHAR]; /* goto table */ +}; + +extern int regtrans(Dfa *, int, w_type, int); + +#endif /* !LIBUXRE_REGDFA_H */ diff --git a/libuxre/regerror.c b/libuxre/regerror.c new file mode 100644 index 0000000..397e3e5 --- /dev/null +++ b/libuxre/regerror.c @@ -0,0 +1,95 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regerror.c 1.4 (gritter) 3/29/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <string.h> +#include "re.h" +/* include "_locale.h" */ + +/* #pragma weak regerror = _regerror */ + +size_t +regerror(int err, const regex_t *ep, char *str, size_t max) +{ + const struct + { + int index; + const char *str; + } unk = + { + 88, "unknown regular expression error" + }, msgs[] = + { + /*ENOSYS*/ { 89, "feature not implemented" }, + /*0*/ { 0, "" }, + /*NOMATCH*/ { 90, "regular expression failed to match" }, + /*BADPAT*/ { 91, "invalid regular expression" }, + /*ECOLLATE*/ { 92, "invalid collating element construct" }, + /*ECTYPE*/ { 93, "invalid character class construct" }, + /*EEQUIV*/ { 94, "invalid equivalence class construct" }, + /*EBKTCHAR*/ { 95, "invalid character in '[ ]' construct" }, + /*EESCAPE*/ { 96, "trailing \\ in pattern" }, + /*ESUBREG*/ { 97, "'\\digit' out of range" }, + /*EBRACK*/ { 98, "'[ ]' imbalance" }, + /*EMPTYSUBBKT*/ { 99, "empty nested '[ ]' construct" }, + /*EMPTYPAREN*/ { 100, "empty '\\( \\)' or '( )'" }, + /*NOPAT*/ { 101, "empty pattern" }, + /*EPAREN*/ { 102, "'\\( \\)' or '( )' imbalance" }, + /*EBRACE*/ { 103, "'\\{ \\} or '{ }' imbalance" }, + /*BADBR*/ { 104, "invalid '\\{ \\}' or '{ }'" }, + /*ERANGE*/ { 105, "invalid endpoint in range" }, + /*ESPACE*/ { 106, "out of regular expression memory" }, + /*BADRPT*/ { 107, "invalid *, +, ?, \\{\\} or {} operator" }, + /*BADESC*/ { 108, "invalid escape sequence (e.g. \\0)" }, + /*ILLSEQ*/ { 109, "illegal byte sequence"} + }; + const char *p; + size_t len; + int i; + + if (err < REG_ENOSYS || REG_ILLSEQ < err) + { + i = unk.index; + p = unk.str; + } + else + { + i = msgs[err - REG_ENOSYS].index; + p = msgs[err - REG_ENOSYS].str; + } +/* p = __gtxt(_str_uxlibc, i, p); */ + len = strlen(p) + 1; + if (max != 0) + { + if (max > len) + max = len; + else if (max < len) + str[--max] = '\0'; + memcpy(str, p, max); + } + return len; +} diff --git a/libuxre/regex.h b/libuxre/regex.h new file mode 100644 index 0000000..8dbd028 --- /dev/null +++ b/libuxre/regex.h @@ -0,0 +1,153 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regex.h 1.13 (gritter) 2/6/05 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LIBUXRE_REGEX_H +#define LIBUXRE_REGEX_H +/* from unixsrc:usr/src/common/head/regex.h /main/uw7_nj/1 */ + +#include <sys/types.h> /* really only want [s]size_t */ + + /* + * Official regexec() flags. + */ +#define REG_NOTBOL 0x000001 /* start of string does not match ^ */ +#define REG_NOTEOL 0x000002 /* end of string does not match $ */ + + /* + * Additional regexec() flags. + */ +#define REG_NONEMPTY 0x000004 /* do not match empty at start of string */ + + /* + * Extensions to provide individual control over each + * of the differences between basic and extended REs. + */ +#define REG_OR 0x0000001 /* enable | operator */ +#define REG_PLUS 0x0000002 /* enable + operator */ +#define REG_QUEST 0x0000004 /* enable ? operator */ +#define REG_BRACES 0x0000008 /* use {m,n} (instead of \{m,n\}) */ +#define REG_PARENS 0x0000010 /* use (...) [instead of \(...\)] */ +#define REG_ANCHORS 0x0000020 /* ^ and $ are anchors anywhere */ +#define REG_NOBACKREF 0x0000040 /* disable \digit */ +#define REG_NOAUTOQUOTE 0x0000080 /* no automatic quoting of REG_BADRPTs */ + + /* + * Official regcomp() flags. + */ +#define REG_EXTENDED (REG_OR | REG_PLUS | REG_QUEST | REG_BRACES | \ + REG_PARENS | REG_ANCHORS | \ + REG_NOBACKREF | REG_NOAUTOQUOTE) +#define REG_ICASE 0x0000100 /* ignore case */ +#define REG_NOSUB 0x0000200 /* only success/fail for regexec() */ +#define REG_NEWLINE 0x0000400 /* take \n as line separator for ^ and $ */ + + /* + * Additional regcomp() flags. + * Some of these assume that int is >16 bits! + * Beware: 0x20000000 and above are used in re.h. + */ +#define REG_ONESUB 0x0000800 /* regexec() only needs pmatch[0] */ +#define REG_MTPARENFAIL 0x0001000 /* take empty \(\) or () as match failure */ +#define REG_MTPARENBAD 0x0002000 /* disallow empty \(\) or () */ +#define REG_BADRANGE 0x0004000 /* accept [m-a] ranges as [ma] */ +#define REG_ODDRANGE 0x0008000 /* oawk oddity: [m-a] means [m] */ +#define REG_SEPRANGE 0x0010000 /* disallow [a-m-z] style ranges */ +#define REG_BKTQUOTE 0x0020000 /* allow \ in []s to quote \, -, ^ or ] */ +#define REG_BKTEMPTY 0x0040000 /* allow empty []s (w/BKTQUOTE, BKTESCAPE) */ +#define REG_ANGLES 0x0080000 /* enable \<, \> operators */ +#define REG_ESCNL 0x0100000 /* take \n as newline character */ +#define REG_NLALT 0x0200000 /* take newline as alternation */ +#define REG_ESCSEQ 0x0400000 /* otherwise, take \ as start of C escapes */ +#define REG_BKTESCAPE 0x0800000 /* allow \ in []s to quote next anything */ +#define REG_NOBRACES 0x1000000 /* disable {n,m} */ +#define REG_ADDITIVE 0x2000000 /* a+*b means + and * additive, ^+ is valid */ +#define REG_NOI18N 0x4000000 /* disable I18N features ([::] etc.) */ +#define REG_OLDESC 0x8000000 /* recognize \b \f \n \r \t \123 only */ +#define REG_AVOIDNULL 0x10000000/* avoid null subexpression matches */ +#define REG_OLDBRE (REG_BADRANGE | REG_ANGLES | REG_ESCNL) +#define REG_OLDERE (REG_OR | REG_PLUS | REG_QUEST | REG_NOBRACES | \ + REG_PARENS | REG_ANCHORS | REG_ODDRANGE | \ + REG_NOBACKREF | REG_ADDITIVE | REG_NOAUTOQUOTE) + + /* + * Error return values. + */ +#define REG_ENOSYS (-1) /* unsupported */ +#define REG_NOMATCH 1 /* regexec() failed to match */ +#define REG_BADPAT 2 /* invalid regular expression */ +#define REG_ECOLLATE 3 /* invalid collating element construct */ +#define REG_ECTYPE 4 /* invalid character class construct */ +#define REG_EEQUIV 5 /* invalid equivalence class construct */ +#define REG_EBKTCHAR 6 /* invalid character in [] construct */ +#define REG_EESCAPE 7 /* trailing \ in pattern */ +#define REG_ESUBREG 8 /* number in \digit invalid or in error */ +#define REG_EBRACK 9 /* [] imbalance */ +#define REG_EMPTYSUBBKT 10 /* empty sub-bracket construct */ +#define REG_EMPTYPAREN 11 /* empty \(\) or () [REG_MTPARENBAD] */ +#define REG_NOPAT 12 /* no (empty) pattern */ +#define REG_EPAREN 13 /* \(\) or () imbalance */ +#define REG_EBRACE 14 /* \{\} or {} imbalance */ +#define REG_BADBR 15 /* contents of \{\} or {} invalid */ +#define REG_ERANGE 16 /* invalid endpoint in expression */ +#define REG_ESPACE 17 /* out of memory */ +#define REG_BADRPT 18 /* *,+,?,\{\} or {} not after r.e. */ +#define REG_BADESC 19 /* invalid escape sequence (e.g. \0) */ +#define REG_ILLSEQ 20 /* illegal byte sequence */ + +typedef struct +{ + size_t re_nsub; /* only advertised member */ + unsigned long re_flags; /* augmented regcomp() flags */ + struct re_dfa_ *re_dfa; /* DFA engine */ + struct re_nfa_ *re_nfa; /* NFA engine */ + struct re_coll_ *re_col; /* current collation info */ + int re_mb_cur_max; /* MB_CUR_MAX acceleration */ + void *re_more; /* just in case... */ +} regex_t; + +typedef ssize_t regoff_t; + +typedef struct +{ + regoff_t rm_so; + regoff_t rm_eo; +} regmatch_t; + +#ifdef __cplusplus +extern "C" { +#endif + +int regcomp(regex_t *, const char *, int); +int regexec(const regex_t *, const char *, size_t, regmatch_t *, int); +size_t regerror(int, const regex_t *, char *, size_t); +void regfree(regex_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* !LIBUXRE_REGEX_H */ diff --git a/libuxre/regexec.c b/libuxre/regexec.c new file mode 100644 index 0000000..667868f --- /dev/null +++ b/libuxre/regexec.c @@ -0,0 +1,68 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regexec.c 1.7 (gritter) 2/6/05 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include "re.h" + +/* #pragma weak regexec = _regexec */ + +int +regexec(const regex_t *ep, const char *s, size_t n, regmatch_t *mp, int flg) +{ + Exec ex; + int ret; + + ex.flags = flg | (ep->re_flags & (REG_NEWLINE|REG_ICASE|REG_AVOIDNULL)); + ex.str = (const unsigned char *)s; + ex.match = mp; + ex.mb_cur_max = ep->re_mb_cur_max; + if ((ex.nmatch = n) != 0) /* impose limits from compile flags */ + { + if (ep->re_flags & REG_NOSUB) + n = ex.nmatch = 0; + else if (ep->re_flags & REG_ONESUB) + ex.nmatch = 1; + else if (n > ep->re_nsub + 1) + ex.nmatch = ep->re_nsub + 1; + } + if (ep->re_flags & REG_DFA && ex.nmatch <= 1) + ret = libuxre_regdfaexec(ep->re_dfa, &ex); + else + ret = libuxre_regnfaexec(ep->re_nfa, &ex); + /* + * Fill unused part of mp[]. + */ + if (ret != 0) + ex.nmatch = 0; + while (n > ex.nmatch) + { + n--; + mp[n].rm_so = -1; + mp[n].rm_eo = -1; + } + return ret; +} diff --git a/libuxre/regfree.c b/libuxre/regfree.c new file mode 100644 index 0000000..31180d7 --- /dev/null +++ b/libuxre/regfree.c @@ -0,0 +1,42 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regfree.c 1.3 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include "re.h" + +/* #pragma weak regfree = _regfree */ + +void +regfree(regex_t *ep) +{ + if (ep->re_flags & REG_DFA) + libuxre_regdeldfa(ep->re_dfa); + if (ep->re_flags & REG_NFA) + libuxre_regdelnfa(ep->re_nfa); + if (ep->re_col != 0) + (void)libuxre_lc_collate(ep->re_col); +} diff --git a/libuxre/regnfa.c b/libuxre/regnfa.c new file mode 100644 index 0000000..6953f1f --- /dev/null +++ b/libuxre/regnfa.c @@ -0,0 +1,1070 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regnfa.c 1.8 (gritter) 2/6/05 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <string.h> +#include <stdlib.h> +#include "re.h" +#include <stddef.h> +#include <ctype.h> + +typedef unsigned char Uchar; +typedef unsigned short Ushort; + +/* +* Nondeterministic Finite Automata. +*/ +typedef struct t_graph Graph; +struct t_graph +{ + union + { + Graph *ptr; + Info info; + } alt; + Graph *next; + w_type op; +}; + +typedef struct t_stack Stack; +struct t_stack +{ + Stack *link; /* simplifies cleanup */ + Stack *prev; /* covered states */ + Graph *wasgp; /* node associated with this state */ + const Uchar *str; /* saved position in the string */ + Ushort cnt; /* ROP_BRACE: traversal count */ +}; + + /* + * A Context holds all the information needed for each + * potential path through the NFA graph. + */ +typedef struct t_ctxt Context; +struct t_ctxt +{ + Context *link; /* simplifies cleanup */ + Context *next; /* singly linked */ + Stack *sp; /* nested counts */ + Graph *gp; /* starting node */ + Graph *wasgp; /* node associated with this state */ + const Uchar *str; /* saved position in the string */ + Ushort cnt; /* ROP_BRACE: traversal count */ + size_t nset; /* length of rm[] that is currently set */ + regmatch_t rm[1]; /* enough to cover re_nsub+1 (np->rmlen) */ +}; + +struct re_nfa_ /*Nfa*/ +{ + Graph *gp; /* entire NFA */ + Stack *sp; /* unused Stacks */ + Stack *allsp; /* linked Stacks (for cleanup) */ + Context *allcp; /* linked Contexts (for cleanup) */ + Context *cur; /* Contexts to be continued now */ + Context *step; /* Contexts waiting for a step of the NFA */ + Context *avail; /* unused Contexts */ + Context **ecur; /* ends cur list of Contexts */ + Context **estp; /* ends step list of Contexts */ + size_t rmlen; /* length of rm[] in each Context */ + size_t rmmin; /* minimum length needed */ + size_t used; /* length used for this libuxre_regnfaexec() */ + w_type beg; /* nonzero for fixed char initial node NFAs */ +}; + +#define ROP_MTOR ROP_CAT /* ROP_OR, except might be empty loop */ + + /* + * Depth first traversal. + * Make a singly linked list (in alt.ptr) of the graph's nodes. + * Must toss any ROP_BKTs, too, since "alt" is overwritten. + */ +static void +deltolist(Graph *gp, Graph **list) +{ + Graph *ptr; + + if ((ptr = gp->next) != 0) /* first time */ + { + gp->next = 0; + if (gp->op == ROP_OR || gp->op == ROP_MTOR) + deltolist(gp->alt.ptr, list); + deltolist(ptr, list); + if (gp->op == ROP_BKT) + { + libuxre_bktfree(gp->alt.info.bkt); + free(gp->alt.info.bkt); + } + } + else if (gp->op == ROP_END) + gp->op = ROP_NOP; + else + return; + gp->alt.ptr = *list; + *list = gp; +} + + /* + * After the list is turned into a linked list, + * walk that list freeing the nodes. + */ +static void +delgraph(Graph *gp) +{ + Graph *gp2, end; + + gp2 = &end; + deltolist(gp, &gp2); + while ((gp = gp2) != &end) + { + gp2 = gp->alt.ptr; + free(gp); + } +} + + /* + * Depth first traversal. + * Look for ROP_NOPs and prune them from the graph. + * Chain them all together on *nop's list. + */ +static Graph * +nopskip(Graph *gp, Graph **nop) +{ + Graph *ptr; + + if ((ptr = gp->next) != 0) /* might have yet to do this subgraph */ + { + if (gp->op == ROP_NOP) + { + if (gp->alt.ptr != 0) /* touched */ + return gp->next; /* already did it */ + gp->alt.ptr = *nop; + *nop = gp; + } + gp->next = 0; /* this subgraph's pending */ + if (gp->op == ROP_OR || gp->op == ROP_MTOR) + gp->alt.ptr = nopskip(gp->alt.ptr, nop); + gp->next = nopskip(ptr, nop); + if (gp->op == ROP_NOP) + return gp->next; + } + return gp; +} + + /* + * Postorder traversal of the parse tree. + * Build a graph using "Thompson's" algorithm. + * The only significant modification is the + * ROP_BRACE->ROP_MTOR construction. + * Returns 1 => graph might match empty + * 0 => graph cannot match empty + * -1 => error (in allocation) + */ +static int +mkgraph(Tree *tp, Graph **first, Graph **last) +{ + Graph *new = 0, *nop, *lf, *ll, *rf, *rl; + int lmt, rmt = 0; + + if (tp->op != ROP_CAT) + { + if ((new = malloc(sizeof(Graph))) == 0) + return 0; + new->op = tp->op; /* usually */ + } + switch (tp->op) + { + case ROP_REF: + new->alt.info.sub = tp->right.info.sub; + *first = new; + *last = new; + return 1; /* safe--can't really tell */ + case ROP_BKT: + tp->op = ROP_BKTCOPY; /* now graph owns clean up */ + /*FALLTHROUGH*/ + case ROP_BKTCOPY: + new->alt.info.bkt = tp->right.info.bkt; + /*FALLTHROUGH*/ + default: + *first = new; + *last = new; + return 0; + case ROP_EMPTY: + new->op = ROP_NOP; + new->alt.ptr = 0; /* untouched */ + *first = new; + *last = new; + return 1; + case ROP_OR: + case ROP_CAT: + lf = 0; /* in case of error */ + if ((rmt = mkgraph(tp->right.ptr, &rf, &rl)) < 0) + goto err; + /*FALLTHROUGH*/ + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + case ROP_BRACE: + case ROP_LP: + if ((lmt = mkgraph(tp->left.ptr, &lf, &ll)) < 0) + goto err; + break; + } + /* + * Note that ROP_NOP only serves as the node that reconnects + * the two choices of an incoming ROP_OR or ROP_QUEST. To + * prevent rewalking portions of the graph in nopskip(), + * this code marks all ROP_NOP nodes as currently untouched. + */ + switch (tp->op) + { + case ROP_OR: + if ((nop = malloc(sizeof(Graph))) == 0) + goto err; + nop->op = ROP_NOP; + nop->alt.ptr = 0; /* untouched */ + ll->next = nop; + rl->next = nop; + new->next = lf; + new->alt.ptr = rf; + *first = new; + *last = nop; + return lmt | rmt; + case ROP_CAT: /* no "new" */ + ll->next = rf; + *first = lf; + *last = rl; + return lmt & rmt; + case ROP_QUEST: + if ((nop = malloc(sizeof(Graph))) == 0) + goto err; + nop->op = ROP_NOP; + nop->alt.ptr = 0; /* untouched */ + new->op = ROP_OR; + new->next = lf; + new->alt.ptr = nop; + ll->next = nop; + *first = new; + *last = nop; + return 1; + case ROP_STAR: + *first = new; + rmt = 1; + star:; + new->op = lmt ? ROP_MTOR : ROP_OR; + new->alt.ptr = lf; + ll->next = new; + *last = new; + return rmt; + case ROP_PLUS: + *first = lf; + rmt = lmt; + goto star; + case ROP_BRACE: + if ((nop = malloc(sizeof(Graph))) == 0) + goto err; + nop->op = ROP_MTOR; /* going to save state anyway... */ + nop->alt.ptr = lf; + ll->next = new; + new->next = nop; + new->alt.info.num[1] = tp->right.info.num[1]; + if ((new->alt.info.num[0] = tp->right.info.num[0]) == 0) + { + lmt = 1; + *first = new; + } + else + { + new->alt.info.num[0]--; /* already done 1 */ + if (new->alt.info.num[1] != BRACE_INF) + new->alt.info.num[1]--; /* likewise */ + *first = lf; + } + *last = nop; + return lmt; + case ROP_LP: + if ((nop = malloc(sizeof(Graph))) == 0) + goto err; + nop->op = ROP_RP; + nop->alt.info.sub = tp->right.info.sub; + new->alt.info.sub = tp->right.info.sub; + new->next = lf; + ll->next = nop; + *first = new; + *last = nop; + return lmt; + } +err:; + if (KIND_ROP(tp->op) == BINARY_ROP && rf != 0) + delgraph(rf); + if (lf != 0) + delgraph(lf); + if (tp->op != ROP_CAT) + free(new); + return -1; +} + + /* + * Semi-preorder traversal. + * Return zero if there's no simple first character + * (including the operation ROP_BOL) that must always + * be at the start of a matching string. + * This code doesn't attempt to get an answer if the + * first of the tree many be empty. + */ +static w_type +firstop(Tree *tp) +{ + w_type op; + + switch (tp->op) + { + case ROP_OR: + if ((op = firstop(tp->left.ptr)) == 0 + || op != firstop(tp->right.ptr)) + { + return 0; + } + return op; + case ROP_BRACE: + if (tp->right.info.num[0] == 0) + return 0; + /*FALLTHROUGH*/ + case ROP_CAT: + case ROP_PLUS: + case ROP_LP: + return firstop(tp->left.ptr); + default: + if (tp->op < 0) + return 0; + /*FALLTHROUGH*/ + case ROP_BOL: + return tp->op; + } +} + +void +libuxre_regdelnfa(Nfa *np) +{ + Context *cp, *cpn; + Stack *sp, *spn; + + if (np->gp != 0) + delgraph(np->gp); + for (cp = np->allcp; cp != 0; cp = cpn) + { + cpn = cp->link; + free(cp); + } + for (sp = np->allsp; sp != 0; sp = spn) + { + spn = sp->link; + free(sp); + } + free(np); +} + +LIBUXRE_STATIC int +libuxre_regnfacomp(regex_t *ep, Tree *tp, Lex *lxp) +{ + Graph *gp, end; + Nfa *np; + + if ((np = malloc(sizeof(Nfa))) == 0) + goto err; + np->gp = 0; /* in case of error */ + if (mkgraph(tp, &np->gp, &gp) < 0) + goto err; + gp->next = 0; /* nothing follows ROP_END */ + np->rmlen = 0; + if ((ep->re_flags & REG_NOSUB) == 0) + np->rmlen = ep->re_nsub + 1; + np->rmmin = 0; + if (lxp->maxref != 0 && (np->rmmin = lxp->maxref + 1) > np->rmlen) + np->rmlen = np->rmmin; + /* + * Delete all ROP_NOPs from the graph. + * nopskip() disconnects them from the graph and + * links them together through their alt.ptr's. + */ + gp = &end; + np->gp = nopskip(np->gp, &gp); + while (gp != &end) + { + Graph *gp2 = gp; + + gp = gp->alt.ptr; + free(gp2); + } + np->sp = 0; + np->allsp = 0; + np->avail = 0; + np->allcp = 0; + ep->re_nfa = np; + np->beg = firstop(tp); + return 0; +err:; + if (np != 0) + { + if (np->gp != 0) + delgraph(np->gp); + free(np); + } + return REG_ESPACE; +} + +static Stack * +newstck(Nfa *np) +{ + Stack *sp, **spp; + int i; + + if ((sp = np->sp) == 0) /* get more */ + { + spp = &np->sp; + i = 4; + while ((sp = malloc(sizeof(Stack))) != 0) + { + sp->link = np->allsp; + np->allsp = sp; + *spp = sp; + spp = &sp->prev; + if (--i == 0) + break; + } + *spp = 0; + if ((sp = np->sp) == 0) /* first malloc failed */ + return 0; + } + np->sp = sp->prev; + return sp; +} + +static int +mkstck(Nfa *np, Context *cp, Graph *gp) +{ + Stack *new, *sp; + + if (gp == 0) /* copy existing stack tail */ + { + /* + * Hoist up top of stack. + */ + new = cp->sp; + cp->wasgp = new->wasgp; + cp->str = new->str; + cp->cnt = new->cnt; + cp->sp = new->prev; + if ((sp = new->prev) == 0) /* only one below */ + { + new->prev = np->sp; + np->sp = new; + cp->sp = 0; + return 0; + } + for (;;) /* copy the rest; reusing the old top */ + { + new->wasgp = sp->wasgp; + new->str = sp->str; + new->cnt = sp->cnt; + if ((new->prev = sp->prev) == 0) + break; + if ((new->prev = newstck(np)) == 0) + return REG_ESPACE; + new = new->prev; + sp = sp->prev; + } + return 0; + } + if (cp->wasgp != 0) /* push current down */ + { + if ((new = newstck(np)) == 0) + return REG_ESPACE; + new->prev = cp->sp; + cp->sp = new; + new->wasgp = cp->wasgp; + new->str = cp->str; + new->cnt = cp->cnt; + } + cp->wasgp = gp; + cp->str = 0; + cp->cnt = 0; + return 0; +} + + /* + * Allocate a new Context (from np->avail) + * and add it to the end of the current list. + */ +static int +newctxt(Nfa *np, Context *cp, Graph *gp) +{ + Context *new; + size_t n; + + if ((new = np->avail) == 0) /* need more */ + { + Context *ncp, **cpp; + int i; + + /* + * Can't easily allocate Contexts in one call because + * the alignments (given the varying length of rm[]) + * are potentially nontrivial. + */ + n = offsetof(Context, rm) + np->rmlen * sizeof(regmatch_t); + i = 4; + cpp = &np->avail; + while ((ncp = malloc(n)) != 0) + { + ncp->link = np->allcp; + np->allcp = ncp; + *cpp = ncp; + cpp = &ncp->next; + if (--i == 0) + break; + } + *cpp = 0; + if ((new = np->avail) == 0) /* first malloc failed */ + return REG_ESPACE; + } + np->avail = new->next; + new->next = 0; + new->gp = gp; + new->sp = 0; + new->wasgp = 0; + new->nset = 0; + if (cp != 0) /* copy existing context information */ + { + if (cp->sp != 0) /* copy tail of stack */ + { + new->sp = cp->sp; + if (mkstck(np, new, 0) != 0) + return REG_ESPACE; + } + new->wasgp = cp->wasgp; + new->str = cp->str; + new->cnt = cp->cnt; + /* + * Copy any valid subexpression match information + * from the existing context. + */ + if (np->used != 0 && (n = cp->nset) != 0) + { + regmatch_t *rmn = new->rm, *rmo = cp->rm; + + new->nset = n; + for (;; ++rmn, ++rmo) + { + rmn->rm_so = rmo->rm_so; + rmn->rm_eo = rmo->rm_eo; + if (--n == 0) + break; + } + } + } + /* + * Append it to the end of the current Context list. + */ + *np->ecur = new; + np->ecur = &new->next; + return 0; +} + + /* + * Compare two byte string sequences for equality. + * If REG_ICASE, walk through the strings doing + * caseless comparisons of the wide characters. + */ +static int +casecmp(const Uchar *s, Exec *xp, ssize_t i, ssize_t n, int mb_cur_max) +{ + const Uchar *p = &xp->str[i]; + const Uchar *end; + w_type wc1, wc2; + int k; + + if (strncmp((char *)s, (char *)p, n) == 0) /* try for exact match */ + return 1; + if ((xp->flags & REG_ICASE) == 0) + return 0; + /* + * Walk through each testing for a match, ignoring case, + * of the resulting wide characters. + * Note that only "s" can run out of characters. + */ + end = &p[n]; + do + { + if ((wc1 = *s++) == '\0') + return 0; + if (!ISONEBYTE(wc1) && (k = libuxre_mb2wc(&wc1, s)) > 0) + s += k; + if (!ISONEBYTE(wc2 = *p++) && (k = libuxre_mb2wc(&wc2, p)) > 0) + p += k; + if (wc1 != wc2) + { + wc1 = to_lower(wc1); + wc2 = to_lower(wc2); + if (wc1 != wc2) + return 0; + } + } while (p < end); + return 1; +} + +LIBUXRE_STATIC int +libuxre_regnfaexec(Nfa *np, Exec *xp) +{ + const Uchar *s, *s1, *s2; + Context *cp, *cpn; + Graph *gp, *brace; + Stack *sp, *spn; + ssize_t rmso, len; + int i, ret, mb_cur_max; + w_type wc; + size_t n; + + ret = 0; /* assume it matches */ + rmso = -1; /* but no match yet */ + np->cur = 0; + np->step = 0; + np->ecur = &np->cur; + np->estp = &np->step; + if ((np->used = xp->nmatch) < np->rmmin) + np->used = np->rmmin; + s1 = 0; /* one char back */ + s = xp->str; /* current high water in string */ + mb_cur_max = xp->mb_cur_max; + for (;;) + { + /* + * Get next character from string. + * If the engine proper hasn't started and the engine + * requires a particular character to start and this + * character isn't it, try the next one. + */ + for (;;) + { + s2 = s1; + s1 = s; + if (!ISONEBYTE(wc = *s++) && + (i = libuxre_mb2wc(&wc, s)) > 0) + s += i; + if (np->cur != 0 || np->beg == wc || np->beg == 0) + break; + if (np->beg == ROP_BOL) + { + if (s2 == 0 && (xp->flags & REG_NOTBOL) == 0) + break; + if ((xp->flags & REG_NEWLINE) == 0) + goto nomatch; + if (s2 != 0 && *s2 == '\n') + break; + } + if (wc == '\0') + goto nomatch; + } + /* + * Start the engine by inserting a fresh initial context + * if there's no known match as yet. (Once some match + * has been found, the end is near.) + */ + if (rmso < 0 && newctxt(np, 0, np->gp) != 0) + goto err; + /* + * Walk the current Contexts list, trying each. + * "loop" is when a new Context is to be tried, + * "again" is when the same Context continues, + * but wc was not yet matched. + */ + cp = np->cur; + loop:; + gp = cp->gp; + again:; + switch (gp->op) + { + case ROP_BRACE: /* gp->next->op == ROP_MTOR */ + brace = gp; + gp = gp->next; + goto mtor; + case ROP_MTOR: + brace = 0; + mtor:; + if (cp->wasgp != gp) /* first time */ + { + if (mkstck(np, cp, gp) != 0) + goto err; + } + else if (cp->str == s) /* spinning */ + goto poptonext; + cp->str = s; + if (brace != 0) + { + if (cp->cnt >= brace->alt.info.num[1]) + goto poptonext; + if (++cp->cnt <= brace->alt.info.num[0]) + { + gp = gp->alt.ptr; + goto again; + } + if (cp->cnt > BRACE_MAX) + cp->cnt = BRACE_MAX; + } + if (newctxt(np, cp, gp->alt.ptr) != 0) + goto err; + poptonext:; + cp->wasgp = 0; + if ((sp = cp->sp) != 0) /* pop stack */ + { + cp->sp = sp->prev; + cp->wasgp = sp->wasgp; + cp->str = sp->str; + cp->cnt = sp->cnt; + sp->prev = np->sp; + np->sp = sp; + } + /*FALLTHROUGH*/ + case ROP_EMPTY: + tonext:; + gp = gp->next; + goto again; + case ROP_OR: + if (newctxt(np, cp, gp->alt.ptr) != 0) + goto err; + goto tonext; + case ROP_LP: + if ((n = gp->alt.info.sub) < np->used) + { + size_t k; + + cp->rm[n].rm_so = s1 - xp->str; + cp->rm[n].rm_eo = -1; + /* + * Mark any skipped subexpressions as + * failing to participate in the match. + */ + if ((k = cp->nset) < n) + { + regmatch_t *rmp = &cp->rm[k]; + + for (;; rmp++) + { + rmp->rm_so = -1; + rmp->rm_eo = -1; + if (++k >= n) + break; + } + } + cp->nset = n + 1; + } + goto tonext; + case ROP_RP: + if ((n = gp->alt.info.sub) < np->used) + cp->rm[n].rm_eo = s1 - xp->str; + goto tonext; + case ROP_BOL: + if (s2 == 0) + { + if (xp->flags & REG_NOTBOL) + goto failed; + } + else if ((xp->flags & REG_NEWLINE) == 0 || *s2 != '\n') + goto failed; + goto tonext; + case ROP_EOL: + if (wc == '\0') + { + if (xp->flags & REG_NOTEOL) + goto failed; + } + else if ((xp->flags & REG_NEWLINE) == 0 || wc != '\n') + goto failed; + goto tonext; + default: /* character match */ + if (gp->op != wc) + { + if ((xp->flags & REG_ICASE) == 0 + || gp->op != to_lower(wc)) + { + goto failed; + } + } + nextwc:; + cp->gp = gp->next; + tostep:; + cpn = cp->next; + cp->next = 0; + *np->estp = cp; + np->estp = &cp->next; + if ((cp = cpn) == 0) + break; + goto loop; + case ROP_NOTNL: + if (wc == '\n') + goto failed; + /*FALLTHROUGH*/ + case ROP_ANYCH: + if (wc > '\0') + goto nextwc; + /*FALLTHROUGH*/ + case ROP_NONE: + failed:; + cpn = cp->next; + cp->next = np->avail; + np->avail = cp; + if ((cp = cpn) == 0) + break; + goto loop; + case ROP_LT: + if (s2 == 0) + { + if (xp->flags & REG_NOTBOL) + goto failed; + } + else + { + w_type pwc; + + if (wc != '_' && + !iswalnum(mb_cur_max == 1 ? btowc(wc) : wc)) + goto failed; + if (!ISONEBYTE(pwc = *s2)) + libuxre_mb2wc(&pwc, &s2[1]); + if (pwc == '_' || + iswalnum(mb_cur_max== 1 ? btowc(pwc) : pwc)) + goto failed; + } + goto tonext; + case ROP_GT: + if (wc == '_' || + iswalnum(mb_cur_max == 1 ? btowc(wc) : wc)) + goto failed; + goto tonext; + case ROP_BKT: + case ROP_BKTCOPY: + if (cp->wasgp == gp) /* rest of MCCE */ + { + checkspin:; + if (s1 >= cp->str) /* got it all */ + goto poptonext; + goto tostep; + } + if ((i = libuxre_bktmbexec(gp->alt.info.bkt, wc, s, + mb_cur_max)) < 0) + goto failed; + if ((n = i) == 0) /* only matched wc */ + goto nextwc; + spin:; + if (mkstck(np, cp, gp) != 0) + goto err; + cp->gp = gp; /* stay here until reach past s+n */ + cp->str = s + n; + goto tostep; + case ROP_REF: + if (cp->wasgp == gp) /* rest of matched string */ + goto checkspin; + if ((n = gp->alt.info.sub) >= cp->nset) + goto failed; + if ((len = cp->rm[n].rm_eo) < 0) + goto failed; + if ((len -= n = cp->rm[n].rm_so) == 0) + goto tonext; + if (casecmp(s1, xp, n, len, mb_cur_max) == 0) + goto failed; + if ((n = s - s1) >= len) + goto nextwc; + n = len - n; + goto spin; + case ROP_END: /* success! */ + if (xp->flags & REG_NONEMPTY) + { + if (s2 == 0) + goto failed; + } + if (xp->nmatch == 0) + goto match; + /* + * Mark any skipped subexpressions as failing to match. + */ + if ((n = cp->nset) < xp->nmatch) + { + do + { + cp->rm[n].rm_so = -1; + cp->rm[n].rm_eo = -1; + } while (++n < xp->nmatch); + } + /* + * Note the left-most match that's longest. + */ + n = cp->rm[0].rm_so; + if (rmso < 0 || n < rmso) + { + rmso = n; + record:; + memcpy(xp->match, cp->rm, + xp->nmatch * sizeof(regmatch_t)); + goto failed; + } + if (rmso < n || xp->match[0].rm_eo > cp->rm[0].rm_eo) + goto failed; + if (xp->match[0].rm_eo < cp->rm[0].rm_eo) + goto record; +#if 0 /* maximize the lengths of earlier LP...RPs */ + /* + * If both are of the same length and start + * at the same point, choose the one with + * a "longest submatch from left to right" + * where an empty string wins over a nonmatch. + */ + for (n = 1; n < xp->nmatch; n++) + { + ssize_t nlen; + + /* + * First, go with the choice that has any + * match for subexpr n. + */ + len = xp->match[n].rm_eo; + nlen = cp->rm[n].rm_eo; + if (nlen < 0) + { + if (len >= 0) + break; + } + else if (len < 0) + goto record; + /* + * Both have a match; go with the longer. + */ + len -= xp->match[n].rm_so; + nlen -= cp->rm[n].rm_so; + if (nlen < len) + break; + if (nlen > len) + goto record; + } +#else /* take LP and RP as "fence posts" and maximize earlier gaps */ + /* + * If both are of the same length and start + * at the same point, choose the one with + * the larger earlier subpatterns, in which + * each rm_so and rm_eo serves as a separator. + */ + for (n = 1; n < xp->nmatch; n++) + { + ssize_t nlen; + int use; + + if (xp->flags & REG_AVOIDNULL) { + /* + * This is to to satisfy POSIX.1-2001 + * XBD pp. 172-173 ll. 6127-6129, whose + * translation is "do not match null + * expressions if there is a choice". + * See also POSIX.2 interpretation #43 + * in which the question was raised. + * + * The first subexpression of "\(x*\)*" + * must thus match the string "xxx". + */ + use = cp->rm[n].rm_eo - + cp->rm[n].rm_so >= + xp->match[n].rm_eo - + xp->match[n].rm_so || + xp->match[n].rm_so < 0; + } else + use = 1; + /* + * Choose the rightmost ROP_LP as that + * maximizes the gap from before. + */ + len = xp->match[n].rm_so; + nlen = cp->rm[n].rm_so; + if (len < nlen && use) + goto record; + if (len > nlen) + break; + /* + * The ROP_LPs are at the same point: + * Choose the rightmost ROP_RP. + */ + len = xp->match[n].rm_eo; + nlen = cp->rm[n].rm_eo; + if (len < nlen && use) + goto record; + if (len > nlen) + break; + } +#endif + goto failed; + } + /* + * Finished the current Context list. If the input string + * has been entirely scanned, we're done. Otherwise, make + * the next step list current for the next character. + * If the next step list was empty and there's an existing + * match, that's the left-most longest. + */ + if (wc == '\0') + { + if (rmso >= 0) + goto match; + goto nomatch; + } + np->ecur = np->estp; + if ((np->cur = np->step) == 0) + { + if (rmso >= 0) + goto match; + np->ecur = &np->cur; /* was pointing at step */ + } + np->step = 0; + np->estp = &np->step; + } +nomatch:; + ret = REG_NOMATCH; +match:; + np->avail = 0; + for (cp = np->allcp; cp != 0; cp = cpn) + { + cpn = cp->link; + cp->next = np->avail; + np->avail = cp; + } + np->sp = 0; + for (sp = np->allsp; sp != 0; sp = spn) + { + spn = sp->link; + sp->prev = np->sp; + np->sp = sp; + } + return ret; +err:; + ret = REG_ESPACE; + goto match; +} diff --git a/libuxre/regparse.c b/libuxre/regparse.c new file mode 100644 index 0000000..0a5c6b2 --- /dev/null +++ b/libuxre/regparse.c @@ -0,0 +1,1091 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)regparse.c 1.12 (gritter) 9/22/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* #include "synonyms.h" */ +#include <stdlib.h> +#include <ctype.h> +#include "re.h" + +LIBUXRE_STATIC void +libuxre_regdeltree(Tree *tp, int all) +{ + if (tp == 0) + return; + if (tp->op < 0) + { + switch (KIND_ROP(tp->op)) + { + case BINARY_ROP: + libuxre_regdeltree(tp->right.ptr, all); + /*FALLTHROUGH*/ + case UNARY_ROP: + libuxre_regdeltree(tp->left.ptr, all); + break; + default: + if (tp->op == ROP_BKT && all) + { + libuxre_bktfree(tp->right.info.bkt); + free(tp->right.info.bkt); + } + break; + } + } + free(tp); +} + +LIBUXRE_STATIC Tree * +libuxre_reg1tree(w_type op, Tree *lp) +{ + Tree *tp; + + if ((tp = malloc(sizeof(Tree))) == 0) + { + if (lp != 0) + libuxre_regdeltree(lp, 1); + return 0; + } + tp->op = op; + tp->left.ptr = lp; + if (lp != 0) + lp->parent = tp; + return tp; +} + +LIBUXRE_STATIC Tree * +libuxre_reg2tree(w_type op, Tree *lp, Tree *rp) +{ + Tree *tp; + + if ((tp = malloc(sizeof(Tree))) == 0) + { + libuxre_regdeltree(lp, 1); + libuxre_regdeltree(rp, 1); + return 0; + } + tp->op = op; + tp->left.ptr = lp; + lp->parent = tp; + tp->right.ptr = rp; + rp->parent = tp; + return tp; +} + +static int +lex(Lex *lxp) +{ + size_t num; + w_type wc; + int n, mb_cur_max; + + mb_cur_max = lxp->mb_cur_max; +nextc: switch (wc = *lxp->pat++) /* interesting ones are single bytes */ + { + case '\0': + lxp->pat--; /* continue to report ROP_END */ + wc = ROP_END; + break; + case '(': + if (lxp->flags & REG_PARENS) + { + leftparen:; + /* + * Must keep track of the closed and + * yet-to-be closed groups as a list. + * Consider (()a(()b(()c(()d... in which + * at each letter another even-numbered + * group is made available, but no + * odd-numbered ones are. + */ + if ((lxp->flags & REG_NOBACKREF) == 0) + { + if (lxp->nleft >= lxp->nclist) /* grow it */ + { + unsigned char *p; + + lxp->nclist += 8; /* arbitrary */ + if ((p = realloc(lxp->clist, + lxp->nclist)) == 0) + { + lxp->err = REG_ESPACE; + return -1; + } + lxp->clist = p; + } + lxp->clist[lxp->nleft] = 0; /* unavailable */ + } + lxp->nleft++; + wc = ROP_LP; + } + break; + case ')': + /* + * For REG_PARENS, only take a right paren as a close + * if there is a matching left paren. + */ + if (lxp->flags & REG_PARENS && lxp->nright < lxp->nleft) + { + lxp->nright++; + rightparen:; + /* + * The group that is being closed is the highest + * numbered as-yet-unclosed group. + */ + if ((lxp->flags & REG_NOBACKREF) == 0) + { + num = lxp->nleft; + while (lxp->clist[--num] != 0) + ; + lxp->clist[num] = 1; + } + wc = ROP_RP; + } + break; + case '.': + wc = ROP_ANYCH; + if (lxp->flags & REG_NEWLINE) + wc = ROP_NOTNL; + break; + case '*': + if (lxp->flags & REG_ADDITIVE) + { + nxtstar: switch (*lxp->pat) + { + case '+': + if ((lxp->flags & REG_PLUS) == 0) + break; + lxp->pat++; + goto nxtstar; + case '?': + if ((lxp->flags & REG_QUEST) == 0) + break; + /*FALLTHRU*/ + case '*': + lxp->pat++; + goto nxtstar; + } + } + wc = ROP_STAR; + break; + case '^': + /* + * Look "behind" to see if this is an anchor. + * Take it as an anchor if it follows an alternation + * operator. (lxp->tok is initially set to ROP_OR.) + */ + if (lxp->flags & REG_ANCHORS || lxp->tok == ROP_OR) { + if (lxp->flags & REG_ADDITIVE) + { + int optional = 0; + + nxtcar: switch (*lxp->pat) + { + case '+': + if ((lxp->flags & REG_PLUS) == 0) + break; + lxp->pat++; + goto nxtcar; + case '?': + if ((lxp->flags & REG_QUEST) == 0) + break; + /*FALLTHRU*/ + case '*': + optional = 1; + lxp->pat++; + goto nxtcar; + } + if (optional) + goto nextc; + } + wc = ROP_BOL; + } + break; + case '$': + /* + * Look ahead to see if this is an anchor, + * unless any '$' is an anchor. + * Take it as an anchor if it occurs just before + * the pattern end or an alternation operator. + */ + if (lxp->flags & REG_ANCHORS || *lxp->pat == '\0' + || (lxp->flags & REG_OR && *lxp->pat == '|') + || (lxp->flags & REG_NLALT && *lxp->pat == '\n')) + { + if (lxp->flags & REG_ADDITIVE) + { + int optional = 0; + + nxtdol: switch (*lxp->pat) + { + case '+': + if ((lxp->flags & REG_PLUS) == 0) + break; + lxp->pat++; + goto nxtdol; + case '?': + if ((lxp->flags & REG_QUEST) == 0) + break; + /*FALLTHRU*/ + case '*': + optional = 1; + lxp->pat++; + goto nxtdol; + } + if (optional) + goto nextc; + } + wc = ROP_EOL; + } + break; + case '+': + if (lxp->flags & REG_PLUS) + { + wc = ROP_PLUS; + if (lxp->flags & REG_ADDITIVE) + { + nxtplus: switch (*lxp->pat) + { + case '?': + if ((lxp->flags & REG_QUEST) == 0) + break; + case '*': + wc = ROP_STAR; + /*FALLTHRU*/ + case '+': + lxp->pat++; + goto nxtplus; + } + } + } + break; + case '?': + if (lxp->flags & REG_QUEST) + { + wc = ROP_QUEST; + if (lxp->flags & REG_ADDITIVE) + { + nxtquest: switch (*lxp->pat) + { + case '+': + if ((lxp->flags & REG_PLUS) == 0) + break; + case '*': + wc = ROP_STAR; + /*FALLTHRU*/ + case '?': + lxp->pat++; + goto nxtquest; + } + } + } + break; + case '\n': + if (lxp->flags & REG_NLALT) + { + /* + * Even when newline is an alternative separator, + * it doesn't permit parenthesized subexpressions + * to include it. + */ + if (lxp->nleft != lxp->nright) + { + lxp->err = REG_EPAREN; + return -1; + } + wc = ROP_OR; + } + else if (lxp->flags & REG_NEWLINE) + lxp->flags |= REG_NFA; + break; + case '|': + if (lxp->flags & REG_OR) + wc = ROP_OR; + break; + case '[': + if ((lxp->info.bkt = malloc(sizeof(Bracket))) == 0) + { + lxp->err = REG_ESPACE; + return -1; + } + if ((lxp->flags & REG_GOTBKT) == 0) /* first time */ + { + struct lc_collate *col; + + lxp->flags |= REG_GOTBKT; + lxp->bktflags = 0; + if (lxp->flags & REG_ICASE) + lxp->bktflags |= BKT_ONECASE; + if (lxp->flags & REG_NEWLINE) + lxp->bktflags |= BKT_NOTNL; + if (lxp->flags & REG_BADRANGE) + lxp->bktflags |= BKT_BADRANGE; + if (lxp->flags & REG_ODDRANGE) + lxp->bktflags |= BKT_ODDRANGE; + if (lxp->flags & REG_SEPRANGE) + lxp->bktflags |= BKT_SEPRANGE; + if (lxp->flags & REG_BKTQUOTE) + lxp->bktflags |= BKT_QUOTE; + if (lxp->flags & REG_BKTEMPTY) + lxp->bktflags |= BKT_EMPTY; + if (lxp->flags & REG_ESCNL) + lxp->bktflags |= BKT_ESCNL; + if (lxp->flags & REG_NLALT) + lxp->bktflags |= BKT_NLBAD; + if (lxp->flags & REG_ESCSEQ) + lxp->bktflags |= BKT_ESCSEQ; + if (lxp->flags & REG_BKTESCAPE) + lxp->bktflags |= BKT_ESCAPE; + if (lxp->flags & REG_NOI18N) + lxp->bktflags |= BKT_NOI18N; + if (lxp->flags & REG_OLDESC) + lxp->bktflags |= BKT_OLDESC; + if ((col = libuxre_lc_collate(0)) != 0) + { + if (col->maintbl == 0 + || col->flags & CHF_ENCODED) + { + (void)libuxre_lc_collate(col); + col = 0; + } + else if (col->flags & CHF_MULTICH) + lxp->flags |= REG_NFA; + } + lxp->col = col; + } + n = lxp->bktflags; + if (*lxp->pat == '^') + { + n |= BKT_NEGATED; + lxp->pat++; + } + lxp->info.bkt->col = lxp->col; + if ((n = libuxre_bktmbcomp(lxp->info.bkt, lxp->pat, + n, mb_cur_max)) < 0) + { + free(lxp->info.bkt); + lxp->err = -n; /* convert to REG_* errors */ + return -1; + } + /* + * NFA forced if newline can be a match and REG_NEWLINE is set. + */ + if ((lxp->flags & (REG_NFA | REG_NEWLINE)) == REG_NEWLINE + && lxp->pat[-1] == '[' /* i.e., not BKT_NEGATED */ + && libuxre_bktmbexec(lxp->info.bkt, '\n', 0, 1) == 0) + { + lxp->flags |= REG_NFA; + } + lxp->pat += n; + wc = ROP_BKT; + break; + case '{': + if (lxp->flags & REG_NOBRACES || (lxp->flags & REG_BRACES) == 0) + break; + interval:; + if (!isdigit(num = *lxp->pat)) + { + badbr:; + lxp->err = REG_BADBR; + if (*lxp->pat == '\0') + lxp->err = REG_EBRACE; /* more accurate */ + return -1; + } + num -= '0'; + while (isdigit(wc = *++lxp->pat)) + { + num *= 10; + if ((num += wc - '0') > BRACE_MAX) + goto badbr; + } + lxp->info.num[0] = num; + lxp->info.num[1] = num; + if (wc == ',') + { + lxp->info.num[1] = BRACE_INF; + if (isdigit(wc = *++lxp->pat)) + { + num = wc - '0'; + while (isdigit(wc = *++lxp->pat)) + { + num *= 10; + if ((num += wc - '0') > BRACE_MAX) + goto badbr; + } + if (num < lxp->info.num[0]) + goto badbr; + lxp->info.num[1] = num; + } + } + if ((lxp->flags & REG_BRACES) == 0) + { + if (wc != '\\') + goto badbr; + wc = *++lxp->pat; + } + if (wc != '}') + goto badbr; + lxp->pat++; + wc = ROP_BRACE; + /* + * Replace interval with simpler equivalents where possible, + * even when the operators are not otherwise available. + */ + if (lxp->info.num[1] <= 1) + { + if (lxp->info.num[0] == 1) + wc = ROP_NOP; /* {1,1} is noise */ + else if (lxp->info.num[1] == 0) + wc = ROP_EMPTY; /* {0,0} is empty string */ + else + wc = ROP_QUEST; /* {0,1} is ? */ + } + else if (lxp->info.num[1] == BRACE_INF) + { + if (lxp->info.num[0] == 0) + wc = ROP_STAR; + else if (lxp->info.num[0] == 1) + wc = ROP_PLUS; + else if (lxp->info.num[0] > BRACE_DFAMAX) + lxp->flags |= REG_NFA; + } + else if (lxp->info.num[1] > BRACE_DFAMAX) + { + lxp->flags |= REG_NFA; + } + break; + case '\\': + switch (wc = *lxp->pat++) + { + case '\0': + lxp->err = REG_EESCAPE; + return -1; + case '<': + if (lxp->flags & REG_ANGLES) + { + lxp->flags |= REG_NFA; + wc = ROP_LT; + } + goto out; + case '>': + if (lxp->flags & REG_ANGLES) + { + lxp->flags |= REG_NFA; + wc = ROP_GT; + } + goto out; + case '(': + if ((lxp->flags & REG_PARENS) == 0) + goto leftparen; + goto out; + case ')': + if ((lxp->flags & REG_PARENS) == 0) + { + if (++lxp->nright > lxp->nleft) + { + lxp->err = REG_EPAREN; + return -1; + } + goto rightparen; + } + goto out; + case '{': + if (lxp->flags & (REG_BRACES|REG_NOBRACES)) + goto out; + goto interval; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + num = wc - '0'; + if ((lxp->flags & REG_NOBACKREF) == 0) + { + backref:; + if (num > lxp->nleft + || lxp->clist[num - 1] == 0) + { + lxp->err = REG_ESUBREG; + return -1; + } + lxp->info.sub = num; + if (lxp->maxref < num) + lxp->maxref = num; + lxp->flags |= REG_NFA; + wc = ROP_REF; + goto out; + } + /* + * For compatibility (w/awk), permit "octal" 8 and 9. + * Already have the value of the first digit in num. + * + * If REG_OLDESC, exactly three digits must be present. + */ + tryoctal:; + if ((lxp->flags & REG_ESCSEQ) == 0) + goto out; + if ((wc = *lxp->pat) >= '0' && wc <= '9') + { + num <<= 3; + num += wc - '0'; + if ((wc = *++lxp->pat) >= '0' && wc <= '9') + { + num <<= 3; + num += wc - '0'; + lxp->pat++; + } + else if (lxp->flags & REG_OLDESC) + { + lxp->pat--; + wc = lxp->pat[-1]; + goto out; + } + } + else if (lxp->flags & REG_OLDESC) + { + wc = lxp->pat[-1]; + goto out; + } + if ((wc = num) <= 0) + { + lxp->err = REG_BADESC; + return -1; + } + goto out; + case '0': + if ((lxp->flags & REG_NOBACKREF) == 0 + && (num = *lxp->pat) >= '0' && num <= '9') + { + num -= '0'; + /* + * This loop ignores wraparounds. + * Keep track of number of digits in n. + */ + n = 1; + while ((wc = *++lxp->pat) >= '0' && wc <= '9') + { + num *= 10; + num += wc - '0'; + n++; + } + if (num != 0) + goto backref; + lxp->pat -= n; + } + num = 0; + goto tryoctal; + case 'a': + if ((lxp->flags&(REG_ESCSEQ|REG_OLDESC)) == REG_ESCSEQ) + wc = '\a'; + goto out; + case 'b': + if (lxp->flags & REG_ESCSEQ) + wc = '\b'; + goto out; + case 'f': + if (lxp->flags & REG_ESCSEQ) + wc = '\f'; + goto out; + case 'n': + if (lxp->flags & (REG_ESCSEQ | REG_ESCNL)) + { + wc = '\n'; + if (lxp->flags & REG_NEWLINE) + lxp->flags |= REG_NFA; + } + goto out; + case 'r': + if (lxp->flags & REG_ESCSEQ) + wc = '\r'; + goto out; + case 't': + if (lxp->flags & REG_ESCSEQ) + wc = '\t'; + goto out; + case 'v': + if ((lxp->flags&(REG_ESCSEQ|REG_OLDESC)) == REG_ESCSEQ) + wc = '\v'; + goto out; + case 'x': + if ((lxp->flags&(REG_ESCSEQ|REG_OLDESC)) == REG_ESCSEQ + && isxdigit(num = *lxp->pat)) + { + wc = num; + num = 0; + /* + * Take as many hex digits as possible, + * ignoring overflows. + * If the result (squeezed into a w_type) + * is positive, it's okay. + */ + do + { + if (isdigit(wc)) + wc -= '0'; + else if (isupper(wc)) + wc -= 'A' + 10; + else + wc -= 'a' + 10; + num <<= 4; + num |= wc; + } while (isxdigit(wc = *++lxp->pat)); + if ((wc = num) <= 0) + { + lxp->err = REG_BADESC; + return -1; + } + } + goto out; + } + /*FALLTHROUGH*/ + default: + if (!ISONEBYTE(wc)) + { + if ((n = libuxre_mb2wc(&wc, lxp->pat)) > 0) + lxp->pat += n; + else if (n < 0) + { + lxp->err = REG_ILLSEQ; + return -1; + } + } + if (lxp->flags & REG_ICASE) + wc = to_lower(wc); + break; + } +out:; + lxp->tok = wc; + return 0; +} + +static Tree *alt(Lex *); + +static Tree * +leaf(Lex *lxp) +{ + Tree *tp; + + if ((tp = malloc(sizeof(Tree))) == 0) + { + lxp->err = REG_ESPACE; + return 0; + } + switch (tp->op = lxp->tok) /* covers most cases */ + { + default: + if (tp->op < 0) + { + lxp->err = REG_BADPAT; + tp->right.ptr = 0; + goto badunary; + } + break; + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + if ((lxp->flags & REG_NOAUTOQUOTE) == 0 + && lxp->pat[-1] != '}') + { + tp->op = lxp->pat[-1]; + break; + } + /*FALLTHROUGH*/ + case ROP_BRACE: + case ROP_EMPTY: /* was {0,0} ROP_BRACE */ + case ROP_NOP: /* was {1,1} ROP_BRACE */ + lxp->err = REG_BADRPT; + badunary:; + tp->left.ptr = 0; + goto err; + case ROP_ANYCH: + case ROP_NOTNL: + break; + case ROP_BOL: + case ROP_EOL: + case ROP_LT: + case ROP_GT: + /* + * Look ahead for what would have been taken to be + * postfix operators. + */ + if (lex(lxp) != 0) + goto err; + switch (lxp->tok) + { + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + if ((lxp->flags & REG_NOAUTOQUOTE) == 0 + && lxp->pat[-1] != '}') + { + lxp->tok = lxp->pat[-1]; + break; + } + /*FALLTHROUGH*/ + case ROP_BRACE: + case ROP_EMPTY: /* was {0,0} ROP_BRACE */ + case ROP_NOP: /* was {1,1} ROP_BRACE */ + lxp->err = REG_BADRPT; + goto err; + } + return tp; + case ROP_BKT: + tp->right.info.bkt = lxp->info.bkt; + break; + case ROP_REF: + tp->right.info.sub = lxp->info.sub; + break; + case ROP_LP: + tp->right.info.sub = lxp->nleft; + if (lex(lxp) != 0) + goto badunary; + if (lxp->tok == ROP_RP) /* empty parens; choice of meaning */ + { + if (lxp->flags & REG_MTPARENBAD) + { + lxp->err = REG_EMPTYPAREN; + goto badunary; + } + lxp->tok = ROP_EMPTY; + if (lxp->flags & REG_MTPARENFAIL) + lxp->tok = ROP_NONE; + if ((tp->left.ptr = libuxre_reg1tree(lxp->tok, 0)) == 0) + goto badunary; + } + else if ((tp->left.ptr = alt(lxp)) == 0) + { + if (lxp->err == REG_BADPAT) + goto parenerr; + goto badunary; + } + else if (lxp->tok != ROP_RP) + { + lxp->err = REG_BADPAT; + parenerr:; + if (lxp->nleft != lxp->nright) + lxp->err = REG_EPAREN; /* better choice */ + goto badunary; + } + tp->left.ptr->parent = tp; + break; + } + if (lex(lxp) != 0) + { + err:; + libuxre_regdeltree(tp, 1); + tp = 0; + } + return tp; +} + +static Tree * +post(Lex *lxp) +{ + Tree *lp; + + if ((lp = leaf(lxp)) == 0) + return 0; + switch (lxp->tok) + { + case ROP_EMPTY: /* this was {0,0} ROP_BRACE */ + libuxre_regdeltree(lp, 1); + lp = 0; + /*FALLTHROUGH*/ + case ROP_BRACE: + case ROP_STAR: + case ROP_PLUS: + case ROP_QUEST: + if ((lp = libuxre_reg1tree(lxp->tok, lp)) == 0) + { + lxp->err = REG_ESPACE; + return 0; + } + if (lxp->tok == ROP_BRACE) + lp->right.info = lxp->info; + /*FALLTHROUGH*/ + case ROP_NOP: /* this was {1,1} ROP_BRACE */ + if (lex(lxp) != 0) + { + libuxre_regdeltree(lp, 1); + return 0; + } + break; + } + return lp; +} + +static Tree * +cat(Lex *lxp) +{ + Tree *lp, *rp; + + if ((lp = post(lxp)) == 0) + return 0; + for (;;) + { + if (lxp->tok == ROP_OR || lxp->tok == ROP_RP + || lxp->tok == ROP_END) + { + return lp; + } + if ((rp = post(lxp)) == 0) + break; + if ((lp = libuxre_reg2tree(ROP_CAT, lp, rp)) == 0) + { + lxp->err = REG_ESPACE; + return 0; + } + } + libuxre_regdeltree(lp, 1); + return 0; +} + +static Tree * +alt(Lex *lxp) +{ + Tree *lp, *rp; + + if ((lp = cat(lxp)) == 0) + return 0; + for (;;) + { + if (lxp->tok != ROP_OR) + return lp; + if (lex(lxp) != 0) + break; + if (lxp->tok == ROP_END) + return lp; /* ignore trailing '|' */ + if ((rp = cat(lxp)) == 0) + break; + if ((lp = libuxre_reg2tree(ROP_OR, lp, rp)) == 0) + { + lxp->err = REG_ESPACE; + return 0; + } + } + libuxre_regdeltree(lp, 1); + return 0; +} + +LIBUXRE_STATIC Tree * +libuxre_regparse(Lex *lxp, const unsigned char *pat, int flags) +{ + Tree *lp, *rp; + + lp = 0; /* in case of error */ + lxp->clist = 0; + lxp->col = 0; + lxp->err = 0; + lxp->maxref = 0; + lxp->nleft = 0; + lxp->nright = 0; + lxp->nclist = 0; + lxp->mb_cur_max = MB_CUR_MAX; + if (flags & REG_OR && *pat == '|') + pat++; /* skip initial OR like egrep did */ + lxp->pat = pat; + lxp->flags = flags; + lxp->tok = ROP_OR; /* enables ^ as anchor */ + /* + * Get initial token. + */ + if (lex(lxp) != 0) + { + err:; + if (lp != 0) + { + libuxre_regdeltree(lp, 1); + lp = 0; + } + if (lxp->err == 0) + lxp->err = REG_ESPACE; + goto ret; + } + if (lxp->tok == ROP_END) + { + lxp->err = REG_NOPAT; + goto err; + } + if ((lp = alt(lxp)) == 0) /* parse entire RE */ + goto err; + if (lxp->maxref != 0 || (flags & REG_NOSUB) == 0) + { + if ((lp = libuxre_reg1tree(ROP_LP, lp)) == 0) + goto err; + lp->right.info.sub = 0; + } + if ((rp = libuxre_reg1tree(ROP_END, 0)) == 0) + goto err; + if ((lp = libuxre_reg2tree(ROP_CAT, lp, rp)) == 0) + goto err; + lp->parent = 0; +ret:; + if (lxp->clist != 0) + free(lxp->clist); + return lp; +} + +#ifdef REGDEBUG + +LIBUXRE_STATIC void +libuxre_regtree(Tree *tp, int n) +{ + const char *opstr; + char buf[32]; + int kind, next; + + if (n < 0) + next = -n + 2; + else + next = n + 2; + switch (tp->op) + { + case ROP_OR: + opstr = "|"; + kind = BINARY_ROP; + break; + case ROP_CAT: + opstr = "&"; + kind = BINARY_ROP; + break; + case ROP_STAR: + opstr = "*"; + kind = UNARY_ROP; + break; + case ROP_PLUS: + opstr = "+"; + kind = UNARY_ROP; + break; + case ROP_QUEST: + opstr = "?"; + kind = UNARY_ROP; + break; + case ROP_BRACE: + opstr = buf; + if (tp->right.info.num[1] == BRACE_INF) + { + sprintf(buf, "{%u,inf}", + (unsigned)tp->right.info.num[0]); + } + else + { + sprintf(buf, "{%u,%u}", + (unsigned)tp->right.info.num[0], + (unsigned)tp->right.info.num[1]); + } + kind = UNARY_ROP; + break; + case ROP_LP: + opstr = buf; + sprintf(buf, "%lu(", (unsigned long)tp->right.info.sub); + kind = UNARY_ROP; + break; + case ROP_RP: + opstr = buf; + sprintf(buf, ")%lu", (unsigned long)tp->right.info.sub); + kind = UNARY_ROP; + break; + case ROP_NOP: + opstr = "<NOP>"; + kind = LEAF_ROP; + break; + case ROP_BOL: + opstr = "<BOL>"; + kind = LEAF_ROP; + break; + case ROP_EOL: + opstr = "<EOL>"; + kind = LEAF_ROP; + break; + case ROP_ALL: + opstr = "<ALL>"; + kind = LEAF_ROP; + break; + case ROP_ANYCH: + opstr = "<ANYCH>"; + kind = LEAF_ROP; + break; + case ROP_NOTNL: + opstr = "<NOTNL>"; + kind = LEAF_ROP; + break; + case ROP_EMPTY: + opstr = "<MT>"; + kind = LEAF_ROP; + break; + case ROP_NONE: + opstr = "<NONE>"; + kind = LEAF_ROP; + break; + case ROP_BKT: + opstr = buf; + sprintf(buf, "[%#lx]", (unsigned long)tp->right.info.bkt); + kind = LEAF_ROP; + break; + case ROP_BKTCOPY: + opstr = buf; + sprintf(buf, "[%#lx]CPY", (unsigned long)tp->right.info.bkt); + kind = LEAF_ROP; + break; + case ROP_LT: + opstr = "\\<"; + kind = LEAF_ROP; + break; + case ROP_GT: + opstr = "\\>"; + kind = LEAF_ROP; + break; + case ROP_REF: + opstr = buf; + sprintf(buf, "\\%lu", (unsigned long)tp->right.info.sub); + kind = LEAF_ROP; + break; + case ROP_END: + opstr = "<END>"; + kind = LEAF_ROP; + break; + default: + opstr = buf; + if (tp->op > UCHAR_MAX) + sprintf(buf, "W%#x", tp->op); + else if (tp->op <= 0) + sprintf(buf, "UNK=%u", tp->op); + else + sprintf(buf, "%c", tp->op); + kind = LEAF_ROP; + break; + } + if (kind == BINARY_ROP) + libuxre_regtree(tp->right.ptr, -next); + printf("%*c:%s\n", next - 1, n < 0 ? 'R' : n > 0 ? 'L' : 'T', opstr); + if (kind != LEAF_ROP) + libuxre_regtree(tp->left.ptr, next); +} + +#endif /*REGDEBUG*/ diff --git a/libuxre/stubs.c b/libuxre/stubs.c new file mode 100644 index 0000000..bd670db --- /dev/null +++ b/libuxre/stubs.c @@ -0,0 +1,97 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)stubs.c 1.24 (gritter) 10/12/04 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* stubbed-out routines needed to complete the RE libc code */ + +#include "colldata.h" + +struct lc_collate * +libuxre_lc_collate(struct lc_collate *cp) +{ + static struct lc_collate curinfo = {0}; /* means CHF_ENCODED */ + + return &curinfo; +} + +#include "wcharm.h" + +LIBUXRE_STATIC int +libuxre_mb2wc(w_type *wt, const unsigned char *s) +{ + wchar_t wc; + int len; + + if ((len = mbtowc(&wc, (const char *)&s[-1], MB_LEN_MAX)) > 0) + *wt = wc; + else if (len == 0) + *wt = '\0'; + else /*if (len < 0)*/ + *wt = (w_type)WEOF; + return len > 0 ? len - 1 : len; +} + +#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 4 +#define USED __attribute__ ((used)) +#elif defined __GNUC__ +#define USED __attribute__ ((unused)) +#else +#define USED +#endif +static const char sccsid[] USED = "@(#)libuxre.sl 1.24 (gritter) 10/12/04"; +/* +_collelem.c: + _collelem.c 1.4 (gritter) 10/18/03 +_collmult.c: + _collmult.c 1.4 (gritter) 9/22/03 +bracket.c: + bracket.c 1.14 (gritter) 10/18/03 +colldata.h: + colldata.h 1.4 (gritter) 10/18/03 +onefile.c: + onefile.c 1.1 (gritter) 9/22/03 +re.h: + re.h 1.14 (gritter) 10/18/03 +regcomp.c: + regcomp.c 1.6 (gritter) 9/22/03 +regdfa.c: + regdfa.c 1.9 (gritter) 9/22/03 +regdfa.h: + regdfa.h 1.3 (gritter) 9/22/03 +regerror.c: + regerror.c 1.4 (gritter) 3/29/03 +regex.h: + regex.h 1.12 (gritter) 9/22/03 +regexec.c: + regexec.c 1.6 (gritter) 9/22/03 +regfree.c: + regfree.c 1.3 (gritter) 9/22/03 +regnfa.c: + regnfa.c 1.7 (gritter) 9/22/03 +regparse.c: + regparse.c 1.12 (gritter) 9/22/03 +wcharm.h: + wcharm.h 1.12 (gritter) 10/18/03 +*/ diff --git a/libuxre/wcharm.h b/libuxre/wcharm.h new file mode 100644 index 0000000..8985d6b --- /dev/null +++ b/libuxre/wcharm.h @@ -0,0 +1,63 @@ +/* + * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. + * + * Sccsid @(#)wcharm.h 1.12 (gritter) 10/18/03 + */ +/* UNIX(R) Regular Expresssion Library + * + * Note: Code is released under the GNU LGPL + * + * Copyright (C) 2001 Caldera International, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* Stubbed-out wide character locale information */ + +#ifndef LIBUXRE_WCHARM_H +#define LIBUXRE_WCHARM_H + +#ifndef LIBUXRE_STATIC +#define LIBUXRE_STATIC +#endif + +#ifndef LIBUXRE_WUCHAR_T +#define LIBUXRE_WUCHAR_T +typedef unsigned int wuchar_type; +#endif + +#ifndef LIBUXRE_W_TYPE +#define LIBUXRE_W_TYPE +typedef int w_type; +#endif + +#include <wchar.h> +#include <wctype.h> +#include <stdlib.h> + +#ifdef notdef +#define ISONEBYTE(ch) ((ch), 1) + +#define libuxre_mb2wc(wp, cp) ((wp), (cp), 0) +#endif /* notdef */ + +#define ISONEBYTE(ch) (((ch) & 0200) == 0 || mb_cur_max == 1) + +#define to_lower(ch) (mb_cur_max > 1 ? towlower(ch) : tolower(ch)) +#define to_upper(ch) (mb_cur_max > 1 ? towupper(ch) : toupper(ch)) + +LIBUXRE_STATIC int libuxre_mb2wc(w_type *, const unsigned char *); + +#endif /* !LIBUXRE_WCHARM_H */ |
