diff options
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/COPYRIGHT | 84 | ||||
-rw-r--r-- | src/backend/regex/Makefile | 29 | ||||
-rw-r--r-- | src/backend/regex/README | 409 | ||||
-rw-r--r-- | src/backend/regex/re_syntax.n | 979 | ||||
-rw-r--r-- | src/backend/regex/regc_color.c | 1140 | ||||
-rw-r--r-- | src/backend/regex/regc_cvec.c | 138 | ||||
-rw-r--r-- | src/backend/regex/regc_lex.c | 1173 | ||||
-rw-r--r-- | src/backend/regex/regc_locale.c | 766 | ||||
-rw-r--r-- | src/backend/regex/regc_nfa.c | 3181 | ||||
-rw-r--r-- | src/backend/regex/regc_pg_locale.c | 935 | ||||
-rw-r--r-- | src/backend/regex/regcomp.c | 2194 | ||||
-rw-r--r-- | src/backend/regex/rege_dfa.c | 929 | ||||
-rw-r--r-- | src/backend/regex/regerror.c | 120 | ||||
-rw-r--r-- | src/backend/regex/regexec.c | 1425 | ||||
-rw-r--r-- | src/backend/regex/regexport.c | 292 | ||||
-rw-r--r-- | src/backend/regex/regfree.c | 54 | ||||
-rw-r--r-- | src/backend/regex/regprefix.c | 260 |
17 files changed, 14108 insertions, 0 deletions
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT new file mode 100644 index 0000000..e50cfb1 --- /dev/null +++ b/src/backend/regex/COPYRIGHT @@ -0,0 +1,84 @@ +This regular expression package was originally developed by Henry Spencer. +It bears the following copyright notice: + +********************************************************************** + +Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + +Development of this software was funded, in part, by Cray Research Inc., +UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics +Corporation, none of whom are responsible for the results. The author +thanks all of them. + +Redistribution and use in source and binary forms -- with or without +modification -- are permitted for any purpose, provided that +redistributions in source form retain this entire copyright notice and +indicate the origin and nature of any modifications. + +I'd appreciate being given credit for this package in the documentation +of software which uses it, but that is not a requirement. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +********************************************************************** + +PostgreSQL adopted the code out of Tcl 8.4.1. Portions of regc_locale.c +and re_syntax.n were developed by Tcl developers other than Henry; these +files bear the Tcl copyright and license notice: + +********************************************************************** + +This software is copyrighted by the Regents of the University of +California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState +Corporation and other parties. The following terms apply to all files +associated with the software unless explicitly disclaimed in +individual files. + +The authors hereby grant permission to use, copy, modify, distribute, +and license this software and its documentation for any purpose, provided +that existing copyright notices are retained in all copies and that this +notice is included verbatim in any distributions. No written agreement, +license, or royalty fee is required for any of the authorized uses. +Modifications to this software may be copyrighted by their authors +and need not follow the licensing terms described here, provided that +the new terms are clearly indicated on the first page of each file where +they apply. + +IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY +FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES +ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY +DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE +IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE +NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR +MODIFICATIONS. + +GOVERNMENT USE: If you are acquiring this software on behalf of the +U.S. government, the Government shall have only "Restricted Rights" +in the software and related documentation as defined in the Federal +Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you +are acquiring the software on behalf of the Department of Defense, the +software shall be classified as "Commercial Computer Software" and the +Government shall have only "Restricted Rights" as defined in Clause +252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the +authors grant the U.S. Government and others acting in its behalf +permission to use and distribute the software in accordance with the +terms specified in this license. + +********************************************************************** + +Subsequent modifications to the code by the PostgreSQL project follow +the same license terms as the rest of PostgreSQL. diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile new file mode 100644 index 0000000..5210c16 --- /dev/null +++ b/src/backend/regex/Makefile @@ -0,0 +1,29 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for backend/regex +# +# IDENTIFICATION +# src/backend/regex/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/regex +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + regcomp.o \ + regerror.o \ + regexec.o \ + regexport.o \ + regfree.o \ + regprefix.o + +include $(top_srcdir)/src/backend/common.mk + +# mark inclusion dependencies between .c files explicitly +regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \ + regc_locale.c regc_pg_locale.c + +regexec.o: regexec.c rege_dfa.c diff --git a/src/backend/regex/README b/src/backend/regex/README new file mode 100644 index 0000000..f08aab6 --- /dev/null +++ b/src/backend/regex/README @@ -0,0 +1,409 @@ +Implementation notes about Henry Spencer's regex library +======================================================== + +If Henry ever had any internals documentation, he didn't publish it. +So this file is an attempt to reverse-engineer some docs. + +General source-file layout +-------------------------- + +There are six separately-compilable source files, five of which expose +exactly one exported function apiece: + regcomp.c: pg_regcomp + regexec.c: pg_regexec + regerror.c: pg_regerror + regfree.c: pg_regfree + regprefix.c: pg_regprefix +(The pg_ prefixes were added by the Postgres project to distinguish this +library version from any similar one that might be present on a particular +system. They'd need to be removed or replaced in any standalone version +of the library.) + +The sixth file, regexport.c, exposes multiple functions that allow extraction +of info about a compiled regex (see regexport.h). + +There are additional source files regc_*.c that are #include'd in regcomp, +and similarly additional source files rege_*.c that are #include'd in +regexec. This was done to avoid exposing internal symbols globally; +all functions not meant to be part of the library API are static. + +(Actually the above is a lie in one respect: there are two more global +symbols, pg_set_regex_collation and pg_reg_getcolor in regcomp. These are +not meant to be part of the API, but they have to be global because both +regcomp and regexec call them. It'd be better to get rid of +pg_set_regex_collation, as well as the static variables it sets, in favor of +keeping the needed locale state in the regex structs. We have not done this +yet for lack of a design for how to add application-specific state to the +structs.) + +What's where in src/backend/regex/: + +regcomp.c Top-level regex compilation code +regc_color.c Color map management +regc_cvec.c Character vector (cvec) management +regc_lex.c Lexer +regc_nfa.c NFA handling +regc_locale.c Application-specific locale code from Tcl project +regc_pg_locale.c Postgres-added application-specific locale code +regexec.c Top-level regex execution code +rege_dfa.c DFA creation and execution +regerror.c pg_regerror: generate text for a regex error code +regfree.c pg_regfree: API to free a no-longer-needed regex_t +regexport.c Functions for extracting info from a regex_t +regprefix.c Code for extracting a common prefix from a regex_t + +The locale-specific code is concerned primarily with case-folding and with +expanding locale-specific character classes, such as [[:alnum:]]. It +really needs refactoring if this is ever to become a standalone library. + +The header files for the library are in src/include/regex/: + +regcustom.h Customizes library for particular application +regerrs.h Error message list +regex.h Exported API +regexport.h Exported API for regexport.c +regguts.h Internals declarations + + +DFAs, NFAs, and all that +------------------------ + +This library is a hybrid DFA/NFA regex implementation. (If you've never +heard either of those terms, get thee to a first-year comp sci textbook.) +It might not be clear at first glance what that really means and how it +relates to what you'll see in the code. Here's what really happens: + +* Initial parsing of a regex generates an NFA representation, with number +of states approximately proportional to the length of the regexp. + +* The NFA is then optimized into a "compact NFA" representation, which is +basically the same idea but without fields that are not going to be needed +at runtime. It is simplified too: the compact format only allows "plain" +and "LACON" arc types. The cNFA representation is what is passed from +regcomp to regexec. + +* Unlike traditional NFA-based regex engines, we do not execute directly +from the NFA representation, as that would require backtracking and so be +very slow in some cases. Rather, we execute a DFA, which ideally can +process an input string in linear time (O(M) for M characters of input) +without backtracking. Each state of the DFA corresponds to a set of +states of the NFA, that is all the states that the NFA might have been in +upon reaching the current point in the input string. Therefore, an NFA +with N states might require as many as 2^N states in the corresponding +DFA, which could easily require unreasonable amounts of memory. We deal +with this by materializing states of the DFA lazily (only when needed) and +keeping them in a limited-size cache. The possible need to build the same +state of the DFA repeatedly makes this approach not truly O(M) time, but +in the worst case as much as O(M*N). That's still far better than the +worst case for a backtracking NFA engine. + +If that were the end of it, we'd just say this is a DFA engine, with the +use of NFAs being merely an implementation detail. However, a DFA engine +cannot handle some important regex features such as capturing parens and +back-references. If the parser finds that a regex uses these features +(collectively called "messy cases" in the code), then we have to use +NFA-style backtracking search after all. + +When using the NFA mode, the representation constructed by the parser +consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are +either plain regular expressions (which are executed as DFAs in the manner +described above) or back-references (which try to match the input to some +previous substring). Non-leaf nodes are capture nodes (which save the +location of the substring currently matching their child node), +concatenation, alternation, or iteration nodes. At execution time, the +executor recursively scans the tree. At concatenation, alternation, or +iteration nodes, it considers each possible alternative way of matching the +input string, that is each place where the string could be split for a +concatenation or iteration, or each child node for an alternation. It +tries the next alternative if the match fails according to the child nodes. +This is exactly the sort of backtracking search done by a traditional NFA +regex engine. If there are many tree levels it can get very slow. + +But all is not lost: we can still be smarter than the average pure NFA +engine. To do this, each subre node has an associated DFA, which +represents what the node could possibly match insofar as a mathematically +pure regex can describe that, which basically means "no backrefs". +Before we perform any search of possible alternative sub-matches, we run +the DFA to see if it thinks the proposed substring could possibly match. +If not, we can reject the match immediately without iterating through many +possibilities. + +As an example, consider the regex "(a[bc]+)\1". The compiled +representation will have a top-level concatenation subre node. Its left +child is a capture node, and the child of that is a plain DFA node for +"a[bc]+". The concatenation's right child is a backref node for \1. +The DFA associated with the concatenation node will be "a[bc]+a[bc]+", +where the backref has been replaced by a copy of the DFA for its referent +expression. When executed, the concatenation node will have to search for +a possible division of the input string that allows its two child nodes to +each match their part of the string (and although this specific case can +only succeed when the division is at the middle, the code does not know +that, nor would it be true in general). However, we can first run the DFA +and quickly reject any input that doesn't start with an "a" and contain +one more "a" plus some number of b's and c's. If the DFA doesn't match, +there is no need to recurse to the two child nodes for each possible +string division point. In many cases, this prefiltering makes the search +run much faster than a pure NFA engine could do. It is this behavior that +justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's +library. + + +Colors and colormapping +----------------------- + +In many common regex patterns, there are large numbers of characters that +can be treated alike by the execution engine. A simple example is the +pattern "[[:alpha:]][[:alnum:]]*" for an identifier. Basically the engine +only needs to care whether an input symbol is a letter, a digit, or other. +We could build the NFA or DFA with a separate arc for each possible letter +and digit, but that's very wasteful of space and not so cheap to execute +either, especially when dealing with Unicode which can have thousands of +letters. Instead, the parser builds a "color map" that maps each possible +input symbol to a "color", or equivalence class. The NFA or DFA +representation then has arcs labeled with colors, not specific input +symbols. At execution, the first thing the executor does with each input +symbol is to look up its color in the color map, and then everything else +works from the color only. + +To build the colormap, we start by assigning every possible input symbol +the color WHITE, which means "other" (that is, at the end of parsing, the +symbols that are still WHITE are those not explicitly referenced anywhere +in the regex). When we see a simple literal character or a bracket +expression in the regex, we want to assign that character, or all the +characters represented by the bracket expression, a unique new color that +can be used to label the NFA arc corresponding to the state transition for +matching this character or bracket expression. The basic idea is: +first, change the color assigned to a character to some new value; +second, run through all the existing arcs in the partially-built NFA, +and for each one referencing the character's old color, add a parallel +arc referencing its new color (this keeps the reassignment from changing +the semantics of what we already built); and third, add a new arc with +the character's new color to the current pair of NFA states, denoting +that seeing this character allows the state transition to be made. + +This is complicated a bit by not wanting to create more colors +(equivalence classes) than absolutely necessary. In particular, if a +bracket expression mentions two characters that had the same color before, +they should still share the same color after we process the bracket, since +there is still not a need to distinguish them. But we do need to +distinguish them from other characters that previously had the same color +yet are not listed in the bracket expression. To mechanize this, the code +has a concept of "parent colors" and "subcolors", where a color's subcolor +is the new color that we are giving to any characters of that color while +parsing the current atom. (The word "parent" is a bit unfortunate here, +because it suggests a long-lived relationship, but a subcolor link really +only lasts for the duration of parsing a single atom.) In other words, +a subcolor link means that we are in process of splitting the parent color +into two colors (equivalence classes), depending on whether or not each +member character should be included by the current regex atom. + +As an example, suppose we have the regex "a\d\wx". Initially all possible +character codes are labeled WHITE (color 0). To parse the atom "a", we +create a new color (1), update "a"'s color map entry to 1, and create an +arc labeled 1 between the first two states of the NFA. Now we see \d, +which is really a bracket expression containing the digits "0"-"9". +First we process "0", which is currently WHITE, so we create a new color +(2), update "0"'s color map entry to 2, and create an arc labeled 2 +between the second and third states of the NFA. We also mark color WHITE +as having the subcolor 2, which means that future relabelings of WHITE +characters should also select 2 as the new color. Thus, when we process +"1", we won't create a new color but re-use 2. We update "1"'s color map +entry to 2, and then find that we don't need a new arc because there is +already one labeled 2 between the second and third states of the NFA. +Similarly for the other 8 digits, so there will be only one arc labeled 2 +between NFA states 2 and 3 for all members of this bracket expression. +At completion of processing of the bracket expression, we call okcolors() +which breaks all the existing parent/subcolor links; there is no longer a +marker saying that WHITE characters should be relabeled 2. (Note: +actually, we did the same creation and clearing of a subcolor link for the +primitive atom "a", but it didn't do anything very interesting.) Now we +come to the "\w" bracket expression, which for simplicity assume expands +to just "[a-z0-9]". We process "a", but observe that it is already the +sole member of its color 1. This means there is no need to subdivide that +equivalence class more finely, so we do not create any new color. We just +make an arc labeled 1 between the third and fourth NFA states. Next we +process "b", which is WHITE and far from the only WHITE character, so we +create a new color (3), link that as WHITE's subcolor, relabel "b" as +color 3, and make an arc labeled 3. As we process "c" through "z", each +is relabeled from WHITE to 3, but no new arc is needed. Now we come to +"0", which is not the only member of its color 2, so we suppose that a new +color is needed and create color 4. We link 4 as subcolor of 2, relabel +"0" as color 4 in the map, and add an arc for color 4. Next "1" through +"9" are similarly relabeled as color 4, with no additional arcs needed. +Having finished the bracket expression, we call okcolors(), which breaks +the subcolor links. okcolors() further observes that we have removed +every member of color 2 (the previous color of the digit characters). +Therefore, it runs through the partial NFA built so far and relabels arcs +labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is +relabeled color 4. Then it frees up color 2, since we have no more use +for that color. We now have an NFA in which transitions for digits are +consistently labeled with color 4. Last, we come to the atom "x". +"x" is currently labeled with color 3, and it's not the only member of +that color, so we realize that we now need to distinguish "x" from other +letters when we did not before. We create a new color, which might have +been 5 but instead we recycle the unused color 2. "x" is relabeled 2 in +the color map and 2 is linked as the subcolor of 3, and we add an arc for +2 between states 4 and 5 of the NFA. Now we call okcolors(), which breaks +the subcolor link between colors 3 and 2 and notices that both colors are +nonempty. Therefore, it also runs through the existing NFA arcs and adds +an additional arc labeled 2 wherever there is an arc labeled 3; this +action ensures that characters of color 2 (i.e., "x") will still be +considered as allowing any transitions they did before. We are now done +parsing the regex, and we have these final color assignments: + color 1: "a" + color 2: "x" + color 3: other letters + color 4: digits +and the NFA has these arcs: + states 1 -> 2 on color 1 (hence, "a" only) + states 2 -> 3 on color 4 (digits) + states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters) + states 4 -> 5 on color 2 ("x" only) +which can be seen to be a correct representation of the regex. + +Given this summary, we can see we need the following operations for +colors: + +* A fast way to look up the current color assignment for any character + code. (This is needed during both parsing and execution, while the + remaining operations are needed only during parsing.) +* A way to alter the color assignment for any given character code. +* We must track the number of characters currently assigned to each + color, so that we can detect empty and singleton colors. +* We must track all existing NFA arcs of a given color, so that we + can relabel them at need, or add parallel arcs of a new color when + an existing color has to be subdivided. + +The last two of these are handled with the "struct colordesc" array and +the "colorchain" links in NFA arc structs. + +Ideally, we'd do the first two operations using a simple linear array +storing the current color assignment for each character code. +Unfortunately, that's not terribly workable for large charsets such as +Unicode. Our solution is to divide the color map into two parts. A simple +linear array is used for character codes up to MAX_SIMPLE_CHR, which can be +chosen large enough to include all popular characters (so that the +significantly-slower code paths about to be described are seldom invoked). +Characters above that need be considered at compile time only if they +appear explicitly in the regex pattern. We store each such mentioned +character or character range as an entry in the "colormaprange" array in +the colormap. (Overlapping ranges are split into unique subranges, so that +each range in the finished list needs only a single color that describes +all its characters.) When mapping a character above MAX_SIMPLE_CHR to a +color at runtime, we search this list of ranges explicitly. + +That's still not quite enough, though, because of locale-dependent +character classes such as [[:alpha:]]. In Unicode locales these classes +may have thousands of entries that are above MAX_SIMPLE_CHR, and we +certainly don't want to be searching large colormaprange arrays at runtime. +Nor do we even want to spend the time to initialize cvec structures that +exhaustively describe all of those characters. Our solution is to compute +exact per-character colors at regex compile time only up to MAX_SIMPLE_CHR. +For characters above that, we apply the <ctype.h> or <wctype.h> lookup +functions at runtime for each locale-dependent character class used in the +regex pattern, constructing a bitmap that describes which classes the +runtime character belongs to. The per-character-range data structure +mentioned above actually holds, for each range, a separate color entry +for each possible combination of character class properties. That is, +the color map for characters above MAX_SIMPLE_CHR is really a 2-D array, +whose rows correspond to high characters or character ranges that are +explicitly mentioned in the regex pattern, and whose columns correspond +to sets of the locale-dependent character classes that are used in the +regex. + +As an example, given the pattern '\w\u1234[\U0001D100-\U0001D1FF]' +(and supposing that MAX_SIMPLE_CHR is less than 0x1234), we will need +a high color map with three rows. One row is for the single character +U+1234 (represented as a single-element range), one is for the range +U+1D100..U+1D1FF, and the other row represents all remaining high +characters. The color map has two columns, one for characters that +satisfy iswalnum() and one for those that don't. + +We build this color map in parallel with scanning the regex. Each time +we detect a new explicit high character (or range) or a locale-dependent +character class, we split existing entry(s) in the high color map so that +characters we need to be able to distinguish will have distinct entries +that can be given separate colors. Often, though, single entries in the +high color map will represent very large sets of characters. + +If there are both explicit high characters/ranges and locale-dependent +character classes, we may have entries in the high color map array that +have non-WHITE colors but don't actually represent any real characters. +(For example, in a row representing a singleton range, only one of the +columns could possibly be a live entry; it's the one matching the actual +locale properties for that single character.) We don't currently make +any effort to reclaim such colors. In principle it could be done, but +it's not clear that it's worth the trouble. + + +Detailed semantics of an NFA +---------------------------- + +When trying to read dumped-out NFAs, it's helpful to know these facts: + +State 0 (additionally marked with "@" in dumpnfa's output) is always the +goal state, and state 1 (additionally marked with ">") is the start state. +(The code refers to these as the post state and pre state respectively.) + +The possible arc types are: + + PLAIN arcs, which specify matching of any character of a given "color" + (see above). These are dumped as "[color_number]->to_state". + + EMPTY arcs, which specify a no-op transition to another state. These + are dumped as "->to_state". + + AHEAD constraints, which represent a "next character must be of this + color" constraint. AHEAD differs from a PLAIN arc in that the input + character is not consumed when crossing the arc. These are dumped as + ">color_number>->to_state". + + BEHIND constraints, which represent a "previous character must be of + this color" constraint, which likewise consumes no input. These are + dumped as "<color_number<->to_state". + + '^' arcs, which specify a beginning-of-input constraint. These are + dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and + beginning-of-line constraints respectively. + + '$' arcs, which specify an end-of-input constraint. These are dumped + as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line + constraints respectively. + + LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and + "(?<!re)" constraints, i.e. the input starting/ending at this point must + match (or not match) a given sub-RE, but the matching input is not + consumed. These are dumped as ":subtree_number:->to_state". + +If you see anything else (especially any question marks) in the display of +an arc, it's dumpnfa() trying to tell you that there's something fishy +about the arc; see the source code. + +The regex executor can only handle PLAIN and LACON transitions. The regex +optimize() function is responsible for transforming the parser's output +to get rid of all the other arc types. In particular, ^ and $ arcs that +are not dropped as impossible will always end up adjacent to the pre or +post state respectively, and then will be converted into PLAIN arcs that +mention the special "colors" for BOS, BOL, EOS, or EOL. + +To decide whether a thus-transformed NFA matches a given substring of the +input string, the executor essentially follows these rules: +1. Start the NFA "looking at" the character *before* the given substring, +or if the substring is at the start of the input, prepend an imaginary BOS +character instead. +2. Run the NFA until it has consumed the character *after* the given +substring, or an imaginary following EOS character if the substring is at +the end of the input. +3. If the NFA is (or can be) in the goal state at this point, it matches. + +So one can mentally execute an untransformed NFA by taking ^ and $ as +ordinary constraints that match at start and end of input; but plain +arcs out of the start state should be taken as matches for the character +before the target substring, and similarly, plain arcs leading to the +post state are matches for the character after the target substring. +This definition is necessary to support regexes that begin or end with +constraints such as \m and \M, which imply requirements on the adjacent +character if any. NFAs for simple unanchored patterns will usually have +pre-state outarcs for all possible character colors as well as BOS and +BOL, and post-state inarcs for all possible character colors as well as +EOS and EOL, so that the executor's behavior will work. diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n new file mode 100644 index 0000000..4621bfc --- /dev/null +++ b/src/backend/regex/re_syntax.n @@ -0,0 +1,979 @@ +'\" +'\" Copyright (c) 1998 Sun Microsystems, Inc. +'\" Copyright (c) 1999 Scriptics Corporation +'\" +'\" This software is copyrighted by the Regents of the University of +'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState +'\" Corporation and other parties. The following terms apply to all files +'\" associated with the software unless explicitly disclaimed in +'\" individual files. +'\" +'\" The authors hereby grant permission to use, copy, modify, distribute, +'\" and license this software and its documentation for any purpose, provided +'\" that existing copyright notices are retained in all copies and that this +'\" notice is included verbatim in any distributions. No written agreement, +'\" license, or royalty fee is required for any of the authorized uses. +'\" Modifications to this software may be copyrighted by their authors +'\" and need not follow the licensing terms described here, provided that +'\" the new terms are clearly indicated on the first page of each file where +'\" they apply. +'\" +'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY +'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES +'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY +'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE +'\" POSSIBILITY OF SUCH DAMAGE. +'\" +'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, +'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, +'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE +'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE +'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR +'\" MODIFICATIONS. +'\" +'\" GOVERNMENT USE: If you are acquiring this software on behalf of the +'\" U.S. government, the Government shall have only "Restricted Rights" +'\" in the software and related documentation as defined in the Federal +'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you +'\" are acquiring the software on behalf of the Department of Defense, the +'\" software shall be classified as "Commercial Computer Software" and the +'\" Government shall have only "Restricted Rights" as defined in Clause +'\" 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the +'\" authors grant the U.S. Government and others acting in its behalf +'\" permission to use and distribute the software in accordance with the +'\" terms specified in this license. +'\" +'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp +'\" +.so man.macros +.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands" +.BS +.SH NAME +re_syntax \- Syntax of Tcl regular expressions. +.BE + +.SH DESCRIPTION +.PP +A \fIregular expression\fR describes strings of characters. +It's a pattern that matches certain strings and doesn't match others. + +.SH "DIFFERENT FLAVORS OF REs" +Regular expressions (``RE''s), as defined by POSIX, come in two +flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs''). +EREs are roughly those of the traditional \fIegrep\fR, while BREs are +roughly those of the traditional \fIed\fR. This implementation adds +a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with +some significant extensions. +.PP +This manual page primarily describes AREs. BREs mostly exist for +backward compatibility in some old programs; they will be discussed at +the end. POSIX EREs are almost an exact subset of AREs. Features of +AREs that are not present in EREs will be indicated. + +.SH "REGULAR EXPRESSION SYNTAX" +.PP +Tcl regular expressions are implemented using the package written by +Henry Spencer, based on the 1003.2 spec and some (not quite all) of +the Perl5 extensions (thanks, Henry!). Much of the description of +regular expressions below is copied verbatim from his manual entry. +.PP +An ARE is one or more \fIbranches\fR, +separated by `\fB|\fR', +matching anything that matches any of the branches. +.PP +A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR, +concatenated. +It matches a match for the first, followed by a match for the second, etc; +an empty branch matches the empty string. +.PP +A quantified atom is an \fIatom\fR possibly followed +by a single \fIquantifier\fR. +Without a quantifier, it matches a match for the atom. +The quantifiers, +and what a so-quantified atom matches, are: +.RS 2 +.TP 6 +\fB*\fR +a sequence of 0 or more matches of the atom +.TP +\fB+\fR +a sequence of 1 or more matches of the atom +.TP +\fB?\fR +a sequence of 0 or 1 matches of the atom +.TP +\fB{\fIm\fB}\fR +a sequence of exactly \fIm\fR matches of the atom +.TP +\fB{\fIm\fB,}\fR +a sequence of \fIm\fR or more matches of the atom +.TP +\fB{\fIm\fB,\fIn\fB}\fR +a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom; +\fIm\fR may not exceed \fIn\fR +.TP +\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR +\fInon-greedy\fR quantifiers, +which match the same possibilities, +but prefer the smallest number rather than the largest number +of matches (see MATCHING) +.RE +.PP +The forms using +\fB{\fR and \fB}\fR +are known as \fIbound\fRs. +The numbers +\fIm\fR and \fIn\fR are unsigned decimal integers +with permissible values from 0 to 255 inclusive. +.PP +An atom is one of: +.RS 2 +.TP 6 +\fB(\fIre\fB)\fR +(where \fIre\fR is any regular expression) +matches a match for +\fIre\fR, with the match noted for possible reporting +.TP +\fB(?:\fIre\fB)\fR +as previous, +but does no reporting +(a ``non-capturing'' set of parentheses) +.TP +\fB()\fR +matches an empty string, +noted for possible reporting +.TP +\fB(?:)\fR +matches an empty string, +without reporting +.TP +\fB[\fIchars\fB]\fR +a \fIbracket expression\fR, +matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail) +.TP + \fB.\fR +matches any single character +.TP +\fB\e\fIk\fR +(where \fIk\fR is a non-alphanumeric character) +matches that character taken as an ordinary character, +e.g. \e\e matches a backslash character +.TP +\fB\e\fIc\fR +where \fIc\fR is alphanumeric +(possibly followed by other characters), +an \fIescape\fR (AREs only), +see ESCAPES below +.TP +\fB{\fR +when followed by a character other than a digit, +matches the left-brace character `\fB{\fR'; +when followed by a digit, it is the beginning of a +\fIbound\fR (see above) +.TP +\fIx\fR +where \fIx\fR is +a single character with no other significance, matches that character. +.RE +.PP +A \fIconstraint\fR matches an empty string when specific conditions +are met. +A constraint may not be followed by a quantifier. +The simple constraints are as follows; some more constraints are +described later, under ESCAPES. +.RS 2 +.TP 8 +\fB^\fR +matches at the beginning of a line +.TP +\fB$\fR +matches at the end of a line +.TP +\fB(?=\fIre\fB)\fR +\fIpositive lookahead\fR (AREs only), matches at any point +where a substring matching \fIre\fR begins +.TP +\fB(?!\fIre\fB)\fR +\fInegative lookahead\fR (AREs only), matches at any point +where no substring matching \fIre\fR begins +.TP +\fB(?<=\fIre\fB)\fR +\fIpositive lookbehind\fR (AREs only), matches at any point +where a substring matching \fIre\fR ends +.TP +\fB(?<!\fIre\fB)\fR +\fInegative lookbehind\fR (AREs only), matches at any point +where no substring matching \fIre\fR ends +.RE +.PP +Lookahead and lookbehind constraints may not contain back references +(see later), and all parentheses within them are considered non-capturing. +.PP +An RE may not end with `\fB\e\fR'. + +.SH "BRACKET EXPRESSIONS" +A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'. +It normally matches any single character from the list (but see below). +If the list begins with `\fB^\fR', +it matches any single character +(but see below) \fInot\fR from the rest of the list. +.PP +If two characters in the list are separated by `\fB\-\fR', +this is shorthand +for the full \fIrange\fR of characters between those two (inclusive) in the +collating sequence, +e.g. +\fB[0\-9]\fR +in ASCII matches any decimal digit. +Two ranges may not share an +endpoint, so e.g. +\fBa\-c\-e\fR +is illegal. +Ranges are very collating-sequence-dependent, +and portable programs should avoid relying on them. +.PP +To include a literal +\fB]\fR +or +\fB\-\fR +in the list, +the simplest method is to +enclose it in +\fB[.\fR and \fB.]\fR +to make it a collating element (see below). +Alternatively, +make it the first character +(following a possible `\fB^\fR'), +or (AREs only) precede it with `\fB\e\fR'. +Alternatively, for `\fB\-\fR', +make it the last character, +or the second endpoint of a range. +To use a literal +\fB\-\fR +as the first endpoint of a range, +make it a collating element +or (AREs only) precede it with `\fB\e\fR'. +With the exception of these, some combinations using +\fB[\fR +(see next +paragraphs), and escapes, +all other special characters lose their +special significance within a bracket expression. +.PP +Within a bracket expression, a collating element (a character, +a multi-character sequence that collates as if it were a single character, +or a collating-sequence name for either) +enclosed in +\fB[.\fR and \fB.]\fR +stands for the +sequence of characters of that collating element. +The sequence is a single element of the bracket expression's list. +A bracket expression in a locale that has +multi-character collating elements +can thus match more than one character. +.VS 8.2 +So (insidiously), a bracket expression that starts with \fB^\fR +can match multi-character collating elements even if none of them +appear in the bracket expression! +(\fINote:\fR Tcl currently has no multi-character collating elements. +This information is only for illustration.) +.PP +For example, assume the collating sequence includes a \fBch\fR +multi-character collating element. +Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP) +matches the first five characters of `\fBchchcc\fR'. +Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR' +(because \fB[^c]\fR matches the multi-character \fBch\fR). +.VE 8.2 +.PP +Within a bracket expression, a collating element enclosed in +\fB[=\fR +and +\fB=]\fR +is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. +(If there are no other equivalent collating elements, +the treatment is as if the enclosing delimiters were `\fB[.\fR'\& +and `\fB.]\fR'.) +For example, if +\fBo\fR +and +\fB\o'o^'\fR +are the members of an equivalence class, +then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR', +and `\fB[o\o'o^']\fR'\& +are all synonymous. +An equivalence class may not be an endpoint +of a range. +.VS 8.2 +(\fINote:\fR +Tcl currently implements only the Unicode locale. +It doesn't define any equivalence classes. +The examples above are just illustrations.) +.VE 8.2 +.PP +Within a bracket expression, the name of a \fIcharacter class\fR enclosed +in +\fB[:\fR +and +\fB:]\fR +stands for the list of all characters +(not all collating elements!) +belonging to that +class. +Standard character classes are: +.PP +.RS +.ne 5 +.nf +.ta 3c +\fBalpha\fR A letter. +\fBupper\fR An upper-case letter. +\fBlower\fR A lower-case letter. +\fBdigit\fR A decimal digit. +\fBxdigit\fR A hexadecimal digit. +\fBalnum\fR An alphanumeric (letter or digit). +\fBprint\fR An alphanumeric (same as alnum). +\fBblank\fR A space or tab character. +\fBspace\fR A character producing white space in displayed text. +\fBpunct\fR A punctuation character. +\fBgraph\fR A character with a visible representation. +\fBcntrl\fR A control character. +.fi +.RE +.PP +A locale may provide others. +.VS 8.2 +(Note that the current Tcl implementation has only one locale: +the Unicode locale.) +.VE 8.2 +A character class may not be used as an endpoint of a range. +.PP +There are two special cases of bracket expressions: +the bracket expressions +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +are constraints, matching empty strings at +the beginning and end of a word respectively. +'\" note, discussion of escapes below references this definition of word +A word is defined as a sequence of +word characters +that is neither preceded nor followed by +word characters. +A word character is an +\fIalnum\fR +character +or an underscore +(\fB_\fR). +These special bracket expressions are deprecated; +users of AREs should use constraint escapes instead (see below). +.SH ESCAPES +Escapes (AREs only), which begin with a +\fB\e\fR +followed by an alphanumeric character, +come in several varieties: +character entry, class shorthands, constraint escapes, and back references. +A +\fB\e\fR +followed by an alphanumeric character but not constituting +a valid escape is illegal in AREs. +In EREs, there are no escapes: +outside a bracket expression, +a +\fB\e\fR +followed by an alphanumeric character merely stands for that +character as an ordinary character, +and inside a bracket expression, +\fB\e\fR +is an ordinary character. +(The latter is the one actual incompatibility between EREs and AREs.) +.PP +Character-entry escapes (AREs only) exist to make it easier to specify +non-printing and otherwise inconvenient characters in REs: +.RS 2 +.TP 5 +\fB\ea\fR +alert (bell) character, as in C +.TP +\fB\eb\fR +backspace, as in C +.TP +\fB\eB\fR +synonym for +\fB\e\fR +to help reduce backslash doubling in some +applications where there are multiple levels of backslash processing +.TP +\fB\ec\fIX\fR +(where X is any character) the character whose +low-order 5 bits are the same as those of +\fIX\fR, +and whose other bits are all zero +.TP +\fB\ee\fR +the character whose collating-sequence name +is `\fBESC\fR', +or failing that, the character with octal value 033 +.TP +\fB\ef\fR +formfeed, as in C +.TP +\fB\en\fR +newline, as in C +.TP +\fB\er\fR +carriage return, as in C +.TP +\fB\et\fR +horizontal tab, as in C +.TP +\fB\eu\fIwxyz\fR +(where +\fIwxyz\fR +is exactly four hexadecimal digits) +the Unicode character +\fBU+\fIwxyz\fR +in the local byte ordering +.TP +\fB\eU\fIstuvwxyz\fR +(where +\fIstuvwxyz\fR +is exactly eight hexadecimal digits) +reserved for a somewhat-hypothetical Unicode extension to 32 bits +.TP +\fB\ev\fR +vertical tab, as in C +are all available. +.TP +\fB\ex\fIhhh\fR +(where +\fIhhh\fR +is any sequence of hexadecimal digits) +the character whose hexadecimal value is +\fB0x\fIhhh\fR +(a single character no matter how many hexadecimal digits are used). +.TP +\fB\e0\fR +the character whose value is +\fB0\fR +.TP +\fB\e\fIxy\fR +(where +\fIxy\fR +is exactly two octal digits, +and is not a +\fIback reference\fR (see below)) +the character whose octal value is +\fB0\fIxy\fR +.TP +\fB\e\fIxyz\fR +(where +\fIxyz\fR +is exactly three octal digits, +and is not a +back reference (see below)) +the character whose octal value is +\fB0\fIxyz\fR +.RE +.PP +Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR', +and `\fBA\fR'-`\fBF\fR'. +Octal digits are `\fB0\fR'-`\fB7\fR'. +.PP +The character-entry escapes are always taken as ordinary characters. +For example, +\fB\e135\fR +is +\fB]\fR +in ASCII, +but +\fB\e135\fR +does not terminate a bracket expression. +Beware, however, that some applications (e.g., C compilers) interpret +such sequences themselves before the regular-expression package +gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'. +.PP +Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used +character classes: +.RS 2 +.TP 10 +\fB\ed\fR +\fB[[:digit:]]\fR +.TP +\fB\es\fR +\fB[[:space:]]\fR +.TP +\fB\ew\fR +\fB[[:alnum:]_]\fR +(note underscore) +.TP +\fB\eD\fR +\fB[^[:digit:]]\fR +.TP +\fB\eS\fR +\fB[^[:space:]]\fR +.TP +\fB\eW\fR +\fB[^[:alnum:]_]\fR +(note underscore) +.RE +.PP +Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', +and `\fB\ew\fR'\& +lose their outer brackets, +and `\fB\eD\fR', `\fB\eS\fR', +and `\fB\eW\fR'\& +are illegal. +.VS 8.2 +(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. +Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.) +.VE 8.2 +.PP +A constraint escape (AREs only) is a constraint, +matching the empty string if specific conditions are met, +written as an escape: +.RS 2 +.TP 6 +\fB\eA\fR +matches only at the beginning of the string +(see MATCHING, below, for how this differs from `\fB^\fR') +.TP +\fB\em\fR +matches only at the beginning of a word +.TP +\fB\eM\fR +matches only at the end of a word +.TP +\fB\ey\fR +matches only at the beginning or end of a word +.TP +\fB\eY\fR +matches only at a point that is not the beginning or end of a word +.TP +\fB\eZ\fR +matches only at the end of the string +(see MATCHING, below, for how this differs from `\fB$\fR') +.TP +\fB\e\fIm\fR +(where +\fIm\fR +is a nonzero digit) a \fIback reference\fR, see below +.TP +\fB\e\fImnn\fR +(where +\fIm\fR +is a nonzero digit, and +\fInn\fR +is some more digits, +and the decimal value +\fImnn\fR +is not greater than the number of closing capturing parentheses seen so far) +a \fIback reference\fR, see below +.RE +.PP +A word is defined as in the specification of +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +above. +Constraint escapes are illegal within bracket expressions. +.PP +A back reference (AREs only) matches the same string matched by the parenthesized +subexpression specified by the number, +so that (e.g.) +\fB([bc])\e1\fR +matches +\fBbb\fR +or +\fBcc\fR +but not `\fBbc\fR'. +The subexpression must entirely precede the back reference in the RE. +Subexpressions are numbered in the order of their leading parentheses. +Non-capturing parentheses do not define subexpressions. +.PP +There is an inherent historical ambiguity between octal character-entry +escapes and back references, which is resolved by heuristics, +as hinted at above. +A leading zero always indicates an octal escape. +A single non-zero digit, not followed by another digit, +is always taken as a back reference. +A multi-digit sequence not starting with a zero is taken as a back +reference if it comes after a suitable subexpression +(i.e. the number is in the legal range for a back reference), +and otherwise is taken as octal. +.SH "METASYNTAX" +In addition to the main syntax described above, there are some special +forms and miscellaneous syntactic facilities available. +.PP +Normally the flavor of RE being used is specified by +application-dependent means. +However, this can be overridden by a \fIdirector\fR. +If an RE of any flavor begins with `\fB***:\fR', +the rest of the RE is an ARE. +If an RE of any flavor begins with `\fB***=\fR', +the rest of the RE is taken to be a literal string, +with all characters considered ordinary characters. +.PP +An ARE may begin with \fIembedded options\fR: +a sequence +\fB(?\fIxyz\fB)\fR +(where +\fIxyz\fR +is one or more alphabetic characters) +specifies options affecting the rest of the RE. +These supplement, and can override, +any options specified by the application. +The available option letters are: +.RS 2 +.TP 3 +\fBb\fR +rest of RE is a BRE +.TP 3 +\fBc\fR +case-sensitive matching (usual default) +.TP 3 +\fBe\fR +rest of RE is an ERE +.TP 3 +\fBi\fR +case-insensitive matching (see MATCHING, below) +.TP 3 +\fBm\fR +historical synonym for +\fBn\fR +.TP 3 +\fBn\fR +newline-sensitive matching (see MATCHING, below) +.TP 3 +\fBp\fR +partial newline-sensitive matching (see MATCHING, below) +.TP 3 +\fBq\fR +rest of RE is a literal (``quoted'') string, all ordinary characters +.TP 3 +\fBs\fR +non-newline-sensitive matching (usual default) +.TP 3 +\fBt\fR +tight syntax (usual default; see below) +.TP 3 +\fBw\fR +inverse partial newline-sensitive (``weird'') matching (see MATCHING, below) +.TP 3 +\fBx\fR +expanded syntax (see below) +.RE +.PP +Embedded options take effect at the +\fB)\fR +terminating the sequence. +They are available only at the start of an ARE, +and may not be used later within it. +.PP +In addition to the usual (\fItight\fR) RE syntax, in which all characters are +significant, there is an \fIexpanded\fR syntax, +available in all flavors of RE +with the \fB-expanded\fR switch, or in AREs with the embedded x option. +In the expanded syntax, +white-space characters are ignored +and all characters between a +\fB#\fR +and the following newline (or the end of the RE) are ignored, +permitting paragraphing and commenting a complex RE. +There are three exceptions to that basic rule: +.RS 2 +.PP +a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained +.PP +white space or `\fB#\fR' within a bracket expression is retained +.PP +white space and comments are illegal within multi-character symbols +like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR' +.RE +.PP +Expanded-syntax white-space characters are blank, tab, newline, and +.VS 8.2 +any character that belongs to the \fIspace\fR character class. +.VE 8.2 +.PP +Finally, in an ARE, +outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR' +(where +\fIttt\fR +is any text not containing a `\fB)\fR') +is a comment, +completely ignored. +Again, this is not allowed between the characters of +multi-character symbols like `\fB(?:\fR'. +Such comments are more a historical artifact than a useful facility, +and their use is deprecated; +use the expanded syntax instead. +.PP +\fINone\fR of these metasyntax extensions is available if the application +(or an initial +\fB***=\fR +director) +has specified that the user's input be treated as a literal string +rather than as an RE. +.SH MATCHING +In the event that an RE could match more than one substring of a given +string, +the RE matches the one starting earliest in the string. +If the RE could match more than one substring starting at that point, +its choice is determined by its \fIpreference\fR: +either the longest substring, or the shortest. +.PP +Most atoms, and all constraints, have no preference. +A parenthesized RE has the same preference (possibly none) as the RE. +A quantified atom with quantifier +\fB{\fIm\fB}\fR +or +\fB{\fIm\fB}?\fR +has the same preference (possibly none) as the atom itself. +A quantified atom with other normal quantifiers (including +\fB{\fIm\fB,\fIn\fB}\fR +with +\fIm\fR +equal to +\fIn\fR) +prefers longest match. +A quantified atom with other non-greedy quantifiers (including +\fB{\fIm\fB,\fIn\fB}?\fR +with +\fIm\fR +equal to +\fIn\fR) +prefers shortest match. +A branch has the same preference as the first quantified atom in it +which has a preference. +An RE consisting of two or more branches connected by the +\fB|\fR +operator prefers longest match. +.PP +Subject to the constraints imposed by the rules for matching the whole RE, +subexpressions also match the longest or shortest possible substrings, +based on their preferences, +with subexpressions starting earlier in the RE taking priority over +ones starting later. +Note that outer subexpressions thus take priority over +their component subexpressions. +.PP +Note that the quantifiers +\fB{1,1}\fR +and +\fB{1,1}?\fR +can be used to force longest and shortest preference, respectively, +on a subexpression or a whole RE. +.PP +Match lengths are measured in characters, not collating elements. +An empty string is considered longer than no match at all. +For example, +\fBbb*\fR +matches the three middle characters of `\fBabbbc\fR', +\fB(week|wee)(night|knights)\fR +matches all ten characters of `\fBweeknights\fR', +when +\fB(.*).*\fR +is matched against +\fBabc\fR +the parenthesized subexpression +matches all three characters, and +when +\fB(a*)*\fR +is matched against +\fBbc\fR +both the whole RE and the parenthesized +subexpression match an empty string. +.PP +If case-independent matching is specified, +the effect is much as if all case distinctions had vanished from the +alphabet. +When an alphabetic that exists in multiple cases appears as an +ordinary character outside a bracket expression, it is effectively +transformed into a bracket expression containing both cases, +so that +\fBx\fR +becomes `\fB[xX]\fR'. +When it appears inside a bracket expression, all case counterparts +of it are added to the bracket expression, so that +\fB[x]\fR +becomes +\fB[xX]\fR +and +\fB[^x]\fR +becomes `\fB[^xX]\fR'. +.PP +If newline-sensitive matching is specified, \fB.\fR +and bracket expressions using +\fB^\fR +will never match the newline character +(so that matches will never cross newlines unless the RE +explicitly arranges it) +and +\fB^\fR +and +\fB$\fR +will match the empty string after and before a newline +respectively, in addition to matching at beginning and end of string +respectively. +ARE +\fB\eA\fR +and +\fB\eZ\fR +continue to match beginning or end of string \fIonly\fR. +.PP +If partial newline-sensitive matching is specified, +this affects \fB.\fR +and bracket expressions +as with newline-sensitive matching, but not +\fB^\fR +and `\fB$\fR'. +.PP +If inverse partial newline-sensitive matching is specified, +this affects +\fB^\fR +and +\fB$\fR +as with +newline-sensitive matching, +but not \fB.\fR +and bracket expressions. +This isn't very useful but is provided for symmetry. +.SH "LIMITS AND COMPATIBILITY" +No particular limit is imposed on the length of REs. +Programs intended to be highly portable should not employ REs longer +than 256 bytes, +as a POSIX-compliant implementation can refuse to accept such REs. +.PP +The only feature of AREs that is actually incompatible with +POSIX EREs is that +\fB\e\fR +does not lose its special +significance inside bracket expressions. +All other ARE features use syntax which is illegal or has +undefined or unspecified effects in POSIX EREs; +the +\fB***\fR +syntax of directors likewise is outside the POSIX +syntax for both BREs and EREs. +.PP +Many of the ARE extensions are borrowed from Perl, but some have +been changed to clean them up, and a few Perl extensions are not present. +Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', +the lack of special treatment for a trailing newline, +the addition of complemented bracket expressions to the things +affected by newline-sensitive matching, +the restrictions on parentheses and back references in lookahead/lookbehind +constraints, +and the longest/shortest-match (rather than first-match) matching semantics. +.PP +The matching rules for REs containing both normal and non-greedy quantifiers +have changed since early beta-test versions of this package. +(The new rules are much simpler and cleaner, +but don't work as hard at guessing the user's real intentions.) +.PP +Henry Spencer's original 1986 \fIregexp\fR package, +still in widespread use (e.g., in pre-8.1 releases of Tcl), +implemented an early version of today's EREs. +There are four incompatibilities between \fIregexp\fR's near-EREs +(`RREs' for short) and AREs. +In roughly increasing order of significance: +.PP +.RS +In AREs, +\fB\e\fR +followed by an alphanumeric character is either an +escape or an error, +while in RREs, it was just another way of writing the +alphanumeric. +This should not be a problem because there was no reason to write +such a sequence in RREs. +.PP +\fB{\fR +followed by a digit in an ARE is the beginning of a bound, +while in RREs, +\fB{\fR +was always an ordinary character. +Such sequences should be rare, +and will often result in an error because following characters +will not look like a valid bound. +.PP +In AREs, +\fB\e\fR +remains a special character within `\fB[\|]\fR', +so a literal +\fB\e\fR +within +\fB[\|]\fR +must be written `\fB\e\e\fR'. +\fB\e\e\fR +also gives a literal +\fB\e\fR +within +\fB[\|]\fR +in RREs, +but only truly paranoid programmers routinely doubled the backslash. +.PP +AREs report the longest/shortest match for the RE, +rather than the first found in a specified search order. +This may affect some RREs which were written in the expectation that +the first match would be reported. +(The careful crafting of RREs to optimize the search order for fast +matching is obsolete (AREs examine all possible matches +in parallel, and their performance is largely insensitive to their +complexity) but cases where the search order was exploited to deliberately +find a match which was \fInot\fR the longest/shortest will need rewriting.) +.RE + +.SH "BASIC REGULAR EXPRESSIONS" +BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR', +and +\fB?\fR +are ordinary characters and there is no equivalent +for their functionality. +The delimiters for bounds are +\fB\e{\fR +and `\fB\e}\fR', +with +\fB{\fR +and +\fB}\fR +by themselves ordinary characters. +The parentheses for nested subexpressions are +\fB\e(\fR +and `\fB\e)\fR', +with +\fB(\fR +and +\fB)\fR +by themselves ordinary characters. +\fB^\fR +is an ordinary character except at the beginning of the +RE or the beginning of a parenthesized subexpression, +\fB$\fR +is an ordinary character except at the end of the +RE or the end of a parenthesized subexpression, +and +\fB*\fR +is an ordinary character if it appears at the beginning of the +RE or the beginning of a parenthesized subexpression +(after a possible leading `\fB^\fR'). +Finally, +single-digit back references are available, +and +\fB\e<\fR +and +\fB\e>\fR +are synonyms for +\fB[[:<:]]\fR +and +\fB[[:>:]]\fR +respectively; +no other escapes are available. + +.SH "SEE ALSO" +RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n) + +.SH KEYWORDS +match, regular expression, string diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c new file mode 100644 index 0000000..f5a4151 --- /dev/null +++ b/src/backend/regex/regc_color.c @@ -0,0 +1,1140 @@ +/* + * colorings of characters + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_color.c + * + * + * Note that there are some incestuous relationships between this code and + * NFA arc maintenance, which perhaps ought to be cleaned up sometime. + */ + + + +#define CISERR() VISERR(cm->v) +#define CERR(e) VERR(cm->v, (e)) + + + +/* + * initcm - set up new colormap + */ +static void +initcm(struct vars *v, + struct colormap *cm) +{ + struct colordesc *cd; + + cm->magic = CMMAGIC; + cm->v = v; + + cm->ncds = NINLINECDS; + cm->cd = cm->cdspace; + cm->max = 0; + cm->free = 0; + + cd = cm->cd; /* cm->cd[WHITE] */ + cd->nschrs = MAX_SIMPLE_CHR - CHR_MIN + 1; + cd->nuchrs = 1; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; + cd->flags = 0; + + cm->locolormap = (color *) + MALLOC((MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color)); + if (cm->locolormap == NULL) + { + CERR(REG_ESPACE); + cm->cmranges = NULL; /* prevent failure during freecm */ + cm->hicolormap = NULL; + return; + } + /* this memset relies on WHITE being zero: */ + memset(cm->locolormap, WHITE, + (MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color)); + + memset(cm->classbits, 0, sizeof(cm->classbits)); + cm->numcmranges = 0; + cm->cmranges = NULL; + cm->maxarrayrows = 4; /* arbitrary initial allocation */ + cm->hiarrayrows = 1; /* but we have only one row/col initially */ + cm->hiarraycols = 1; + cm->hicolormap = (color *) MALLOC(cm->maxarrayrows * sizeof(color)); + if (cm->hicolormap == NULL) + { + CERR(REG_ESPACE); + return; + } + /* initialize the "all other characters" row to WHITE */ + cm->hicolormap[0] = WHITE; +} + +/* + * freecm - free dynamically-allocated things in a colormap + */ +static void +freecm(struct colormap *cm) +{ + cm->magic = 0; + if (cm->cd != cm->cdspace) + FREE(cm->cd); + if (cm->locolormap != NULL) + FREE(cm->locolormap); + if (cm->cmranges != NULL) + FREE(cm->cmranges); + if (cm->hicolormap != NULL) + FREE(cm->hicolormap); +} + +/* + * pg_reg_getcolor - slow case of GETCOLOR() + */ +color +pg_reg_getcolor(struct colormap *cm, chr c) +{ + int rownum, + colnum, + low, + high; + + /* Should not be used for chrs in the locolormap */ + assert(c > MAX_SIMPLE_CHR); + + /* + * Find which row it's in. The colormapranges are in order, so we can use + * binary search. + */ + rownum = 0; /* if no match, use array row zero */ + low = 0; + high = cm->numcmranges; + while (low < high) + { + int middle = low + (high - low) / 2; + const colormaprange *cmr = &cm->cmranges[middle]; + + if (c < cmr->cmin) + high = middle; + else if (c > cmr->cmax) + low = middle + 1; + else + { + rownum = cmr->rownum; /* found a match */ + break; + } + } + + /* + * Find which column it's in --- this is all locale-dependent. + */ + if (cm->hiarraycols > 1) + { + colnum = cclass_column_index(cm, c); + return cm->hicolormap[rownum * cm->hiarraycols + colnum]; + } + else + { + /* fast path if no relevant cclasses */ + return cm->hicolormap[rownum]; + } +} + +/* + * maxcolor - report largest color number in use + */ +static color +maxcolor(struct colormap *cm) +{ + if (CISERR()) + return COLORLESS; + + return (color) cm->max; +} + +/* + * newcolor - find a new color (must be assigned at once) + * Beware: may relocate the colordescs. + */ +static color /* COLORLESS for error */ +newcolor(struct colormap *cm) +{ + struct colordesc *cd; + size_t n; + + if (CISERR()) + return COLORLESS; + + if (cm->free != 0) + { + assert(cm->free > 0); + assert((size_t) cm->free < cm->ncds); + cd = &cm->cd[cm->free]; + assert(UNUSEDCOLOR(cd)); + assert(cd->arcs == NULL); + cm->free = cd->sub; + } + else if (cm->max < cm->ncds - 1) + { + cm->max++; + cd = &cm->cd[cm->max]; + } + else + { + /* oops, must allocate more */ + struct colordesc *newCd; + + if (cm->max == MAX_COLOR) + { + CERR(REG_ECOLORS); + return COLORLESS; /* too many colors */ + } + + n = cm->ncds * 2; + if (n > MAX_COLOR + 1) + n = MAX_COLOR + 1; + if (cm->cd == cm->cdspace) + { + newCd = (struct colordesc *) MALLOC(n * sizeof(struct colordesc)); + if (newCd != NULL) + memcpy(VS(newCd), VS(cm->cdspace), cm->ncds * + sizeof(struct colordesc)); + } + else + newCd = (struct colordesc *) + REALLOC(cm->cd, n * sizeof(struct colordesc)); + if (newCd == NULL) + { + CERR(REG_ESPACE); + return COLORLESS; + } + cm->cd = newCd; + cm->ncds = n; + assert(cm->max < cm->ncds - 1); + cm->max++; + cd = &cm->cd[cm->max]; + } + + cd->nschrs = 0; + cd->nuchrs = 0; + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; /* in case never set otherwise */ + cd->flags = 0; + + return (color) (cd - cm->cd); +} + +/* + * freecolor - free a color (must have no arcs or subcolor) + */ +static void +freecolor(struct colormap *cm, + color co) +{ + struct colordesc *cd = &cm->cd[co]; + color pco, + nco; /* for freelist scan */ + + assert(co >= 0); + if (co == WHITE) + return; + + assert(cd->arcs == NULL); + assert(cd->sub == NOSUB); + assert(cd->nschrs == 0); + assert(cd->nuchrs == 0); + cd->flags = FREECOL; + + if ((size_t) co == cm->max) + { + while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max])) + cm->max--; + assert(cm->free >= 0); + while ((size_t) cm->free > cm->max) + cm->free = cm->cd[cm->free].sub; + if (cm->free > 0) + { + assert(cm->free < cm->max); + pco = cm->free; + nco = cm->cd[pco].sub; + while (nco > 0) + if ((size_t) nco > cm->max) + { + /* take this one out of freelist */ + nco = cm->cd[nco].sub; + cm->cd[pco].sub = nco; + } + else + { + assert(nco < cm->max); + pco = nco; + nco = cm->cd[pco].sub; + } + } + } + else + { + cd->sub = cm->free; + cm->free = (color) (cd - cm->cd); + } +} + +/* + * pseudocolor - allocate a false color, to be managed by other means + */ +static color +pseudocolor(struct colormap *cm) +{ + color co; + struct colordesc *cd; + + co = newcolor(cm); + if (CISERR()) + return COLORLESS; + cd = &cm->cd[co]; + cd->nschrs = 0; + cd->nuchrs = 1; /* pretend it is in the upper map */ + cd->sub = NOSUB; + cd->arcs = NULL; + cd->firstchr = CHR_MIN; + cd->flags = PSEUDO; + return co; +} + +/* + * subcolor - allocate a new subcolor (if necessary) to this chr + * + * This works only for chrs that map into the low color map. + */ +static color +subcolor(struct colormap *cm, chr c) +{ + color co; /* current color of c */ + color sco; /* new subcolor */ + + assert(c <= MAX_SIMPLE_CHR); + + co = cm->locolormap[c - CHR_MIN]; + sco = newsub(cm, co); + if (CISERR()) + return COLORLESS; + assert(sco != COLORLESS); + + if (co == sco) /* already in an open subcolor */ + return co; /* rest is redundant */ + cm->cd[co].nschrs--; + if (cm->cd[sco].nschrs == 0) + cm->cd[sco].firstchr = c; + cm->cd[sco].nschrs++; + cm->locolormap[c - CHR_MIN] = sco; + return sco; +} + +/* + * subcolorhi - allocate a new subcolor (if necessary) to this colormap entry + * + * This is the same processing as subcolor(), but for entries in the high + * colormap, which do not necessarily correspond to exactly one chr code. + */ +static color +subcolorhi(struct colormap *cm, color *pco) +{ + color co; /* current color of entry */ + color sco; /* new subcolor */ + + co = *pco; + sco = newsub(cm, co); + if (CISERR()) + return COLORLESS; + assert(sco != COLORLESS); + + if (co == sco) /* already in an open subcolor */ + return co; /* rest is redundant */ + cm->cd[co].nuchrs--; + cm->cd[sco].nuchrs++; + *pco = sco; + return sco; +} + +/* + * newsub - allocate a new subcolor (if necessary) for a color + */ +static color +newsub(struct colormap *cm, + color co) +{ + color sco; /* new subcolor */ + + sco = cm->cd[co].sub; + if (sco == NOSUB) + { /* color has no open subcolor */ + /* optimization: singly-referenced color need not be subcolored */ + if ((cm->cd[co].nschrs + cm->cd[co].nuchrs) == 1) + return co; + sco = newcolor(cm); /* must create subcolor */ + if (sco == COLORLESS) + { + assert(CISERR()); + return COLORLESS; + } + cm->cd[co].sub = sco; + cm->cd[sco].sub = sco; /* open subcolor points to self */ + } + assert(sco != NOSUB); + + return sco; +} + +/* + * newhicolorrow - get a new row in the hicolormap, cloning it from oldrow + * + * Returns array index of new row. Note the array might move. + */ +static int +newhicolorrow(struct colormap *cm, + int oldrow) +{ + int newrow = cm->hiarrayrows; + color *newrowptr; + int i; + + /* Assign a fresh array row index, enlarging storage if needed */ + if (newrow >= cm->maxarrayrows) + { + color *newarray; + + if (cm->maxarrayrows >= INT_MAX / (cm->hiarraycols * 2)) + { + CERR(REG_ESPACE); + return 0; + } + newarray = (color *) REALLOC(cm->hicolormap, + cm->maxarrayrows * 2 * + cm->hiarraycols * sizeof(color)); + if (newarray == NULL) + { + CERR(REG_ESPACE); + return 0; + } + cm->hicolormap = newarray; + cm->maxarrayrows *= 2; + } + cm->hiarrayrows++; + + /* Copy old row data */ + newrowptr = &cm->hicolormap[newrow * cm->hiarraycols]; + memcpy(newrowptr, + &cm->hicolormap[oldrow * cm->hiarraycols], + cm->hiarraycols * sizeof(color)); + + /* Increase color reference counts to reflect new colormap entries */ + for (i = 0; i < cm->hiarraycols; i++) + cm->cd[newrowptr[i]].nuchrs++; + + return newrow; +} + +/* + * newhicolorcols - create a new set of columns in the high colormap + * + * Essentially, extends the 2-D array to the right with a copy of itself. + */ +static void +newhicolorcols(struct colormap *cm) +{ + color *newarray; + int r, + c; + + if (cm->hiarraycols >= INT_MAX / (cm->maxarrayrows * 2)) + { + CERR(REG_ESPACE); + return; + } + newarray = (color *) REALLOC(cm->hicolormap, + cm->maxarrayrows * + cm->hiarraycols * 2 * sizeof(color)); + if (newarray == NULL) + { + CERR(REG_ESPACE); + return; + } + cm->hicolormap = newarray; + + /* Duplicate existing columns to the right, and increase ref counts */ + /* Must work backwards in the array because we realloc'd in place */ + for (r = cm->hiarrayrows - 1; r >= 0; r--) + { + color *oldrowptr = &newarray[r * cm->hiarraycols]; + color *newrowptr = &newarray[r * cm->hiarraycols * 2]; + color *newrowptr2 = newrowptr + cm->hiarraycols; + + for (c = 0; c < cm->hiarraycols; c++) + { + color co = oldrowptr[c]; + + newrowptr[c] = newrowptr2[c] = co; + cm->cd[co].nuchrs++; + } + } + + cm->hiarraycols *= 2; +} + +/* + * subcolorcvec - allocate new subcolors to cvec members, fill in arcs + * + * For each chr "c" represented by the cvec, do the equivalent of + * newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp); + * + * Note that in typical cases, many of the subcolors are the same. + * While newarc() would discard duplicate arc requests, we can save + * some cycles by not calling it repetitively to begin with. This is + * mechanized with the "lastsubcolor" state variable. + */ +static void +subcolorcvec(struct vars *v, + struct cvec *cv, + struct state *lp, + struct state *rp) +{ + struct colormap *cm = v->cm; + color lastsubcolor = COLORLESS; + chr ch, + from, + to; + const chr *p; + int i; + + /* ordinary characters */ + for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) + { + ch = *p; + subcoloronechr(v, ch, lp, rp, &lastsubcolor); + NOERR(); + } + + /* and the ranges */ + for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) + { + from = *p; + to = *(p + 1); + if (from <= MAX_SIMPLE_CHR) + { + /* deal with simple chars one at a time */ + chr lim = (to <= MAX_SIMPLE_CHR) ? to : MAX_SIMPLE_CHR; + + while (from <= lim) + { + color sco = subcolor(cm, from); + + NOERR(); + if (sco != lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + lastsubcolor = sco; + } + from++; + } + } + /* deal with any part of the range that's above MAX_SIMPLE_CHR */ + if (from < to) + subcoloronerange(v, from, to, lp, rp, &lastsubcolor); + else if (from == to) + subcoloronechr(v, from, lp, rp, &lastsubcolor); + NOERR(); + } + + /* and deal with cclass if any */ + if (cv->cclasscode >= 0) + { + int classbit; + color *pco; + int r, + c; + + /* Enlarge array if we don't have a column bit assignment for cclass */ + if (cm->classbits[cv->cclasscode] == 0) + { + cm->classbits[cv->cclasscode] = cm->hiarraycols; + newhicolorcols(cm); + NOERR(); + } + /* Apply subcolorhi() and make arc for each entry in relevant cols */ + classbit = cm->classbits[cv->cclasscode]; + pco = cm->hicolormap; + for (r = 0; r < cm->hiarrayrows; r++) + { + for (c = 0; c < cm->hiarraycols; c++) + { + if (c & classbit) + { + color sco = subcolorhi(cm, pco); + + NOERR(); + /* add the arc if needed */ + if (sco != lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + lastsubcolor = sco; + } + } + pco++; + } + } + } +} + +/* + * subcoloronechr - do subcolorcvec's work for a singleton chr + * + * We could just let subcoloronerange do this, but it's a bit more efficient + * if we exploit the single-chr case. Also, callers find it useful for this + * to be able to handle both low and high chr codes. + */ +static void +subcoloronechr(struct vars *v, + chr ch, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + colormaprange *newranges; + int numnewranges; + colormaprange *oldrange; + int oldrangen; + int newrow; + + /* Easy case for low chr codes */ + if (ch <= MAX_SIMPLE_CHR) + { + color sco = subcolor(cm, ch); + + NOERR(); + if (sco != *lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + *lastsubcolor = sco; + } + return; + } + + /* + * Potentially, we could need two more colormapranges than we have now, if + * the given chr is in the middle of some existing range. + */ + newranges = (colormaprange *) + MALLOC((cm->numcmranges + 2) * sizeof(colormaprange)); + if (newranges == NULL) + { + CERR(REG_ESPACE); + return; + } + numnewranges = 0; + + /* Ranges before target are unchanged */ + for (oldrange = cm->cmranges, oldrangen = 0; + oldrangen < cm->numcmranges; + oldrange++, oldrangen++) + { + if (oldrange->cmax >= ch) + break; + newranges[numnewranges++] = *oldrange; + } + + /* Match target chr against current range */ + if (oldrangen >= cm->numcmranges || oldrange->cmin > ch) + { + /* chr does not belong to any existing range, make a new one */ + newranges[numnewranges].cmin = ch; + newranges[numnewranges].cmax = ch; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + } + else if (oldrange->cmin == oldrange->cmax) + { + /* we have an existing singleton range matching the chr */ + newranges[numnewranges++] = *oldrange; + newrow = oldrange->rownum; + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + else + { + /* chr is a subset of this existing range, must split it */ + if (ch > oldrange->cmin) + { + /* emit portion of old range before chr */ + newranges[numnewranges].cmin = oldrange->cmin; + newranges[numnewranges].cmax = ch - 1; + newranges[numnewranges].rownum = oldrange->rownum; + numnewranges++; + } + /* emit chr as singleton range, initially cloning from range */ + newranges[numnewranges].cmin = ch; + newranges[numnewranges].cmax = ch; + newranges[numnewranges].rownum = newrow = + newhicolorrow(cm, oldrange->rownum); + numnewranges++; + if (ch < oldrange->cmax) + { + /* emit portion of old range after chr */ + newranges[numnewranges].cmin = ch + 1; + newranges[numnewranges].cmax = oldrange->cmax; + /* must clone the row if we are making two new ranges from old */ + newranges[numnewranges].rownum = + (ch > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) : + oldrange->rownum; + numnewranges++; + } + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + + /* Ranges after target are unchanged */ + for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++) + { + newranges[numnewranges++] = *oldrange; + } + + /* Assert our original space estimate was adequate */ + assert(numnewranges <= (cm->numcmranges + 2)); + + /* And finally, store back the updated list of ranges */ + if (cm->cmranges != NULL) + FREE(cm->cmranges); + cm->cmranges = newranges; + cm->numcmranges = numnewranges; +} + +/* + * subcoloronerange - do subcolorcvec's work for a high range + */ +static void +subcoloronerange(struct vars *v, + chr from, + chr to, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + colormaprange *newranges; + int numnewranges; + colormaprange *oldrange; + int oldrangen; + int newrow; + + /* Caller should take care of non-high-range cases */ + assert(from > MAX_SIMPLE_CHR); + assert(from < to); + + /* + * Potentially, if we have N non-adjacent ranges, we could need as many as + * 2N+1 result ranges (consider case where new range spans 'em all). + */ + newranges = (colormaprange *) + MALLOC((cm->numcmranges * 2 + 1) * sizeof(colormaprange)); + if (newranges == NULL) + { + CERR(REG_ESPACE); + return; + } + numnewranges = 0; + + /* Ranges before target are unchanged */ + for (oldrange = cm->cmranges, oldrangen = 0; + oldrangen < cm->numcmranges; + oldrange++, oldrangen++) + { + if (oldrange->cmax >= from) + break; + newranges[numnewranges++] = *oldrange; + } + + /* + * Deal with ranges that (partially) overlap the target. As we process + * each such range, increase "from" to remove the dealt-with characters + * from the target range. + */ + while (oldrangen < cm->numcmranges && oldrange->cmin <= to) + { + if (from < oldrange->cmin) + { + /* Handle portion of new range that corresponds to no old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = oldrange->cmin - 1; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + /* We've now fully processed the part of new range before old */ + from = oldrange->cmin; + } + + if (from <= oldrange->cmin && to >= oldrange->cmax) + { + /* old range is fully contained in new, process it in-place */ + newranges[numnewranges++] = *oldrange; + newrow = oldrange->rownum; + from = oldrange->cmax + 1; + } + else + { + /* some part of old range does not overlap new range */ + if (from > oldrange->cmin) + { + /* emit portion of old range before new range */ + newranges[numnewranges].cmin = oldrange->cmin; + newranges[numnewranges].cmax = from - 1; + newranges[numnewranges].rownum = oldrange->rownum; + numnewranges++; + } + /* emit common subrange, initially cloning from old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = + (to < oldrange->cmax) ? to : oldrange->cmax; + newranges[numnewranges].rownum = newrow = + newhicolorrow(cm, oldrange->rownum); + numnewranges++; + if (to < oldrange->cmax) + { + /* emit portion of old range after new range */ + newranges[numnewranges].cmin = to + 1; + newranges[numnewranges].cmax = oldrange->cmax; + /* must clone the row if we are making two new ranges from old */ + newranges[numnewranges].rownum = + (from > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) : + oldrange->rownum; + numnewranges++; + } + from = oldrange->cmax + 1; + } + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + /* we've now fully processed this old range */ + oldrange++, oldrangen++; + } + + if (from <= to) + { + /* Handle portion of new range that corresponds to no old range */ + newranges[numnewranges].cmin = from; + newranges[numnewranges].cmax = to; + /* row state should be cloned from the "all others" row */ + newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0); + numnewranges++; + /* Update colors in newrow and create arcs as needed */ + subcoloronerow(v, newrow, lp, rp, lastsubcolor); + } + + /* Ranges after target are unchanged */ + for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++) + { + newranges[numnewranges++] = *oldrange; + } + + /* Assert our original space estimate was adequate */ + assert(numnewranges <= (cm->numcmranges * 2 + 1)); + + /* And finally, store back the updated list of ranges */ + if (cm->cmranges != NULL) + FREE(cm->cmranges); + cm->cmranges = newranges; + cm->numcmranges = numnewranges; +} + +/* + * subcoloronerow - do subcolorcvec's work for one new row in the high colormap + */ +static void +subcoloronerow(struct vars *v, + int rownum, + struct state *lp, + struct state *rp, + color *lastsubcolor) +{ + struct colormap *cm = v->cm; + color *pco; + int i; + + /* Apply subcolorhi() and make arc for each entry in row */ + pco = &cm->hicolormap[rownum * cm->hiarraycols]; + for (i = 0; i < cm->hiarraycols; pco++, i++) + { + color sco = subcolorhi(cm, pco); + + NOERR(); + /* make the arc if needed */ + if (sco != *lastsubcolor) + { + newarc(v->nfa, PLAIN, sco, lp, rp); + NOERR(); + *lastsubcolor = sco; + } + } +} + +/* + * okcolors - promote subcolors to full colors + */ +static void +okcolors(struct nfa *nfa, + struct colormap *cm) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + struct colordesc *scd; + struct arc *a; + color co; + color sco; + + for (cd = cm->cd, co = 0; cd < end; cd++, co++) + { + sco = cd->sub; + if (UNUSEDCOLOR(cd) || sco == NOSUB) + { + /* has no subcolor, no further action */ + } + else if (sco == co) + { + /* is subcolor, let parent deal with it */ + } + else if (cd->nschrs == 0 && cd->nuchrs == 0) + { + /* parent empty, its arcs change color to subcolor */ + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nschrs > 0 || scd->nuchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + while ((a = cd->arcs) != NULL) + { + assert(a->co == co); + uncolorchain(cm, a); + a->co = sco; + colorchain(cm, a); + } + freecolor(cm, co); + } + else + { + /* parent's arcs must gain parallel subcolor arcs */ + cd->sub = NOSUB; + scd = &cm->cd[sco]; + assert(scd->nschrs > 0 || scd->nuchrs > 0); + assert(scd->sub == sco); + scd->sub = NOSUB; + for (a = cd->arcs; a != NULL; a = a->colorchain) + { + assert(a->co == co); + newarc(nfa, a->type, sco, a->from, a->to); + } + } + } +} + +/* + * colorchain - add this arc to the color chain of its color + */ +static void +colorchain(struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + + if (cd->arcs != NULL) + cd->arcs->colorchainRev = a; + a->colorchain = cd->arcs; + a->colorchainRev = NULL; + cd->arcs = a; +} + +/* + * uncolorchain - delete this arc from the color chain of its color + */ +static void +uncolorchain(struct colormap *cm, + struct arc *a) +{ + struct colordesc *cd = &cm->cd[a->co]; + struct arc *aa = a->colorchainRev; + + if (aa == NULL) + { + assert(cd->arcs == a); + cd->arcs = a->colorchain; + } + else + { + assert(aa->colorchain == a); + aa->colorchain = a->colorchain; + } + if (a->colorchain != NULL) + a->colorchain->colorchainRev = aa; + a->colorchain = NULL; /* paranoia */ + a->colorchainRev = NULL; +} + +/* + * rainbow - add arcs of all full colors (but one) between specified states + */ +static void +rainbow(struct nfa *nfa, + struct colormap *cm, + int type, + color but, /* COLORLESS if no exceptions */ + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + + for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) + if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but && + !(cd->flags & PSEUDO)) + newarc(nfa, type, co, from, to); +} + +/* + * colorcomplement - add arcs of complementary colors + * + * The calling sequence ought to be reconciled with cloneouts(). + */ +static void +colorcomplement(struct nfa *nfa, + struct colormap *cm, + int type, + struct state *of, /* complements of this guy's PLAIN outarcs */ + struct state *from, + struct state *to) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(cm); + color co; + + assert(of != from); + for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) + if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + if (findarc(of, PLAIN, co) == NULL) + newarc(nfa, type, co, from, to); +} + + +#ifdef REG_DEBUG + +/* + * dumpcolors - debugging output + */ +static void +dumpcolors(struct colormap *cm, + FILE *f) +{ + struct colordesc *cd; + struct colordesc *end; + color co; + chr c; + + fprintf(f, "max %ld\n", (long) cm->max); + end = CDEND(cm); + for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */ + { + if (!UNUSEDCOLOR(cd)) + { + assert(cd->nschrs > 0 || cd->nuchrs > 0); + if (cd->flags & PSEUDO) + fprintf(f, "#%2ld(ps): ", (long) co); + else + fprintf(f, "#%2ld(%2d): ", (long) co, cd->nschrs + cd->nuchrs); + + /* + * Unfortunately, it's hard to do this next bit more efficiently. + */ + for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) + if (GETCOLOR(cm, c) == co) + dumpchr(c, f); + fprintf(f, "\n"); + } + } + /* dump the high colormap if it contains anything interesting */ + if (cm->hiarrayrows > 1 || cm->hiarraycols > 1) + { + int r, + c; + const color *rowptr; + + fprintf(f, "other:\t"); + for (c = 0; c < cm->hiarraycols; c++) + { + fprintf(f, "\t%ld", (long) cm->hicolormap[c]); + } + fprintf(f, "\n"); + for (r = 0; r < cm->numcmranges; r++) + { + dumpchr(cm->cmranges[r].cmin, f); + fprintf(f, ".."); + dumpchr(cm->cmranges[r].cmax, f); + fprintf(f, ":"); + rowptr = &cm->hicolormap[cm->cmranges[r].rownum * cm->hiarraycols]; + for (c = 0; c < cm->hiarraycols; c++) + { + fprintf(f, "\t%ld", (long) rowptr[c]); + } + fprintf(f, "\n"); + } + } +} + +/* + * dumpchr - print a chr + * + * Kind of char-centric but works well enough for debug use. + */ +static void +dumpchr(chr c, + FILE *f) +{ + if (c == '\\') + fprintf(f, "\\\\"); + else if (c > ' ' && c <= '~') + putc((char) c, f); + else + fprintf(f, "\\u%04lx", (long) c); +} + +#endif /* REG_DEBUG */ diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c new file mode 100644 index 0000000..1030621 --- /dev/null +++ b/src/backend/regex/regc_cvec.c @@ -0,0 +1,138 @@ +/* + * Utility functions for handling cvecs + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_cvec.c + * + */ + +/* + * Notes: + * Only (selected) functions in _this_ file should treat the chr arrays + * of a cvec as non-constant. + */ + +/* + * newcvec - allocate a new cvec + */ +static struct cvec * +newcvec(int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges */ +{ + size_t nc = (size_t) nchrs + (size_t) nranges * 2; + size_t n = sizeof(struct cvec) + nc * sizeof(chr); + struct cvec *cv = (struct cvec *) MALLOC(n); + + if (cv == NULL) + return NULL; + cv->chrspace = nchrs; + cv->chrs = (chr *) (((char *) cv) + sizeof(struct cvec)); + cv->ranges = cv->chrs + nchrs; + cv->rangespace = nranges; + return clearcvec(cv); +} + +/* + * clearcvec - clear a possibly-new cvec + * Returns pointer as convenience. + */ +static struct cvec * +clearcvec(struct cvec *cv) +{ + assert(cv != NULL); + cv->nchrs = 0; + cv->nranges = 0; + cv->cclasscode = -1; + return cv; +} + +/* + * addchr - add a chr to a cvec + */ +static void +addchr(struct cvec *cv, /* character vector */ + chr c) /* character to add */ +{ + assert(cv->nchrs < cv->chrspace); + cv->chrs[cv->nchrs++] = c; +} + +/* + * addrange - add a range to a cvec + */ +static void +addrange(struct cvec *cv, /* character vector */ + chr from, /* first character of range */ + chr to) /* last character of range */ +{ + assert(cv->nranges < cv->rangespace); + cv->ranges[cv->nranges * 2] = from; + cv->ranges[cv->nranges * 2 + 1] = to; + cv->nranges++; +} + +/* + * getcvec - get a transient cvec, initialized to empty + * + * The returned cvec is valid only until the next call of getcvec, which + * typically will recycle the space. Callers should *not* free the cvec + * explicitly; it will be cleaned up when the struct vars is destroyed. + * + * This is typically used while interpreting bracket expressions. In that + * usage the cvec is only needed momentarily until we build arcs from it, + * so transientness is a convenient behavior. + */ +static struct cvec * +getcvec(struct vars *v, /* context */ + int nchrs, /* to hold this many chrs... */ + int nranges) /* ... and this many ranges */ +{ + /* recycle existing transient cvec if large enough */ + if (v->cv != NULL && nchrs <= v->cv->chrspace && + nranges <= v->cv->rangespace) + return clearcvec(v->cv); + + /* nope, make a new one */ + if (v->cv != NULL) + freecvec(v->cv); + v->cv = newcvec(nchrs, nranges); + if (v->cv == NULL) + ERR(REG_ESPACE); + + return v->cv; +} + +/* + * freecvec - free a cvec + */ +static void +freecvec(struct cvec *cv) +{ + FREE(cv); +} diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c new file mode 100644 index 0000000..1666453 --- /dev/null +++ b/src/backend/regex/regc_lex.c @@ -0,0 +1,1173 @@ +/* + * lexical analyzer + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_lex.c + * + */ + +/* scanning macros (know about v) */ +#define ATEOS() (v->now >= v->stop) +#define HAVE(n) (v->stop - v->now >= (n)) +#define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) +#define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) +#define NEXT3(a,b,c) (HAVE(3) && *v->now == CHR(a) && \ + *(v->now+1) == CHR(b) && \ + *(v->now+2) == CHR(c)) +#define SET(c) (v->nexttype = (c)) +#define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) +#define RET(c) return (SET(c), 1) +#define RETV(c, n) return (SETV(c, n), 1) +#define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ +#define LASTTYPE(t) (v->lasttype == (t)) + +/* lexical contexts */ +#define L_ERE 1 /* mainline ERE/ARE */ +#define L_BRE 2 /* mainline BRE */ +#define L_Q 3 /* REG_QUOTE */ +#define L_EBND 4 /* ERE/ARE bound */ +#define L_BBND 5 /* BRE bound */ +#define L_BRACK 6 /* brackets */ +#define L_CEL 7 /* collating element */ +#define L_ECL 8 /* equivalence class */ +#define L_CCL 9 /* character class */ +#define INTOCON(c) (v->lexcon = (c)) +#define INCON(con) (v->lexcon == (con)) + +/* construct pointer past end of chr array */ +#define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) + +/* + * lexstart - set up lexical stuff, scan leading options + */ +static void +lexstart(struct vars *v) +{ + prefixes(v); /* may turn on new type bits etc. */ + NOERR(); + + if (v->cflags & REG_QUOTE) + { + assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))); + INTOCON(L_Q); + } + else if (v->cflags & REG_EXTENDED) + { + assert(!(v->cflags & REG_QUOTE)); + INTOCON(L_ERE); + } + else + { + assert(!(v->cflags & (REG_QUOTE | REG_ADVF))); + INTOCON(L_BRE); + } + + v->nexttype = EMPTY; /* remember we were at the start */ + next(v); /* set up the first token */ +} + +/* + * prefixes - implement various special prefixes + */ +static void +prefixes(struct vars *v) +{ + /* literal string doesn't get any of this stuff */ + if (v->cflags & REG_QUOTE) + return; + + /* initial "***" gets special things */ + if (HAVE(4) && NEXT3('*', '*', '*')) + switch (*(v->now + 3)) + { + case CHR('?'): /* "***?" error, msg shows version */ + ERR(REG_BADPAT); + return; /* proceed no further */ + break; + case CHR('='): /* "***=" shifts to literal string */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_QUOTE; + v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE); + v->now += 4; + return; /* and there can be no more prefixes */ + break; + case CHR(':'): /* "***:" shifts to AREs */ + NOTE(REG_UNONPOSIX); + v->cflags |= REG_ADVANCED; + v->now += 4; + break; + default: /* otherwise *** is just an error */ + ERR(REG_BADRPT); + return; + break; + } + + /* BREs and EREs don't get embedded options */ + if ((v->cflags & REG_ADVANCED) != REG_ADVANCED) + return; + + /* embedded options (AREs only) */ + if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) + { + NOTE(REG_UNONPOSIX); + v->now += 2; + for (; !ATEOS() && iscalpha(*v->now); v->now++) + switch (*v->now) + { + case CHR('b'): /* BREs (but why???) */ + v->cflags &= ~(REG_ADVANCED | REG_QUOTE); + break; + case CHR('c'): /* case sensitive */ + v->cflags &= ~REG_ICASE; + break; + case CHR('e'): /* plain EREs */ + v->cflags |= REG_EXTENDED; + v->cflags &= ~(REG_ADVF | REG_QUOTE); + break; + case CHR('i'): /* case insensitive */ + v->cflags |= REG_ICASE; + break; + case CHR('m'): /* Perloid synonym for n */ + case CHR('n'): /* \n affects ^ $ . [^ */ + v->cflags |= REG_NEWLINE; + break; + case CHR('p'): /* ~Perl, \n affects . [^ */ + v->cflags |= REG_NLSTOP; + v->cflags &= ~REG_NLANCH; + break; + case CHR('q'): /* literal string */ + v->cflags |= REG_QUOTE; + v->cflags &= ~REG_ADVANCED; + break; + case CHR('s'): /* single line, \n ordinary */ + v->cflags &= ~REG_NEWLINE; + break; + case CHR('t'): /* tight syntax */ + v->cflags &= ~REG_EXPANDED; + break; + case CHR('w'): /* weird, \n affects ^ $ only */ + v->cflags &= ~REG_NLSTOP; + v->cflags |= REG_NLANCH; + break; + case CHR('x'): /* expanded syntax */ + v->cflags |= REG_EXPANDED; + break; + default: + ERR(REG_BADOPT); + return; + } + if (!NEXT1(')')) + { + ERR(REG_BADOPT); + return; + } + v->now++; + if (v->cflags & REG_QUOTE) + v->cflags &= ~(REG_EXPANDED | REG_NEWLINE); + } +} + +/* + * lexnest - "call a subroutine", interpolating string at the lexical level + * + * Note, this is not a very general facility. There are a number of + * implicit assumptions about what sorts of strings can be subroutines. + */ +static void +lexnest(struct vars *v, + const chr *beginp, /* start of interpolation */ + const chr *endp) /* one past end of interpolation */ +{ + assert(v->savenow == NULL); /* only one level of nesting */ + v->savenow = v->now; + v->savestop = v->stop; + v->now = beginp; + v->stop = endp; +} + +/* + * string constants to interpolate as expansions of things like \d + */ +static const chr backd[] = { /* \d */ + CHR('['), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backD[] = { /* \D */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbackd[] = { /* \d within brackets */ + CHR('['), CHR(':'), + CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), + CHR(':'), CHR(']') +}; +static const chr backs[] = { /* \s */ + CHR('['), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr backS[] = { /* \S */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']'), CHR(']') +}; +static const chr brbacks[] = { /* \s within brackets */ + CHR('['), CHR(':'), + CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), + CHR(':'), CHR(']') +}; +static const chr backw[] = { /* \w */ + CHR('['), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static const chr backW[] = { /* \W */ + CHR('['), CHR('^'), CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_'), CHR(']') +}; +static const chr brbackw[] = { /* \w within brackets */ + CHR('['), CHR(':'), + CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), + CHR(':'), CHR(']'), CHR('_') +}; + +/* + * lexword - interpolate a bracket expression for word characters + * Possibly ought to inquire whether there is a "word" character class. + */ +static void +lexword(struct vars *v) +{ + lexnest(v, backw, ENDOF(backw)); +} + +/* + * next - get next token + */ +static int /* 1 normal, 0 failure */ +next(struct vars *v) +{ + chr c; + + /* errors yield an infinite sequence of failures */ + if (ISERR()) + return 0; /* the error has set nexttype to EOS */ + + /* remember flavor of last token */ + v->lasttype = v->nexttype; + + /* REG_BOSONLY */ + if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) + { + /* at start of a REG_BOSONLY RE */ + RETV(SBEGIN, 0); /* same as \A */ + } + + /* if we're nested and we've hit end, return to outer level */ + if (v->savenow != NULL && ATEOS()) + { + v->now = v->savenow; + v->stop = v->savestop; + v->savenow = v->savestop = NULL; + } + + /* skip white space etc. if appropriate (not in literal or []) */ + if (v->cflags & REG_EXPANDED) + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_EBND: + case L_BBND: + skip(v); + break; + } + + /* handle EOS, depending on context */ + if (ATEOS()) + { + switch (v->lexcon) + { + case L_ERE: + case L_BRE: + case L_Q: + RET(EOS); + break; + case L_EBND: + case L_BBND: + FAILW(REG_EBRACE); + break; + case L_BRACK: + case L_CEL: + case L_ECL: + case L_CCL: + FAILW(REG_EBRACK); + break; + } + assert(NOTREACHED); + } + + /* okay, time to actually get a character */ + c = *v->now++; + + /* deal with the easy contexts, punt EREs to code below */ + switch (v->lexcon) + { + case L_BRE: /* punt BREs to separate function */ + return brenext(v, c); + break; + case L_ERE: /* see below */ + break; + case L_Q: /* literal strings are easy */ + RETV(PLAIN, c); + break; + case L_BBND: /* bounds are fairly simple */ + case L_EBND: + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + RETV(DIGIT, (chr) DIGITVAL(c)); + break; + case CHR(','): + RET(','); + break; + case CHR('}'): /* ERE bound ends with } */ + if (INCON(L_EBND)) + { + INTOCON(L_ERE); + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('}', 0); + } + RETV('}', 1); + } + else + FAILW(REG_BADBR); + break; + case CHR('\\'): /* BRE bound ends with \} */ + if (INCON(L_BBND) && NEXT1('}')) + { + v->now++; + INTOCON(L_BRE); + RETV('}', 1); + } + else + FAILW(REG_BADBR); + break; + default: + FAILW(REG_BADBR); + break; + } + assert(NOTREACHED); + break; + case L_BRACK: /* brackets are not too hard */ + switch (c) + { + case CHR(']'): + if (LASTTYPE('[')) + RETV(PLAIN, c); + else + { + INTOCON((v->cflags & REG_EXTENDED) ? + L_ERE : L_BRE); + RET(']'); + } + break; + case CHR('\\'): + NOTE(REG_UBBS); + if (!(v->cflags & REG_ADVF)) + RETV(PLAIN, c); + NOTE(REG_UNONPOSIX); + if (ATEOS()) + FAILW(REG_EESCAPE); + (DISCARD) lexescape(v); + switch (v->nexttype) + { /* not all escapes okay here */ + case PLAIN: + return 1; + break; + case CCLASS: + switch (v->nextvalue) + { + case 'd': + lexnest(v, brbackd, ENDOF(brbackd)); + break; + case 's': + lexnest(v, brbacks, ENDOF(brbacks)); + break; + case 'w': + lexnest(v, brbackw, ENDOF(brbackw)); + break; + default: + FAILW(REG_EESCAPE); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + break; + } + /* not one of the acceptable escapes */ + FAILW(REG_EESCAPE); + break; + case CHR('-'): + if (LASTTYPE('[') || NEXT1(']')) + RETV(PLAIN, c); + else + RETV(RANGE, c); + break; + case CHR('['): + if (ATEOS()) + FAILW(REG_EBRACK); + switch (*v->now++) + { + case CHR('.'): + INTOCON(L_CEL); + /* might or might not be locale-specific */ + RET(COLLEL); + break; + case CHR('='): + INTOCON(L_ECL); + NOTE(REG_ULOCALE); + RET(ECLASS); + break; + case CHR(':'): + INTOCON(L_CCL); + NOTE(REG_ULOCALE); + RET(CCLASS); + break; + default: /* oops */ + v->now--; + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + default: + RETV(PLAIN, c); + break; + } + assert(NOTREACHED); + break; + case L_CEL: /* collating elements are easy */ + if (c == CHR('.') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '.'); + } + else + RETV(PLAIN, c); + break; + case L_ECL: /* ditto equivalence classes */ + if (c == CHR('=') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, '='); + } + else + RETV(PLAIN, c); + break; + case L_CCL: /* ditto character classes */ + if (c == CHR(':') && NEXT1(']')) + { + v->now++; + INTOCON(L_BRACK); + RETV(END, ':'); + } + else + RETV(PLAIN, c); + break; + default: + assert(NOTREACHED); + break; + } + + /* that got rid of everything except EREs and AREs */ + assert(INCON(L_ERE)); + + /* deal with EREs and AREs, except for backslashes */ + switch (c) + { + case CHR('|'): + RET('|'); + break; + case CHR('*'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('*', 0); + } + RETV('*', 1); + break; + case CHR('+'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('+', 0); + } + RETV('+', 1); + break; + case CHR('?'): + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + v->now++; + NOTE(REG_UNONPOSIX); + RETV('?', 0); + } + RETV('?', 1); + break; + case CHR('{'): /* bounds start or plain character */ + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS() || !iscdigit(*v->now)) + { + NOTE(REG_UBRACES); + NOTE(REG_UUNSPEC); + RETV(PLAIN, c); + } + else + { + NOTE(REG_UBOUNDS); + INTOCON(L_EBND); + RET('{'); + } + assert(NOTREACHED); + break; + case CHR('('): /* parenthesis, or advanced extension */ + if ((v->cflags & REG_ADVF) && NEXT1('?')) + { + NOTE(REG_UNONPOSIX); + v->now++; + if (ATEOS()) + FAILW(REG_BADRPT); + switch (*v->now++) + { + case CHR(':'): /* non-capturing paren */ + RETV('(', 0); + break; + case CHR('#'): /* comment */ + while (!ATEOS() && *v->now != CHR(')')) + v->now++; + if (!ATEOS()) + v->now++; + assert(v->nexttype == v->lasttype); + return next(v); + break; + case CHR('='): /* positive lookahead */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_POS); + break; + case CHR('!'): /* negative lookahead */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_NEG); + break; + case CHR('<'): + if (ATEOS()) + FAILW(REG_BADRPT); + switch (*v->now++) + { + case CHR('='): /* positive lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_POS); + break; + case CHR('!'): /* negative lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_NEG); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); + } + if (v->cflags & REG_NOSUB) + RETV('(', 0); /* all parens non-capturing */ + else + RETV('(', 1); + break; + case CHR(')'): + if (LASTTYPE('(')) + NOTE(REG_UUNSPEC); + RETV(')', c); + break; + case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + RET('^'); + break; + case CHR('$'): + RET('$'); + break; + case CHR('\\'): /* mostly punt backslashes to code below */ + if (ATEOS()) + FAILW(REG_EESCAPE); + break; + default: /* ordinary character */ + RETV(PLAIN, c); + break; + } + + /* ERE/ARE backslash handling; backslash already eaten */ + assert(!ATEOS()); + if (!(v->cflags & REG_ADVF)) + { /* only AREs have non-trivial escapes */ + if (iscalnum(*v->now)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, *v->now++); + } + (DISCARD) lexescape(v); + if (ISERR()) + FAILW(REG_EESCAPE); + if (v->nexttype == CCLASS) + { /* fudge at lexical level */ + switch (v->nextvalue) + { + case 'd': + lexnest(v, backd, ENDOF(backd)); + break; + case 'D': + lexnest(v, backD, ENDOF(backD)); + break; + case 's': + lexnest(v, backs, ENDOF(backs)); + break; + case 'S': + lexnest(v, backS, ENDOF(backS)); + break; + case 'w': + lexnest(v, backw, ENDOF(backw)); + break; + case 'W': + lexnest(v, backW, ENDOF(backW)); + break; + default: + assert(NOTREACHED); + FAILW(REG_ASSERT); + break; + } + /* lexnest done, back up and try again */ + v->nexttype = v->lasttype; + return next(v); + } + /* otherwise, lexescape has already done the work */ + return !ISERR(); +} + +/* + * lexescape - parse an ARE backslash escape (backslash already eaten) + * Note slightly nonstandard use of the CCLASS type code. + */ +static int /* not actually used, but convenient for RETV */ +lexescape(struct vars *v) +{ + chr c; + static const chr alert[] = { + CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') + }; + static const chr esc[] = { + CHR('E'), CHR('S'), CHR('C') + }; + const chr *save; + + assert(v->cflags & REG_ADVF); + + assert(!ATEOS()); + c = *v->now++; + if (!iscalnum(c)) + RETV(PLAIN, c); + + NOTE(REG_UNONPOSIX); + switch (c) + { + case CHR('a'): + RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); + break; + case CHR('A'): + RETV(SBEGIN, 0); + break; + case CHR('b'): + RETV(PLAIN, CHR('\b')); + break; + case CHR('B'): + RETV(PLAIN, CHR('\\')); + break; + case CHR('c'): + NOTE(REG_UUNPORT); + if (ATEOS()) + FAILW(REG_EESCAPE); + RETV(PLAIN, (chr) (*v->now++ & 037)); + break; + case CHR('d'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'd'); + break; + case CHR('D'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'D'); + break; + case CHR('e'): + NOTE(REG_UUNPORT); + RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); + break; + case CHR('f'): + RETV(PLAIN, CHR('\f')); + break; + case CHR('m'): + RET('<'); + break; + case CHR('M'): + RET('>'); + break; + case CHR('n'): + RETV(PLAIN, CHR('\n')); + break; + case CHR('r'): + RETV(PLAIN, CHR('\r')); + break; + case CHR('s'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 's'); + break; + case CHR('S'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'S'); + break; + case CHR('t'): + RETV(PLAIN, CHR('\t')); + break; + case CHR('u'): + c = lexdigits(v, 16, 4, 4); + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('U'): + c = lexdigits(v, 16, 8, 8); + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('v'): + RETV(PLAIN, CHR('\v')); + break; + case CHR('w'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'w'); + break; + case CHR('W'): + NOTE(REG_ULOCALE); + RETV(CCLASS, 'W'); + break; + case CHR('x'): + NOTE(REG_UUNPORT); + c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + if (ISERR() || !CHR_IS_IN_RANGE(c)) + FAILW(REG_EESCAPE); + RETV(PLAIN, c); + break; + case CHR('y'): + NOTE(REG_ULOCALE); + RETV(WBDRY, 0); + break; + case CHR('Y'): + NOTE(REG_ULOCALE); + RETV(NWBDRY, 0); + break; + case CHR('Z'): + RETV(SEND, 0); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + save = v->now; + v->now--; /* put first digit back */ + c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + if (ISERR()) + FAILW(REG_EESCAPE); + /* ugly heuristic (first test is "exactly 1 digit?") */ + if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp)) + { + NOTE(REG_UBACKREF); + RETV(BACKREF, c); + } + /* oops, doesn't look like it's a backref after all... */ + v->now = save; + /* and fall through into octal number */ + /* FALLTHROUGH */ + case CHR('0'): + NOTE(REG_UUNPORT); + v->now--; /* put first digit back */ + c = lexdigits(v, 8, 1, 3); + if (ISERR()) + FAILW(REG_EESCAPE); + if (c > 0xff) + { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; + } + RETV(PLAIN, c); + break; + default: + assert(iscalpha(c)); + FAILW(REG_EESCAPE); /* unknown alphabetic escape */ + break; + } + assert(NOTREACHED); +} + +/* + * lexdigits - slurp up digits and return chr value + * + * This does not account for overflow; callers should range-check the result + * if maxlen is large enough to make that possible. + */ +static chr /* chr value; errors signalled via ERR */ +lexdigits(struct vars *v, + int base, + int minlen, + int maxlen) +{ + uchr n; /* unsigned to avoid overflow misbehavior */ + int len; + chr c; + int d; + const uchr ub = (uchr) base; + + n = 0; + for (len = 0; len < maxlen && !ATEOS(); len++) + { + c = *v->now++; + switch (c) + { + case CHR('0'): + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + d = DIGITVAL(c); + break; + case CHR('a'): + case CHR('A'): + d = 10; + break; + case CHR('b'): + case CHR('B'): + d = 11; + break; + case CHR('c'): + case CHR('C'): + d = 12; + break; + case CHR('d'): + case CHR('D'): + d = 13; + break; + case CHR('e'): + case CHR('E'): + d = 14; + break; + case CHR('f'): + case CHR('F'): + d = 15; + break; + default: + v->now--; /* oops, not a digit at all */ + d = -1; + break; + } + + if (d >= base) + { /* not a plausible digit */ + v->now--; + d = -1; + } + if (d < 0) + break; /* NOTE BREAK OUT */ + n = n * ub + (uchr) d; + } + if (len < minlen) + ERR(REG_EESCAPE); + + return (chr) n; +} + +/* + * brenext - get next BRE token + * + * This is much like EREs except for all the stupid backslashes and the + * context-dependency of some things. + */ +static int /* 1 normal, 0 failure */ +brenext(struct vars *v, + chr c) +{ + switch (c) + { + case CHR('*'): + if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) + RETV(PLAIN, c); + RETV('*', 1); + break; + case CHR('['): + if (HAVE(6) && *(v->now + 0) == CHR('[') && + *(v->now + 1) == CHR(':') && + (*(v->now + 2) == CHR('<') || + *(v->now + 2) == CHR('>')) && + *(v->now + 3) == CHR(':') && + *(v->now + 4) == CHR(']') && + *(v->now + 5) == CHR(']')) + { + c = *(v->now + 2); + v->now += 6; + NOTE(REG_UNONPOSIX); + RET((c == CHR('<')) ? '<' : '>'); + } + INTOCON(L_BRACK); + if (NEXT1('^')) + { + v->now++; + RETV('[', 0); + } + RETV('[', 1); + break; + case CHR('.'): + RET('.'); + break; + case CHR('^'): + if (LASTTYPE(EMPTY)) + RET('^'); + if (LASTTYPE('(')) + { + NOTE(REG_UUNSPEC); + RET('^'); + } + RETV(PLAIN, c); + break; + case CHR('$'): + if (v->cflags & REG_EXPANDED) + skip(v); + if (ATEOS()) + RET('$'); + if (NEXT2('\\', ')')) + { + NOTE(REG_UUNSPEC); + RET('$'); + } + RETV(PLAIN, c); + break; + case CHR('\\'): + break; /* see below */ + default: + RETV(PLAIN, c); + break; + } + + assert(c == CHR('\\')); + + if (ATEOS()) + FAILW(REG_EESCAPE); + + c = *v->now++; + switch (c) + { + case CHR('{'): + INTOCON(L_BBND); + NOTE(REG_UBOUNDS); + RET('{'); + break; + case CHR('('): + RETV('(', 1); + break; + case CHR(')'): + RETV(')', c); + break; + case CHR('<'): + NOTE(REG_UNONPOSIX); + RET('<'); + break; + case CHR('>'): + NOTE(REG_UNONPOSIX); + RET('>'); + break; + case CHR('1'): + case CHR('2'): + case CHR('3'): + case CHR('4'): + case CHR('5'): + case CHR('6'): + case CHR('7'): + case CHR('8'): + case CHR('9'): + NOTE(REG_UBACKREF); + RETV(BACKREF, (chr) DIGITVAL(c)); + break; + default: + if (iscalnum(c)) + { + NOTE(REG_UBSALNUM); + NOTE(REG_UUNSPEC); + } + RETV(PLAIN, c); + break; + } + + assert(NOTREACHED); + return 0; +} + +/* + * skip - skip white space and comments in expanded form + */ +static void +skip(struct vars *v) +{ + const chr *start = v->now; + + assert(v->cflags & REG_EXPANDED); + + for (;;) + { + while (!ATEOS() && iscspace(*v->now)) + v->now++; + if (ATEOS() || *v->now != CHR('#')) + break; /* NOTE BREAK OUT */ + assert(NEXT1('#')); + while (!ATEOS() && *v->now != CHR('\n')) + v->now++; + /* leave the newline to be picked up by the iscspace loop */ + } + + if (v->now != start) + NOTE(REG_UNONPOSIX); +} + +/* + * newline - return the chr for a newline + * + * This helps confine use of CHR to this source file. + */ +static chr +newline(void) +{ + return CHR('\n'); +} + +/* + * chrnamed - return the chr known by a given (chr string) name + * + * The code is a bit clumsy, but this routine gets only such specialized + * use that it hardly matters. + */ +static chr +chrnamed(struct vars *v, + const chr *startp, /* start of name */ + const chr *endp, /* just past end of name */ + chr lastresort) /* what to return if name lookup fails */ +{ + chr c; + int errsave; + int e; + struct cvec *cv; + + errsave = v->err; + v->err = 0; + c = element(v, startp, endp); + e = v->err; + v->err = errsave; + + if (e != 0) + return lastresort; + + cv = range(v, c, c, 0); + if (cv->nchrs == 0) + return lastresort; + return cv->chrs[0]; +} diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c new file mode 100644 index 0000000..047abc3 --- /dev/null +++ b/src/backend/regex/regc_locale.c @@ -0,0 +1,766 @@ +/* + * regc_locale.c -- + * + * This file contains locale-specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * This software is copyrighted by the Regents of the University of + * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + * Corporation and other parties. The following terms apply to all files + * associated with the software unless explicitly disclaimed in + * individual files. + * + * The authors hereby grant permission to use, copy, modify, distribute, + * and license this software and its documentation for any purpose, provided + * that existing copyright notices are retained in all copies and that this + * notice is included verbatim in any distributions. No written agreement, + * license, or royalty fee is required for any of the authorized uses. + * Modifications to this software may be copyrighted by their authors + * and need not follow the licensing terms described here, provided that + * the new terms are clearly indicated on the first page of each file where + * they apply. + * + * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + * MODIFICATIONS. + * + * GOVERNMENT USE: If you are acquiring this software on behalf of the + * U.S. government, the Government shall have only "Restricted Rights" + * in the software and related documentation as defined in the Federal + * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + * are acquiring the software on behalf of the Department of Defense, the + * software shall be classified as "Commercial Computer Software" and the + * Government shall have only "Restricted Rights" as defined in Clause + * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + * authors grant the U.S. Government and others acting in its behalf + * permission to use and distribute the software in accordance with the + * terms specified in this license. + * + * src/backend/regex/regc_locale.c + */ + +/* ASCII character-name table */ + +static const struct cname +{ + const char *name; + const char code; +} cnames[] = + +{ + { + "NUL", '\0' + }, + { + "SOH", '\001' + }, + { + "STX", '\002' + }, + { + "ETX", '\003' + }, + { + "EOT", '\004' + }, + { + "ENQ", '\005' + }, + { + "ACK", '\006' + }, + { + "BEL", '\007' + }, + { + "alert", '\007' + }, + { + "BS", '\010' + }, + { + "backspace", '\b' + }, + { + "HT", '\011' + }, + { + "tab", '\t' + }, + { + "LF", '\012' + }, + { + "newline", '\n' + }, + { + "VT", '\013' + }, + { + "vertical-tab", '\v' + }, + { + "FF", '\014' + }, + { + "form-feed", '\f' + }, + { + "CR", '\015' + }, + { + "carriage-return", '\r' + }, + { + "SO", '\016' + }, + { + "SI", '\017' + }, + { + "DLE", '\020' + }, + { + "DC1", '\021' + }, + { + "DC2", '\022' + }, + { + "DC3", '\023' + }, + { + "DC4", '\024' + }, + { + "NAK", '\025' + }, + { + "SYN", '\026' + }, + { + "ETB", '\027' + }, + { + "CAN", '\030' + }, + { + "EM", '\031' + }, + { + "SUB", '\032' + }, + { + "ESC", '\033' + }, + { + "IS4", '\034' + }, + { + "FS", '\034' + }, + { + "IS3", '\035' + }, + { + "GS", '\035' + }, + { + "IS2", '\036' + }, + { + "RS", '\036' + }, + { + "IS1", '\037' + }, + { + "US", '\037' + }, + { + "space", ' ' + }, + { + "exclamation-mark", '!' + }, + { + "quotation-mark", '"' + }, + { + "number-sign", '#' + }, + { + "dollar-sign", '$' + }, + { + "percent-sign", '%' + }, + { + "ampersand", '&' + }, + { + "apostrophe", '\'' + }, + { + "left-parenthesis", '(' + }, + { + "right-parenthesis", ')' + }, + { + "asterisk", '*' + }, + { + "plus-sign", '+' + }, + { + "comma", ',' + }, + { + "hyphen", '-' + }, + { + "hyphen-minus", '-' + }, + { + "period", '.' + }, + { + "full-stop", '.' + }, + { + "slash", '/' + }, + { + "solidus", '/' + }, + { + "zero", '0' + }, + { + "one", '1' + }, + { + "two", '2' + }, + { + "three", '3' + }, + { + "four", '4' + }, + { + "five", '5' + }, + { + "six", '6' + }, + { + "seven", '7' + }, + { + "eight", '8' + }, + { + "nine", '9' + }, + { + "colon", ':' + }, + { + "semicolon", ';' + }, + { + "less-than-sign", '<' + }, + { + "equals-sign", '=' + }, + { + "greater-than-sign", '>' + }, + { + "question-mark", '?' + }, + { + "commercial-at", '@' + }, + { + "left-square-bracket", '[' + }, + { + "backslash", '\\' + }, + { + "reverse-solidus", '\\' + }, + { + "right-square-bracket", ']' + }, + { + "circumflex", '^' + }, + { + "circumflex-accent", '^' + }, + { + "underscore", '_' + }, + { + "low-line", '_' + }, + { + "grave-accent", '`' + }, + { + "left-brace", '{' + }, + { + "left-curly-bracket", '{' + }, + { + "vertical-line", '|' + }, + { + "right-brace", '}' + }, + { + "right-curly-bracket", '}' + }, + { + "tilde", '~' + }, + { + "DEL", '\177' + }, + { + NULL, 0 + } +}; + +/* + * The following arrays define the valid character class names. + */ +static const char *const classNames[NUM_CCLASSES + 1] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", NULL +}; + +enum classes +{ + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT +}; + +/* + * We do not use the hard-wired Unicode classification tables that Tcl does. + * This is because (a) we need to deal with other encodings besides Unicode, + * and (b) we want to track the behavior of the libc locale routines as + * closely as possible. For example, it wouldn't be unreasonable for a + * locale to not consider every Unicode letter as a letter. So we build + * character classification cvecs by asking libc, even for Unicode. + */ + + +/* + * element - map collating-element name to chr + */ +static chr +element(struct vars *v, /* context */ + const chr *startp, /* points to start of name */ + const chr *endp) /* points just past end of name */ +{ + const struct cname *cn; + size_t len; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) + return *startp; + + NOTE(REG_ULOCALE); + + /* search table */ + for (cn = cnames; cn->name != NULL; cn++) + { + if (strlen(cn->name) == len && + pg_char_and_wchar_strncmp(cn->name, startp, len) == 0) + { + break; /* NOTE BREAK OUT */ + } + } + if (cn->name != NULL) + return CHR(cn->code); + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + * range - supply cvec for a range, including legality check + */ +static struct cvec * +range(struct vars *v, /* context */ + chr a, /* range start */ + chr b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + chr c, + cc; + + if (a != b && !before(a, b)) + { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) + { /* easy version */ + cv = getcvec(v, 0, 1); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are usable, + * so for now at least, we won't try. We use a range for the originally + * specified chrs and then add on any case-equivalents that are outside + * that range as individual chrs. + * + * To ensure sane behavior if someone specifies a very large range, limit + * the allocation size to 100000 chrs (arbitrary) and check for overrun + * inside the loop below. + */ + nchrs = b - a + 1; + if (nchrs <= 0 || nchrs > 100000) + nchrs = 100000; + + cv = getcvec(v, nchrs, 1); + NOERRN(); + addrange(cv, a, b); + + for (c = a; c <= b; c++) + { + cc = pg_wc_tolower(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + cc = pg_wc_toupper(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + } + + return cv; +} + +/* + * before - is chr x before chr y, for purposes of range legality? + */ +static int /* predicate */ +before(chr x, chr y) +{ + if (x < y) + return 1; + return 0; +} + +/* + * eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + */ +static struct cvec * +eclass(struct vars *v, /* context */ + chr c, /* Collating element representing the + * equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags & REG_FAKE) && c == 'x') + { + cv = getcvec(v, 4, 0); + addchr(cv, CHR('x')); + addchr(cv, CHR('y')); + if (cases) + { + addchr(cv, CHR('X')); + addchr(cv, CHR('Y')); + } + return cv; + } + + /* otherwise, none */ + if (cases) + return allcases(v, c); + cv = getcvec(v, 1, 0); + assert(cv != NULL); + addchr(cv, c); + return cv; +} + +/* + * cclass - supply cvec for a character class + * + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. + */ +static struct cvec * +cclass(struct vars *v, /* context */ + const chr *startp, /* where the name starts */ + const chr *endp, /* just past the end of the name */ + int cases) /* case-independent? */ +{ + size_t len; + struct cvec *cv = NULL; + const char *const *namePtr; + int i, + index; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + index = -1; + for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) + { + if (strlen(*namePtr) == len && + pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) + { + index = i; + break; + } + } + if (index == -1) + { + ERR(REG_ECTYPE); + return NULL; + } + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && + ((enum classes) index == CC_LOWER || + (enum classes) index == CC_UPPER)) + index = (int) CC_ALPHA; + + /* + * Now compute the character class contents. For classes that are based + * on the behavior of a <wctype.h> or <ctype.h> function, we use + * pg_ctype_get_cache so that we can cache the results. Other classes + * have definitions that are hard-wired here, and for those we just + * construct a transient cvec on the fly. + * + * NB: keep this code in sync with cclass_column_index(), below. + */ + + switch ((enum classes) index) + { + case CC_PRINT: + cv = pg_ctype_get_cache(pg_wc_isprint, index); + break; + case CC_ALNUM: + cv = pg_ctype_get_cache(pg_wc_isalnum, index); + break; + case CC_ALPHA: + cv = pg_ctype_get_cache(pg_wc_isalpha, index); + break; + case CC_ASCII: + /* hard-wired meaning */ + cv = getcvec(v, 0, 1); + if (cv) + addrange(cv, 0, 0x7f); + break; + case CC_BLANK: + /* hard-wired meaning */ + cv = getcvec(v, 2, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + /* hard-wired meaning */ + cv = getcvec(v, 0, 2); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = pg_ctype_get_cache(pg_wc_isdigit, index); + break; + case CC_PUNCT: + cv = pg_ctype_get_cache(pg_wc_ispunct, index); + break; + case CC_XDIGIT: + + /* + * It's not clear how to define this in non-western locales, and + * even less clear that there's any particular use in trying. So + * just hard-wire the meaning. + */ + cv = getcvec(v, 0, 3); + if (cv) + { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = pg_ctype_get_cache(pg_wc_isspace, index); + break; + case CC_LOWER: + cv = pg_ctype_get_cache(pg_wc_islower, index); + break; + case CC_UPPER: + cv = pg_ctype_get_cache(pg_wc_isupper, index); + break; + case CC_GRAPH: + cv = pg_ctype_get_cache(pg_wc_isgraph, index); + break; + } + + /* If cv is NULL now, the reason must be "out of memory" */ + if (cv == NULL) + ERR(REG_ESPACE); + return cv; +} + +/* + * cclass_column_index - get appropriate high colormap column index for chr + */ +static int +cclass_column_index(struct colormap *cm, chr c) +{ + int colnum = 0; + + /* Shouldn't go through all these pushups for simple chrs */ + assert(c > MAX_SIMPLE_CHR); + + /* + * Note: we should not see requests to consider cclasses that are not + * treated as locale-specific by cclass(), above. + */ + if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) + colnum |= cm->classbits[CC_PRINT]; + if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) + colnum |= cm->classbits[CC_ALNUM]; + if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) + colnum |= cm->classbits[CC_ALPHA]; + assert(cm->classbits[CC_ASCII] == 0); + assert(cm->classbits[CC_BLANK] == 0); + assert(cm->classbits[CC_CNTRL] == 0); + if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) + colnum |= cm->classbits[CC_DIGIT]; + if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) + colnum |= cm->classbits[CC_PUNCT]; + assert(cm->classbits[CC_XDIGIT] == 0); + if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) + colnum |= cm->classbits[CC_SPACE]; + if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) + colnum |= cm->classbits[CC_LOWER]; + if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) + colnum |= cm->classbits[CC_UPPER]; + if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) + colnum |= cm->classbits[CC_GRAPH]; + + return colnum; +} + +/* + * allcases - supply cvec for all case counterparts of a chr (including itself) + * + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + */ +static struct cvec * +allcases(struct vars *v, /* context */ + chr c) /* character to get case equivs of */ +{ + struct cvec *cv; + chr lc, + uc; + + lc = pg_wc_tolower(c); + uc = pg_wc_toupper(c); + + cv = getcvec(v, 2, 0); + addchr(cv, lc); + if (lc != uc) + addchr(cv, uc); + return cv; +} + +/* + * cmp - chr-substring compare + * + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len * sizeof(chr)); +} + +/* + * casecmp - case-independent chr-substring compare + * + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) + { + if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) + return 1; + } + return 0; +} diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c new file mode 100644 index 0000000..92c9c4d --- /dev/null +++ b/src/backend/regex/regc_nfa.c @@ -0,0 +1,3181 @@ +/* + * NFA utilities. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regc_nfa.c + * + * + * One or two things that technically ought to be in here + * are actually in color.c, thanks to some incestuous relationships in + * the color chains. + */ + +#define NISERR() VISERR(nfa->v) +#define NERR(e) VERR(nfa->v, (e)) + + +/* + * newnfa - set up an NFA + */ +static struct nfa * /* the NFA, or NULL */ +newnfa(struct vars *v, + struct colormap *cm, + struct nfa *parent) /* NULL if primary NFA */ +{ + struct nfa *nfa; + + nfa = (struct nfa *) MALLOC(sizeof(struct nfa)); + if (nfa == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + + nfa->states = NULL; + nfa->slast = NULL; + nfa->free = NULL; + nfa->nstates = 0; + nfa->cm = cm; + nfa->v = v; + nfa->bos[0] = nfa->bos[1] = COLORLESS; + nfa->eos[0] = nfa->eos[1] = COLORLESS; + nfa->parent = parent; /* Precedes newfstate so parent is valid. */ + nfa->post = newfstate(nfa, '@'); /* number 0 */ + nfa->pre = newfstate(nfa, '>'); /* number 1 */ + + nfa->init = newstate(nfa); /* may become invalid later */ + nfa->final = newstate(nfa); + if (ISERR()) + { + freenfa(nfa); + return NULL; + } + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init); + newarc(nfa, '^', 1, nfa->pre, nfa->init); + newarc(nfa, '^', 0, nfa->pre, nfa->init); + rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post); + newarc(nfa, '$', 1, nfa->final, nfa->post); + newarc(nfa, '$', 0, nfa->final, nfa->post); + + if (ISERR()) + { + freenfa(nfa); + return NULL; + } + return nfa; +} + +/* + * freenfa - free an entire NFA + */ +static void +freenfa(struct nfa *nfa) +{ + struct state *s; + + while ((s = nfa->states) != NULL) + { + s->nins = s->nouts = 0; /* don't worry about arcs */ + freestate(nfa, s); + } + while ((s = nfa->free) != NULL) + { + nfa->free = s->next; + destroystate(nfa, s); + } + + nfa->slast = NULL; + nfa->nstates = -1; + nfa->pre = NULL; + nfa->post = NULL; + FREE(nfa); +} + +/* + * newstate - allocate an NFA state, with zero flag value + */ +static struct state * /* NULL on error */ +newstate(struct nfa *nfa) +{ + struct state *s; + + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state or arc. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return NULL; + } + + if (nfa->free != NULL) + { + s = nfa->free; + nfa->free = s->next; + } + else + { + if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) + { + NERR(REG_ETOOBIG); + return NULL; + } + s = (struct state *) MALLOC(sizeof(struct state)); + if (s == NULL) + { + NERR(REG_ESPACE); + return NULL; + } + nfa->v->spaceused += sizeof(struct state); + s->oas.next = NULL; + s->free = NULL; + s->noas = 0; + } + + assert(nfa->nstates >= 0); + s->no = nfa->nstates++; + s->flag = 0; + if (nfa->states == NULL) + nfa->states = s; + s->nins = 0; + s->ins = NULL; + s->nouts = 0; + s->outs = NULL; + s->tmp = NULL; + s->next = NULL; + if (nfa->slast != NULL) + { + assert(nfa->slast->next == NULL); + nfa->slast->next = s; + } + s->prev = nfa->slast; + nfa->slast = s; + return s; +} + +/* + * newfstate - allocate an NFA state with a specified flag value + */ +static struct state * /* NULL on error */ +newfstate(struct nfa *nfa, int flag) +{ + struct state *s; + + s = newstate(nfa); + if (s != NULL) + s->flag = (char) flag; + return s; +} + +/* + * dropstate - delete a state's inarcs and outarcs and free it + */ +static void +dropstate(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + while ((a = s->ins) != NULL) + freearc(nfa, a); + while ((a = s->outs) != NULL) + freearc(nfa, a); + freestate(nfa, s); +} + +/* + * freestate - free a state, which has no in-arcs or out-arcs + */ +static void +freestate(struct nfa *nfa, + struct state *s) +{ + assert(s != NULL); + assert(s->nins == 0 && s->nouts == 0); + + s->no = FREESTATE; + s->flag = 0; + if (s->next != NULL) + s->next->prev = s->prev; + else + { + assert(s == nfa->slast); + nfa->slast = s->prev; + } + if (s->prev != NULL) + s->prev->next = s->next; + else + { + assert(s == nfa->states); + nfa->states = s->next; + } + s->prev = NULL; + s->next = nfa->free; /* don't delete it, put it on the free list */ + nfa->free = s; +} + +/* + * destroystate - really get rid of an already-freed state + */ +static void +destroystate(struct nfa *nfa, + struct state *s) +{ + struct arcbatch *ab; + struct arcbatch *abnext; + + assert(s->no == FREESTATE); + for (ab = s->oas.next; ab != NULL; ab = abnext) + { + abnext = ab->next; + FREE(ab); + nfa->v->spaceused -= sizeof(struct arcbatch); + } + s->ins = NULL; + s->outs = NULL; + s->next = NULL; + FREE(s); + nfa->v->spaceused -= sizeof(struct state); +} + +/* + * newarc - set up a new arc within an NFA + * + * This function checks to make sure that no duplicate arcs are created. + * In general we never want duplicates. + */ +static void +newarc(struct nfa *nfa, + int t, + color co, + struct state *from, + struct state *to) +{ + struct arc *a; + + assert(from != NULL && to != NULL); + + /* + * This is a handy place to check for operation cancel during regex + * compilation, since no code path will go very long without making a new + * state or arc. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + /* check for duplicate arc, using whichever chain is shorter */ + if (from->nouts <= to->nins) + { + for (a = from->outs; a != NULL; a = a->outchain) + if (a->to == to && a->co == co && a->type == t) + return; + } + else + { + for (a = to->ins; a != NULL; a = a->inchain) + if (a->from == from && a->co == co && a->type == t) + return; + } + + /* no dup, so create the arc */ + createarc(nfa, t, co, from, to); +} + +/* + * createarc - create a new arc within an NFA + * + * This function must *only* be used after verifying that there is no existing + * identical arc (same type/color/from/to). + */ +static void +createarc(struct nfa *nfa, + int t, + color co, + struct state *from, + struct state *to) +{ + struct arc *a; + + /* the arc is physically allocated within its from-state */ + a = allocarc(nfa, from); + if (NISERR()) + return; + assert(a != NULL); + + a->type = t; + a->co = co; + a->to = to; + a->from = from; + + /* + * Put the new arc on the beginning, not the end, of the chains; it's + * simpler here, and freearc() is the same cost either way. See also the + * logic in moveins() and its cohorts, as well as fixempties(). + */ + a->inchain = to->ins; + a->inchainRev = NULL; + if (to->ins) + to->ins->inchainRev = a; + to->ins = a; + a->outchain = from->outs; + a->outchainRev = NULL; + if (from->outs) + from->outs->outchainRev = a; + from->outs = a; + + from->nouts++; + to->nins++; + + if (COLORED(a) && nfa->parent == NULL) + colorchain(nfa->cm, a); +} + +/* + * allocarc - allocate a new out-arc within a state + */ +static struct arc * /* NULL for failure */ +allocarc(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + /* shortcut */ + if (s->free == NULL && s->noas < ABSIZE) + { + a = &s->oas.a[s->noas]; + s->noas++; + return a; + } + + /* if none at hand, get more */ + if (s->free == NULL) + { + struct arcbatch *newAb; + int i; + + if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE) + { + NERR(REG_ETOOBIG); + return NULL; + } + newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch)); + if (newAb == NULL) + { + NERR(REG_ESPACE); + return NULL; + } + nfa->v->spaceused += sizeof(struct arcbatch); + newAb->next = s->oas.next; + s->oas.next = newAb; + + for (i = 0; i < ABSIZE; i++) + { + newAb->a[i].type = 0; + newAb->a[i].freechain = &newAb->a[i + 1]; + } + newAb->a[ABSIZE - 1].freechain = NULL; + s->free = &newAb->a[0]; + } + assert(s->free != NULL); + + a = s->free; + s->free = a->freechain; + return a; +} + +/* + * freearc - free an arc + */ +static void +freearc(struct nfa *nfa, + struct arc *victim) +{ + struct state *from = victim->from; + struct state *to = victim->to; + struct arc *predecessor; + + assert(victim->type != 0); + + /* take it off color chain if necessary */ + if (COLORED(victim) && nfa->parent == NULL) + uncolorchain(nfa->cm, victim); + + /* take it off source's out-chain */ + assert(from != NULL); + predecessor = victim->outchainRev; + if (predecessor == NULL) + { + assert(from->outs == victim); + from->outs = victim->outchain; + } + else + { + assert(predecessor->outchain == victim); + predecessor->outchain = victim->outchain; + } + if (victim->outchain != NULL) + { + assert(victim->outchain->outchainRev == victim); + victim->outchain->outchainRev = predecessor; + } + from->nouts--; + + /* take it off target's in-chain */ + assert(to != NULL); + predecessor = victim->inchainRev; + if (predecessor == NULL) + { + assert(to->ins == victim); + to->ins = victim->inchain; + } + else + { + assert(predecessor->inchain == victim); + predecessor->inchain = victim->inchain; + } + if (victim->inchain != NULL) + { + assert(victim->inchain->inchainRev == victim); + victim->inchain->inchainRev = predecessor; + } + to->nins--; + + /* clean up and place on from-state's free list */ + victim->type = 0; + victim->from = NULL; /* precautions... */ + victim->to = NULL; + victim->inchain = NULL; + victim->inchainRev = NULL; + victim->outchain = NULL; + victim->outchainRev = NULL; + victim->freechain = from->free; + from->free = victim; +} + +/* + * changearctarget - flip an arc to have a different to state + * + * Caller must have verified that there is no pre-existing duplicate arc. + * + * Note that because we store arcs in their from state, we can't easily have + * a similar changearcsource function. + */ +static void +changearctarget(struct arc *a, struct state *newto) +{ + struct state *oldto = a->to; + struct arc *predecessor; + + assert(oldto != newto); + + /* take it off old target's in-chain */ + assert(oldto != NULL); + predecessor = a->inchainRev; + if (predecessor == NULL) + { + assert(oldto->ins == a); + oldto->ins = a->inchain; + } + else + { + assert(predecessor->inchain == a); + predecessor->inchain = a->inchain; + } + if (a->inchain != NULL) + { + assert(a->inchain->inchainRev == a); + a->inchain->inchainRev = predecessor; + } + oldto->nins--; + + a->to = newto; + + /* prepend it to new target's in-chain */ + a->inchain = newto->ins; + a->inchainRev = NULL; + if (newto->ins) + newto->ins->inchainRev = a; + newto->ins = a; + newto->nins++; +} + +/* + * hasnonemptyout - Does state have a non-EMPTY out arc? + */ +static int +hasnonemptyout(struct state *s) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->type != EMPTY) + return 1; + } + return 0; +} + +/* + * findarc - find arc, if any, from given source with given type and color + * If there is more than one such arc, the result is random. + */ +static struct arc * +findarc(struct state *s, + int type, + color co) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + if (a->type == type && a->co == co) + return a; + return NULL; +} + +/* + * cparc - allocate a new arc within an NFA, copying details from old one + */ +static void +cparc(struct nfa *nfa, + struct arc *oa, + struct state *from, + struct state *to) +{ + newarc(nfa, oa->type, oa->co, from, to); +} + +/* + * sortins - sort the in arcs of a state by from/color/type + */ +static void +sortins(struct nfa *nfa, + struct state *s) +{ + struct arc **sortarray; + struct arc *a; + int n = s->nins; + int i; + + if (n <= 1) + return; /* nothing to do */ + /* make an array of arc pointers ... */ + sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *)); + if (sortarray == NULL) + { + NERR(REG_ESPACE); + return; + } + i = 0; + for (a = s->ins; a != NULL; a = a->inchain) + sortarray[i++] = a; + assert(i == n); + /* ... sort the array */ + qsort(sortarray, n, sizeof(struct arc *), sortins_cmp); + /* ... and rebuild arc list in order */ + /* it seems worth special-casing first and last items to simplify loop */ + a = sortarray[0]; + s->ins = a; + a->inchain = sortarray[1]; + a->inchainRev = NULL; + for (i = 1; i < n - 1; i++) + { + a = sortarray[i]; + a->inchain = sortarray[i + 1]; + a->inchainRev = sortarray[i - 1]; + } + a = sortarray[i]; + a->inchain = NULL; + a->inchainRev = sortarray[i - 1]; + FREE(sortarray); +} + +static int +sortins_cmp(const void *a, const void *b) +{ + const struct arc *aa = *((const struct arc *const *) a); + const struct arc *bb = *((const struct arc *const *) b); + + /* we check the fields in the order they are most likely to be different */ + if (aa->from->no < bb->from->no) + return -1; + if (aa->from->no > bb->from->no) + return 1; + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return 1; + if (aa->type < bb->type) + return -1; + if (aa->type > bb->type) + return 1; + return 0; +} + +/* + * sortouts - sort the out arcs of a state by to/color/type + */ +static void +sortouts(struct nfa *nfa, + struct state *s) +{ + struct arc **sortarray; + struct arc *a; + int n = s->nouts; + int i; + + if (n <= 1) + return; /* nothing to do */ + /* make an array of arc pointers ... */ + sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *)); + if (sortarray == NULL) + { + NERR(REG_ESPACE); + return; + } + i = 0; + for (a = s->outs; a != NULL; a = a->outchain) + sortarray[i++] = a; + assert(i == n); + /* ... sort the array */ + qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp); + /* ... and rebuild arc list in order */ + /* it seems worth special-casing first and last items to simplify loop */ + a = sortarray[0]; + s->outs = a; + a->outchain = sortarray[1]; + a->outchainRev = NULL; + for (i = 1; i < n - 1; i++) + { + a = sortarray[i]; + a->outchain = sortarray[i + 1]; + a->outchainRev = sortarray[i - 1]; + } + a = sortarray[i]; + a->outchain = NULL; + a->outchainRev = sortarray[i - 1]; + FREE(sortarray); +} + +static int +sortouts_cmp(const void *a, const void *b) +{ + const struct arc *aa = *((const struct arc *const *) a); + const struct arc *bb = *((const struct arc *const *) b); + + /* we check the fields in the order they are most likely to be different */ + if (aa->to->no < bb->to->no) + return -1; + if (aa->to->no > bb->to->no) + return 1; + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return 1; + if (aa->type < bb->type) + return -1; + if (aa->type > bb->type) + return 1; + return 0; +} + +/* + * Common decision logic about whether to use arc-by-arc operations or + * sort/merge. If there's just a few source arcs we cannot recoup the + * cost of sorting the destination arc list, no matter how large it is. + * Otherwise, limit the number of arc-by-arc comparisons to about 1000 + * (a somewhat arbitrary choice, but the breakeven point would probably + * be machine dependent anyway). + */ +#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \ + ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32)) + +/* + * moveins - move all in arcs of a state to another state + * + * You might think this could be done better by just updating the + * existing arcs, and you would be right if it weren't for the need + * for duplicate suppression, which makes it easier to just make new + * ones to exploit the suppression built into newarc. + * + * However, if we have a whole lot of arcs to deal with, retail duplicate + * checks become too slow. In that case we proceed by sorting and merging + * the arc lists, and then we can indeed just update the arcs in-place. + */ +static void +moveins(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + while ((a = oldState->ins) != NULL) + { + cparc(nfa, a, a->from, newState); + freearc(nfa, a); + } + } + else + { + /* + * With many arcs, use a sort-merge approach. Note changearctarget() + * will put the arc onto the front of newState's chain, so it does not + * break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortins(nfa, oldState); + sortins(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->ins; + na = newState->ins; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortins_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->inchain; + + /* + * Rather than doing createarc+freearc, we can just unlink + * and relink the existing arc struct. + */ + changearctarget(a, newState); + break; + case 0: + /* match, advance in both lists */ + oa = oa->inchain; + na = na->inchain; + /* ... and drop duplicate arc from oldState */ + freearc(nfa, a); + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->inchain; + changearctarget(a, newState); + } + } + + assert(oldState->nins == 0); + assert(oldState->ins == NULL); +} + +/* + * copyins - copy in arcs of a state to another state + */ +static void +copyins(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + for (a = oldState->ins; a != NULL; a = a->inchain) + cparc(nfa, a, a->from, newState); + } + else + { + /* + * With many arcs, use a sort-merge approach. Note that createarc() + * will put new arcs onto the front of newState's chain, so it does + * not break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortins(nfa, oldState); + sortins(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->ins; + na = newState->ins; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortins_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->inchain; + createarc(nfa, a->type, a->co, a->from, newState); + break; + case 0: + /* match, advance in both lists */ + oa = oa->inchain; + na = na->inchain; + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->inchain; + createarc(nfa, a->type, a->co, a->from, newState); + } + } +} + +/* + * mergeins - merge a list of inarcs into a state + * + * This is much like copyins, but the source arcs are listed in an array, + * and are not guaranteed unique. It's okay to clobber the array contents. + */ +static void +mergeins(struct nfa *nfa, + struct state *s, + struct arc **arcarray, + int arccount) +{ + struct arc *na; + int i; + int j; + + if (arccount <= 0) + return; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + /* Sort existing inarcs as well as proposed new ones */ + sortins(nfa, s); + if (NISERR()) + return; /* might have failed to sort */ + + qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp); + + /* + * arcarray very likely includes dups, so we must eliminate them. (This + * could be folded into the next loop, but it's not worth the trouble.) + */ + j = 0; + for (i = 1; i < arccount; i++) + { + switch (sortins_cmp(&arcarray[j], &arcarray[i])) + { + case -1: + /* non-dup */ + arcarray[++j] = arcarray[i]; + break; + case 0: + /* dup */ + break; + default: + /* trouble */ + assert(NOTREACHED); + } + } + arccount = j + 1; + + /* + * Now merge into s' inchain. Note that createarc() will put new arcs + * onto the front of s's chain, so it does not break our walk through the + * sorted part of the chain. + */ + i = 0; + na = s->ins; + while (i < arccount && na != NULL) + { + struct arc *a = arcarray[i]; + + switch (sortins_cmp(&a, &na)) + { + case -1: + /* s does not have anything matching a */ + createarc(nfa, a->type, a->co, a->from, s); + i++; + break; + case 0: + /* match, advance in both lists */ + i++; + na = na->inchain; + break; + case +1: + /* advance only na; array might have a match later */ + na = na->inchain; + break; + default: + assert(NOTREACHED); + } + } + while (i < arccount) + { + /* s does not have anything matching a */ + struct arc *a = arcarray[i]; + + createarc(nfa, a->type, a->co, a->from, s); + i++; + } +} + +/* + * moveouts - move all out arcs of a state to another state + */ +static void +moveouts(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + while ((a = oldState->outs) != NULL) + { + cparc(nfa, a, newState, a->to); + freearc(nfa, a); + } + } + else + { + /* + * With many arcs, use a sort-merge approach. Note that createarc() + * will put new arcs onto the front of newState's chain, so it does + * not break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortouts(nfa, oldState); + sortouts(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->outs; + na = newState->outs; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortouts_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + freearc(nfa, a); + break; + case 0: + /* match, advance in both lists */ + oa = oa->outchain; + na = na->outchain; + /* ... and drop duplicate arc from oldState */ + freearc(nfa, a); + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->outchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + freearc(nfa, a); + } + } + + assert(oldState->nouts == 0); + assert(oldState->outs == NULL); +} + +/* + * copyouts - copy out arcs of a state to another state + */ +static void +copyouts(struct nfa *nfa, + struct state *oldState, + struct state *newState) +{ + assert(oldState != newState); + + if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + { + /* With not too many arcs, just do them one at a time */ + struct arc *a; + + for (a = oldState->outs; a != NULL; a = a->outchain) + cparc(nfa, a, newState, a->to); + } + else + { + /* + * With many arcs, use a sort-merge approach. Note that createarc() + * will put new arcs onto the front of newState's chain, so it does + * not break our walk through the sorted part of the chain. + */ + struct arc *oa; + struct arc *na; + + /* + * Because we bypass newarc() in this code path, we'd better include a + * cancel check. + */ + if (CANCEL_REQUESTED(nfa->v->re)) + { + NERR(REG_CANCEL); + return; + } + + sortouts(nfa, oldState); + sortouts(nfa, newState); + if (NISERR()) + return; /* might have failed to sort */ + oa = oldState->outs; + na = newState->outs; + while (oa != NULL && na != NULL) + { + struct arc *a = oa; + + switch (sortouts_cmp(&oa, &na)) + { + case -1: + /* newState does not have anything matching oa */ + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + break; + case 0: + /* match, advance in both lists */ + oa = oa->outchain; + na = na->outchain; + break; + case +1: + /* advance only na; oa might have a match later */ + na = na->outchain; + break; + default: + assert(NOTREACHED); + } + } + while (oa != NULL) + { + /* newState does not have anything matching oa */ + struct arc *a = oa; + + oa = oa->outchain; + createarc(nfa, a->type, a->co, newState, a->to); + } + } +} + +/* + * cloneouts - copy out arcs of a state to another state pair, modifying type + */ +static void +cloneouts(struct nfa *nfa, + struct state *old, + struct state *from, + struct state *to, + int type) +{ + struct arc *a; + + assert(old != from); + + for (a = old->outs; a != NULL; a = a->outchain) + newarc(nfa, type, a->co, from, to); +} + +/* + * delsub - delete a sub-NFA, updating subre pointers if necessary + * + * This uses a recursive traversal of the sub-NFA, marking already-seen + * states using their tmp pointer. + */ +static void +delsub(struct nfa *nfa, + struct state *lp, /* the sub-NFA goes from here... */ + struct state *rp) /* ...to here, *not* inclusive */ +{ + assert(lp != rp); + + rp->tmp = rp; /* mark end */ + + deltraverse(nfa, lp, lp); + if (NISERR()) + return; /* asserts might not hold after failure */ + assert(lp->nouts == 0 && rp->nins == 0); /* did the job */ + assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */ + + rp->tmp = NULL; /* unmark end */ + lp->tmp = NULL; /* and begin, marked by deltraverse */ +} + +/* + * deltraverse - the recursive heart of delsub + * This routine's basic job is to destroy all out-arcs of the state. + */ +static void +deltraverse(struct nfa *nfa, + struct state *leftend, + struct state *s) +{ + struct arc *a; + struct state *to; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->nouts == 0) + return; /* nothing to do */ + if (s->tmp != NULL) + return; /* already in progress */ + + s->tmp = s; /* mark as in progress */ + + while ((a = s->outs) != NULL) + { + to = a->to; + deltraverse(nfa, leftend, to); + if (NISERR()) + return; /* asserts might not hold after failure */ + assert(to->nouts == 0 || to->tmp != NULL); + freearc(nfa, a); + if (to->nins == 0 && to->tmp == NULL) + { + assert(to->nouts == 0); + freestate(nfa, to); + } + } + + assert(s->no != FREESTATE); /* we're still here */ + assert(s == leftend || s->nins != 0); /* and still reachable */ + assert(s->nouts == 0); /* but have no outarcs */ + + s->tmp = NULL; /* we're done here */ +} + +/* + * dupnfa - duplicate sub-NFA + * + * Another recursive traversal, this time using tmp to point to duplicates + * as well as mark already-seen states. (You knew there was a reason why + * it's a state pointer, didn't you? :-)) + */ +static void +dupnfa(struct nfa *nfa, + struct state *start, /* duplicate of subNFA starting here */ + struct state *stop, /* and stopping here */ + struct state *from, /* stringing duplicate from here */ + struct state *to) /* to here */ +{ + if (start == stop) + { + newarc(nfa, EMPTY, 0, from, to); + return; + } + + stop->tmp = to; + duptraverse(nfa, start, from); + /* done, except for clearing out the tmp pointers */ + + stop->tmp = NULL; + cleartraverse(nfa, start); +} + +/* + * duptraverse - recursive heart of dupnfa + */ +static void +duptraverse(struct nfa *nfa, + struct state *s, + struct state *stmp) /* s's duplicate, or NULL */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != NULL) + return; /* already done */ + + s->tmp = (stmp == NULL) ? newstate(nfa) : stmp; + if (s->tmp == NULL) + { + assert(NISERR()); + return; + } + + for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) + { + duptraverse(nfa, a->to, (struct state *) NULL); + if (NISERR()) + break; + assert(a->to->tmp != NULL); + cparc(nfa, a, s->tmp, a->to->tmp); + } +} + +/* + * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set + */ +static void +cleartraverse(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp == NULL) + return; + s->tmp = NULL; + + for (a = s->outs; a != NULL; a = a->outchain) + cleartraverse(nfa, a->to); +} + +/* + * single_color_transition - does getting from s1 to s2 cross one PLAIN arc? + * + * If traversing from s1 to s2 requires a single PLAIN match (possibly of any + * of a set of colors), return a state whose outarc list contains only PLAIN + * arcs of those color(s). Otherwise return NULL. + * + * This is used before optimizing the NFA, so there may be EMPTY arcs, which + * we should ignore; the possibility of an EMPTY is why the result state could + * be different from s1. + * + * It's worth troubling to handle multiple parallel PLAIN arcs here because a + * bracket construct such as [abc] might yield either one or several parallel + * PLAIN arcs depending on earlier atoms in the expression. We'd rather that + * that implementation detail not create user-visible performance differences. + */ +static struct state * +single_color_transition(struct state *s1, struct state *s2) +{ + struct arc *a; + + /* Ignore leading EMPTY arc, if any */ + if (s1->nouts == 1 && s1->outs->type == EMPTY) + s1 = s1->outs->to; + /* Likewise for any trailing EMPTY arc */ + if (s2->nins == 1 && s2->ins->type == EMPTY) + s2 = s2->ins->from; + /* Perhaps we could have a single-state loop in between, if so reject */ + if (s1 == s2) + return NULL; + /* s1 must have at least one outarc... */ + if (s1->outs == NULL) + return NULL; + /* ... and they must all be PLAIN arcs to s2 */ + for (a = s1->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN || a->to != s2) + return NULL; + } + /* OK, return s1 as the possessor of the relevant outarcs */ + return s1; +} + +/* + * specialcolors - fill in special colors for an NFA + */ +static void +specialcolors(struct nfa *nfa) +{ + /* false colors for BOS, BOL, EOS, EOL */ + if (nfa->parent == NULL) + { + nfa->bos[0] = pseudocolor(nfa->cm); + nfa->bos[1] = pseudocolor(nfa->cm); + nfa->eos[0] = pseudocolor(nfa->cm); + nfa->eos[1] = pseudocolor(nfa->cm); + } + else + { + assert(nfa->parent->bos[0] != COLORLESS); + nfa->bos[0] = nfa->parent->bos[0]; + assert(nfa->parent->bos[1] != COLORLESS); + nfa->bos[1] = nfa->parent->bos[1]; + assert(nfa->parent->eos[0] != COLORLESS); + nfa->eos[0] = nfa->parent->eos[0]; + assert(nfa->parent->eos[1] != COLORLESS); + nfa->eos[1] = nfa->parent->eos[1]; + } +} + +/* + * optimize - optimize an NFA + * + * The main goal of this function is not so much "optimization" (though it + * does try to get rid of useless NFA states) as reducing the NFA to a form + * the regex executor can handle. The executor, and indeed the cNFA format + * that is its input, can only handle PLAIN and LACON arcs. The output of + * the regex parser also includes EMPTY (do-nothing) arcs, as well as + * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here. + * We first get rid of EMPTY arcs and then deal with the constraint arcs. + * The hardest part of either job is to get rid of circular loops of the + * target arc type. We would have to do that in any case, though, as such a + * loop would otherwise allow the executor to cycle through the loop endlessly + * without making any progress in the input string. + */ +static long /* re_info bits */ +optimize(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ +#ifdef REG_DEBUG + int verbose = (f != NULL) ? 1 : 0; + + if (verbose) + fprintf(f, "\ninitial cleanup:\n"); +#endif + cleanup(nfa); /* may simplify situation */ +#ifdef REG_DEBUG + if (verbose) + dumpnfa(nfa, f); + if (verbose) + fprintf(f, "\nempties:\n"); +#endif + fixempties(nfa, f); /* get rid of EMPTY arcs */ +#ifdef REG_DEBUG + if (verbose) + fprintf(f, "\nconstraints:\n"); +#endif + fixconstraintloops(nfa, f); /* get rid of constraint loops */ + pullback(nfa, f); /* pull back constraints backward */ + pushfwd(nfa, f); /* push fwd constraints forward */ +#ifdef REG_DEBUG + if (verbose) + fprintf(f, "\nfinal cleanup:\n"); +#endif + cleanup(nfa); /* final tidying */ +#ifdef REG_DEBUG + if (verbose) + dumpnfa(nfa, f); +#endif + return analyze(nfa); /* and analysis */ +} + +/* + * pullback - pull back constraints backward to eliminate them + */ +static void +pullback(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + struct state *intermediates; + int progress; + + /* find and pull until there are no more */ + do + { + progress = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + intermediates = NULL; + for (a = s->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + if (a->type == '^' || a->type == BEHIND) + if (pull(nfa, a, &intermediates)) + progress = 1; + } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) + { + struct state *ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + if (progress && f != NULL) + dumpnfa(nfa, f); + } while (progress && !NISERR()); + if (NISERR()) + return; + + /* + * Any ^ constraints we were able to pull to the start state can now be + * replaced by PLAIN arcs referencing the BOS or BOL colors. There should + * be no other ^ or BEHIND arcs left in the NFA, though we do not check + * that here (compact() will fail if so). + */ + for (a = nfa->pre->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->type == '^') + { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + * pull - pull a back constraint backward past its source state + * + * Returns 1 if successful (which it always is unless the source is the + * start state or we have an internal error), 0 if nothing happened. + * + * A significant property of this function is that it deletes no pre-existing + * states, and no outarcs of the constraint's from state other than the given + * constraint arc. This makes the loops in pullback() safe, at the cost that + * we may leave useless states behind. Therefore, we leave it to pullback() + * to delete such states. + * + * If the from state has multiple back-constraint outarcs, and/or multiple + * compatible constraint inarcs, we only need to create one new intermediate + * state per combination of predecessor and successor states. *intermediates + * points to a list of such intermediate states for this from state (chained + * through their tmp fields). + */ +static int +pull(struct nfa *nfa, + struct arc *con, + struct state **intermediates) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + assert(from != to); /* should have gotten rid of this earlier */ + if (from->flag) /* can't pull back beyond start */ + return 0; + if (from->nins == 0) + { /* unreachable */ + freearc(nfa, con); + return 1; + } + + /* + * First, clone from state if necessary to avoid other outarcs. This may + * seem wasteful, but it simplifies the logic, and we'll get rid of the + * clone state again at the bottom. + */ + if (from->nouts > 1) + { + s = newstate(nfa); + if (NISERR()) + return 0; + copyins(nfa, from, s); /* duplicate inarcs */ + cparc(nfa, con, s, to); /* move constraint arc */ + freearc(nfa, con); + if (NISERR()) + return 0; + from = s; + con = from->outs; + } + assert(from->nouts == 1); + + /* propagate the constraint into the from state's inarcs */ + for (a = from->ins; a != NULL && !NISERR(); a = nexta) + { + nexta = a->inchain; + switch (combine(con, a)) + { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) + { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == a->from && s->outs->to == to) + break; + } + if (s == NULL) + { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } + cparc(nfa, con, a->from, s); + cparc(nfa, a, s, to); + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* remaining inarcs, if any, incorporate the constraint */ + moveins(nfa, from, to); + freearc(nfa, con); + /* from state is now useless, but we leave it to pullback() to clean up */ + return 1; +} + +/* + * pushfwd - push forward constraints forward to eliminate them + */ +static void +pushfwd(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + struct state *intermediates; + int progress; + + /* find and push until there are no more */ + do + { + progress = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + intermediates = NULL; + for (a = s->ins; a != NULL && !NISERR(); a = nexta) + { + nexta = a->inchain; + if (a->type == '$' || a->type == AHEAD) + if (push(nfa, a, &intermediates)) + progress = 1; + } + /* clear tmp fields of intermediate states created here */ + while (intermediates != NULL) + { + struct state *ns = intermediates->tmp; + + intermediates->tmp = NULL; + intermediates = ns; + } + /* if s is now useless, get rid of it */ + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + if (progress && f != NULL) + dumpnfa(nfa, f); + } while (progress && !NISERR()); + if (NISERR()) + return; + + /* + * Any $ constraints we were able to push to the post state can now be + * replaced by PLAIN arcs referencing the EOS or EOL colors. There should + * be no other $ or AHEAD arcs left in the NFA, though we do not check + * that here (compact() will fail if so). + */ + for (a = nfa->post->ins; a != NULL; a = nexta) + { + nexta = a->inchain; + if (a->type == '$') + { + assert(a->co == 0 || a->co == 1); + newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to); + freearc(nfa, a); + } + } +} + +/* + * push - push a forward constraint forward past its destination state + * + * Returns 1 if successful (which it always is unless the destination is the + * post state or we have an internal error), 0 if nothing happened. + * + * A significant property of this function is that it deletes no pre-existing + * states, and no inarcs of the constraint's to state other than the given + * constraint arc. This makes the loops in pushfwd() safe, at the cost that + * we may leave useless states behind. Therefore, we leave it to pushfwd() + * to delete such states. + * + * If the to state has multiple forward-constraint inarcs, and/or multiple + * compatible constraint outarcs, we only need to create one new intermediate + * state per combination of predecessor and successor states. *intermediates + * points to a list of such intermediate states for this to state (chained + * through their tmp fields). + */ +static int +push(struct nfa *nfa, + struct arc *con, + struct state **intermediates) +{ + struct state *from = con->from; + struct state *to = con->to; + struct arc *a; + struct arc *nexta; + struct state *s; + + assert(to != from); /* should have gotten rid of this earlier */ + if (to->flag) /* can't push forward beyond end */ + return 0; + if (to->nouts == 0) + { /* dead end */ + freearc(nfa, con); + return 1; + } + + /* + * First, clone to state if necessary to avoid other inarcs. This may + * seem wasteful, but it simplifies the logic, and we'll get rid of the + * clone state again at the bottom. + */ + if (to->nins > 1) + { + s = newstate(nfa); + if (NISERR()) + return 0; + copyouts(nfa, to, s); /* duplicate outarcs */ + cparc(nfa, con, from, s); /* move constraint arc */ + freearc(nfa, con); + if (NISERR()) + return 0; + to = s; + con = to->ins; + } + assert(to->nins == 1); + + /* propagate the constraint into the to state's outarcs */ + for (a = to->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + switch (combine(con, a)) + { + case INCOMPATIBLE: /* destroy the arc */ + freearc(nfa, a); + break; + case SATISFIED: /* no action needed */ + break; + case COMPATIBLE: /* swap the two arcs, more or less */ + /* need an intermediate state, but might have one already */ + for (s = *intermediates; s != NULL; s = s->tmp) + { + assert(s->nins > 0 && s->nouts > 0); + if (s->ins->from == from && s->outs->to == a->to) + break; + } + if (s == NULL) + { + s = newstate(nfa); + if (NISERR()) + return 0; + s->tmp = *intermediates; + *intermediates = s; + } + cparc(nfa, con, s, a->to); + cparc(nfa, a, from, s); + freearc(nfa, a); + break; + default: + assert(NOTREACHED); + break; + } + } + + /* remaining outarcs, if any, incorporate the constraint */ + moveouts(nfa, to, from); + freearc(nfa, con); + /* to state is now useless, but we leave it to pushfwd() to clean up */ + return 1; +} + +/* + * combine - constraint lands on an arc, what happens? + * + * #def INCOMPATIBLE 1 // destroys arc + * #def SATISFIED 2 // constraint satisfied + * #def COMPATIBLE 3 // compatible but not satisfied yet + */ +static int +combine(struct arc *con, + struct arc *a) +{ +#define CA(ct,at) (((ct)<<CHAR_BIT) | (at)) + + switch (CA(con->type, a->type)) + { + case CA('^', PLAIN): /* newlines are handled separately */ + case CA('$', PLAIN): + return INCOMPATIBLE; + break; + case CA(AHEAD, PLAIN): /* color constraints meet colors */ + case CA(BEHIND, PLAIN): + if (con->co == a->co) + return SATISFIED; + return INCOMPATIBLE; + break; + case CA('^', '^'): /* collision, similar constraints */ + case CA('$', '$'): + case CA(AHEAD, AHEAD): + case CA(BEHIND, BEHIND): + if (con->co == a->co) /* true duplication */ + return SATISFIED; + return INCOMPATIBLE; + break; + case CA('^', BEHIND): /* collision, dissimilar constraints */ + case CA(BEHIND, '^'): + case CA('$', AHEAD): + case CA(AHEAD, '$'): + return INCOMPATIBLE; + break; + case CA('^', '$'): /* constraints passing each other */ + case CA('^', AHEAD): + case CA(BEHIND, '$'): + case CA(BEHIND, AHEAD): + case CA('$', '^'): + case CA('$', BEHIND): + case CA(AHEAD, '^'): + case CA(AHEAD, BEHIND): + case CA('^', LACON): + case CA(BEHIND, LACON): + case CA('$', LACON): + case CA(AHEAD, LACON): + return COMPATIBLE; + break; + } + assert(NOTREACHED); + return INCOMPATIBLE; /* for benefit of blind compilers */ +} + +/* + * fixempties - get rid of EMPTY arcs + */ +static void +fixempties(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *s2; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int totalinarcs; + struct arc **inarcsorig; + struct arc **arcarray; + int arccount; + int prevnins; + int nskip; + + /* + * First, get rid of any states whose sole out-arc is an EMPTY, since + * they're basically just aliases for their successor. The parsing + * algorithm creates enough of these that it's worth special-casing this. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + if (s->flag || s->nouts != 1) + continue; + a = s->outs; + assert(a != NULL && a->outchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->to) + moveins(nfa, s, a->to); + dropstate(nfa, s); + } + + /* + * Similarly, get rid of any state with a single EMPTY in-arc, by folding + * it into its predecessor. + */ + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + assert(s->tmp == NULL); + if (s->flag || s->nins != 1) + continue; + a = s->ins; + assert(a != NULL && a->inchain == NULL); + if (a->type != EMPTY) + continue; + if (s != a->from) + moveouts(nfa, s, a->from); + dropstate(nfa, s); + } + + if (NISERR()) + return; + + /* + * For each remaining NFA state, find all other states from which it is + * reachable by a chain of one or more EMPTY arcs. Then generate new arcs + * that eliminate the need for each such chain. + * + * We could replace a chain of EMPTY arcs that leads from a "from" state + * to a "to" state either by pushing non-EMPTY arcs forward (linking + * directly from "from"'s predecessors to "to") or by pulling them back + * (linking directly from "from" to "to"'s successors). We choose to + * always do the former; this choice is somewhat arbitrary, but the + * approach below requires that we uniformly do one or the other. + * + * Suppose we have a chain of N successive EMPTY arcs (where N can easily + * approach the size of the NFA). All of the intermediate states must + * have additional inarcs and outarcs, else they'd have been removed by + * the steps above. Assuming their inarcs are mostly not empties, we will + * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one + * state in the chain must be duplicated to lead to all its successor + * states as well. So there is no hope of doing less than O(N^2) work; + * however, we should endeavor to keep the big-O cost from being even + * worse than that, which it can easily become without care. In + * particular, suppose we were to copy all S1's inarcs forward to S2, and + * then also to S3, and then later we consider pushing S2's inarcs forward + * to S3. If we include the arcs already copied from S1 in that, we'd be + * doing O(N^3) work. (The duplicate-arc elimination built into newarc() + * and its cohorts would get rid of the extra arcs, but not without cost.) + * + * We can avoid this cost by treating only arcs that existed at the start + * of this phase as candidates to be pushed forward. To identify those, + * we remember the first inarc each state had to start with. We rely on + * the fact that newarc() and friends put new arcs on the front of their + * to-states' inchains, and that this phase never deletes arcs, so that + * the original arcs must be the last arcs in their to-states' inchains. + * + * So the process here is that, for each state in the NFA, we gather up + * all non-EMPTY inarcs of states that can reach the target state via + * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the + * target state's inchain. (We can safely use sort-merge for this as long + * as we update each state's original-arcs pointer after we add arcs to + * it; the sort step of mergeins probably changed the order of the old + * arcs.) + * + * Another refinement worth making is that, because we only add non-EMPTY + * arcs during this phase, and all added arcs have the same from-state as + * the non-EMPTY arc they were cloned from, we know ahead of time that any + * states having only EMPTY outarcs will be useless for lack of outarcs + * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if + * they had none to start with.) So we need not bother to update the + * inchains of such states at all. + */ + + /* Remember the states' first original inarcs */ + /* ... and while at it, count how many old inarcs there are altogether */ + inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *)); + if (inarcsorig == NULL) + { + NERR(REG_ESPACE); + return; + } + totalinarcs = 0; + for (s = nfa->states; s != NULL; s = s->next) + { + inarcsorig[s->no] = s->ins; + totalinarcs += s->nins; + } + + /* + * Create a workspace for accumulating the inarcs to be added to the + * current target state. totalinarcs is probably a considerable + * overestimate of the space needed, but the NFA is unlikely to be large + * enough at this point to make it worth being smarter. + */ + arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *)); + if (arcarray == NULL) + { + NERR(REG_ESPACE); + FREE(inarcsorig); + return; + } + + /* And iterate over the target states */ + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) + { + /* Ignore target states without non-EMPTY outarcs, per note above */ + if (!s->flag && !hasnonemptyout(s)) + continue; + + /* Find predecessor states and accumulate their original inarcs */ + arccount = 0; + for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts) + { + /* Add s2's original inarcs to arcarray[], but ignore empties */ + for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain) + { + if (a->type != EMPTY) + arcarray[arccount++] = a; + } + + /* Reset the tmp fields as we walk back */ + nexts = s2->tmp; + s2->tmp = NULL; + } + s->tmp = NULL; + assert(arccount <= totalinarcs); + + /* Remember how many original inarcs this state has */ + prevnins = s->nins; + + /* Add non-duplicate inarcs to target state */ + mergeins(nfa, s, arcarray, arccount); + + /* Now we must update the state's inarcsorig pointer */ + nskip = s->nins - prevnins; + a = s->ins; + while (nskip-- > 0) + a = a->inchain; + inarcsorig[s->no] = a; + } + + FREE(arcarray); + FREE(inarcsorig); + + if (NISERR()) + return; + + /* + * Now remove all the EMPTY arcs, since we don't need them anymore. + */ + for (s = nfa->states; s != NULL; s = s->next) + { + for (a = s->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->type == EMPTY) + freearc(nfa, a); + } + } + + /* + * And remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + */ + for (s = nfa->states; s != NULL; s = nexts) + { + nexts = s->next; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL) + dumpnfa(nfa, f); +} + +/* + * emptyreachable - recursively find all states that can reach s by EMPTY arcs + * + * The return value is the last such state found. Its tmp field links back + * to the next-to-last such state, and so on back to s, so that all these + * states can be located without searching the whole NFA. + * + * Since this is only used in fixempties(), we pass in the inarcsorig[] array + * maintained by that function. This lets us skip over all new inarcs, which + * are certainly not EMPTY arcs. + * + * The maximum recursion depth here is equal to the length of the longest + * loop-free chain of EMPTY arcs, which is surely no more than the size of + * the NFA ... but that could still be enough to cause trouble. + */ +static struct state * +emptyreachable(struct nfa *nfa, + struct state *s, + struct state *lastfound, + struct arc **inarcsorig) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return lastfound; + } + + s->tmp = lastfound; + lastfound = s; + for (a = inarcsorig[s->no]; a != NULL; a = a->inchain) + { + if (a->type == EMPTY && a->from->tmp == NULL) + lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig); + } + return lastfound; +} + +/* + * isconstraintarc - detect whether an arc is of a constraint type + */ +static inline int +isconstraintarc(struct arc *a) +{ + switch (a->type) + { + case '^': + case '$': + case BEHIND: + case AHEAD: + case LACON: + return 1; + } + return 0; +} + +/* + * hasconstraintout - does state have a constraint out arc? + */ +static int +hasconstraintout(struct state *s) +{ + struct arc *a; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (isconstraintarc(a)) + return 1; + } + return 0; +} + +/* + * fixconstraintloops - get rid of loops containing only constraint arcs + * + * A loop of states that contains only constraint arcs is useless, since + * passing around the loop represents no forward progress. Moreover, it + * would cause infinite looping in pullback/pushfwd, so we need to get rid + * of such loops before doing that. + */ +static void +fixconstraintloops(struct nfa *nfa, + FILE *f) /* for debug output; NULL none */ +{ + struct state *s; + struct state *nexts; + struct arc *a; + struct arc *nexta; + int hasconstraints; + + /* + * In the trivial case of a state that loops to itself, we can just drop + * the constraint arc altogether. This is worth special-casing because + * such loops are far more common than loops containing multiple states. + * While we're at it, note whether any constraint arcs survive. + */ + hasconstraints = 0; + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + /* while we're at it, ensure tmp fields are clear for next step */ + assert(s->tmp == NULL); + for (a = s->outs; a != NULL && !NISERR(); a = nexta) + { + nexta = a->outchain; + if (isconstraintarc(a)) + { + if (a->to == s) + freearc(nfa, a); + else + hasconstraints = 1; + } + } + /* If we removed all the outarcs, the state is useless. */ + if (s->nouts == 0 && !s->flag) + dropstate(nfa, s); + } + + /* Nothing to do if no remaining constraint arcs */ + if (NISERR() || !hasconstraints) + return; + + /* + * Starting from each remaining NFA state, search outwards for a + * constraint loop. If we find a loop, break the loop, then start the + * search over. (We could possibly retain some state from the first scan, + * but it would complicate things greatly, and multi-state constraint + * loops are rare enough that it's not worth optimizing the case.) + */ +restart: + for (s = nfa->states; s != NULL && !NISERR(); s = s->next) + { + if (findconstraintloop(nfa, s)) + goto restart; + } + + if (NISERR()) + return; + + /* + * Now remove any states that have become useless. (This cleanup is not + * very thorough, and would be even less so if we tried to combine it with + * the previous step; but cleanup() will take care of anything we miss.) + * + * Because findconstraintloop intentionally doesn't reset all tmp fields, + * we have to clear them after it's done. This is a convenient place to + * do that, too. + */ + for (s = nfa->states; s != NULL; s = nexts) + { + nexts = s->next; + s->tmp = NULL; + if ((s->nins == 0 || s->nouts == 0) && !s->flag) + dropstate(nfa, s); + } + + if (f != NULL) + dumpnfa(nfa, f); +} + +/* + * findconstraintloop - recursively find a loop of constraint arcs + * + * If we find a loop, break it by calling breakconstraintloop(), then + * return 1; otherwise return 0. + * + * State tmp fields are guaranteed all NULL on a success return, because + * breakconstraintloop does that. After a failure return, any state that + * is known not to be part of a loop is marked with s->tmp == s; this allows + * us not to have to re-prove that fact on later calls. (This convention is + * workable because we already eliminated single-state loops.) + * + * Note that the found loop doesn't necessarily include the first state we + * are called on. Any loop reachable from that state will do. + * + * The maximum recursion depth here is one more than the length of the longest + * loop-free chain of constraint arcs, which is surely no more than the size + * of the NFA ... but that could still be enough to cause trouble. + */ +static int +findconstraintloop(struct nfa *nfa, struct state *s) +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return 1; /* to exit as quickly as possible */ + } + + if (s->tmp != NULL) + { + /* Already proven uninteresting? */ + if (s->tmp == s) + return 0; + /* Found a loop involving s */ + breakconstraintloop(nfa, s); + /* The tmp fields have been cleaned up by breakconstraintloop */ + return 1; + } + for (a = s->outs; a != NULL; a = a->outchain) + { + if (isconstraintarc(a)) + { + struct state *sto = a->to; + + assert(sto != s); + s->tmp = sto; + if (findconstraintloop(nfa, sto)) + return 1; + } + } + + /* + * If we get here, no constraint loop exists leading out from s. Mark it + * with s->tmp == s so we need not rediscover that fact again later. + */ + s->tmp = s; + return 0; +} + +/* + * breakconstraintloop - break a loop of constraint arcs + * + * sinitial is any one member state of the loop. Each loop member's tmp + * field links to its successor within the loop. (Note that this function + * will reset all the tmp fields to NULL.) + * + * We can break the loop by, for any one state S1 in the loop, cloning its + * loop successor state S2 (and possibly following states), and then moving + * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should + * copy any non-constraint outarcs of S2. Constraint outarcs should be + * dropped if they point back to S1, else they need to be copied as arcs to + * similarly cloned states S3, S4, etc. In general, each cloned state copies + * non-constraint outarcs, drops constraint outarcs that would lead to itself + * or any earlier cloned state, and sends other constraint outarcs to newly + * cloned states. No cloned state will have any inarcs that aren't constraint + * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop + * constraint back-arcs since they would not take us to any state we've not + * already been in; therefore, no new constraint loop is created. In this way + * we generate a modified NFA that can still represent every useful state + * sequence, but not sequences that represent state loops with no consumption + * of input data. Note that the set of cloned states will certainly include + * all of the loop member states other than S1, and it may also include + * non-loop states that are reachable from S2 via constraint arcs. This is + * important because there is no guarantee that findconstraintloop found a + * maximal loop (and searching for one would be NP-hard, so don't try). + * Frequently the "non-loop states" are actually part of a larger loop that + * we didn't notice, and indeed there may be several overlapping loops. + * This technique ensures convergence in such cases, while considering only + * the originally-found loop does not. + * + * If there is only one S1->S2 constraint arc, then that constraint is + * certainly satisfied when we enter any of the clone states. This means that + * in the common case where many of the constraint arcs are identically + * labeled, we can merge together clone states linked by a similarly-labeled + * constraint: if we can get to the first one we can certainly get to the + * second, so there's no need to distinguish. This greatly reduces the number + * of new states needed, so we preferentially break the given loop at a state + * pair where this is true. + * + * Furthermore, it's fairly common to find that a cloned successor state has + * no outarcs, especially if we're a bit aggressive about removing unnecessary + * outarcs. If that happens, then there is simply not any interesting state + * that can be reached through the predecessor's loop arcs, which means we can + * break the loop just by removing those loop arcs, with no new states added. + */ +static void +breakconstraintloop(struct nfa *nfa, struct state *sinitial) +{ + struct state *s; + struct state *shead; + struct state *stail; + struct state *sclone; + struct state *nexts; + struct arc *refarc; + struct arc *a; + struct arc *nexta; + + /* + * Start by identifying which loop step we want to break at. + * Preferentially this is one with only one constraint arc. (XXX are + * there any other secondary heuristics we want to use here?) Set refarc + * to point to the selected lone constraint arc, if there is one. + */ + refarc = NULL; + s = sinitial; + do + { + nexts = s->tmp; + assert(nexts != s); /* should not see any one-element loops */ + if (refarc == NULL) + { + int narcs = 0; + + for (a = s->outs; a != NULL; a = a->outchain) + { + if (a->to == nexts && isconstraintarc(a)) + { + refarc = a; + narcs++; + } + } + assert(narcs > 0); + if (narcs > 1) + refarc = NULL; /* multiple constraint arcs here, no good */ + } + s = nexts; + } while (s != sinitial); + + if (refarc) + { + /* break at the refarc */ + shead = refarc->from; + stail = refarc->to; + assert(stail == shead->tmp); + } + else + { + /* for lack of a better idea, break after sinitial */ + shead = sinitial; + stail = sinitial->tmp; + } + + /* + * Reset the tmp fields so that we can use them for local storage in + * clonesuccessorstates. (findconstraintloop won't mind, since it's just + * going to abandon its search anyway.) + */ + for (s = nfa->states; s != NULL; s = s->next) + s->tmp = NULL; + + /* + * Recursively build clone state(s) as needed. + */ + sclone = newstate(nfa); + if (sclone == NULL) + { + assert(NISERR()); + return; + } + + clonesuccessorstates(nfa, stail, sclone, shead, refarc, + NULL, NULL, nfa->nstates); + + if (NISERR()) + return; + + /* + * It's possible that sclone has no outarcs at all, in which case it's + * useless. (We don't try extremely hard to get rid of useless states + * here, but this is an easy and fairly common case.) + */ + if (sclone->nouts == 0) + { + freestate(nfa, sclone); + sclone = NULL; + } + + /* + * Move shead's constraint-loop arcs to point to sclone, or just drop them + * if we discovered we don't need sclone. + */ + for (a = shead->outs; a != NULL; a = nexta) + { + nexta = a->outchain; + if (a->to == stail && isconstraintarc(a)) + { + if (sclone) + cparc(nfa, a, shead, sclone); + freearc(nfa, a); + if (NISERR()) + break; + } + } +} + +/* + * clonesuccessorstates - create a tree of constraint-arc successor states + * + * ssource is the state to be cloned, and sclone is the state to copy its + * outarcs into. sclone's inarcs, if any, should already be set up. + * + * spredecessor is the original predecessor state that we are trying to build + * successors for (it may not be the immediate predecessor of ssource). + * refarc, if not NULL, is the original constraint arc that is known to have + * been traversed out of spredecessor to reach the successor(s). + * + * For each cloned successor state, we transiently create a "donemap" that is + * a boolean array showing which source states we've already visited for this + * clone state. This prevents infinite recursion as well as useless repeat + * visits to the same state subtree (which can add up fast, since typical NFAs + * have multiple redundant arc pathways). Each donemap is a char array + * indexed by state number. The donemaps are all of the same size "nstates", + * which is nfa->nstates as of the start of the recursion. This is enough to + * have entries for all pre-existing states, but *not* entries for clone + * states created during the recursion. That's okay since we have no need to + * mark those. + * + * curdonemap is NULL when recursing to a new sclone state, or sclone's + * donemap when we are recursing without having created a new state (which we + * do when we decide we can merge a successor state into the current clone + * state). outerdonemap is NULL at the top level and otherwise the parent + * clone state's donemap. + * + * The successor states we create and fill here form a strict tree structure, + * with each state having exactly one predecessor, except that the toplevel + * state has no inarcs as yet (breakconstraintloop will add its inarcs from + * spredecessor after we're done). Thus, we can examine sclone's inarcs back + * to the root, plus refarc if any, to identify the set of constraints already + * known valid at the current point. This allows us to avoid generating extra + * successor states. + */ +static void +clonesuccessorstates(struct nfa *nfa, + struct state *ssource, + struct state *sclone, + struct state *spredecessor, + struct arc *refarc, + char *curdonemap, + char *outerdonemap, + int nstates) +{ + char *donemap; + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + /* If this state hasn't already got a donemap, create one */ + donemap = curdonemap; + if (donemap == NULL) + { + donemap = (char *) MALLOC(nstates * sizeof(char)); + if (donemap == NULL) + { + NERR(REG_ESPACE); + return; + } + + if (outerdonemap != NULL) + { + /* + * Not at outermost recursion level, so copy the outer level's + * donemap; this ensures that we see states in process of being + * visited at outer levels, or already merged into predecessor + * states, as ones we shouldn't traverse back to. + */ + memcpy(donemap, outerdonemap, nstates * sizeof(char)); + } + else + { + /* At outermost level, only spredecessor is off-limits */ + memset(donemap, 0, nstates * sizeof(char)); + assert(spredecessor->no < nstates); + donemap[spredecessor->no] = 1; + } + } + + /* Mark ssource as visited in the donemap */ + assert(ssource->no < nstates); + assert(donemap[ssource->no] == 0); + donemap[ssource->no] = 1; + + /* + * We proceed by first cloning all of ssource's outarcs, creating new + * clone states as needed but not doing more with them than that. Then in + * a second pass, recurse to process the child clone states. This allows + * us to have only one child clone state per reachable source state, even + * when there are multiple outarcs leading to the same state. Also, when + * we do visit a child state, its set of inarcs is known exactly, which + * makes it safe to apply the constraint-is-already-checked optimization. + * Also, this ensures that we've merged all the states we can into the + * current clone before we recurse to any children, thus possibly saving + * them from making extra images of those states. + * + * While this function runs, child clone states of the current state are + * marked by setting their tmp fields to point to the original state they + * were cloned from. This makes it possible to detect multiple outarcs + * leading to the same state, and also makes it easy to distinguish clone + * states from original states (which will have tmp == NULL). + */ + for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain) + { + struct state *sto = a->to; + + /* + * We do not consider cloning successor states that have no constraint + * outarcs; just link to them as-is. They cannot be part of a + * constraint loop so there is no need to make copies. In particular, + * this rule keeps us from trying to clone the post state, which would + * be a bad idea. + */ + if (isconstraintarc(a) && hasconstraintout(sto)) + { + struct state *prevclone; + int canmerge; + struct arc *a2; + + /* + * Back-link constraint arcs must not be followed. Nor is there a + * need to revisit states previously merged into this clone. + */ + assert(sto->no < nstates); + if (donemap[sto->no] != 0) + continue; + + /* + * Check whether we already have a child clone state for this + * source state. + */ + prevclone = NULL; + for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain) + { + if (a2->to->tmp == sto) + { + prevclone = a2->to; + break; + } + } + + /* + * If this arc is labeled the same as refarc, or the same as any + * arc we must have traversed to get to sclone, then no additional + * constraints need to be met to get to sto, so we should just + * merge its outarcs into sclone. + */ + if (refarc && a->type == refarc->type && a->co == refarc->co) + canmerge = 1; + else + { + struct state *s; + + canmerge = 0; + for (s = sclone; s->ins; s = s->ins->from) + { + if (s->nins == 1 && + a->type == s->ins->type && a->co == s->ins->co) + { + canmerge = 1; + break; + } + } + } + + if (canmerge) + { + /* + * We can merge into sclone. If we previously made a child + * clone state, drop it; there's no need to visit it. (This + * can happen if ssource has multiple pathways to sto, and we + * only just now found one that is provably a no-op.) + */ + if (prevclone) + dropstate(nfa, prevclone); /* kills our outarc, too */ + + /* Recurse to merge sto's outarcs into sclone */ + clonesuccessorstates(nfa, + sto, + sclone, + spredecessor, + refarc, + donemap, + outerdonemap, + nstates); + /* sto should now be marked as previously visited */ + assert(NISERR() || donemap[sto->no] == 1); + } + else if (prevclone) + { + /* + * We already have a clone state for this successor, so just + * make another arc to it. + */ + cparc(nfa, a, sclone, prevclone); + } + else + { + /* + * We need to create a new successor clone state. + */ + struct state *stoclone; + + stoclone = newstate(nfa); + if (stoclone == NULL) + { + assert(NISERR()); + break; + } + /* Mark it as to what it's a clone of */ + stoclone->tmp = sto; + /* ... and add the outarc leading to it */ + cparc(nfa, a, sclone, stoclone); + } + } + else + { + /* + * Non-constraint outarcs just get copied to sclone, as do outarcs + * leading to states with no constraint outarc. + */ + cparc(nfa, a, sclone, sto); + } + } + + /* + * If we are at outer level for this clone state, recurse to all its child + * clone states, clearing their tmp fields as we go. (If we're not + * outermost for sclone, leave this to be done by the outer call level.) + * Note that if we have multiple outarcs leading to the same clone state, + * it will only be recursed-to once. + */ + if (curdonemap == NULL) + { + for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain) + { + struct state *stoclone = a->to; + struct state *sto = stoclone->tmp; + + if (sto != NULL) + { + stoclone->tmp = NULL; + clonesuccessorstates(nfa, + sto, + stoclone, + spredecessor, + refarc, + NULL, + donemap, + nstates); + } + } + + /* Don't forget to free sclone's donemap when done with it */ + FREE(donemap); + } +} + +/* + * cleanup - clean up NFA after optimizations + */ +static void +cleanup(struct nfa *nfa) +{ + struct state *s; + struct state *nexts; + int n; + + if (NISERR()) + return; + + /* clear out unreachable or dead-end states */ + /* use pre to mark reachable, then post to mark can-reach-post */ + markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre); + markcanreach(nfa, nfa->post, nfa->pre, nfa->post); + for (s = nfa->states; s != NULL && !NISERR(); s = nexts) + { + nexts = s->next; + if (s->tmp != nfa->post && !s->flag) + dropstate(nfa, s); + } + assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post); + cleartraverse(nfa, nfa->pre); + assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL); + /* the nins==0 (final unreachable) case will be caught later */ + + /* renumber surviving states */ + n = 0; + for (s = nfa->states; s != NULL; s = s->next) + s->no = n++; + nfa->nstates = n; +} + +/* + * markreachable - recursive marking of reachable states + */ +static void +markreachable(struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != okay) + return; + s->tmp = mark; + + for (a = s->outs; a != NULL; a = a->outchain) + markreachable(nfa, a->to, okay, mark); +} + +/* + * markcanreach - recursive marking of states which can reach here + */ +static void +markcanreach(struct nfa *nfa, + struct state *s, + struct state *okay, /* consider only states with this mark */ + struct state *mark) /* the value to mark with */ +{ + struct arc *a; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != okay) + return; + s->tmp = mark; + + for (a = s->ins; a != NULL; a = a->inchain) + markcanreach(nfa, a->from, okay, mark); +} + +/* + * analyze - ascertain potentially-useful facts about an optimized NFA + */ +static long /* re_info bits to be ORed in */ +analyze(struct nfa *nfa) +{ + struct arc *a; + struct arc *aa; + + if (NISERR()) + return 0; + + if (nfa->pre->outs == NULL) + return REG_UIMPOSSIBLE; + for (a = nfa->pre->outs; a != NULL; a = a->outchain) + for (aa = a->to->outs; aa != NULL; aa = aa->outchain) + if (aa->to == nfa->post) + return REG_UEMPTYMATCH; + return 0; +} + +/* + * compact - construct the compact representation of an NFA + */ +static void +compact(struct nfa *nfa, + struct cnfa *cnfa) +{ + struct state *s; + struct arc *a; + size_t nstates; + size_t narcs; + struct carc *ca; + struct carc *first; + + assert(!NISERR()); + + nstates = 0; + narcs = 0; + for (s = nfa->states; s != NULL; s = s->next) + { + nstates++; + narcs += s->nouts + 1; /* need one extra for endmarker */ + } + + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); + cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); + cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) + { + if (cnfa->stflags != NULL) + FREE(cnfa->stflags); + if (cnfa->states != NULL) + FREE(cnfa->states); + if (cnfa->arcs != NULL) + FREE(cnfa->arcs); + NERR(REG_ESPACE); + return; + } + cnfa->nstates = nstates; + cnfa->pre = nfa->pre->no; + cnfa->post = nfa->post->no; + cnfa->bos[0] = nfa->bos[0]; + cnfa->bos[1] = nfa->bos[1]; + cnfa->eos[0] = nfa->eos[0]; + cnfa->eos[1] = nfa->eos[1]; + cnfa->ncolors = maxcolor(nfa->cm) + 1; + cnfa->flags = 0; + + ca = cnfa->arcs; + for (s = nfa->states; s != NULL; s = s->next) + { + assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; + cnfa->states[s->no] = ca; + first = ca; + for (a = s->outs; a != NULL; a = a->outchain) + switch (a->type) + { + case PLAIN: + ca->co = a->co; + ca->to = a->to->no; + ca++; + break; + case LACON: + assert(s->no != cnfa->pre); + ca->co = (color) (cnfa->ncolors + a->co); + ca->to = a->to->no; + ca++; + cnfa->flags |= HASLACONS; + break; + default: + NERR(REG_ASSERT); + break; + } + carcsort(first, ca - first); + ca->co = COLORLESS; + ca->to = 0; + ca++; + } + assert(ca == &cnfa->arcs[narcs]); + assert(cnfa->nstates != 0); + + /* mark no-progress states */ + for (a = nfa->pre->outs; a != NULL; a = a->outchain) + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; +} + +/* + * carcsort - sort compacted-NFA arcs by color + */ +static void +carcsort(struct carc *first, size_t n) +{ + if (n > 1) + qsort(first, n, sizeof(struct carc), carc_cmp); +} + +static int +carc_cmp(const void *a, const void *b) +{ + const struct carc *aa = (const struct carc *) a; + const struct carc *bb = (const struct carc *) b; + + if (aa->co < bb->co) + return -1; + if (aa->co > bb->co) + return +1; + if (aa->to < bb->to) + return -1; + if (aa->to > bb->to) + return +1; + return 0; +} + +/* + * freecnfa - free a compacted NFA + */ +static void +freecnfa(struct cnfa *cnfa) +{ + assert(cnfa->nstates != 0); /* not empty already */ + cnfa->nstates = 0; + FREE(cnfa->stflags); + FREE(cnfa->states); + FREE(cnfa->arcs); +} + +/* + * dumpnfa - dump an NFA in human-readable form + */ +static void +dumpnfa(struct nfa *nfa, + FILE *f) +{ +#ifdef REG_DEBUG + struct state *s; + int nstates = 0; + int narcs = 0; + + fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no); + if (nfa->bos[0] != COLORLESS) + fprintf(f, ", bos [%ld]", (long) nfa->bos[0]); + if (nfa->bos[1] != COLORLESS) + fprintf(f, ", bol [%ld]", (long) nfa->bos[1]); + if (nfa->eos[0] != COLORLESS) + fprintf(f, ", eos [%ld]", (long) nfa->eos[0]); + if (nfa->eos[1] != COLORLESS) + fprintf(f, ", eol [%ld]", (long) nfa->eos[1]); + fprintf(f, "\n"); + for (s = nfa->states; s != NULL; s = s->next) + { + dumpstate(s, f); + nstates++; + narcs += s->nouts; + } + fprintf(f, "total of %d states, %d arcs\n", nstates, narcs); + if (nfa->parent == NULL) + dumpcolors(nfa->cm, f); + fflush(f); +#endif +} + +#ifdef REG_DEBUG /* subordinates of dumpnfa */ + +/* + * dumpstate - dump an NFA state in human-readable form + */ +static void +dumpstate(struct state *s, + FILE *f) +{ + struct arc *a; + + fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "", + (s->flag) ? s->flag : '.'); + if (s->prev != NULL && s->prev->next != s) + fprintf(f, "\tstate chain bad\n"); + if (s->nouts == 0) + fprintf(f, "\tno out arcs\n"); + else + dumparcs(s, f); + fflush(f); + for (a = s->ins; a != NULL; a = a->inchain) + { + if (a->to != s) + fprintf(f, "\tlink from %d to %d on %d's in-chain\n", + a->from->no, a->to->no, s->no); + } +} + +/* + * dumparcs - dump out-arcs in human-readable form + */ +static void +dumparcs(struct state *s, + FILE *f) +{ + int pos; + struct arc *a; + + /* printing oldest arcs first is usually clearer */ + a = s->outs; + assert(a != NULL); + while (a->outchain != NULL) + a = a->outchain; + pos = 1; + do + { + dumparc(a, s, f); + if (pos == 5) + { + fprintf(f, "\n"); + pos = 1; + } + else + pos++; + a = a->outchainRev; + } while (a != NULL); + if (pos != 1) + fprintf(f, "\n"); +} + +/* + * dumparc - dump one outarc in readable form, including prefixing tab + */ +static void +dumparc(struct arc *a, + struct state *s, + FILE *f) +{ + struct arc *aa; + struct arcbatch *ab; + + fprintf(f, "\t"); + switch (a->type) + { + case PLAIN: + fprintf(f, "[%ld]", (long) a->co); + break; + case AHEAD: + fprintf(f, ">%ld>", (long) a->co); + break; + case BEHIND: + fprintf(f, "<%ld<", (long) a->co); + break; + case LACON: + fprintf(f, ":%ld:", (long) a->co); + break; + case '^': + case '$': + fprintf(f, "%c%d", a->type, (int) a->co); + break; + case EMPTY: + break; + default: + fprintf(f, "0x%x/0%lo", a->type, (long) a->co); + break; + } + if (a->from != s) + fprintf(f, "?%d?", a->from->no); + for (ab = &a->from->oas; ab != NULL; ab = ab->next) + { + for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++) + if (aa == a) + break; /* NOTE BREAK OUT */ + if (aa < &ab->a[ABSIZE]) /* propagate break */ + break; /* NOTE BREAK OUT */ + } + if (ab == NULL) + fprintf(f, "?!?"); /* not in allocated space */ + fprintf(f, "->"); + if (a->to == NULL) + { + fprintf(f, "NULL"); + return; + } + fprintf(f, "%d", a->to->no); + for (aa = a->to->ins; aa != NULL; aa = aa->inchain) + if (aa == a) + break; /* NOTE BREAK OUT */ + if (aa == NULL) + fprintf(f, "?!?"); /* missing from in-chain */ +} +#endif /* REG_DEBUG */ + +/* + * dumpcnfa - dump a compacted NFA in human-readable form + */ +#ifdef REG_DEBUG +static void +dumpcnfa(struct cnfa *cnfa, + FILE *f) +{ + int st; + + fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post); + if (cnfa->bos[0] != COLORLESS) + fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]); + if (cnfa->bos[1] != COLORLESS) + fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]); + if (cnfa->eos[0] != COLORLESS) + fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]); + if (cnfa->eos[1] != COLORLESS) + fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]); + if (cnfa->flags & HASLACONS) + fprintf(f, ", haslacons"); + fprintf(f, "\n"); + for (st = 0; st < cnfa->nstates; st++) + dumpcstate(st, cnfa, f); + fflush(f); +} +#endif + +#ifdef REG_DEBUG /* subordinates of dumpcnfa */ + +/* + * dumpcstate - dump a compacted-NFA state in human-readable form + */ +static void +dumpcstate(int st, + struct cnfa *cnfa, + FILE *f) +{ + struct carc *ca; + int pos; + + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); + pos = 1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); + else + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); + if (pos == 5) + { + fprintf(f, "\n"); + pos = 1; + } + else + pos++; + } + if (ca == cnfa->states[st] || pos != 1) + fprintf(f, "\n"); + fflush(f); +} + +#endif /* REG_DEBUG */ diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c new file mode 100644 index 0000000..3cc2d4d --- /dev/null +++ b/src/backend/regex/regc_pg_locale.c @@ -0,0 +1,935 @@ +/*------------------------------------------------------------------------- + * + * regc_pg_locale.c + * ctype functions adapted to work on pg_wchar (a/k/a chr), + * and functions to cache the results of wholesale ctype probing. + * + * This file is #included by regcomp.c; it's not meant to compile standalone. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/regex/regc_pg_locale.c + * + *------------------------------------------------------------------------- + */ + +#include "catalog/pg_collation.h" +#include "utils/pg_locale.h" + +/* + * To provide as much functionality as possible on a variety of platforms, + * without going so far as to implement everything from scratch, we use + * several implementation strategies depending on the situation: + * + * 1. In C/POSIX collations, we use hard-wired code. We can't depend on + * the <ctype.h> functions since those will obey LC_CTYPE. Note that these + * collations don't give a fig about multibyte characters. + * + * 2. In the "default" collation (which is supposed to obey LC_CTYPE): + * + * 2a. When working in UTF8 encoding, we use the <wctype.h> functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the <wctype.h> functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. + * + * 3. Other collations are only supported on platforms that HAVE_LOCALE_T. + * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h> + * functions, under exactly the same cases as #2. + * + * There is one notable difference between cases 2 and 3: in the "default" + * collation we force ASCII letters to follow ASCII upcase/downcase rules, + * while in a non-default collation we just let the library functions do what + * they will. The case where this matters is treatment of I/i in Turkish, + * and the behavior is meant to match the upper()/lower() SQL functions. + * + * We store the active collation setting in static variables. In principle + * it could be passed down to here via the regex library's "struct vars" data + * structure; but that would require somewhat invasive changes in the regex + * library, and right now there's no real benefit to be gained from that. + * + * NB: the coding here assumes pg_wchar is an unsigned type. + */ + +typedef enum +{ + PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */ + PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */ + PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */ + PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */ + PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */ +} PG_Locale_Strategy; + +static PG_Locale_Strategy pg_regex_strategy; +static pg_locale_t pg_regex_locale; +static Oid pg_regex_collation; + +/* + * Hard-wired character properties for C locale + */ +#define PG_ISDIGIT 0x01 +#define PG_ISALPHA 0x02 +#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) +#define PG_ISUPPER 0x04 +#define PG_ISLOWER 0x08 +#define PG_ISGRAPH 0x10 +#define PG_ISPRINT 0x20 +#define PG_ISPUNCT 0x40 +#define PG_ISSPACE 0x80 + +static const unsigned char pg_char_properties[128] = { + /* NUL */ 0, + /* ^A */ 0, + /* ^B */ 0, + /* ^C */ 0, + /* ^D */ 0, + /* ^E */ 0, + /* ^F */ 0, + /* ^G */ 0, + /* ^H */ 0, + /* ^I */ PG_ISSPACE, + /* ^J */ PG_ISSPACE, + /* ^K */ PG_ISSPACE, + /* ^L */ PG_ISSPACE, + /* ^M */ PG_ISSPACE, + /* ^N */ 0, + /* ^O */ 0, + /* ^P */ 0, + /* ^Q */ 0, + /* ^R */ 0, + /* ^S */ 0, + /* ^T */ 0, + /* ^U */ 0, + /* ^V */ 0, + /* ^W */ 0, + /* ^X */ 0, + /* ^Y */ 0, + /* ^Z */ 0, + /* ^[ */ 0, + /* ^\ */ 0, + /* ^] */ 0, + /* ^^ */ 0, + /* ^_ */ 0, + /* */ PG_ISPRINT | PG_ISSPACE, + /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, + /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, + /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, + /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, + /* DEL */ 0 +}; + + +/* + * pg_set_regex_collation: set collation for these functions to obey + * + * This is called when beginning compilation or execution of a regexp. + * Since there's no need for reentrancy of regexp operations, it's okay + * to store the results in static variables. + */ +void +pg_set_regex_collation(Oid collation) +{ + if (lc_ctype_is_c(collation)) + { + /* C/POSIX collations use this path regardless of database encoding */ + pg_regex_strategy = PG_REGEX_LOCALE_C; + pg_regex_locale = 0; + pg_regex_collation = C_COLLATION_OID; + } + else + { + if (collation == DEFAULT_COLLATION_OID) + pg_regex_locale = 0; + else if (OidIsValid(collation)) + { + /* + * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; + * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not + * have to be considered below. + */ + pg_regex_locale = pg_newlocale_from_collation(collation); + } + else + { + /* + * This typically means that the parser could not resolve a + * conflict of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for regular expression"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (pg_regex_locale && !pg_regex_locale->deterministic) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("nondeterministic collations are not supported for regular expressions"))); + +#ifdef USE_ICU + if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU) + pg_regex_strategy = PG_REGEX_LOCALE_ICU; + else +#endif + if (GetDatabaseEncoding() == PG_UTF8) + { + if (pg_regex_locale) + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE; + } + else + { + if (pg_regex_locale) + pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L; + else + pg_regex_strategy = PG_REGEX_LOCALE_1BYTE; + } + + pg_regex_collation = collation; + } +} + +static int +pg_wc_isdigit(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswdigit((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isdigit((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isdigit(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isalpha(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalpha((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isalpha((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isalpha(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isalnum(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalnum((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isalnum((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isalnum(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isupper(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswupper((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isupper((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswupper_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isupper_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isupper(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_islower(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswlower((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + islower((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswlower_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + islower_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_islower(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isgraph(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswgraph((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isgraph((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isgraph(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isprint(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswprint((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isprint((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswprint_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isprint_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isprint(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_ispunct(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswpunct((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + ispunct((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_ispunct(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static int +pg_wc_isspace(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_LOCALE_WIDE: + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswspace((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + return (c <= (pg_wchar) UCHAR_MAX && + isspace((unsigned char) c)); + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswspace_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + return (c <= (pg_wchar) UCHAR_MAX && + isspace_l((unsigned char) c, pg_regex_locale->info.lt)); +#endif + break; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_isspace(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static pg_wchar +pg_wc_toupper(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towupper((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + if (c <= (pg_wchar) UCHAR_MAX) + return toupper((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + if (c <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) c, pg_regex_locale->info.lt); +#endif + return c; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_toupper(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + +static pg_wchar +pg_wc_tolower(pg_wchar c) +{ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towlower((wint_t) c); + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE: + /* force C behavior for ASCII characters, per comments above */ + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + if (c <= (pg_wchar) UCHAR_MAX) + return tolower((unsigned char) c); + return c; + case PG_REGEX_LOCALE_WIDE_L: +#ifdef HAVE_LOCALE_T + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) c, pg_regex_locale->info.lt); +#endif + /* FALL THRU */ + case PG_REGEX_LOCALE_1BYTE_L: +#ifdef HAVE_LOCALE_T + if (c <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) c, pg_regex_locale->info.lt); +#endif + return c; + case PG_REGEX_LOCALE_ICU: +#ifdef USE_ICU + return u_tolower(c); +#endif + break; + } + return 0; /* can't get here, but keep compiler quiet */ +} + + +/* + * These functions cache the results of probing libc's ctype behavior for + * all character codes of interest in a given encoding/collation. The + * result is provided as a "struct cvec", but notice that the representation + * is a touch different from a cvec created by regc_cvec.c: we allocate the + * chrs[] and ranges[] arrays separately from the struct so that we can + * realloc them larger at need. This is okay since the cvecs made here + * should never be freed by freecvec(). + * + * We use malloc not palloc since we mustn't lose control on out-of-memory; + * the main regex code expects us to return a failure indication instead. + */ + +typedef int (*pg_wc_probefunc) (pg_wchar c); + +typedef struct pg_ctype_cache +{ + pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */ + Oid collation; /* collation this entry is for */ + struct cvec cv; /* cache entry contents */ + struct pg_ctype_cache *next; /* chain link */ +} pg_ctype_cache; + +static pg_ctype_cache *pg_ctype_cache_list = NULL; + +/* + * Add a chr or range to pcc->cv; return false if run out of memory + */ +static bool +store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs) +{ + chr *newchrs; + + if (nchrs > 1) + { + if (pcc->cv.nranges >= pcc->cv.rangespace) + { + pcc->cv.rangespace *= 2; + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.rangespace * sizeof(chr) * 2); + if (newchrs == NULL) + return false; + pcc->cv.ranges = newchrs; + } + pcc->cv.ranges[pcc->cv.nranges * 2] = chr1; + pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1; + pcc->cv.nranges++; + } + else + { + assert(nchrs == 1); + if (pcc->cv.nchrs >= pcc->cv.chrspace) + { + pcc->cv.chrspace *= 2; + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.chrspace * sizeof(chr)); + if (newchrs == NULL) + return false; + pcc->cv.chrs = newchrs; + } + pcc->cv.chrs[pcc->cv.nchrs++] = chr1; + } + return true; +} + +/* + * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all + * chrs satisfying the probe function. The active collation is the one + * previously set by pg_set_regex_collation. Return NULL if out of memory. + * + * Note that the result must not be freed or modified by caller. + */ +static struct cvec * +pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) +{ + pg_ctype_cache *pcc; + pg_wchar max_chr; + pg_wchar cur_chr; + int nmatches; + chr *newchrs; + + /* + * Do we already have the answer cached? + */ + for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next) + { + if (pcc->probefunc == probefunc && + pcc->collation == pg_regex_collation) + return &pcc->cv; + } + + /* + * Nope, so initialize some workspace ... + */ + pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache)); + if (pcc == NULL) + return NULL; + pcc->probefunc = probefunc; + pcc->collation = pg_regex_collation; + pcc->cv.nchrs = 0; + pcc->cv.chrspace = 128; + pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr)); + pcc->cv.nranges = 0; + pcc->cv.rangespace = 64; + pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2); + if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL) + goto out_of_memory; + pcc->cv.cclasscode = cclasscode; + + /* + * Decide how many character codes we ought to look through. In general + * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at + * runtime using the "high colormap" mechanism. However, in C locale + * there's no need to go further than 127, and if we only have a 1-byte + * <ctype.h> API there's no need to go further than that can handle. + * + * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the + * output cvec as not having any locale-dependent behavior, since there + * will be no need to do any run-time locale checks. (The #if's here + * would always be true for production values of MAX_SIMPLE_CHR, but it's + * useful to allow it to be small for testing purposes.) + */ + switch (pg_regex_strategy) + { + case PG_REGEX_LOCALE_C: +#if MAX_SIMPLE_CHR >= 127 + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; +#else + max_chr = (pg_wchar) MAX_SIMPLE_CHR; +#endif + break; + case PG_REGEX_LOCALE_WIDE: + case PG_REGEX_LOCALE_WIDE_L: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; + case PG_REGEX_LOCALE_1BYTE: + case PG_REGEX_LOCALE_1BYTE_L: +#if MAX_SIMPLE_CHR >= UCHAR_MAX + max_chr = (pg_wchar) UCHAR_MAX; + pcc->cv.cclasscode = -1; +#else + max_chr = (pg_wchar) MAX_SIMPLE_CHR; +#endif + break; + case PG_REGEX_LOCALE_ICU: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; + default: + max_chr = 0; /* can't get here, but keep compiler quiet */ + break; + } + + /* + * And scan 'em ... + */ + nmatches = 0; /* number of consecutive matches */ + + for (cur_chr = 0; cur_chr <= max_chr; cur_chr++) + { + if ((*probefunc) (cur_chr)) + nmatches++; + else if (nmatches > 0) + { + if (!store_match(pcc, cur_chr - nmatches, nmatches)) + goto out_of_memory; + nmatches = 0; + } + } + + if (nmatches > 0) + if (!store_match(pcc, cur_chr - nmatches, nmatches)) + goto out_of_memory; + + /* + * We might have allocated more memory than needed, if so free it + */ + if (pcc->cv.nchrs == 0) + { + free(pcc->cv.chrs); + pcc->cv.chrs = NULL; + pcc->cv.chrspace = 0; + } + else if (pcc->cv.nchrs < pcc->cv.chrspace) + { + newchrs = (chr *) realloc(pcc->cv.chrs, + pcc->cv.nchrs * sizeof(chr)); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.chrs = newchrs; + pcc->cv.chrspace = pcc->cv.nchrs; + } + if (pcc->cv.nranges == 0) + { + free(pcc->cv.ranges); + pcc->cv.ranges = NULL; + pcc->cv.rangespace = 0; + } + else if (pcc->cv.nranges < pcc->cv.rangespace) + { + newchrs = (chr *) realloc(pcc->cv.ranges, + pcc->cv.nranges * sizeof(chr) * 2); + if (newchrs == NULL) + goto out_of_memory; + pcc->cv.ranges = newchrs; + pcc->cv.rangespace = pcc->cv.nranges; + } + + /* + * Success, link it into cache chain + */ + pcc->next = pg_ctype_cache_list; + pg_ctype_cache_list = pcc; + + return &pcc->cv; + + /* + * Failure, clean up + */ +out_of_memory: + if (pcc->cv.chrs) + free(pcc->cv.chrs); + if (pcc->cv.ranges) + free(pcc->cv.ranges); + free(pcc); + + return NULL; +} diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c new file mode 100644 index 0000000..91078dc --- /dev/null +++ b/src/backend/regex/regcomp.c @@ -0,0 +1,2194 @@ +/* + * re_*comp and friends - compile REs + * This file #includes several others (see the bottom). + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regcomp.c + * + */ + +#include "regex/regguts.h" + +/* + * forward declarations, up here so forward datatypes etc. are defined early + */ +/* === regcomp.c === */ +static void moresubs(struct vars *, int); +static int freev(struct vars *, int); +static void makesearch(struct vars *, struct nfa *); +static struct subre *parse(struct vars *, int, int, struct state *, struct state *); +static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int); +static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); +static void nonword(struct vars *, int, struct state *, struct state *); +static void word(struct vars *, int, struct state *, struct state *); +static int scannum(struct vars *); +static void repeat(struct vars *, struct state *, struct state *, int, int); +static void bracket(struct vars *, struct state *, struct state *); +static void cbracket(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *); +static const chr *scanplain(struct vars *); +static void onechr(struct vars *, chr, struct state *, struct state *); +static void wordchrs(struct vars *); +static void processlacon(struct vars *, struct state *, struct state *, int, + struct state *, struct state *); +static struct subre *subre(struct vars *, int, int, struct state *, struct state *); +static void freesubre(struct vars *, struct subre *); +static void freesrnode(struct vars *, struct subre *); +static void optst(struct vars *, struct subre *); +static int numst(struct subre *, int); +static void markst(struct subre *); +static void cleanst(struct vars *); +static long nfatree(struct vars *, struct subre *, FILE *); +static long nfanode(struct vars *, struct subre *, int, FILE *); +static int newlacon(struct vars *, struct state *, struct state *, int); +static void freelacons(struct subre *, int); +static void rfree(regex_t *); +static int rcancelrequested(void); +static int rstacktoodeep(void); + +#ifdef REG_DEBUG +static void dump(regex_t *, FILE *); +static void dumpst(struct subre *, FILE *, int); +static void stdump(struct subre *, FILE *, int); +static const char *stid(struct subre *, char *, size_t); +#endif +/* === regc_lex.c === */ +static void lexstart(struct vars *); +static void prefixes(struct vars *); +static void lexnest(struct vars *, const chr *, const chr *); +static void lexword(struct vars *); +static int next(struct vars *); +static int lexescape(struct vars *); +static chr lexdigits(struct vars *, int, int, int); +static int brenext(struct vars *, chr); +static void skip(struct vars *); +static chr newline(void); +static chr chrnamed(struct vars *, const chr *, const chr *, chr); + +/* === regc_color.c === */ +static void initcm(struct vars *, struct colormap *); +static void freecm(struct colormap *); +static color maxcolor(struct colormap *); +static color newcolor(struct colormap *); +static void freecolor(struct colormap *, color); +static color pseudocolor(struct colormap *); +static color subcolor(struct colormap *, chr); +static color subcolorhi(struct colormap *, color *); +static color newsub(struct colormap *, color); +static int newhicolorrow(struct colormap *, int); +static void newhicolorcols(struct colormap *); +static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *); +static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *); +static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *); +static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *); +static void okcolors(struct nfa *, struct colormap *); +static void colorchain(struct colormap *, struct arc *); +static void uncolorchain(struct colormap *, struct arc *); +static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *); +static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *); + +#ifdef REG_DEBUG +static void dumpcolors(struct colormap *, FILE *); +static void dumpchr(chr, FILE *); +#endif +/* === regc_nfa.c === */ +static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *); +static void freenfa(struct nfa *); +static struct state *newstate(struct nfa *); +static struct state *newfstate(struct nfa *, int flag); +static void dropstate(struct nfa *, struct state *); +static void freestate(struct nfa *, struct state *); +static void destroystate(struct nfa *, struct state *); +static void newarc(struct nfa *, int, color, struct state *, struct state *); +static void createarc(struct nfa *, int, color, struct state *, struct state *); +static struct arc *allocarc(struct nfa *, struct state *); +static void freearc(struct nfa *, struct arc *); +static void changearctarget(struct arc *, struct state *); +static int hasnonemptyout(struct state *); +static struct arc *findarc(struct state *, int, color); +static void cparc(struct nfa *, struct arc *, struct state *, struct state *); +static void sortins(struct nfa *, struct state *); +static int sortins_cmp(const void *, const void *); +static void sortouts(struct nfa *, struct state *); +static int sortouts_cmp(const void *, const void *); +static void moveins(struct nfa *, struct state *, struct state *); +static void copyins(struct nfa *, struct state *, struct state *); +static void mergeins(struct nfa *, struct state *, struct arc **, int); +static void moveouts(struct nfa *, struct state *, struct state *); +static void copyouts(struct nfa *, struct state *, struct state *); +static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int); +static void delsub(struct nfa *, struct state *, struct state *); +static void deltraverse(struct nfa *, struct state *, struct state *); +static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); +static void duptraverse(struct nfa *, struct state *, struct state *); +static void cleartraverse(struct nfa *, struct state *); +static struct state *single_color_transition(struct state *, struct state *); +static void specialcolors(struct nfa *); +static long optimize(struct nfa *, FILE *); +static void pullback(struct nfa *, FILE *); +static int pull(struct nfa *, struct arc *, struct state **); +static void pushfwd(struct nfa *, FILE *); +static int push(struct nfa *, struct arc *, struct state **); + +#define INCOMPATIBLE 1 /* destroys arc */ +#define SATISFIED 2 /* constraint satisfied */ +#define COMPATIBLE 3 /* compatible but not satisfied yet */ +static int combine(struct arc *, struct arc *); +static void fixempties(struct nfa *, FILE *); +static struct state *emptyreachable(struct nfa *, struct state *, + struct state *, struct arc **); +static int isconstraintarc(struct arc *); +static int hasconstraintout(struct state *); +static void fixconstraintloops(struct nfa *, FILE *); +static int findconstraintloop(struct nfa *, struct state *); +static void breakconstraintloop(struct nfa *, struct state *); +static void clonesuccessorstates(struct nfa *, struct state *, struct state *, + struct state *, struct arc *, + char *, char *, int); +static void cleanup(struct nfa *); +static void markreachable(struct nfa *, struct state *, struct state *, struct state *); +static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); +static long analyze(struct nfa *); +static void compact(struct nfa *, struct cnfa *); +static void carcsort(struct carc *, size_t); +static int carc_cmp(const void *, const void *); +static void freecnfa(struct cnfa *); +static void dumpnfa(struct nfa *, FILE *); + +#ifdef REG_DEBUG +static void dumpstate(struct state *, FILE *); +static void dumparcs(struct state *, FILE *); +static void dumparc(struct arc *, struct state *, FILE *); +static void dumpcnfa(struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); +#endif +/* === regc_cvec.c === */ +static struct cvec *newcvec(int, int); +static struct cvec *clearcvec(struct cvec *); +static void addchr(struct cvec *, chr); +static void addrange(struct cvec *, chr, chr); +static struct cvec *getcvec(struct vars *, int, int); +static void freecvec(struct cvec *); + +/* === regc_pg_locale.c === */ +static int pg_wc_isdigit(pg_wchar c); +static int pg_wc_isalpha(pg_wchar c); +static int pg_wc_isalnum(pg_wchar c); +static int pg_wc_isupper(pg_wchar c); +static int pg_wc_islower(pg_wchar c); +static int pg_wc_isgraph(pg_wchar c); +static int pg_wc_isprint(pg_wchar c); +static int pg_wc_ispunct(pg_wchar c); +static int pg_wc_isspace(pg_wchar c); +static pg_wchar pg_wc_toupper(pg_wchar c); +static pg_wchar pg_wc_tolower(pg_wchar c); + +/* === regc_locale.c === */ +static chr element(struct vars *, const chr *, const chr *); +static struct cvec *range(struct vars *, chr, chr, int); +static int before(chr, chr); +static struct cvec *eclass(struct vars *, chr, int); +static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static int cclass_column_index(struct colormap *, chr); +static struct cvec *allcases(struct vars *, chr); +static int cmp(const chr *, const chr *, size_t); +static int casecmp(const chr *, const chr *, size_t); + + +/* internal variables, bundled for easy passing around */ +struct vars +{ + regex_t *re; + const chr *now; /* scan pointer into string */ + const chr *stop; /* end of string */ + const chr *savenow; /* saved now and stop for "subroutine call" */ + const chr *savestop; + int err; /* error code (0 if none) */ + int cflags; /* copy of compile flags */ + int lasttype; /* type of previous token */ + int nexttype; /* type of next token */ + chr nextvalue; /* value (if any) of next token */ + int lexcon; /* lexical context type (see lex.c) */ + int nsubexp; /* subexpression count */ + struct subre **subs; /* subRE pointer vector */ + size_t nsubs; /* length of vector */ + struct subre *sub10[10]; /* initial vector, enough for most */ + struct nfa *nfa; /* the NFA */ + struct colormap *cm; /* character color map */ + color nlcolor; /* color of newline */ + struct state *wordchrs; /* state in nfa holding word-char outarcs */ + struct subre *tree; /* subexpression tree */ + struct subre *treechain; /* all tree nodes allocated */ + struct subre *treefree; /* any free tree nodes */ + int ntree; /* number of tree nodes, plus one */ + struct cvec *cv; /* interface cvec */ + struct cvec *cv2; /* utility cvec */ + struct subre *lacons; /* lookaround-constraint vector */ + int nlacons; /* size of lacons[]; note that only slots + * numbered 1 .. nlacons-1 are used */ + size_t spaceused; /* approx. space used for compilation */ +}; + +/* parsing macros; most know that `v' is the struct vars pointer */ +#define NEXT() (next(v)) /* advance by one token */ +#define SEE(t) (v->nexttype == (t)) /* is next token this? */ +#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */ +#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) ((vv)->nexttype = EOS, \ + (vv)->err = ((vv)->err ? (vv)->err : (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return;} /* if error seen, return */ +#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */ +#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */ +#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */ +#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */ +#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y) + +/* token type codes, some also used as NFA arc types */ +#define EMPTY 'n' /* no token present */ +#define EOS 'e' /* end of string */ +#define PLAIN 'p' /* ordinary character */ +#define DIGIT 'd' /* digit (in bound) */ +#define BACKREF 'b' /* back reference */ +#define COLLEL 'I' /* start of [. */ +#define ECLASS 'E' /* start of [= */ +#define CCLASS 'C' /* start of [: */ +#define END 'X' /* end of [. [= [: */ +#define RANGE 'R' /* - within [] which might be range delim. */ +#define LACON 'L' /* lookaround constraint subRE */ +#define AHEAD 'a' /* color-lookahead arc */ +#define BEHIND 'r' /* color-lookbehind arc */ +#define WBDRY 'w' /* word boundary constraint */ +#define NWBDRY 'W' /* non-word-boundary constraint */ +#define SBEGIN 'A' /* beginning of string (even if not BOL) */ +#define SEND 'Z' /* end of string (even if not EOL) */ + +/* is an arc colored, and hence on a color chain? */ +#define COLORED(a) \ + ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND) + + +/* static function list */ +static const struct fns functions = { + rfree, /* regfree insides */ + rcancelrequested, /* check for cancel request */ + rstacktoodeep /* check for stack getting dangerously deep */ +}; + + + +/* + * pg_regcomp - compile regular expression + * + * Note: on failure, no resources remain allocated, so pg_regfree() + * need not be applied to re. + */ +int +pg_regcomp(regex_t *re, + const chr *string, + size_t len, + int flags, + Oid collation) +{ + struct vars var; + struct vars *v = &var; + struct guts *g; + int i; + size_t j; + +#ifdef REG_DEBUG + FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL; +#else + FILE *debug = (FILE *) NULL; +#endif + +#define CNOERR() { if (ISERR()) return freev(v, v->err); } + + /* sanity checks */ + + if (re == NULL || string == NULL) + return REG_INVARG; + if ((flags & REG_QUOTE) && + (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE))) + return REG_INVARG; + if (!(flags & REG_EXTENDED) && (flags & REG_ADVF)) + return REG_INVARG; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(collation); + + /* initial setup (after which freev() is callable) */ + v->re = re; + v->now = string; + v->stop = v->now + len; + v->savenow = v->savestop = NULL; + v->err = 0; + v->cflags = flags; + v->nsubexp = 0; + v->subs = v->sub10; + v->nsubs = 10; + for (j = 0; j < v->nsubs; j++) + v->subs[j] = NULL; + v->nfa = NULL; + v->cm = NULL; + v->nlcolor = COLORLESS; + v->wordchrs = NULL; + v->tree = NULL; + v->treechain = NULL; + v->treefree = NULL; + v->cv = NULL; + v->cv2 = NULL; + v->lacons = NULL; + v->nlacons = 0; + v->spaceused = 0; + re->re_magic = REMAGIC; + re->re_info = 0; /* bits get set during parse */ + re->re_csize = sizeof(chr); + re->re_collation = collation; + re->re_guts = NULL; + re->re_fns = VS(&functions); + + /* more complex setup, malloced things */ + re->re_guts = VS(MALLOC(sizeof(struct guts))); + if (re->re_guts == NULL) + return freev(v, REG_ESPACE); + g = (struct guts *) re->re_guts; + g->tree = NULL; + initcm(v, &g->cmap); + v->cm = &g->cmap; + g->lacons = NULL; + g->nlacons = 0; + ZAPCNFA(g->search); + v->nfa = newnfa(v, v->cm, (struct nfa *) NULL); + CNOERR(); + /* set up a reasonably-sized transient cvec for getcvec usage */ + v->cv = newcvec(100, 20); + if (v->cv == NULL) + return freev(v, REG_ESPACE); + + /* parsing */ + lexstart(v); /* also handles prefixes */ + if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH)) + { + /* assign newline a unique color */ + v->nlcolor = subcolor(v->cm, newline()); + okcolors(v->nfa, v->cm); + } + CNOERR(); + v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final); + assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */ + CNOERR(); + assert(v->tree != NULL); + + /* finish setup of nfa and its subre tree */ + specialcolors(v->nfa); + CNOERR(); +#ifdef REG_DEBUG + if (debug != NULL) + { + fprintf(debug, "\n\n\n========= RAW ==========\n"); + dumpnfa(v->nfa, debug); + dumpst(v->tree, debug, 1); + } +#endif + optst(v, v->tree); + v->ntree = numst(v->tree, 1); + markst(v->tree); + cleanst(v); +#ifdef REG_DEBUG + if (debug != NULL) + { + fprintf(debug, "\n\n\n========= TREE FIXED ==========\n"); + dumpst(v->tree, debug, 1); + } +#endif + + /* build compacted NFAs for tree and lacons */ + re->re_info |= nfatree(v, v->tree, debug); + CNOERR(); + assert(v->nlacons == 0 || v->lacons != NULL); + for (i = 1; i < v->nlacons; i++) + { + struct subre *lasub = &v->lacons[i]; + +#ifdef REG_DEBUG + if (debug != NULL) + fprintf(debug, "\n\n\n========= LA%d ==========\n", i); +#endif + + /* Prepend .* to pattern if it's a lookbehind LACON */ + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); + } + CNOERR(); + if (v->tree->flags & SHORTER) + NOTE(REG_USHORTEST); + + /* build compacted NFAs for tree, lacons, fast search */ +#ifdef REG_DEBUG + if (debug != NULL) + fprintf(debug, "\n\n\n========= SEARCH ==========\n"); +#endif + /* can sacrifice main NFA now, so use it as work area */ + (DISCARD) optimize(v->nfa, debug); + CNOERR(); + makesearch(v, v->nfa); + CNOERR(); + compact(v->nfa, &g->search); + CNOERR(); + + /* looks okay, package it up */ + re->re_nsub = v->nsubexp; + v->re = NULL; /* freev no longer frees re */ + g->magic = GUTSMAGIC; + g->cflags = v->cflags; + g->info = re->re_info; + g->nsub = re->re_nsub; + g->tree = v->tree; + v->tree = NULL; + g->ntree = v->ntree; + g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp; + g->lacons = v->lacons; + v->lacons = NULL; + g->nlacons = v->nlacons; + +#ifdef REG_DEBUG + if (flags & REG_DUMP) + dump(re, stdout); +#endif + + assert(v->err == 0); + return freev(v, 0); +} + +/* + * moresubs - enlarge subRE vector + */ +static void +moresubs(struct vars *v, + int wanted) /* want enough room for this one */ +{ + struct subre **p; + size_t n; + + assert(wanted > 0 && (size_t) wanted >= v->nsubs); + n = (size_t) wanted * 3 / 2 + 1; + + if (v->subs == v->sub10) + { + p = (struct subre **) MALLOC(n * sizeof(struct subre *)); + if (p != NULL) + memcpy(VS(p), VS(v->subs), + v->nsubs * sizeof(struct subre *)); + } + else + p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *)); + if (p == NULL) + { + ERR(REG_ESPACE); + return; + } + v->subs = p; + for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) + *p = NULL; + assert(v->nsubs == n); + assert((size_t) wanted < v->nsubs); +} + +/* + * freev - free vars struct's substructures where necessary + * + * Optionally does error-number setting, and always returns error code + * (if any), to make error-handling code terser. + */ +static int +freev(struct vars *v, + int err) +{ + if (v->re != NULL) + rfree(v->re); + if (v->subs != v->sub10) + FREE(v->subs); + if (v->nfa != NULL) + freenfa(v->nfa); + if (v->tree != NULL) + freesubre(v, v->tree); + if (v->treechain != NULL) + cleanst(v); + if (v->cv != NULL) + freecvec(v->cv); + if (v->cv2 != NULL) + freecvec(v->cv2); + if (v->lacons != NULL) + freelacons(v->lacons, v->nlacons); + ERR(err); /* nop if err==0 */ + + return v->err; +} + +/* + * makesearch - turn an NFA into a search NFA (implicit prepend of .*?) + * NFA must have been optimize()d already. + */ +static void +makesearch(struct vars *v, + struct nfa *nfa) +{ + struct arc *a; + struct arc *b; + struct state *pre = nfa->pre; + struct state *s; + struct state *s2; + struct state *slist; + + /* no loops are needed if it's anchored */ + for (a = pre->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) + break; + } + if (a != NULL) + { + /* add implicit .* in front */ + rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre); + + /* and ^* and \A* too -- not always necessary, but harmless */ + newarc(nfa, PLAIN, nfa->bos[0], pre, pre); + newarc(nfa, PLAIN, nfa->bos[1], pre, pre); + } + + /* + * Now here's the subtle part. Because many REs have no lookback + * constraints, often knowing when you were in the pre state tells you + * little; it's the next state(s) that are informative. But some of them + * may have other inarcs, i.e. it may be possible to make actual progress + * and then return to one of them. We must de-optimize such cases, + * splitting each such state into progress and no-progress states. + */ + + /* first, make a list of the states reachable from pre and elsewhere */ + slist = NULL; + for (a = pre->outs; a != NULL; a = a->outchain) + { + s = a->to; + for (b = s->ins; b != NULL; b = b->inchain) + { + if (b->from != pre) + break; + } + + /* + * We want to mark states as being in the list already by having non + * NULL tmp fields, but we can't just store the old slist value in tmp + * because that doesn't work for the first such state. Instead, the + * first list entry gets its own address in tmp. + */ + if (b != NULL && s->tmp == NULL) + { + s->tmp = (slist != NULL) ? slist : s; + slist = s; + } + } + + /* do the splits */ + for (s = slist; s != NULL; s = s2) + { + s2 = newstate(nfa); + NOERR(); + copyouts(nfa, s, s2); + NOERR(); + for (a = s->ins; a != NULL; a = b) + { + b = a->inchain; + if (a->from != pre) + { + cparc(nfa, a, a->from, s2); + freearc(nfa, a); + } + } + s2 = (s->tmp != s) ? s->tmp : NULL; + s->tmp = NULL; /* clean up while we're at it */ + } +} + +/* + * parse - parse an RE + * + * This is actually just the top level, which parses a bunch of branches + * tied together with '|'. They appear in the tree as the left children + * of a chain of '|' subres. + */ +static struct subre * +parse(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *init, /* initial state */ + struct state *final) /* final state */ +{ + struct state *left; /* scaffolding for branch */ + struct state *right; + struct subre *branches; /* top level */ + struct subre *branch; /* current branch */ + struct subre *t; /* temporary */ + int firstbranch; /* is this the first branch? */ + + assert(stopper == ')' || stopper == EOS); + + branches = subre(v, '|', LONGER, init, final); + NOERRN(); + branch = branches; + firstbranch = 1; + do + { /* a branch */ + if (!firstbranch) + { + /* need a place to hang it */ + branch->right = subre(v, '|', LONGER, init, final); + NOERRN(); + branch = branch->right; + } + firstbranch = 0; + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERRN(); + EMPTYARC(init, left); + EMPTYARC(right, final); + NOERRN(); + branch->left = parsebranch(v, stopper, type, left, right, 0); + NOERRN(); + branch->flags |= UP(branch->flags | branch->left->flags); + if ((branch->flags & ~branches->flags) != 0) /* new flags */ + for (t = branches; t != branch; t = t->right) + t->flags |= branch->flags; + } while (EAT('|')); + assert(SEE(stopper) || SEE(EOS)); + + if (!SEE(stopper)) + { + assert(stopper == ')' && SEE(EOS)); + ERR(REG_EPAREN); + } + + /* optimize out simple cases */ + if (branch == branches) + { /* only one branch */ + assert(branch->right == NULL); + t = branch->left; + branch->left = NULL; + freesubre(v, branches); + branches = t; + } + else if (!MESSY(branches->flags)) + { /* no interesting innards */ + freesubre(v, branches->left); + branches->left = NULL; + freesubre(v, branches->right); + branches->right = NULL; + branches->op = '='; + } + + return branches; +} + +/* + * parsebranch - parse one branch of an RE + * + * This mostly manages concatenation, working closely with parseqatom(). + * Concatenated things are bundled up as much as possible, with separate + * ',' nodes introduced only when necessary due to substructure. + */ +static struct subre * +parsebranch(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *left, /* leftmost state */ + struct state *right, /* rightmost state */ + int partial) /* is this only part of a branch? */ +{ + struct state *lp; /* left end of current construct */ + int seencontent; /* is there anything in this branch yet? */ + struct subre *t; + + lp = left; + seencontent = 0; + t = subre(v, '=', 0, left, right); /* op '=' is tentative */ + NOERRN(); + while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) + { + if (seencontent) + { /* implicit concat operator */ + lp = newstate(v->nfa); + NOERRN(); + moveins(v->nfa, right, lp); + } + seencontent = 1; + + /* NB, recursion in parseqatom() may swallow rest of branch */ + parseqatom(v, stopper, type, lp, right, t); + NOERRN(); + } + + if (!seencontent) + { /* empty branch */ + if (!partial) + NOTE(REG_UUNSPEC); + assert(lp == left); + EMPTYARC(left, right); + } + + return t; +} + +/* + * parseqatom - parse one quantified atom or constraint of an RE + * + * The bookkeeping near the end cooperates very closely with parsebranch(); + * in particular, it contains a recursion that can involve parsing the rest + * of the branch, making this function's name somewhat inaccurate. + */ +static void +parseqatom(struct vars *v, + int stopper, /* EOS or ')' */ + int type, /* LACON (lookaround subRE) or PLAIN */ + struct state *lp, /* left state to hang it on */ + struct state *rp, /* right state to hang it on */ + struct subre *top) /* subtree top */ +{ + struct state *s; /* temporaries for new states */ + struct state *s2; + +#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp) + int m, + n; + struct subre *atom; /* atom's subtree */ + struct subre *t; + int cap; /* capturing parens? */ + int latype; /* lookaround constraint type */ + int subno; /* capturing-parens or backref number */ + int atomtype; + int qprefer; /* quantifier short/long preference */ + int f; + struct subre **atomp; /* where the pointer to atom is */ + + /* initial bookkeeping */ + atom = NULL; + assert(lp->nouts == 0); /* must string new code */ + assert(rp->nins == 0); /* between lp and rp */ + subno = 0; /* just to shut lint up */ + + /* an atom or constraint... */ + atomtype = v->nexttype; + switch (atomtype) + { + /* first, constraints, which end by returning */ + case '^': + ARCV('^', 1); + if (v->cflags & REG_NLANCH) + ARCV(BEHIND, v->nlcolor); + NEXT(); + return; + break; + case '$': + ARCV('$', 1); + if (v->cflags & REG_NLANCH) + ARCV(AHEAD, v->nlcolor); + NEXT(); + return; + break; + case SBEGIN: + ARCV('^', 1); /* BOL */ + ARCV('^', 0); /* or BOS */ + NEXT(); + return; + break; + case SEND: + ARCV('$', 1); /* EOL */ + ARCV('$', 0); /* or EOS */ + NEXT(); + return; + break; + case '<': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + return; + break; + case '>': + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case WBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case NWBDRY: + wordchrs(v); /* does NEXT() */ + s = newstate(v->nfa); + NOERR(); + word(v, BEHIND, lp, s); + word(v, AHEAD, s, rp); + s = newstate(v->nfa); + NOERR(); + nonword(v, BEHIND, lp, s); + nonword(v, AHEAD, s, rp); + return; + break; + case LACON: /* lookaround constraint */ + latype = v->nextvalue; + NEXT(); + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + t = parse(v, ')', LACON, s, s2); + freesubre(v, t); /* internal structure irrelevant */ + NOERR(); + assert(SEE(')')); + NEXT(); + processlacon(v, s, s2, latype, lp, rp); + return; + break; + /* then errors, to get them out of the way */ + case '*': + case '+': + case '?': + case '{': + ERR(REG_BADRPT); + return; + break; + default: + ERR(REG_ASSERT); + return; + break; + /* then plain characters, and minor variants on that theme */ + case ')': /* unbalanced paren */ + if ((v->cflags & REG_ADVANCED) != REG_EXTENDED) + { + ERR(REG_EPAREN); + return; + } + /* legal in EREs due to specification botch */ + NOTE(REG_UPBOTCH); + /* fall through into case PLAIN */ + /* FALLTHROUGH */ + case PLAIN: + onechr(v, v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NOERR(); + NEXT(); + break; + case '[': + if (v->nextvalue == 1) + bracket(v, lp, rp); + else + cbracket(v, lp, rp); + assert(SEE(']') || ISERR()); + NEXT(); + break; + case '.': + rainbow(v->nfa, v->cm, PLAIN, + (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, + lp, rp); + NEXT(); + break; + /* and finally the ugly stuff */ + case '(': /* value flags as capturing or non */ + cap = (type == LACON) ? 0 : v->nextvalue; + if (cap) + { + v->nsubexp++; + subno = v->nsubexp; + if ((size_t) subno >= v->nsubs) + moresubs(v, subno); + assert((size_t) subno < v->nsubs); + } + else + atomtype = PLAIN; /* something that's not '(' */ + NEXT(); + /* need new endpoints because tree will contain pointers */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + NOERR(); + atom = parse(v, ')', type, s, s2); + assert(SEE(')') || ISERR()); + NEXT(); + NOERR(); + if (cap) + { + v->subs[subno] = atom; + t = subre(v, '(', atom->flags | CAP, lp, rp); + NOERR(); + t->subno = subno; + t->left = atom; + atom = t; + } + /* postpone everything else pending possible {0} */ + break; + case BACKREF: /* the Feature From The Black Lagoon */ + INSIST(type != LACON, REG_ESUBREG); + INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); + INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); + NOERR(); + assert(v->nextvalue > 0); + atom = subre(v, 'b', BACKR, lp, rp); + NOERR(); + subno = v->nextvalue; + atom->subno = subno; + EMPTYARC(lp, rp); /* temporarily, so there's something */ + NEXT(); + break; + } + + /* ...and an atom may be followed by a quantifier */ + switch (v->nexttype) + { + case '*': + m = 0; + n = DUPINF; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '+': + m = 1; + n = DUPINF; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '?': + m = 0; + n = 1; + qprefer = (v->nextvalue) ? LONGER : SHORTER; + NEXT(); + break; + case '{': + NEXT(); + m = scannum(v); + if (EAT(',')) + { + if (SEE(DIGIT)) + n = scannum(v); + else + n = DUPINF; + if (m > n) + { + ERR(REG_BADBR); + return; + } + /* {m,n} exercises preference, even if it's {m,m} */ + qprefer = (v->nextvalue) ? LONGER : SHORTER; + } + else + { + n = m; + /* {m} passes operand's preference through */ + qprefer = 0; + } + if (!SEE('}')) + { /* catches errors too */ + ERR(REG_BADBR); + return; + } + NEXT(); + break; + default: /* no quantifier */ + m = n = 1; + qprefer = 0; + break; + } + + /* annoying special case: {0} or {0,0} cancels everything */ + if (m == 0 && n == 0) + { + if (atom != NULL) + freesubre(v, atom); + if (atomtype == '(') + v->subs[subno] = NULL; + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + return; + } + + /* if not a messy case, avoid hard part */ + assert(!MESSY(top->flags)); + f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0); + if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) + { + if (!(m == 1 && n == 1)) + repeat(v, lp, rp, m, n); + if (atom != NULL) + freesubre(v, atom); + top->flags = f; + return; + } + + /* + * hard part: something messy + * + * That is, capturing parens, back reference, short/long clash, or an atom + * with substructure containing one of those. + */ + + /* now we'll need a subre for the contents even if they're boring */ + if (atom == NULL) + { + atom = subre(v, '=', 0, lp, rp); + NOERR(); + } + + /*---------- + * Prepare a general-purpose state skeleton. + * + * In the no-backrefs case, we want this: + * + * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp] + * + * where prefix is some repetitions of atom. In the general case we need + * + * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] + * + * where the iterator wraps around [begin] ---atom---> [end] + * + * We make the s state here for both cases; s2 is made below if needed + *---------- + */ + s = newstate(v->nfa); /* first, new endpoints for the atom */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + NOERR(); + atom->begin = s; + atom->end = s2; + s = newstate(v->nfa); /* set up starting state */ + NOERR(); + EMPTYARC(lp, s); + NOERR(); + + /* break remaining subRE into x{...} and what follows */ + t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); + NOERR(); + t->left = atom; + atomp = &t->left; + + /* here we should recurse... but we must postpone that to the end */ + + /* split top into prefix and remaining */ + assert(top->op == '=' && top->left == NULL && top->right == NULL); + top->left = subre(v, '=', top->flags, top->begin, lp); + NOERR(); + top->op = '.'; + top->right = t; + + /* if it's a backref, now is the time to replicate the subNFA */ + if (atomtype == BACKREF) + { + assert(atom->begin->nouts == 1); /* just the EMPTY */ + delsub(v->nfa, atom->begin, atom->end); + assert(v->subs[subno] != NULL); + + /* + * And here's why the recursion got postponed: it must wait until the + * skeleton is filled in, because it may hit a backref that wants to + * copy the filled-in skeleton. + */ + dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, + atom->begin, atom->end); + NOERR(); + } + + /* + * It's quantifier time. If the atom is just a backref, we'll let it deal + * with quantifiers internally. + */ + if (atomtype == BACKREF) + { + /* special case: backrefs have internal quantifiers */ + EMPTYARC(s, atom->begin); /* empty prefix */ + /* just stuff everything into atom */ + repeat(v, atom->begin, atom->end, m, n); + atom->min = (short) m; + atom->max = (short) n; + atom->flags |= COMBINE(qprefer, atom->flags); + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else if (m == 1 && n == 1 && + (qprefer == 0 || + (atom->flags & (LONGER | SHORTER | MIXED)) == 0 || + qprefer == (atom->flags & (LONGER | SHORTER | MIXED)))) + { + /* no/vacuous quantifier: done */ + EMPTYARC(s, atom->begin); /* empty prefix */ + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else if (m > 0 && !(atom->flags & BACKR)) + { + /* + * If there's no backrefs involved, we can turn x{m,n} into + * x{m-1,n-1}x, with capturing parens in only the second x. This is + * valid because we only care about capturing matches from the final + * iteration of the quantifier. It's a win because we can implement + * the backref-free left side as a plain DFA node, since we don't + * really care where its submatches are. + */ + dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); + assert(m >= 1 && m != DUPINF && n >= 1); + repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '.', f, s, atom->end); /* prefix and atom */ + NOERR(); + t->left = subre(v, '=', PREF(f), s, atom->begin); + NOERR(); + t->right = atom; + *atomp = t; + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else + { + /* general case: need an iteration node */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, atom->end, s2); + NOERR(); + dupnfa(v->nfa, atom->begin, atom->end, s, s2); + repeat(v, s, s2, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '*', f, s, s2); + NOERR(); + t->min = (short) m; + t->max = (short) n; + t->left = atom; + *atomp = t; + /* rest of branch is to be strung from iteration's end state */ + } + + /* and finally, look after that postponed recursion */ + t = top->right; + if (!(SEE('|') || SEE(stopper) || SEE(EOS))) + t->right = parsebranch(v, stopper, type, s2, rp, 1); + else + { + EMPTYARC(s2, rp); + t->right = subre(v, '=', 0, s2, rp); + } + NOERR(); + assert(SEE('|') || SEE(stopper) || SEE(EOS)); + t->flags |= COMBINE(t->flags, t->right->flags); + top->flags |= COMBINE(top->flags, t->flags); +} + +/* + * nonword - generate arcs for non-word-character ahead or behind + */ +static void +nonword(struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + int anchor = (dir == AHEAD) ? '$' : '^'; + + assert(dir == AHEAD || dir == BEHIND); + newarc(v->nfa, anchor, 1, lp, rp); + newarc(v->nfa, anchor, 0, lp, rp); + colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp); + /* (no need for special attention to \n) */ +} + +/* + * word - generate arcs for word character ahead or behind + */ +static void +word(struct vars *v, + int dir, /* AHEAD or BEHIND */ + struct state *lp, + struct state *rp) +{ + assert(dir == AHEAD || dir == BEHIND); + cloneouts(v->nfa, v->wordchrs, lp, rp, dir); + /* (no need for special attention to \n) */ +} + +/* + * scannum - scan a number + */ +static int /* value, <= DUPMAX */ +scannum(struct vars *v) +{ + int n = 0; + + while (SEE(DIGIT) && n < DUPMAX) + { + n = n * 10 + v->nextvalue; + NEXT(); + } + if (SEE(DIGIT) || n > DUPMAX) + { + ERR(REG_BADBR); + return 0; + } + return n; +} + +/* + * repeat - replicate subNFA for quantifiers + * + * The sub-NFA strung from lp to rp is modified to represent m to n + * repetitions of its initial contents. + * + * The duplication sequences used here are chosen carefully so that any + * pointers starting out pointing into the subexpression end up pointing into + * the last occurrence. (Note that it may not be strung between the same + * left and right end states, however!) This used to be important for the + * subRE tree, although the important bits are now handled by the in-line + * code in parse(), and when this is called, it doesn't matter any more. + */ +static void +repeat(struct vars *v, + struct state *lp, + struct state *rp, + int m, + int n) +{ +#define SOME 2 +#define INF 3 +#define PAIR(x, y) ((x)*4 + (y)) +#define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) ) + const int rm = REDUCE(m); + const int rn = REDUCE(n); + struct state *s; + struct state *s2; + + switch (PAIR(rm, rn)) + { + case PAIR(0, 0): /* empty string */ + delsub(v->nfa, lp, rp); + EMPTYARC(lp, rp); + break; + case PAIR(0, 1): /* do as x| */ + EMPTYARC(lp, rp); + break; + case PAIR(0, SOME): /* do as x{1,n}| */ + repeat(v, lp, rp, 1, n); + NOERR(); + EMPTYARC(lp, rp); + break; + case PAIR(0, INF): /* loop x around */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s); + EMPTYARC(lp, s); + EMPTYARC(s, rp); + break; + case PAIR(1, 1): /* no action required */ + break; + case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, 1, n - 1); + NOERR(); + EMPTYARC(lp, s); + break; + case PAIR(1, INF): /* add loopback arc */ + s = newstate(v->nfa); + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + moveins(v->nfa, rp, s2); + EMPTYARC(lp, s); + EMPTYARC(s2, rp); + EMPTYARC(s2, s); + break; + case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m - 1, n - 1); + break; + case PAIR(SOME, INF): /* do as x{m-1,}x */ + s = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, lp, s); + dupnfa(v->nfa, s, rp, lp, s); + NOERR(); + repeat(v, lp, s, m - 1, n); + break; + default: + ERR(REG_ASSERT); + break; + } +} + +/* + * bracket - handle non-complemented bracket expression + * Also called from cbracket for complemented bracket expressions. + */ +static void +bracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + assert(SEE('[')); + NEXT(); + while (!SEE(']') && !SEE(EOS)) + brackpart(v, lp, rp); + assert(SEE(']') || ISERR()); + okcolors(v->nfa, v->cm); +} + +/* + * cbracket - handle complemented bracket expression + * We do it by calling bracket() with dummy endpoints, and then complementing + * the result. The alternative would be to invoke rainbow(), and then delete + * arcs as the b.e. is seen... but that gets messy. + */ +static void +cbracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct state *left = newstate(v->nfa); + struct state *right = newstate(v->nfa); + + NOERR(); + bracket(v, left, right); + if (v->cflags & REG_NLSTOP) + newarc(v->nfa, PLAIN, v->nlcolor, left, right); + NOERR(); + + assert(lp->nouts == 0); /* all outarcs will be ours */ + + /* + * Easy part of complementing, and all there is to do since the MCCE code + * was removed. + */ + colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); + NOERR(); + dropstate(v->nfa, left); + assert(right->nins == 0); + freestate(v->nfa, right); +} + +/* + * brackpart - handle one item (or range) within a bracket expression + */ +static void +brackpart(struct vars *v, + struct state *lp, + struct state *rp) +{ + chr startc; + chr endc; + struct cvec *cv; + const chr *startp; + const chr *endp; + chr c[1]; + + /* parse something, get rid of special cases, take shortcuts */ + switch (v->nexttype) + { + case RANGE: /* a-b-c or other botch */ + ERR(REG_ERANGE); + return; + break; + case PLAIN: + c[0] = v->nextvalue; + NEXT(); + /* shortcut for ordinary chr (not range) */ + if (!SEE(RANGE)) + { + onechr(v, c[0], lp, rp); + return; + } + startc = element(v, c, c + 1); + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + break; + case ECLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + startc = element(v, startp, endp); + NOERR(); + cv = eclass(v, startc, (v->cflags & REG_ICASE)); + NOERR(); + subcolorcvec(v, cv, lp, rp); + return; + break; + case CCLASS: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECTYPE); + NOERR(); + cv = cclass(v, startp, endp, (v->cflags & REG_ICASE)); + NOERR(); + subcolorcvec(v, cv, lp, rp); + return; + break; + default: + ERR(REG_ASSERT); + return; + break; + } + + if (SEE(RANGE)) + { + NEXT(); + switch (v->nexttype) + { + case PLAIN: + case RANGE: + c[0] = v->nextvalue; + NEXT(); + endc = element(v, c, c + 1); + NOERR(); + break; + case COLLEL: + startp = v->now; + endp = scanplain(v); + INSIST(startp < endp, REG_ECOLLATE); + NOERR(); + endc = element(v, startp, endp); + NOERR(); + break; + default: + ERR(REG_ERANGE); + return; + break; + } + } + else + endc = startc; + + /* + * Ranges are unportable. Actually, standard C does guarantee that digits + * are contiguous, but making that an exception is just too complicated. + */ + if (startc != endc) + NOTE(REG_UUNPORT); + cv = range(v, startc, endc, (v->cflags & REG_ICASE)); + NOERR(); + subcolorcvec(v, cv, lp, rp); +} + +/* + * scanplain - scan PLAIN contents of [. etc. + * + * Certain bits of trickery in lex.c know that this code does not try + * to look past the final bracket of the [. etc. + */ +static const chr * /* just after end of sequence */ +scanplain(struct vars *v) +{ + const chr *endp; + + assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS)); + NEXT(); + + endp = v->now; + while (SEE(PLAIN)) + { + endp = v->now; + NEXT(); + } + + assert(SEE(END) || ISERR()); + NEXT(); + + return endp; +} + +/* + * onechr - fill in arcs for a plain character, and possible case complements + * This is mostly a shortcut for efficient handling of the common case. + */ +static void +onechr(struct vars *v, + chr c, + struct state *lp, + struct state *rp) +{ + if (!(v->cflags & REG_ICASE)) + { + color lastsubcolor = COLORLESS; + + subcoloronechr(v, c, lp, rp, &lastsubcolor); + return; + } + + /* rats, need general case anyway... */ + subcolorcvec(v, allcases(v, c), lp, rp); +} + +/* + * wordchrs - set up word-chr list for word-boundary stuff, if needed + * + * The list is kept as a bunch of arcs between two dummy states; it's + * disposed of by the unreachable-states sweep in NFA optimization. + * Does NEXT(). Must not be called from any unusual lexical context. + * This should be reconciled with the \w etc. handling in lex.c, and + * should be cleaned up to reduce dependencies on input scanning. + */ +static void +wordchrs(struct vars *v) +{ + struct state *left; + struct state *right; + + if (v->wordchrs != NULL) + { + NEXT(); /* for consistency */ + return; + } + + left = newstate(v->nfa); + right = newstate(v->nfa); + NOERR(); + /* fine point: implemented with [::], and lexer will set REG_ULOCALE */ + lexword(v); + NEXT(); + assert(v->savenow != NULL && SEE('[')); + bracket(v, left, right); + assert((v->savenow != NULL && SEE(']')) || ISERR()); + NEXT(); + NOERR(); + v->wordchrs = left; +} + +/* + * processlacon - generate the NFA representation of a LACON + * + * In the general case this is just newlacon() + newarc(), but some cases + * can be optimized. + */ +static void +processlacon(struct vars *v, + struct state *begin, /* start of parsed LACON sub-re */ + struct state *end, /* end of parsed LACON sub-re */ + int latype, + struct state *lp, /* left state to hang it on */ + struct state *rp) /* right state to hang it on */ +{ + struct state *s1; + int n; + + /* + * Check for lookaround RE consisting of a single plain color arc (or set + * of arcs); this would typically be a simple chr or a bracket expression. + */ + s1 = single_color_transition(begin, end); + switch (latype) + { + case LATYPE_AHEAD_POS: + /* If lookahead RE is just colorset C, convert to AHEAD(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, AHEAD); + return; + } + break; + case LATYPE_AHEAD_NEG: + /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp); + newarc(v->nfa, '$', 1, lp, rp); + newarc(v->nfa, '$', 0, lp, rp); + return; + } + break; + case LATYPE_BEHIND_POS: + /* If lookbehind RE is just colorset C, convert to BEHIND(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, BEHIND); + return; + } + break; + case LATYPE_BEHIND_NEG: + /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp); + newarc(v->nfa, '^', 1, lp, rp); + newarc(v->nfa, '^', 0, lp, rp); + return; + } + break; + default: + assert(NOTREACHED); + } + + /* General case: we need a LACON subre and arc */ + n = newlacon(v, begin, end, latype); + newarc(v->nfa, LACON, n, lp, rp); +} + +/* + * subre - allocate a subre + */ +static struct subre * +subre(struct vars *v, + int op, + int flags, + struct state *begin, + struct state *end) +{ + struct subre *ret = v->treefree; + + /* + * Checking for stack overflow here is sufficient to protect parse() and + * its recursive subroutines. + */ + if (STACK_TOO_DEEP(v->re)) + { + ERR(REG_ETOOBIG); + return NULL; + } + + if (ret != NULL) + v->treefree = ret->left; + else + { + ret = (struct subre *) MALLOC(sizeof(struct subre)); + if (ret == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + ret->chain = v->treechain; + v->treechain = ret; + } + + assert(strchr("=b|.*(", op) != NULL); + + ret->op = op; + ret->flags = flags; + ret->id = 0; /* will be assigned later */ + ret->subno = 0; + ret->min = ret->max = 1; + ret->left = NULL; + ret->right = NULL; + ret->begin = begin; + ret->end = end; + ZAPCNFA(ret->cnfa); + + return ret; +} + +/* + * freesubre - free a subRE subtree + */ +static void +freesubre(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) + return; + + if (sr->left != NULL) + freesubre(v, sr->left); + if (sr->right != NULL) + freesubre(v, sr->right); + + freesrnode(v, sr); +} + +/* + * freesrnode - free one node in a subRE subtree + */ +static void +freesrnode(struct vars *v, /* might be NULL */ + struct subre *sr) +{ + if (sr == NULL) + return; + + if (!NULLCNFA(sr->cnfa)) + freecnfa(&sr->cnfa); + sr->flags = 0; + + if (v != NULL && v->treechain != NULL) + { + /* we're still parsing, maybe we can reuse the subre */ + sr->left = v->treefree; + v->treefree = sr; + } + else + FREE(sr); +} + +/* + * optst - optimize a subRE subtree + */ +static void +optst(struct vars *v, + struct subre *t) +{ + /* + * DGP (2007-11-13): I assume it was the programmer's intent to eventually + * come back and add code to optimize subRE trees, but the routine coded + * just spends effort traversing the tree and doing nothing. We can do + * nothing with less effort. + */ + return; +} + +/* + * numst - number tree nodes (assigning "id" indexes) + */ +static int /* next number */ +numst(struct subre *t, + int start) /* starting point for subtree numbers */ +{ + int i; + + assert(t != NULL); + + i = start; + t->id = (short) i++; + if (t->left != NULL) + i = numst(t->left, i); + if (t->right != NULL) + i = numst(t->right, i); + return i; +} + +/* + * markst - mark tree nodes as INUSE + * + * Note: this is a great deal more subtle than it looks. During initial + * parsing of a regex, all subres are linked into the treechain list; + * discarded ones are also linked into the treefree list for possible reuse. + * After we are done creating all subres required for a regex, we run markst() + * then cleanst(), which results in discarding all subres not reachable from + * v->tree. We then clear v->treechain, indicating that subres must be found + * by descending from v->tree. This changes the behavior of freesubre(): it + * will henceforth FREE() unwanted subres rather than sticking them into the + * treefree list. (Doing that any earlier would result in dangling links in + * the treechain list.) This all means that freev() will clean up correctly + * if invoked before or after markst()+cleanst(); but it would not work if + * called partway through this state conversion, so we mustn't error out + * in or between these two functions. + */ +static void +markst(struct subre *t) +{ + assert(t != NULL); + + t->flags |= INUSE; + if (t->left != NULL) + markst(t->left); + if (t->right != NULL) + markst(t->right); +} + +/* + * cleanst - free any tree nodes not marked INUSE + */ +static void +cleanst(struct vars *v) +{ + struct subre *t; + struct subre *next; + + for (t = v->treechain; t != NULL; t = next) + { + next = t->chain; + if (!(t->flags & INUSE)) + FREE(t); + } + v->treechain = NULL; + v->treefree = NULL; /* just on general principles */ +} + +/* + * nfatree - turn a subRE subtree into a tree of compacted NFAs + */ +static long /* optimize results from top node */ +nfatree(struct vars *v, + struct subre *t, + FILE *f) /* for debug output */ +{ + assert(t != NULL && t->begin != NULL); + + if (t->left != NULL) + (DISCARD) nfatree(v, t->left, f); + if (t->right != NULL) + (DISCARD) nfatree(v, t->right, f); + + return nfanode(v, t, 0, f); +} + +/* + * nfanode - do one NFA for nfatree or lacons + * + * If converttosearch is true, apply makesearch() to the NFA. + */ +static long /* optimize results */ +nfanode(struct vars *v, + struct subre *t, + int converttosearch, + FILE *f) /* for debug output */ +{ + struct nfa *nfa; + long ret = 0; + + assert(t->begin != NULL); + +#ifdef REG_DEBUG + if (f != NULL) + { + char idbuf[50]; + + fprintf(f, "\n\n\n========= TREE NODE %s ==========\n", + stid(t, idbuf, sizeof(idbuf))); + } +#endif + nfa = newnfa(v, v->cm, v->nfa); + NOERRZ(); + dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); + if (!ISERR()) + specialcolors(nfa); + if (!ISERR()) + ret = optimize(nfa, f); + if (converttosearch && !ISERR()) + makesearch(v, nfa); + if (!ISERR()) + compact(nfa, &t->cnfa); + + freenfa(nfa); + return ret; +} + +/* + * newlacon - allocate a lookaround-constraint subRE + */ +static int /* lacon number */ +newlacon(struct vars *v, + struct state *begin, + struct state *end, + int latype) +{ + int n; + struct subre *newlacons; + struct subre *sub; + + if (v->nlacons == 0) + { + n = 1; /* skip 0th */ + newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre)); + } + else + { + n = v->nlacons; + newlacons = (struct subre *) REALLOC(v->lacons, + (n + 1) * sizeof(struct subre)); + } + if (newlacons == NULL) + { + ERR(REG_ESPACE); + return 0; + } + v->lacons = newlacons; + v->nlacons = n + 1; + sub = &v->lacons[n]; + sub->begin = begin; + sub->end = end; + sub->subno = latype; + ZAPCNFA(sub->cnfa); + return n; +} + +/* + * freelacons - free lookaround-constraint subRE vector + */ +static void +freelacons(struct subre *subs, + int n) +{ + struct subre *sub; + int i; + + assert(n > 0); + for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */ + if (!NULLCNFA(sub->cnfa)) + freecnfa(&sub->cnfa); + FREE(subs); +} + +/* + * rfree - free a whole RE (insides of regfree) + */ +static void +rfree(regex_t *re) +{ + struct guts *g; + + if (re == NULL || re->re_magic != REMAGIC) + return; + + re->re_magic = 0; /* invalidate RE */ + g = (struct guts *) re->re_guts; + re->re_guts = NULL; + re->re_fns = NULL; + if (g != NULL) + { + g->magic = 0; + freecm(&g->cmap); + if (g->tree != NULL) + freesubre((struct vars *) NULL, g->tree); + if (g->lacons != NULL) + freelacons(g->lacons, g->nlacons); + if (!NULLCNFA(g->search)) + freecnfa(&g->search); + FREE(g); + } +} + +/* + * rcancelrequested - check for external request to cancel regex operation + * + * Return nonzero to fail the operation with error code REG_CANCEL, + * zero to keep going + * + * The current implementation is Postgres-specific. If we ever get around + * to splitting the regex code out as a standalone library, there will need + * to be some API to let applications define a callback function for this. + */ +static int +rcancelrequested(void) +{ + return InterruptPending && (QueryCancelPending || ProcDiePending); +} + +/* + * rstacktoodeep - check for stack getting dangerously deep + * + * Return nonzero to fail the operation with error code REG_ETOOBIG, + * zero to keep going + * + * The current implementation is Postgres-specific. If we ever get around + * to splitting the regex code out as a standalone library, there will need + * to be some API to let applications define a callback function for this. + */ +static int +rstacktoodeep(void) +{ + return stack_is_too_deep(); +} + +#ifdef REG_DEBUG + +/* + * dump - dump an RE in human-readable form + */ +static void +dump(regex_t *re, + FILE *f) +{ + struct guts *g; + int i; + + if (re->re_magic != REMAGIC) + fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic, + REMAGIC); + if (re->re_guts == NULL) + { + fprintf(f, "NULL guts!!!\n"); + return; + } + g = (struct guts *) re->re_guts; + if (g->magic != GUTSMAGIC) + fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic, + GUTSMAGIC); + + fprintf(f, "\n\n\n========= DUMP ==========\n"); + fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", + (int) re->re_nsub, re->re_info, re->re_csize, g->ntree); + + dumpcolors(&g->cmap, f); + if (!NULLCNFA(g->search)) + { + fprintf(f, "\nsearch:\n"); + dumpcnfa(&g->search, f); + } + for (i = 1; i < g->nlacons; i++) + { + struct subre *lasub = &g->lacons[i]; + const char *latype; + + switch (lasub->subno) + { + case LATYPE_AHEAD_POS: + latype = "positive lookahead"; + break; + case LATYPE_AHEAD_NEG: + latype = "negative lookahead"; + break; + case LATYPE_BEHIND_POS: + latype = "positive lookbehind"; + break; + case LATYPE_BEHIND_NEG: + latype = "negative lookbehind"; + break; + default: + latype = "???"; + break; + } + fprintf(f, "\nla%d (%s):\n", i, latype); + dumpcnfa(&lasub->cnfa, f); + } + fprintf(f, "\n"); + dumpst(g->tree, f, 0); +} + +/* + * dumpst - dump a subRE tree + */ +static void +dumpst(struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + if (t == NULL) + fprintf(f, "null tree\n"); + else + stdump(t, f, nfapresent); + fflush(f); +} + +/* + * stdump - recursive guts of dumpst + */ +static void +stdump(struct subre *t, + FILE *f, + int nfapresent) /* is the original NFA still around? */ +{ + char idbuf[50]; + + fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); + if (t->flags & LONGER) + fprintf(f, " longest"); + if (t->flags & SHORTER) + fprintf(f, " shortest"); + if (t->flags & MIXED) + fprintf(f, " hasmixed"); + if (t->flags & CAP) + fprintf(f, " hascapture"); + if (t->flags & BACKR) + fprintf(f, " hasbackref"); + if (!(t->flags & INUSE)) + fprintf(f, " UNUSED"); + if (t->subno != 0) + fprintf(f, " (#%d)", t->subno); + if (t->min != 1 || t->max != 1) + { + fprintf(f, " {%d,", t->min); + if (t->max != DUPINF) + fprintf(f, "%d", t->max); + fprintf(f, "}"); + } + if (nfapresent) + fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no); + if (t->left != NULL) + fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); + if (t->right != NULL) + fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); + if (!NULLCNFA(t->cnfa)) + { + fprintf(f, "\n"); + dumpcnfa(&t->cnfa, f); + } + fprintf(f, "\n"); + if (t->left != NULL) + stdump(t->left, f, nfapresent); + if (t->right != NULL) + stdump(t->right, f, nfapresent); +} + +/* + * stid - identify a subtree node for dumping + */ +static const char * /* points to buf or constant string */ +stid(struct subre *t, + char *buf, + size_t bufsize) +{ + /* big enough for hex int or decimal t->id? */ + if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1) + return "unable"; + if (t->id != 0) + sprintf(buf, "%d", t->id); + else + sprintf(buf, "%p", t); + return buf; +} +#endif /* REG_DEBUG */ + + +#include "regc_lex.c" +#include "regc_color.c" +#include "regc_nfa.c" +#include "regc_cvec.c" +#include "regc_pg_locale.c" +#include "regc_locale.c" diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c new file mode 100644 index 0000000..5695e15 --- /dev/null +++ b/src/backend/regex/rege_dfa.c @@ -0,0 +1,929 @@ +/* + * DFA routines + * This file is #included by regexec.c. + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/rege_dfa.c + * + */ + +/* + * longest - longest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. + */ +static chr * +longest(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *stop, /* match must end at or before here */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realstop = (stop == v->stop) ? stop : stop + 1; + color co; + struct sset *css; + struct sset *ss; + chr *post; + int i; + struct colormap *cm = d->cm; + + /* prevent "uninitialized variable" warnings */ + if (hitstopp != NULL) + *hitstopp = 0; + + /* initialize */ + css = initialize(v, d, start); + if (css == NULL) + return NULL; + cp = start; + + /* startup */ + FDEBUG(("+++ startup +++\n")); + if (cp == v->start) + { + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + } + else + { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < realstop) + { + FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + else +#endif + { + while (cp < realstop) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + if (ISERR()) + return NULL; + + /* shutdown */ + FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets))); + if (cp == v->stop && stop == v->stop) + { + if (hitstopp != NULL) + *hitstopp = 1; + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, start); + if (ISERR()) + return NULL; + /* special case: match ended at eol? */ + if (ss != NULL && (ss->flags & POSTSTATE)) + return cp; + else if (ss != NULL) + ss->lastseen = cp; /* to be tidy */ + } + + /* find last match, if any */ + post = d->lastpost; + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) + if ((ss->flags & POSTSTATE) && post != ss->lastseen && + (post == NULL || post < ss->lastseen)) + post = ss->lastseen; + if (post != NULL) /* found one */ + return post - 1; + + return NULL; +} + +/* + * shortest - shortest-preferred matching engine + * + * On success, returns match endpoint address. Returns NULL on no match. + * Internal errors also return NULL, with v->err set. + */ +static chr * +shortest(struct vars *v, + struct dfa *d, + chr *start, /* where the match should start */ + chr *min, /* match must end at or after here */ + chr *max, /* match must end at or before here */ + chr **coldp, /* store coldstart pointer here, if non-NULL */ + int *hitstopp) /* record whether hit v->stop, if non-NULL */ +{ + chr *cp; + chr *realmin = (min == v->stop) ? min : min + 1; + chr *realmax = (max == v->stop) ? max : max + 1; + color co; + struct sset *css; + struct sset *ss; + struct colormap *cm = d->cm; + + /* prevent "uninitialized variable" warnings */ + if (coldp != NULL) + *coldp = NULL; + if (hitstopp != NULL) + *hitstopp = 0; + + /* initialize */ + css = initialize(v, d, start); + if (css == NULL) + return NULL; + cp = start; + + /* startup */ + FDEBUG(("--- startup ---\n")); + if (cp == v->start) + { + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + } + else + { + co = GETCOLOR(cm, *(cp - 1)); + FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co)); + } + css = miss(v, d, css, co, cp, start); + if (css == NULL) + return NULL; + css->lastseen = cp; + ss = css; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < realmax) + { + FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags & POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + } + else +#endif + { + while (cp < realmax) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + if ((ss->flags & POSTSTATE) && cp >= realmin) + break; /* NOTE BREAK OUT */ + } + } + + if (ss == NULL) + return NULL; + + if (coldp != NULL) /* report last no-progress state set, if any */ + *coldp = lastcold(v, d); + + if ((ss->flags & POSTSTATE) && cp > min) + { + assert(cp >= realmin); + cp--; + } + else if (cp == v->stop && max == v->stop) + { + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, start); + /* match might have ended at eol */ + if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL) + *hitstopp = 1; + } + + if (ss == NULL || !(ss->flags & POSTSTATE)) + return NULL; + + return cp; +} + +/* + * matchuntil - incremental matching engine + * + * This is meant for use with a search-style NFA (that is, the pattern is + * known to act as though it had a leading .*). We determine whether a + * match exists starting at v->start and ending at probe. Multiple calls + * require only O(N) time not O(N^2) so long as the probe values are + * nondecreasing. *lastcss and *lastcp must be initialized to NULL before + * starting a series of calls. + * + * Returns 1 if a match exists, 0 if not. + * Internal errors also return 0, with v->err set. + */ +static int +matchuntil(struct vars *v, + struct dfa *d, + chr *probe, /* we want to know if a match ends here */ + struct sset **lastcss, /* state storage across calls */ + chr **lastcp) /* state storage across calls */ +{ + chr *cp = *lastcp; + color co; + struct sset *css = *lastcss; + struct sset *ss; + struct colormap *cm = d->cm; + + /* initialize and startup, or restart, if necessary */ + if (cp == NULL || cp > probe) + { + cp = v->start; + css = initialize(v, d, cp); + if (css == NULL) + return 0; + + FDEBUG((">>> startup >>>\n")); + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + + css = miss(v, d, css, co, cp, v->start); + if (css == NULL) + return 0; + css->lastseen = cp; + } + else if (css == NULL) + { + /* we previously found that no match is possible beyond *lastcp */ + return 0; + } + ss = css; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < probe) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + else +#endif + { + while (cp < probe) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + *lastcss = ss; + *lastcp = cp; + + if (ss == NULL) + return 0; /* impossible match, or internal error */ + + /* We need to process one more chr, or the EOS symbol, to check match */ + if (cp < v->stop) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + ss = miss(v, d, css, co, cp + 1, v->start); + } + else + { + assert(cp == v->stop); + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, v->start); + } + + if (ss == NULL || !(ss->flags & POSTSTATE)) + return 0; + + return 1; +} + +/* + * lastcold - determine last point at which no progress had been made + */ +static chr * /* endpoint, or NULL */ +lastcold(struct vars *v, + struct dfa *d) +{ + struct sset *ss; + chr *nopr; + int i; + + nopr = d->lastnopr; + if (nopr == NULL) + nopr = v->start; + for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--) + if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen) + nopr = ss->lastseen; + return nopr; +} + +/* + * newdfa - set up a fresh DFA + */ +static struct dfa * +newdfa(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct smalldfa *sml) /* preallocated space, may be NULL */ +{ + struct dfa *d; + size_t nss = cnfa->nstates * 2; + int wordsper = (cnfa->nstates + UBITS - 1) / UBITS; + struct smalldfa *smallwas = sml; + + assert(cnfa != NULL && cnfa->nstates != 0); + + if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) + { + assert(wordsper == 1); + if (sml == NULL) + { + sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa)); + if (sml == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + } + d = &sml->dfa; + d->ssets = sml->ssets; + d->statesarea = sml->statesarea; + d->work = &d->statesarea[nss]; + d->outsarea = sml->outsarea; + d->incarea = sml->incarea; + d->cptsmalloced = 0; + d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL; + } + else + { + d = (struct dfa *) MALLOC(sizeof(struct dfa)); + if (d == NULL) + { + ERR(REG_ESPACE); + return NULL; + } + d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset)); + d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper * + sizeof(unsigned)); + d->work = &d->statesarea[nss * wordsper]; + d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors * + sizeof(struct sset *)); + d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors * + sizeof(struct arcp)); + d->cptsmalloced = 1; + d->mallocarea = (char *) d; + if (d->ssets == NULL || d->statesarea == NULL || + d->outsarea == NULL || d->incarea == NULL) + { + freedfa(d); + ERR(REG_ESPACE); + return NULL; + } + } + + d->nssets = (v->eflags & REG_SMALL) ? 7 : nss; + d->nssused = 0; + d->nstates = cnfa->nstates; + d->ncolors = cnfa->ncolors; + d->wordsper = wordsper; + d->cnfa = cnfa; + d->cm = cm; + d->lastpost = NULL; + d->lastnopr = NULL; + d->search = d->ssets; + + /* initialization of sset fields is done as needed */ + + return d; +} + +/* + * freedfa - free a DFA + */ +static void +freedfa(struct dfa *d) +{ + if (d->cptsmalloced) + { + if (d->ssets != NULL) + FREE(d->ssets); + if (d->statesarea != NULL) + FREE(d->statesarea); + if (d->outsarea != NULL) + FREE(d->outsarea); + if (d->incarea != NULL) + FREE(d->incarea); + } + + if (d->mallocarea != NULL) + FREE(d->mallocarea); +} + +/* + * hash - construct a hash code for a bitvector + * + * There are probably better ways, but they're more expensive. + */ +static unsigned +hash(unsigned *uv, + int n) +{ + int i; + unsigned h; + + h = 0; + for (i = 0; i < n; i++) + h ^= uv[i]; + return h; +} + +/* + * initialize - hand-craft a cache entry for startup, otherwise get ready + */ +static struct sset * +initialize(struct vars *v, + struct dfa *d, + chr *start) +{ + struct sset *ss; + int i; + + /* is previous one still there? */ + if (d->nssused > 0 && (d->ssets[0].flags & STARTER)) + ss = &d->ssets[0]; + else + { /* no, must (re)build it */ + ss = getvacant(v, d, start, start); + if (ss == NULL) + return NULL; + for (i = 0; i < d->wordsper; i++) + ss->states[i] = 0; + BSET(ss->states, d->cnfa->pre); + ss->hash = HASH(ss->states, d->wordsper); + assert(d->cnfa->pre != d->cnfa->post); + ss->flags = STARTER | LOCKED | NOPROGRESS; + /* lastseen dealt with below */ + } + + for (i = 0; i < d->nssused; i++) + d->ssets[i].lastseen = NULL; + ss->lastseen = start; /* maybe untrue, but harmless */ + d->lastpost = NULL; + d->lastnopr = NULL; + return ss; +} + +/* + * miss - handle a stateset cache miss + * + * css is the current stateset, co is the color of the current input character, + * cp points to the character after that (which is where we may need to test + * LACONs). start does not affect matching behavior but is needed for pickss' + * heuristics about which stateset cache entry to replace. + * + * Ordinarily, returns the address of the next stateset (the one that is + * valid after consuming the input character). Returns NULL if no valid + * NFA states remain, ie we have a certain match failure. + * Internal errors also return NULL, with v->err set. + */ +static struct sset * +miss(struct vars *v, + struct dfa *d, + struct sset *css, + color co, + chr *cp, /* next chr */ + chr *start) /* where the attempt got started */ +{ + struct cnfa *cnfa = d->cnfa; + int i; + unsigned h; + struct carc *ca; + struct sset *p; + int ispost; + int noprogress; + int gotstate; + int dolacons; + int sawlacons; + + /* for convenience, we can be called even if it might not be a miss */ + if (css->outs[co] != NULL) + { + FDEBUG(("hit\n")); + return css->outs[co]; + } + FDEBUG(("miss\n")); + + /* + * Checking for operation cancel in the inner text search loop seems + * unduly expensive. As a compromise, check during cache misses. + */ + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + + /* + * What set of states would we end up in after consuming the co character? + * We first consider PLAIN arcs that consume the character, and then look + * to see what LACON arcs could be traversed after consuming it. + */ + for (i = 0; i < d->wordsper; i++) + d->work[i] = 0; /* build new stateset bitmap in d->work */ + ispost = 0; + noprogress = 1; + gotstate = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(css->states, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + if (ca->co == co) + { + BSET(d->work, ca->to); + gotstate = 1; + if (ca->to == cnfa->post) + ispost = 1; + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) + noprogress = 0; + FDEBUG(("%d -> %d\n", i, ca->to)); + } + if (!gotstate) + return NULL; /* character cannot reach any new state */ + dolacons = (cnfa->flags & HASLACONS); + sawlacons = 0; + /* outer loop handles transitive closure of reachable-by-LACON states */ + while (dolacons) + { + dolacons = 0; + for (i = 0; i < d->nstates; i++) + if (ISBSET(d->work, i)) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + continue; /* not a LACON arc */ + if (ISBSET(d->work, ca->to)) + continue; /* arc would be a no-op anyway */ + sawlacons = 1; /* this LACON affects our result */ + if (!lacon(v, cnfa, cp, ca->co)) + { + if (ISERR()) + return NULL; + continue; /* LACON arc cannot be traversed */ + } + if (ISERR()) + return NULL; + BSET(d->work, ca->to); + dolacons = 1; + if (ca->to == cnfa->post) + ispost = 1; + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) + noprogress = 0; + FDEBUG(("%d :> %d\n", i, ca->to)); + } + } + h = HASH(d->work, d->wordsper); + + /* Is this stateset already in the cache? */ + for (p = d->ssets, i = d->nssused; i > 0; p++, i--) + if (HIT(h, d->work, p, d->wordsper)) + { + FDEBUG(("cached c%d\n", (int) (p - d->ssets))); + break; /* NOTE BREAK OUT */ + } + if (i == 0) + { /* nope, need a new cache entry */ + p = getvacant(v, d, cp, start); + if (p == NULL) + return NULL; + assert(p != css); + for (i = 0; i < d->wordsper; i++) + p->states[i] = d->work[i]; + p->hash = h; + p->flags = (ispost) ? POSTSTATE : 0; + if (noprogress) + p->flags |= NOPROGRESS; + /* lastseen to be dealt with by caller */ + } + + /* + * Link new stateset to old, unless a LACON affected the result, in which + * case we don't create the link. That forces future transitions across + * this same arc (same prior stateset and character color) to come through + * miss() again, so that we can recheck the LACON(s), which might or might + * not pass since context will be different. + */ + if (!sawlacons) + { + FDEBUG(("c%d[%d]->c%d\n", + (int) (css - d->ssets), co, (int) (p - d->ssets))); + css->outs[co] = p; + css->inchain[co] = p->ins; + p->ins.ss = css; + p->ins.co = co; + } + return p; +} + +/* + * lacon - lookaround-constraint checker for miss() + */ +static int /* predicate: constraint satisfied? */ +lacon(struct vars *v, + struct cnfa *pcnfa, /* parent cnfa */ + chr *cp, + color co) /* "color" of the lookaround constraint */ +{ + int n; + struct subre *sub; + struct dfa *d; + chr *end; + int satisfied; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(v->re)) + { + ERR(REG_ETOOBIG); + return 0; + } + + n = co - pcnfa->ncolors; + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); + FDEBUG(("=== testing lacon %d\n", n)); + sub = &v->g->lacons[n]; + d = getladfa(v, n); + if (d == NULL) + return 0; + if (LATYPE_IS_AHEAD(sub->subno)) + { + /* used to use longest() here, but shortest() could be much cheaper */ + end = shortest(v, d, cp, cp, v->stop, + (chr **) NULL, (int *) NULL); + satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); + } + else + { + /* + * To avoid doing O(N^2) work when repeatedly testing a lookbehind + * constraint in an N-character string, we use matchuntil() which can + * cache the DFA state across calls. We only need to restart if the + * probe point decreases, which is not common. The NFA we're using is + * a search NFA, so it doesn't mind scanning over stuff before the + * nominal match. + */ + satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); + if (!LATYPE_IS_POS(sub->subno)) + satisfied = !satisfied; + } + FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); + return satisfied; +} + +/* + * getvacant - get a vacant state set + * + * This routine clears out the inarcs and outarcs, but does not otherwise + * clear the innards of the state set -- that's up to the caller. + */ +static struct sset * +getvacant(struct vars *v, + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *p; + struct arcp ap; + color co; + + ss = pickss(v, d, cp, start); + if (ss == NULL) + return NULL; + assert(!(ss->flags & LOCKED)); + + /* clear out its inarcs, including self-referential ones */ + ap = ss->ins; + while ((p = ap.ss) != NULL) + { + co = ap.co; + FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co)); + p->outs[co] = NULL; + ap = p->inchain[co]; + p->inchain[co].ss = NULL; /* paranoia */ + } + ss->ins.ss = NULL; + + /* take it off the inarc chains of the ssets reached by its outarcs */ + for (i = 0; i < d->ncolors; i++) + { + p = ss->outs[i]; + assert(p != ss); /* not self-referential */ + if (p == NULL) + continue; /* NOTE CONTINUE */ + FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets))); + if (p->ins.ss == ss && p->ins.co == i) + p->ins = ss->inchain[i]; + else + { + struct arcp lastap = {NULL, 0}; + + assert(p->ins.ss != NULL); + for (ap = p->ins; ap.ss != NULL && + !(ap.ss == ss && ap.co == i); + ap = ap.ss->inchain[ap.co]) + lastap = ap; + assert(ap.ss != NULL); + lastap.ss->inchain[lastap.co] = ss->inchain[i]; + } + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + + /* if ss was a success state, may need to remember location */ + if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost && + (d->lastpost == NULL || d->lastpost < ss->lastseen)) + d->lastpost = ss->lastseen; + + /* likewise for a no-progress state */ + if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr && + (d->lastnopr == NULL || d->lastnopr < ss->lastseen)) + d->lastnopr = ss->lastseen; + + return ss; +} + +/* + * pickss - pick the next stateset to be used + */ +static struct sset * +pickss(struct vars *v, + struct dfa *d, + chr *cp, + chr *start) +{ + int i; + struct sset *ss; + struct sset *end; + chr *ancient; + + /* shortcut for cases where cache isn't full */ + if (d->nssused < d->nssets) + { + i = d->nssused; + d->nssused++; + ss = &d->ssets[i]; + FDEBUG(("new c%d\n", i)); + /* set up innards */ + ss->states = &d->statesarea[i * d->wordsper]; + ss->flags = 0; + ss->ins.ss = NULL; + ss->ins.co = WHITE; /* give it some value */ + ss->outs = &d->outsarea[i * d->ncolors]; + ss->inchain = &d->incarea[i * d->ncolors]; + for (i = 0; i < d->ncolors; i++) + { + ss->outs[i] = NULL; + ss->inchain[i].ss = NULL; + } + return ss; + } + + /* look for oldest, or old enough anyway */ + if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */ + ancient = cp - d->nssets * 2 / 3; + else + ancient = start; + for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags & LOCKED)) + { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); + return ss; + } + for (ss = d->ssets, end = d->search; ss < end; ss++) + if ((ss->lastseen == NULL || ss->lastseen < ancient) && + !(ss->flags & LOCKED)) + { + d->search = ss + 1; + FDEBUG(("replacing c%d\n", (int) (ss - d->ssets))); + return ss; + } + + /* nobody's old enough?!? -- something's really wrong */ + FDEBUG(("cannot find victim to replace!\n")); + ERR(REG_ASSERT); + return NULL; +} diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c new file mode 100644 index 0000000..4a27c25 --- /dev/null +++ b/src/backend/regex/regerror.c @@ -0,0 +1,120 @@ +/* + * regerror - error-code expansion + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regerror.c + * + */ + +#include "regex/regguts.h" + +/* unknown-error explanation */ +static const char unk[] = "*** unknown regex error code 0x%x ***"; + +/* struct to map among codes, code names, and explanations */ +static const struct rerr +{ + int code; + const char *name; + const char *explain; +} rerrs[] = + +{ + /* the actual table is built from regex.h */ +#include "regex/regerrs.h" /* pgrminclude ignore */ + { + -1, "", "oops" + }, /* explanation special-cased in code */ +}; + +/* + * pg_regerror - the interface to error numbers + */ +/* ARGSUSED */ +size_t /* actual space needed (including NUL) */ +pg_regerror(int errcode, /* error code, or REG_ATOI or REG_ITOA */ + const regex_t *preg, /* associated regex_t (unused at present) */ + char *errbuf, /* result buffer (unless errbuf_size==0) */ + size_t errbuf_size) /* available space in errbuf, can be 0 */ +{ + const struct rerr *r; + const char *msg; + char convbuf[sizeof(unk) + 50]; /* 50 = plenty for int */ + size_t len; + int icode; + + switch (errcode) + { + case REG_ATOI: /* convert name to number */ + for (r = rerrs; r->code >= 0; r++) + if (strcmp(r->name, errbuf) == 0) + break; + sprintf(convbuf, "%d", r->code); /* -1 for unknown */ + msg = convbuf; + break; + case REG_ITOA: /* convert number to name */ + icode = atoi(errbuf); /* not our problem if this fails */ + for (r = rerrs; r->code >= 0; r++) + if (r->code == icode) + break; + if (r->code >= 0) + msg = r->name; + else + { /* unknown; tell him the number */ + sprintf(convbuf, "REG_%u", (unsigned) icode); + msg = convbuf; + } + break; + default: /* a real, normal error code */ + for (r = rerrs; r->code >= 0; r++) + if (r->code == errcode) + break; + if (r->code >= 0) + msg = r->explain; + else + { /* unknown; say so */ + sprintf(convbuf, unk, errcode); + msg = convbuf; + } + break; + } + + len = strlen(msg) + 1; /* space needed, including NUL */ + if (errbuf_size > 0) + { + if (errbuf_size > len) + strcpy(errbuf, msg); + else + { /* truncate to fit */ + memcpy(errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = '\0'; + } + } + + return len; +} diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c new file mode 100644 index 0000000..f7eaa76 --- /dev/null +++ b/src/backend/regex/regexec.c @@ -0,0 +1,1425 @@ +/* + * re_*exec and friends - match REs + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regexec.c + * + */ + +#include "regex/regguts.h" + + + +/* lazy-DFA representation */ +struct arcp +{ /* "pointer" to an outarc */ + struct sset *ss; + color co; +}; + +struct sset +{ /* state set */ + unsigned *states; /* pointer to bitvector */ + unsigned hash; /* hash of bitvector */ +#define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) +#define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ + memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) + int flags; +#define STARTER 01 /* the initial state set */ +#define POSTSTATE 02 /* includes the goal state */ +#define LOCKED 04 /* locked in cache */ +#define NOPROGRESS 010 /* zero-progress state set */ + struct arcp ins; /* chain of inarcs pointing here */ + chr *lastseen; /* last entered on arrival here */ + struct sset **outs; /* outarc vector indexed by color */ + struct arcp *inchain; /* chain-pointer vector for outarcs */ +}; + +struct dfa +{ + int nssets; /* size of cache */ + int nssused; /* how many entries occupied yet */ + int nstates; /* number of states */ + int ncolors; /* length of outarc and inchain vectors */ + int wordsper; /* length of state-set bitvectors */ + struct sset *ssets; /* state-set cache */ + unsigned *statesarea; /* bitvector storage */ + unsigned *work; /* pointer to work area within statesarea */ + struct sset **outsarea; /* outarc-vector storage */ + struct arcp *incarea; /* inchain storage */ + struct cnfa *cnfa; + struct colormap *cm; + chr *lastpost; /* location of last cache-flushed success */ + chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ + struct sset *search; /* replacement-search-pointer memory */ + int cptsmalloced; /* were the areas individually malloced? */ + char *mallocarea; /* self, or master malloced area, or NULL */ +}; + +#define WORK 1 /* number of work bitvectors needed */ + +/* setup for non-malloc allocation for small cases */ +#define FEWSTATES 20 /* must be less than UBITS */ +#define FEWCOLORS 15 +struct smalldfa +{ + struct dfa dfa; + struct sset ssets[FEWSTATES * 2]; + unsigned statesarea[FEWSTATES * 2 + WORK]; + struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS]; + struct arcp incarea[FEWSTATES * 2 * FEWCOLORS]; +}; + +#define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ + + + +/* internal variables, bundled for easy passing around */ +struct vars +{ + regex_t *re; + struct guts *g; + int eflags; /* copies of arguments */ + size_t nmatch; + regmatch_t *pmatch; + rm_detail_t *details; + chr *start; /* start of string */ + chr *search_start; /* search start of string */ + chr *stop; /* just past end of string */ + int err; /* error code if any (0 none) */ + struct dfa **subdfas; /* per-tree-subre DFAs */ + struct dfa **ladfas; /* per-lacon-subre DFAs */ + struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */ + chr **lblastcp; /* per-lacon-subre lookbehind restart data */ + struct smalldfa dfa1; + struct smalldfa dfa2; +}; + +#define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ +#define ISERR() VISERR(v) +#define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e))) +#define ERR(e) VERR(v, e) /* record an error */ +#define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ +#define OFF(p) ((p) - v->start) +#define LOFF(p) ((long)OFF(p)) + + + +/* + * forward declarations + */ +/* === regexec.c === */ +static struct dfa *getsubdfa(struct vars *, struct subre *); +static struct dfa *getladfa(struct vars *, int); +static int find(struct vars *, struct cnfa *, struct colormap *); +static int cfind(struct vars *, struct cnfa *, struct colormap *); +static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); +static void zapallsubs(regmatch_t *, size_t); +static void zaptreesubs(struct vars *, struct subre *); +static void subset(struct vars *, struct subre *, chr *, chr *); +static int cdissect(struct vars *, struct subre *, chr *, chr *); +static int ccondissect(struct vars *, struct subre *, chr *, chr *); +static int crevcondissect(struct vars *, struct subre *, chr *, chr *); +static int cbrdissect(struct vars *, struct subre *, chr *, chr *); +static int caltdissect(struct vars *, struct subre *, chr *, chr *); +static int citerdissect(struct vars *, struct subre *, chr *, chr *); +static int creviterdissect(struct vars *, struct subre *, chr *, chr *); + +/* === rege_dfa.c === */ +static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); +static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); +static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **); +static chr *lastcold(struct vars *, struct dfa *); +static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); +static void freedfa(struct dfa *); +static unsigned hash(unsigned *, int); +static struct sset *initialize(struct vars *, struct dfa *, chr *); +static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *); +static int lacon(struct vars *, struct cnfa *, chr *, color); +static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); +static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); + + +/* + * pg_regexec - match regular expression + */ +int +pg_regexec(regex_t *re, + const chr *string, + size_t len, + size_t search_start, + rm_detail_t *details, + size_t nmatch, + regmatch_t pmatch[], + int flags) +{ + struct vars var; + register struct vars *v = &var; + int st; + size_t n; + size_t i; + int backref; + +#define LOCALMAT 20 + regmatch_t mat[LOCALMAT]; + +#define LOCALDFAS 40 + struct dfa *subdfas[LOCALDFAS]; + + /* sanity checks */ + if (re == NULL || string == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + v->re = re; + v->g = (struct guts *) re->re_guts; + if ((v->g->cflags & REG_EXPECT) && details == NULL) + return REG_INVARG; + if (v->g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + backref = (v->g->info & REG_UBACKREF) ? 1 : 0; + v->eflags = flags; + if (v->g->cflags & REG_NOSUB) + nmatch = 0; /* override client */ + v->nmatch = nmatch; + if (backref) + { + /* need work area */ + if (v->g->nsub + 1 <= LOCALMAT) + v->pmatch = mat; + else + v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) * + sizeof(regmatch_t)); + if (v->pmatch == NULL) + return REG_ESPACE; + v->nmatch = v->g->nsub + 1; + } + else + v->pmatch = pmatch; + v->details = details; + v->start = (chr *) string; + v->search_start = (chr *) string + search_start; + v->stop = (chr *) string + len; + v->err = 0; + v->subdfas = NULL; + v->ladfas = NULL; + v->lblastcss = NULL; + v->lblastcp = NULL; + /* below this point, "goto cleanup" will behave sanely */ + + assert(v->g->ntree >= 0); + n = (size_t) v->g->ntree; + if (n <= LOCALDFAS) + v->subdfas = subdfas; + else + { + v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->subdfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + } + for (i = 0; i < n; i++) + v->subdfas[i] = NULL; + + assert(v->g->nlacons >= 0); + n = (size_t) v->g->nlacons; + if (n > 0) + { + v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->ladfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + v->ladfas[i] = NULL; + v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *)); + v->lblastcp = (chr **) MALLOC(n * sizeof(chr *)); + if (v->lblastcss == NULL || v->lblastcp == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + { + v->lblastcss[i] = NULL; + v->lblastcp[i] = NULL; + } + } + + /* do it */ + assert(v->g->tree != NULL); + if (backref) + st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); + else + st = find(v, &v->g->tree->cnfa, &v->g->cmap); + + /* copy (portion of) match vector over if necessary */ + if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) + { + zapallsubs(pmatch, nmatch); + n = (nmatch < v->nmatch) ? nmatch : v->nmatch; + memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); + } + + /* clean up */ +cleanup: + if (v->pmatch != pmatch && v->pmatch != mat) + FREE(v->pmatch); + if (v->subdfas != NULL) + { + n = (size_t) v->g->ntree; + for (i = 0; i < n; i++) + { + if (v->subdfas[i] != NULL) + freedfa(v->subdfas[i]); + } + if (v->subdfas != subdfas) + FREE(v->subdfas); + } + if (v->ladfas != NULL) + { + n = (size_t) v->g->nlacons; + for (i = 0; i < n; i++) + { + if (v->ladfas[i] != NULL) + freedfa(v->ladfas[i]); + } + FREE(v->ladfas); + } + if (v->lblastcss != NULL) + FREE(v->lblastcss); + if (v->lblastcp != NULL) + FREE(v->lblastcp); + + return st; +} + +/* + * getsubdfa - create or re-fetch the DFA for a tree subre node + * + * We only need to create the DFA once per overall regex execution. + * The DFA will be freed by the cleanup step in pg_regexec(). + */ +static struct dfa * +getsubdfa(struct vars *v, + struct subre *t) +{ + if (v->subdfas[t->id] == NULL) + { + v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + return NULL; + } + return v->subdfas[t->id]; +} + +/* + * getladfa - create or re-fetch the DFA for a LACON subre node + * + * Same as above, but for LACONs. + */ +static struct dfa * +getladfa(struct vars *v, + int n) +{ + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); + + if (v->ladfas[n] == NULL) + { + struct subre *sub = &v->g->lacons[n]; + + v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + return NULL; + } + return v->ladfas[n]; +} + +/* + * find - find a match for the main NFA (no-complications case) + */ +static int +find(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *begin; + chr *end = NULL; + chr *cold; + chr *open; /* open and close of range of possible starts */ + chr *close; + int hitend; + int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0; + + /* first, a shot with the search RE */ + s = newdfa(v, &v->g->search, cm, &v->dfa1); + assert(!(ISERR() && s != NULL)); + NOERR(); + MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); + cold = NULL; + close = shortest(v, s, v->search_start, v->search_start, v->stop, + &cold, (int *) NULL); + freedfa(s); + NOERR(); + if (v->g->cflags & REG_EXPECT) + { + assert(v->details != NULL); + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (close == NULL) /* not found */ + return REG_NOMATCH; + if (v->nmatch == 0) /* found, don't need exact location */ + return REG_OKAY; + + /* find starting point and match */ + assert(cold != NULL); + open = cold; + cold = NULL; + MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); + d = newdfa(v, cnfa, cm, &v->dfa1); + assert(!(ISERR() && d != NULL)); + NOERR(); + for (begin = open; begin <= close; begin++) + { + MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); + if (shorter) + end = shortest(v, d, begin, begin, v->stop, + (chr **) NULL, &hitend); + else + end = longest(v, d, begin, v->stop, &hitend); + if (ISERR()) + { + freedfa(d); + return v->err; + } + if (hitend && cold == NULL) + cold = begin; + if (end != NULL) + break; /* NOTE BREAK OUT */ + } + assert(end != NULL); /* search RE succeeded so loop should */ + freedfa(d); + + /* and pin down details */ + assert(v->nmatch > 0); + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + if (v->g->cflags & REG_EXPECT) + { + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + if (v->nmatch == 1) /* no need for submatches */ + return REG_OKAY; + + /* find submatches */ + zapallsubs(v->pmatch, v->nmatch); + return cdissect(v, v->g->tree, begin, end); +} + +/* + * cfind - find a match for the main NFA (with complications) + */ +static int +cfind(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm) +{ + struct dfa *s; + struct dfa *d; + chr *cold; + int ret; + + s = newdfa(v, &v->g->search, cm, &v->dfa1); + NOERR(); + d = newdfa(v, cnfa, cm, &v->dfa2); + if (ISERR()) + { + assert(d == NULL); + freedfa(s); + return v->err; + } + + ret = cfindloop(v, cnfa, cm, d, s, &cold); + + freedfa(d); + freedfa(s); + NOERR(); + if (v->g->cflags & REG_EXPECT) + { + assert(v->details != NULL); + if (cold != NULL) + v->details->rm_extend.rm_so = OFF(cold); + else + v->details->rm_extend.rm_so = OFF(v->stop); + v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ + } + return ret; +} + +/* + * cfindloop - the heart of cfind + */ +static int +cfindloop(struct vars *v, + struct cnfa *cnfa, + struct colormap *cm, + struct dfa *d, + struct dfa *s, + chr **coldp) /* where to put coldstart pointer */ +{ + chr *begin; + chr *end; + chr *cold; + chr *open; /* open and close of range of possible starts */ + chr *close; + chr *estart; + chr *estop; + int er; + int shorter = v->g->tree->flags & SHORTER; + int hitend; + + assert(d != NULL && s != NULL); + cold = NULL; + close = v->search_start; + do + { + /* Search with the search RE for match range at/beyond "close" */ + MDEBUG(("\ncsearch at %ld\n", LOFF(close))); + close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL); + if (ISERR()) + { + *coldp = cold; + return v->err; + } + if (close == NULL) + break; /* no more possible match anywhere */ + assert(cold != NULL); + open = cold; + cold = NULL; + /* Search for matches starting between "open" and "close" inclusive */ + MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); + for (begin = open; begin <= close; begin++) + { + MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); + estart = begin; + estop = v->stop; + for (;;) + { + /* Here we use the top node's detailed RE */ + if (shorter) + end = shortest(v, d, begin, estart, + estop, (chr **) NULL, &hitend); + else + end = longest(v, d, begin, estop, + &hitend); + if (ISERR()) + { + *coldp = cold; + return v->err; + } + if (hitend && cold == NULL) + cold = begin; + if (end == NULL) + break; /* no match with this begin point, try next */ + MDEBUG(("tentative end %ld\n", LOFF(end))); + /* Dissect the potential match to see if it really matches */ + zapallsubs(v->pmatch, v->nmatch); + er = cdissect(v, v->g->tree, begin, end); + if (er == REG_OKAY) + { + if (v->nmatch > 0) + { + v->pmatch[0].rm_so = OFF(begin); + v->pmatch[0].rm_eo = OFF(end); + } + *coldp = cold; + return REG_OKAY; + } + if (er != REG_NOMATCH) + { + ERR(er); + *coldp = cold; + return er; + } + /* Try next longer/shorter match with same begin point */ + if (shorter) + { + if (end == estop) + break; /* no more, so try next begin point */ + estart = end + 1; + } + else + { + if (end == begin) + break; /* no more, so try next begin point */ + estop = end - 1; + } + } /* end loop over endpoint positions */ + } /* end loop over beginning positions */ + + /* + * If we get here, there is no possible match starting at or before + * "close", so consider matches beyond that. We'll do a fresh search + * with the search RE to find a new promising match range. + */ + close++; + } while (close < v->stop); + + *coldp = cold; + return REG_NOMATCH; +} + +/* + * zapallsubs - initialize all subexpression matches to "no match" + */ +static void +zapallsubs(regmatch_t *p, + size_t n) +{ + size_t i; + + for (i = n - 1; i > 0; i--) + { + p[i].rm_so = -1; + p[i].rm_eo = -1; + } +} + +/* + * zaptreesubs - initialize subexpressions within subtree to "no match" + */ +static void +zaptreesubs(struct vars *v, + struct subre *t) +{ + if (t->op == '(') + { + int n = t->subno; + + assert(n > 0); + if ((size_t) n < v->nmatch) + { + v->pmatch[n].rm_so = -1; + v->pmatch[n].rm_eo = -1; + } + } + + if (t->left != NULL) + zaptreesubs(v, t->left); + if (t->right != NULL) + zaptreesubs(v, t->right); +} + +/* + * subset - set subexpression match data for a successful subre + */ +static void +subset(struct vars *v, + struct subre *sub, + chr *begin, + chr *end) +{ + int n = sub->subno; + + assert(n > 0); + if ((size_t) n >= v->nmatch) + return; + + MDEBUG(("setting %d\n", n)); + v->pmatch[n].rm_so = OFF(begin); + v->pmatch[n].rm_eo = OFF(end); +} + +/* + * cdissect - check backrefs and determine subexpression matches + * + * cdissect recursively processes a subre tree to check matching of backrefs + * and/or identify submatch boundaries for capture nodes. The proposed match + * runs from "begin" to "end" (not including "end"), and we are basically + * "dissecting" it to see where the submatches are. + * + * Before calling any level of cdissect, the caller must have run the node's + * DFA and found that the proposed substring satisfies the DFA. (We make + * the caller do that because in concatenation and iteration nodes, it's + * much faster to check all the substrings against the child DFAs before we + * recurse.) Also, caller must have cleared subexpression match data via + * zaptreesubs (or zapallsubs at the top level). + */ +static int /* regexec return code */ +cdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int er; + + assert(t != NULL); + MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); + + /* handy place to check for operation cancel */ + if (CANCEL_REQUESTED(v->re)) + return REG_CANCEL; + /* ... and stack overrun */ + if (STACK_TOO_DEEP(v->re)) + return REG_ETOOBIG; + + switch (t->op) + { + case '=': /* terminal node */ + assert(t->left == NULL && t->right == NULL); + er = REG_OKAY; /* no action, parent did the work */ + break; + case 'b': /* back reference */ + assert(t->left == NULL && t->right == NULL); + er = cbrdissect(v, t, begin, end); + break; + case '.': /* concatenation */ + assert(t->left != NULL && t->right != NULL); + if (t->left->flags & SHORTER) /* reverse scan */ + er = crevcondissect(v, t, begin, end); + else + er = ccondissect(v, t, begin, end); + break; + case '|': /* alternation */ + assert(t->left != NULL); + er = caltdissect(v, t, begin, end); + break; + case '*': /* iteration */ + assert(t->left != NULL); + if (t->left->flags & SHORTER) /* reverse scan */ + er = creviterdissect(v, t, begin, end); + else + er = citerdissect(v, t, begin, end); + break; + case '(': /* capturing */ + assert(t->left != NULL && t->right == NULL); + assert(t->subno > 0); + er = cdissect(v, t->left, begin, end); + if (er == REG_OKAY) + subset(v, t, begin, end); + break; + default: + er = REG_ASSERT; + break; + } + + /* + * We should never have a match failure unless backrefs lurk below; + * otherwise, either caller failed to check the DFA, or there's some + * inconsistency between the DFA and the node's innards. + */ + assert(er != REG_NOMATCH || (t->flags & BACKR)); + + return er; +} + +/* + * ccondissect - dissect match for concatenation node + */ +static int /* regexec return code */ +ccondissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + assert(!(t->left->flags & SHORTER)); + + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); + MDEBUG(("cconcat %d\n", t->id)); + + /* pick a tentative midpoint */ + mid = longest(v, d, begin, end, (int *) NULL); + NOERR(); + if (mid == NULL) + return REG_NOMATCH; + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); + + /* iterate until satisfaction or failure */ + for (;;) + { + /* try this midpoint on for size */ + if (longest(v, d2, mid, end, (int *) NULL) == end) + { + er = cdissect(v, t->left, begin, mid); + if (er == REG_OKAY) + { + er = cdissect(v, t->right, mid, end); + if (er == REG_OKAY) + { + /* satisfaction */ + MDEBUG(("successful\n")); + return REG_OKAY; + } + } + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + /* that midpoint didn't work, find a new one */ + if (mid == begin) + { + /* all possibilities exhausted */ + MDEBUG(("%d no midpoint\n", t->id)); + return REG_NOMATCH; + } + mid = longest(v, d, begin, mid - 1, (int *) NULL); + NOERR(); + if (mid == NULL) + { + /* failed to find a new one */ + MDEBUG(("%d failed midpoint\n", t->id)); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); + zaptreesubs(v, t->left); + zaptreesubs(v, t->right); + } + + /* can't get here */ + return REG_ASSERT; +} + +/* + * crevcondissect - dissect match for concatenation node, shortest-first + */ +static int /* regexec return code */ +crevcondissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + struct dfa *d2; + chr *mid; + int er; + + assert(t->op == '.'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->right != NULL && t->right->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + + d = getsubdfa(v, t->left); + NOERR(); + d2 = getsubdfa(v, t->right); + NOERR(); + MDEBUG(("crevcon %d\n", t->id)); + + /* pick a tentative midpoint */ + mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL); + NOERR(); + if (mid == NULL) + return REG_NOMATCH; + MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); + + /* iterate until satisfaction or failure */ + for (;;) + { + /* try this midpoint on for size */ + if (longest(v, d2, mid, end, (int *) NULL) == end) + { + er = cdissect(v, t->left, begin, mid); + if (er == REG_OKAY) + { + er = cdissect(v, t->right, mid, end); + if (er == REG_OKAY) + { + /* satisfaction */ + MDEBUG(("successful\n")); + return REG_OKAY; + } + } + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + /* that midpoint didn't work, find a new one */ + if (mid == end) + { + /* all possibilities exhausted */ + MDEBUG(("%d no midpoint\n", t->id)); + return REG_NOMATCH; + } + mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL); + NOERR(); + if (mid == NULL) + { + /* failed to find a new one */ + MDEBUG(("%d failed midpoint\n", t->id)); + return REG_NOMATCH; + } + MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); + zaptreesubs(v, t->left); + zaptreesubs(v, t->right); + } + + /* can't get here */ + return REG_ASSERT; +} + +/* + * cbrdissect - dissect match for backref node + */ +static int /* regexec return code */ +cbrdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + int n = t->subno; + size_t numreps; + size_t tlen; + size_t brlen; + chr *brstring; + chr *p; + int min = t->min; + int max = t->max; + + assert(t != NULL); + assert(t->op == 'b'); + assert(n >= 0); + assert((size_t) n < v->nmatch); + + MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max)); + + /* get the backreferenced string */ + if (v->pmatch[n].rm_so == -1) + return REG_NOMATCH; + brstring = v->start + v->pmatch[n].rm_so; + brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; + + /* special cases for zero-length strings */ + if (brlen == 0) + { + /* + * matches only if target is zero length, but any number of + * repetitions can be considered to be present + */ + if (begin == end && min <= max) + { + MDEBUG(("cbackref matched trivially\n")); + return REG_OKAY; + } + return REG_NOMATCH; + } + if (begin == end) + { + /* matches only if zero repetitions are okay */ + if (min == 0) + { + MDEBUG(("cbackref matched trivially\n")); + return REG_OKAY; + } + return REG_NOMATCH; + } + + /* + * check target length to see if it could possibly be an allowed number of + * repetitions of brstring + */ + assert(end > begin); + tlen = end - begin; + if (tlen % brlen != 0) + return REG_NOMATCH; + numreps = tlen / brlen; + if (numreps < min || (numreps > max && max != DUPINF)) + return REG_NOMATCH; + + /* okay, compare the actual string contents */ + p = begin; + while (numreps-- > 0) + { + if ((*v->g->compare) (brstring, p, brlen) != 0) + return REG_NOMATCH; + p += brlen; + } + + MDEBUG(("cbackref matched\n")); + return REG_OKAY; +} + +/* + * caltdissect - dissect match for alternation node + */ +static int /* regexec return code */ +caltdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + int er; + + /* We loop, rather than tail-recurse, to handle a chain of alternatives */ + while (t != NULL) + { + assert(t->op == '|'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + + MDEBUG(("calt n%d\n", t->id)); + + d = getsubdfa(v, t->left); + NOERR(); + if (longest(v, d, begin, end, (int *) NULL) == end) + { + MDEBUG(("calt matched\n")); + er = cdissect(v, t->left, begin, end); + if (er != REG_NOMATCH) + return er; + } + NOERR(); + + t = t->right; + } + + return REG_NOMATCH; +} + +/* + * citerdissect - dissect match for iteration node + */ +static int /* regexec return code */ +citerdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(!(t->left->flags & SHORTER)); + assert(begin <= end); + + /* + * For the moment, assume the minimum number of matches is 1. If zero + * matches are allowed, and the target string is empty, we are allowed to + * match regardless of the contents of the iter node --- but we would + * prefer to match once, so that capturing parens get set. (An example of + * the concern here is a pattern like "()*\1", which historically this + * code has allowed to succeed.) Therefore, we deal with the zero-matches + * case at the bottom, after failing to find any other way to match. + */ + min_matches = t->min; + if (min_matches <= 0) + min_matches = 1; + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != DUPINF) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = getsubdfa(v, t->left); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("citer %d\n", t->id)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->id, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches that + * works so far as the child DFA can tell. If k is an allowed number + * of matches, start the slow part: recurse to verify each sub-match. + * We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zaptreesubs(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->id)); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + + /* + * Must consider shorter versions of the current sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + FREE(endpts); + + /* + * Now consider the possibility that we can match to a zero-length string + * by using zero repetitions. + */ + if (t->min == 0 && begin == end) + { + MDEBUG(("%d allowing zero matches\n", t->id)); + return REG_OKAY; + } + + MDEBUG(("%d failed\n", t->id)); + return REG_NOMATCH; +} + +/* + * creviterdissect - dissect match for iteration node, shortest-first + */ +static int /* regexec return code */ +creviterdissect(struct vars *v, + struct subre *t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + assert(begin <= end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != DUPINF) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = getsubdfa(v, t->left); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("creviter %d\n", t->id)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* if this is the last allowed sub-match, it must reach to the end */ + if (k >= max_matches) + limit = end; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->id, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches that + * works so far as the child DFA can tell. If k is an allowed number + * of matches, start the slow part: recurse to verify each sub-match. + * We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zaptreesubs(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->id)); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + + /* + * Must consider longer versions of the current sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d failed\n", t->id)); + FREE(endpts); + return REG_NOMATCH; +} + + + +#include "rege_dfa.c" diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c new file mode 100644 index 0000000..a925a9f --- /dev/null +++ b/src/backend/regex/regexport.c @@ -0,0 +1,292 @@ +/*------------------------------------------------------------------------- + * + * regexport.c + * Functions for exporting info about a regex's NFA + * + * In this implementation, the NFA defines a necessary but not sufficient + * condition for a string to match the regex: that is, there can be strings + * that match the NFA but don't match the full regex, but not vice versa. + * Thus, for example, it is okay for the functions below to treat lookaround + * constraints as no-ops, since they merely constrain the string some more. + * + * Notice that these functions return info into caller-provided arrays + * rather than doing their own malloc's. This simplifies the APIs by + * eliminating a class of error conditions, and in the case of colors + * allows the caller to decide how big is too big to bother with. + * + * + * Portions Copyright (c) 2013-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regexport.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + +#include "regex/regexport.h" + + +/* + * Get total number of NFA states. + */ +int +pg_reg_getnumstates(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->nstates; +} + +/* + * Get initial state of NFA. + */ +int +pg_reg_getinitialstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->pre; +} + +/* + * Get final state of NFA. + */ +int +pg_reg_getfinalstate(const regex_t *regex) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + return cnfa->post; +} + +/* + * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON + * arcs from the caller, treating any LACON as being automatically satisfied. + * Since the output representation does not support arcs that consume no + * character when traversed, we have to recursively traverse LACON arcs here, + * and report whatever normal arcs are reachable by traversing LACON arcs. + * Note that this wouldn't work if it were possible to reach the final state + * via LACON traversal, but the regex library never builds NFAs that have + * LACON arcs leading directly to the final state. (This is because the + * regex executor is designed to consume one character beyond the nominal + * match end --- possibly an EOS indicator --- so there is always a set of + * ordinary arcs leading to the final state.) + * + * traverse_lacons is a recursive subroutine used by both exported functions + * to count and then emit the reachable regular arcs. *arcs_count is + * incremented by the number of reachable arcs, and as many as will fit in + * arcs_len (possibly 0) are emitted into arcs[]. + */ +static void +traverse_lacons(struct cnfa *cnfa, int st, + int *arcs_count, + regex_arc_t *arcs, int arcs_len) +{ + struct carc *ca; + + /* + * Since this function recurses, it could theoretically be driven to stack + * overflow. In practice, this is mostly useful to backstop against a + * failure of the regex compiler to remove a loop of LACON arcs. + */ + check_stack_depth(); + + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co < cnfa->ncolors) + { + /* Ordinary arc, so count and possibly emit it */ + int ndx = (*arcs_count)++; + + if (ndx < arcs_len) + { + arcs[ndx].co = ca->co; + arcs[ndx].to = ca->to; + } + } + else + { + /* LACON arc --- assume it's satisfied and recurse... */ + /* ... but first, assert it doesn't lead directly to post state */ + Assert(ca->to != cnfa->post); + + traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len); + } + } +} + +/* + * Get number of outgoing NFA arcs of state number "st". + */ +int +pg_reg_getnumoutarcs(const regex_t *regex, int st) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates) + return 0; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, NULL, 0); + return arcs_count; +} + +/* + * Write array of outgoing NFA arcs of state number "st" into arcs[], + * whose length arcs_len must be at least as long as indicated by + * pg_reg_getnumoutarcs(), else not all arcs will be returned. + */ +void +pg_reg_getoutarcs(const regex_t *regex, int st, + regex_arc_t *arcs, int arcs_len) +{ + struct cnfa *cnfa; + int arcs_count; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (st < 0 || st >= cnfa->nstates || arcs_len <= 0) + return; + arcs_count = 0; + traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len); +} + +/* + * Get total number of colors. + */ +int +pg_reg_getnumcolors(const regex_t *regex) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + return cm->max + 1; +} + +/* + * Check if color is beginning of line/string. + * + * (We might at some point need to offer more refined handling of pseudocolors, + * but this will do for now.) + */ +int +pg_reg_colorisbegin(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->bos[0] || co == cnfa->bos[1]) + return true; + else + return false; +} + +/* + * Check if color is end of line/string. + */ +int +pg_reg_colorisend(const regex_t *regex, int co) +{ + struct cnfa *cnfa; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cnfa = &((struct guts *) regex->re_guts)->search; + + if (co == cnfa->eos[0] || co == cnfa->eos[1]) + return true; + else + return false; +} + +/* + * Get number of member chrs of color number "co". + * + * Note: we return -1 if the color number is invalid, or if it is a special + * color (WHITE or a pseudocolor), or if the number of members is uncertain. + * Callers should not try to extract the members if -1 is returned. + */ +int +pg_reg_getnumcharacters(const regex_t *regex, int co) +{ + struct colormap *cm; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */ + return -1; + if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */ + return -1; + + /* + * If the color appears anywhere in the high colormap, treat its number of + * members as uncertain. In principle we could determine all the specific + * chrs corresponding to each such entry, but it would be expensive + * (particularly if character class tests are required) and it doesn't + * seem worth it. + */ + if (cm->cd[co].nuchrs != 0) + return -1; + + /* OK, return the known number of member chrs */ + return cm->cd[co].nschrs; +} + +/* + * Write array of member chrs of color number "co" into chars[], + * whose length chars_len must be at least as long as indicated by + * pg_reg_getnumcharacters(), else not all chars will be returned. + * + * Fetching the members of WHITE or a pseudocolor is not supported. + * + * Caution: this is a relatively expensive operation. + */ +void +pg_reg_getcharacters(const regex_t *regex, int co, + pg_wchar *chars, int chars_len) +{ + struct colormap *cm; + chr c; + + assert(regex != NULL && regex->re_magic == REMAGIC); + cm = &((struct guts *) regex->re_guts)->cmap; + + if (co <= 0 || co > cm->max || chars_len <= 0) + return; + if (cm->cd[co].flags & PSEUDO) + return; + + /* + * We need only examine the low character map; there should not be any + * matching entries in the high map. + */ + for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++) + { + if (cm->locolormap[c - CHR_MIN] == co) + { + *chars++ = c; + if (--chars_len == 0) + break; + } + } +} diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c new file mode 100644 index 0000000..ae17ae7 --- /dev/null +++ b/src/backend/regex/regfree.c @@ -0,0 +1,54 @@ +/* + * regfree - free an RE + * + * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. + * + * Development of this software was funded, in part, by Cray Research Inc., + * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics + * Corporation, none of whom are responsible for the results. The author + * thanks all of them. + * + * Redistribution and use in source and binary forms -- with or without + * modification -- are permitted for any purpose, provided that + * redistributions in source form retain this entire copyright notice and + * indicate the origin and nature of any modifications. + * + * I'd appreciate being given credit for this package in the documentation + * of software which uses it, but that is not a requirement. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * src/backend/regex/regfree.c + * + * + * You might think that this could be incorporated into regcomp.c, and + * that would be a reasonable idea... except that this is a generic + * function (with a generic name), applicable to all compiled REs + * regardless of the size of their characters, whereas the stuff in + * regcomp.c gets compiled once per character size. + */ + +#include "regex/regguts.h" + + +/* + * pg_regfree - free an RE (generic function, punts to RE-specific function) + * + * Ignoring invocation with NULL is a convenience. + */ +void +pg_regfree(regex_t *re) +{ + if (re == NULL) + return; + (*((struct fns *) re->re_fns)->free) (re); +} diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c new file mode 100644 index 0000000..991b868 --- /dev/null +++ b/src/backend/regex/regprefix.c @@ -0,0 +1,260 @@ +/*------------------------------------------------------------------------- + * + * regprefix.c + * Extract a common prefix, if any, from a compiled regex. + * + * + * Portions Copyright (c) 2012-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regprefix.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + + +/* + * forward declarations + */ +static int findprefix(struct cnfa *cnfa, struct colormap *cm, + chr *string, size_t *slength); + + +/* + * pg_regprefix - get common prefix for regular expression + * + * Returns one of: + * REG_NOMATCH: there is no common prefix of strings matching the regex + * REG_PREFIX: there is a common prefix of strings matching the regex + * REG_EXACT: all strings satisfying the regex must match the same string + * or a REG_XXX error code + * + * In the non-failure cases, *string is set to a malloc'd string containing + * the common prefix or exact value, of length *slength (measured in chrs + * not bytes!). + * + * This function does not analyze all complex cases (such as lookaround + * constraints) exactly. Therefore it is possible that some strings matching + * the reported prefix or exact-match string do not satisfy the regex. But + * it should never be the case that a string satisfying the regex does not + * match the reported prefix or exact-match string. + */ +int +pg_regprefix(regex_t *re, + chr **string, + size_t *slength) +{ + struct guts *g; + struct cnfa *cnfa; + int st; + + /* sanity checks */ + if (string == NULL || slength == NULL) + return REG_INVARG; + *string = NULL; /* initialize for failure cases */ + *slength = 0; + if (re == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + g = (struct guts *) re->re_guts; + if (g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + + /* + * This implementation considers only the search NFA for the topmost regex + * tree node. Therefore, constraints such as backrefs are not fully + * applied, which is allowed per the function's API spec. + */ + assert(g->tree != NULL); + cnfa = &g->tree->cnfa; + + /* + * Since a correct NFA should never contain any exit-free loops, it should + * not be possible for our traversal to return to a previously visited NFA + * state. Hence we need at most nstates chrs in the output string. + */ + *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); + if (*string == NULL) + return REG_ESPACE; + + /* do it */ + st = findprefix(cnfa, &g->cmap, *string, slength); + + assert(*slength <= cnfa->nstates); + + /* clean up */ + if (st != REG_PREFIX && st != REG_EXACT) + { + FREE(*string); + *string = NULL; + *slength = 0; + } + + return st; +} + +/* + * findprefix - extract common prefix from cNFA + * + * Results are returned into the preallocated chr array string[], with + * *slength (which must be preset to zero) incremented for each chr. + */ +static int /* regprefix return code */ +findprefix(struct cnfa *cnfa, + struct colormap *cm, + chr *string, + size_t *slength) +{ + int st; + int nextst; + color thiscolor; + chr c; + struct carc *ca; + + /* + * The "pre" state must have only BOS/BOL outarcs, else pattern isn't + * anchored left. If we have both BOS and BOL, they must go to the same + * next state. + */ + st = cnfa->pre; + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + return REG_NOMATCH; + } + else + return REG_NOMATCH; + } + if (nextst == -1) + return REG_NOMATCH; + + /* + * Scan through successive states, stopping as soon as we find one with + * more than one acceptable transition character (either multiple colors + * on out-arcs, or a color with more than one member chr). + * + * We could find a state with multiple out-arcs that are all labeled with + * the same singleton color; this comes from patterns like "^ab(cde|cxy)". + * In that case we add the chr "c" to the output string but then exit the + * loop with nextst == -1. This leaves a little bit on the table: if the + * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added + * to the prefix. But chasing multiple parallel state chains doesn't seem + * worth the trouble. + */ + do + { + st = nextst; + nextst = -1; + thiscolor = COLORLESS; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + /* We can ignore BOS/BOL arcs */ + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + continue; + /* ... but EOS/EOL arcs terminate the search, as do LACONs */ + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || + ca->co >= cnfa->ncolors) + { + thiscolor = COLORLESS; + break; + } + if (thiscolor == COLORLESS) + { + /* First plain outarc */ + thiscolor = ca->co; + nextst = ca->to; + } + else if (thiscolor == ca->co) + { + /* Another plain outarc for same color */ + nextst = -1; + } + else + { + /* More than one plain outarc color terminates the search */ + thiscolor = COLORLESS; + break; + } + } + /* Done if we didn't find exactly one color on plain outarcs */ + if (thiscolor == COLORLESS) + break; + /* The color must be a singleton */ + if (cm->cd[thiscolor].nschrs != 1) + break; + /* Must not have any high-color-map entries */ + if (cm->cd[thiscolor].nuchrs != 0) + break; + + /* + * Identify the color's sole member chr and add it to the prefix + * string. In general the colormap data structure doesn't provide a + * way to find color member chrs, except by trying GETCOLOR() on each + * possible chr value, which won't do at all. However, for the cases + * we care about it should be sufficient to test the "firstchr" value, + * that is the first chr ever added to the color. There are cases + * where this might no longer be a member of the color (so we do need + * to test), but none of them are likely to arise for a character that + * is a member of a common prefix. If we do hit such a corner case, + * we just fall out without adding anything to the prefix string. + */ + c = cm->cd[thiscolor].firstchr; + if (GETCOLOR(cm, c) != thiscolor) + break; + + string[(*slength)++] = c; + + /* Advance to next state, but only if we have a unique next state */ + } while (nextst != -1); + + /* + * If we ended at a state that only has EOS/EOL outarcs leading to the + * "post" state, then we have an exact-match string. Note this is true + * even if the string is of zero length. + */ + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + { + nextst = -1; + break; + } + } + else + { + nextst = -1; + break; + } + } + if (nextst == cnfa->post) + return REG_EXACT; + + /* + * Otherwise, if we were unable to identify any prefix characters, say + * NOMATCH --- the pattern is anchored left, but doesn't specify any + * particular first character. + */ + if (*slength > 0) + return REG_PREFIX; + + return REG_NOMATCH; +} |