17 files changed, 14108 insertions, 0 deletions
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT
new file mode 100644
index 0000000..e50cfb1
--- /dev/null
+++ b/src/backend/regex/COPYRIGHT
@@ -0,0 +1,84 @@
+This regular expression package was originally developed by Henry Spencer.
+It bears the following copyright notice:
+
+**********************************************************************
+
+Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+
+Development of this software was funded, in part, by Cray Research Inc.,
+UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+Corporation, none of whom are responsible for the results.  The author
+thanks all of them. 
+
+Redistribution and use in source and binary forms -- with or without
+modification -- are permitted for any purpose, provided that
+redistributions in source form retain this entire copyright notice and
+indicate the origin and nature of any modifications.
+
+I'd appreciate being given credit for this package in the documentation
+of software which uses it, but that is not a requirement.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************
+
+PostgreSQL adopted the code out of Tcl 8.4.1.  Portions of regc_locale.c
+and re_syntax.n were developed by Tcl developers other than Henry; these
+files bear the Tcl copyright and license notice:
+
+**********************************************************************
+
+This software is copyrighted by the Regents of the University of
+California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+Corporation and other parties.  The following terms apply to all files
+associated with the software unless explicitly disclaimed in
+individual files.
+
+The authors hereby grant permission to use, copy, modify, distribute,
+and license this software and its documentation for any purpose, provided
+that existing copyright notices are retained in all copies and that this
+notice is included verbatim in any distributions. No written agreement,
+license, or royalty fee is required for any of the authorized uses.
+Modifications to this software may be copyrighted by their authors
+and need not follow the licensing terms described here, provided that
+the new terms are clearly indicated on the first page of each file where
+they apply.
+
+IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+MODIFICATIONS.
+
+GOVERNMENT USE: If you are acquiring this software on behalf of the
+U.S. government, the Government shall have only "Restricted Rights"
+in the software and related documentation as defined in the Federal 
+Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+are acquiring the software on behalf of the Department of Defense, the
+software shall be classified as "Commercial Computer Software" and the
+Government shall have only "Restricted Rights" as defined in Clause
+252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+authors grant the U.S. Government and others acting in its behalf
+permission to use and distribute the software in accordance with the
+terms specified in this license. 
+
+**********************************************************************
+
+Subsequent modifications to the code by the PostgreSQL project follow
+the same license terms as the rest of PostgreSQL.
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
new file mode 100644
index 0000000..5210c16
--- /dev/null
+++ b/src/backend/regex/Makefile
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for backend/regex
+#
+# IDENTIFICATION
+#    src/backend/regex/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/regex
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	regcomp.o \
+	regerror.o \
+	regexec.o \
+	regexport.o \
+	regfree.o \
+	regprefix.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# mark inclusion dependencies between .c files explicitly
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
+        regc_locale.c regc_pg_locale.c
+
+regexec.o: regexec.c rege_dfa.c
diff --git a/src/backend/regex/README b/src/backend/regex/README
new file mode 100644
index 0000000..f08aab6
--- /dev/null
+++ b/src/backend/regex/README
@@ -0,0 +1,409 @@
+Implementation notes about Henry Spencer's regex library
+========================================================
+
+If Henry ever had any internals documentation, he didn't publish it.
+So this file is an attempt to reverse-engineer some docs.
+
+General source-file layout
+--------------------------
+
+There are six separately-compilable source files, five of which expose
+exactly one exported function apiece:
+	regcomp.c: pg_regcomp
+	regexec.c: pg_regexec
+	regerror.c: pg_regerror
+	regfree.c: pg_regfree
+	regprefix.c: pg_regprefix
+(The pg_ prefixes were added by the Postgres project to distinguish this
+library version from any similar one that might be present on a particular
+system.  They'd need to be removed or replaced in any standalone version
+of the library.)
+
+The sixth file, regexport.c, exposes multiple functions that allow extraction
+of info about a compiled regex (see regexport.h).
+
+There are additional source files regc_*.c that are #include'd in regcomp,
+and similarly additional source files rege_*.c that are #include'd in
+regexec.  This was done to avoid exposing internal symbols globally;
+all functions not meant to be part of the library API are static.
+
+(Actually the above is a lie in one respect: there are two more global
+symbols, pg_set_regex_collation and pg_reg_getcolor in regcomp.  These are
+not meant to be part of the API, but they have to be global because both
+regcomp and regexec call them.  It'd be better to get rid of
+pg_set_regex_collation, as well as the static variables it sets, in favor of
+keeping the needed locale state in the regex structs.  We have not done this
+yet for lack of a design for how to add application-specific state to the
+structs.)
+
+What's where in src/backend/regex/:
+
+regcomp.c		Top-level regex compilation code
+regc_color.c		Color map management
+regc_cvec.c		Character vector (cvec) management
+regc_lex.c		Lexer
+regc_nfa.c		NFA handling
+regc_locale.c		Application-specific locale code from Tcl project
+regc_pg_locale.c	Postgres-added application-specific locale code
+regexec.c		Top-level regex execution code
+rege_dfa.c		DFA creation and execution
+regerror.c		pg_regerror: generate text for a regex error code
+regfree.c		pg_regfree: API to free a no-longer-needed regex_t
+regexport.c		Functions for extracting info from a regex_t
+regprefix.c		Code for extracting a common prefix from a regex_t
+
+The locale-specific code is concerned primarily with case-folding and with
+expanding locale-specific character classes, such as [[:alnum:]].  It
+really needs refactoring if this is ever to become a standalone library.
+
+The header files for the library are in src/include/regex/:
+
+regcustom.h		Customizes library for particular application
+regerrs.h		Error message list
+regex.h			Exported API
+regexport.h		Exported API for regexport.c
+regguts.h		Internals declarations
+
+
+DFAs, NFAs, and all that
+------------------------
+
+This library is a hybrid DFA/NFA regex implementation.  (If you've never
+heard either of those terms, get thee to a first-year comp sci textbook.)
+It might not be clear at first glance what that really means and how it
+relates to what you'll see in the code.  Here's what really happens:
+
+* Initial parsing of a regex generates an NFA representation, with number
+of states approximately proportional to the length of the regexp.
+
+* The NFA is then optimized into a "compact NFA" representation, which is
+basically the same idea but without fields that are not going to be needed
+at runtime.  It is simplified too: the compact format only allows "plain"
+and "LACON" arc types.  The cNFA representation is what is passed from
+regcomp to regexec.
+
+* Unlike traditional NFA-based regex engines, we do not execute directly
+from the NFA representation, as that would require backtracking and so be
+very slow in some cases.  Rather, we execute a DFA, which ideally can
+process an input string in linear time (O(M) for M characters of input)
+without backtracking.  Each state of the DFA corresponds to a set of
+states of the NFA, that is all the states that the NFA might have been in
+upon reaching the current point in the input string.  Therefore, an NFA
+with N states might require as many as 2^N states in the corresponding
+DFA, which could easily require unreasonable amounts of memory.  We deal
+with this by materializing states of the DFA lazily (only when needed) and
+keeping them in a limited-size cache.  The possible need to build the same
+state of the DFA repeatedly makes this approach not truly O(M) time, but
+in the worst case as much as O(M*N).  That's still far better than the
+worst case for a backtracking NFA engine.
+
+If that were the end of it, we'd just say this is a DFA engine, with the
+use of NFAs being merely an implementation detail.  However, a DFA engine
+cannot handle some important regex features such as capturing parens and
+back-references.  If the parser finds that a regex uses these features
+(collectively called "messy cases" in the code), then we have to use
+NFA-style backtracking search after all.
+
+When using the NFA mode, the representation constructed by the parser
+consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
+either plain regular expressions (which are executed as DFAs in the manner
+described above) or back-references (which try to match the input to some
+previous substring).  Non-leaf nodes are capture nodes (which save the
+location of the substring currently matching their child node),
+concatenation, alternation, or iteration nodes.  At execution time, the
+executor recursively scans the tree.  At concatenation, alternation, or
+iteration nodes, it considers each possible alternative way of matching the
+input string, that is each place where the string could be split for a
+concatenation or iteration, or each child node for an alternation.  It
+tries the next alternative if the match fails according to the child nodes.
+This is exactly the sort of backtracking search done by a traditional NFA
+regex engine.  If there are many tree levels it can get very slow.
+
+But all is not lost: we can still be smarter than the average pure NFA
+engine.  To do this, each subre node has an associated DFA, which
+represents what the node could possibly match insofar as a mathematically
+pure regex can describe that, which basically means "no backrefs".
+Before we perform any search of possible alternative sub-matches, we run
+the DFA to see if it thinks the proposed substring could possibly match.
+If not, we can reject the match immediately without iterating through many
+possibilities.
+
+As an example, consider the regex "(a[bc]+)\1".  The compiled
+representation will have a top-level concatenation subre node.  Its left
+child is a capture node, and the child of that is a plain DFA node for
+"a[bc]+".  The concatenation's right child is a backref node for \1.
+The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
+where the backref has been replaced by a copy of the DFA for its referent
+expression.  When executed, the concatenation node will have to search for
+a possible division of the input string that allows its two child nodes to
+each match their part of the string (and although this specific case can
+only succeed when the division is at the middle, the code does not know
+that, nor would it be true in general).  However, we can first run the DFA
+and quickly reject any input that doesn't start with an "a" and contain
+one more "a" plus some number of b's and c's.  If the DFA doesn't match,
+there is no need to recurse to the two child nodes for each possible
+string division point.  In many cases, this prefiltering makes the search
+run much faster than a pure NFA engine could do.  It is this behavior that
+justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
+library.
+
+
+Colors and colormapping
+-----------------------
+
+In many common regex patterns, there are large numbers of characters that
+can be treated alike by the execution engine.  A simple example is the
+pattern "[[:alpha:]][[:alnum:]]*" for an identifier.  Basically the engine
+only needs to care whether an input symbol is a letter, a digit, or other.
+We could build the NFA or DFA with a separate arc for each possible letter
+and digit, but that's very wasteful of space and not so cheap to execute
+either, especially when dealing with Unicode which can have thousands of
+letters.  Instead, the parser builds a "color map" that maps each possible
+input symbol to a "color", or equivalence class.  The NFA or DFA
+representation then has arcs labeled with colors, not specific input
+symbols.  At execution, the first thing the executor does with each input
+symbol is to look up its color in the color map, and then everything else
+works from the color only.
+
+To build the colormap, we start by assigning every possible input symbol
+the color WHITE, which means "other" (that is, at the end of parsing, the
+symbols that are still WHITE are those not explicitly referenced anywhere
+in the regex).  When we see a simple literal character or a bracket
+expression in the regex, we want to assign that character, or all the
+characters represented by the bracket expression, a unique new color that
+can be used to label the NFA arc corresponding to the state transition for
+matching this character or bracket expression.  The basic idea is:
+first, change the color assigned to a character to some new value;
+second, run through all the existing arcs in the partially-built NFA,
+and for each one referencing the character's old color, add a parallel
+arc referencing its new color (this keeps the reassignment from changing
+the semantics of what we already built); and third, add a new arc with
+the character's new color to the current pair of NFA states, denoting
+that seeing this character allows the state transition to be made.
+
+This is complicated a bit by not wanting to create more colors
+(equivalence classes) than absolutely necessary.  In particular, if a
+bracket expression mentions two characters that had the same color before,
+they should still share the same color after we process the bracket, since
+there is still not a need to distinguish them.  But we do need to
+distinguish them from other characters that previously had the same color
+yet are not listed in the bracket expression.  To mechanize this, the code
+has a concept of "parent colors" and "subcolors", where a color's subcolor
+is the new color that we are giving to any characters of that color while
+parsing the current atom.  (The word "parent" is a bit unfortunate here,
+because it suggests a long-lived relationship, but a subcolor link really
+only lasts for the duration of parsing a single atom.)  In other words,
+a subcolor link means that we are in process of splitting the parent color
+into two colors (equivalence classes), depending on whether or not each
+member character should be included by the current regex atom.
+
+As an example, suppose we have the regex "a\d\wx".  Initially all possible
+character codes are labeled WHITE (color 0).  To parse the atom "a", we
+create a new color (1), update "a"'s color map entry to 1, and create an
+arc labeled 1 between the first two states of the NFA.  Now we see \d,
+which is really a bracket expression containing the digits "0"-"9".
+First we process "0", which is currently WHITE, so we create a new color
+(2), update "0"'s color map entry to 2, and create an arc labeled 2
+between the second and third states of the NFA.  We also mark color WHITE
+as having the subcolor 2, which means that future relabelings of WHITE
+characters should also select 2 as the new color.  Thus, when we process
+"1", we won't create a new color but re-use 2.  We update "1"'s color map
+entry to 2, and then find that we don't need a new arc because there is
+already one labeled 2 between the second and third states of the NFA.
+Similarly for the other 8 digits, so there will be only one arc labeled 2
+between NFA states 2 and 3 for all members of this bracket expression.
+At completion of processing of the bracket expression, we call okcolors()
+which breaks all the existing parent/subcolor links; there is no longer a
+marker saying that WHITE characters should be relabeled 2.  (Note:
+actually, we did the same creation and clearing of a subcolor link for the
+primitive atom "a", but it didn't do anything very interesting.)  Now we
+come to the "\w" bracket expression, which for simplicity assume expands
+to just "[a-z0-9]".  We process "a", but observe that it is already the
+sole member of its color 1.  This means there is no need to subdivide that
+equivalence class more finely, so we do not create any new color.  We just
+make an arc labeled 1 between the third and fourth NFA states.  Next we
+process "b", which is WHITE and far from the only WHITE character, so we
+create a new color (3), link that as WHITE's subcolor, relabel "b" as
+color 3, and make an arc labeled 3.  As we process "c" through "z", each
+is relabeled from WHITE to 3, but no new arc is needed.  Now we come to
+"0", which is not the only member of its color 2, so we suppose that a new
+color is needed and create color 4.  We link 4 as subcolor of 2, relabel
+"0" as color 4 in the map, and add an arc for color 4.  Next "1" through
+"9" are similarly relabeled as color 4, with no additional arcs needed.
+Having finished the bracket expression, we call okcolors(), which breaks
+the subcolor links.  okcolors() further observes that we have removed
+every member of color 2 (the previous color of the digit characters).
+Therefore, it runs through the partial NFA built so far and relabels arcs
+labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is
+relabeled color 4.  Then it frees up color 2, since we have no more use
+for that color.  We now have an NFA in which transitions for digits are
+consistently labeled with color 4.  Last, we come to the atom "x".
+"x" is currently labeled with color 3, and it's not the only member of
+that color, so we realize that we now need to distinguish "x" from other
+letters when we did not before.  We create a new color, which might have
+been 5 but instead we recycle the unused color 2.  "x" is relabeled 2 in
+the color map and 2 is linked as the subcolor of 3, and we add an arc for
+2 between states 4 and 5 of the NFA.  Now we call okcolors(), which breaks
+the subcolor link between colors 3 and 2 and notices that both colors are
+nonempty.  Therefore, it also runs through the existing NFA arcs and adds
+an additional arc labeled 2 wherever there is an arc labeled 3; this
+action ensures that characters of color 2 (i.e., "x") will still be
+considered as allowing any transitions they did before.  We are now done
+parsing the regex, and we have these final color assignments:
+	color 1: "a"
+	color 2: "x"
+	color 3: other letters
+	color 4: digits
+and the NFA has these arcs:
+	states 1 -> 2 on color 1 (hence, "a" only)
+	states 2 -> 3 on color 4 (digits)
+	states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters)
+	states 4 -> 5 on color 2 ("x" only)
+which can be seen to be a correct representation of the regex.
+
+Given this summary, we can see we need the following operations for
+colors:
+
+* A fast way to look up the current color assignment for any character
+  code.  (This is needed during both parsing and execution, while the
+  remaining operations are needed only during parsing.)
+* A way to alter the color assignment for any given character code.
+* We must track the number of characters currently assigned to each
+  color, so that we can detect empty and singleton colors.
+* We must track all existing NFA arcs of a given color, so that we
+  can relabel them at need, or add parallel arcs of a new color when
+  an existing color has to be subdivided.
+
+The last two of these are handled with the "struct colordesc" array and
+the "colorchain" links in NFA arc structs.
+
+Ideally, we'd do the first two operations using a simple linear array
+storing the current color assignment for each character code.
+Unfortunately, that's not terribly workable for large charsets such as
+Unicode.  Our solution is to divide the color map into two parts.  A simple
+linear array is used for character codes up to MAX_SIMPLE_CHR, which can be
+chosen large enough to include all popular characters (so that the
+significantly-slower code paths about to be described are seldom invoked).
+Characters above that need be considered at compile time only if they
+appear explicitly in the regex pattern.  We store each such mentioned
+character or character range as an entry in the "colormaprange" array in
+the colormap.  (Overlapping ranges are split into unique subranges, so that
+each range in the finished list needs only a single color that describes
+all its characters.)  When mapping a character above MAX_SIMPLE_CHR to a
+color at runtime, we search this list of ranges explicitly.
+
+That's still not quite enough, though, because of locale-dependent
+character classes such as [[:alpha:]].  In Unicode locales these classes
+may have thousands of entries that are above MAX_SIMPLE_CHR, and we
+certainly don't want to be searching large colormaprange arrays at runtime.
+Nor do we even want to spend the time to initialize cvec structures that
+exhaustively describe all of those characters.  Our solution is to compute
+exact per-character colors at regex compile time only up to MAX_SIMPLE_CHR.
+For characters above that, we apply the <ctype.h> or <wctype.h> lookup
+functions at runtime for each locale-dependent character class used in the
+regex pattern, constructing a bitmap that describes which classes the
+runtime character belongs to.  The per-character-range data structure
+mentioned above actually holds, for each range, a separate color entry
+for each possible combination of character class properties.  That is,
+the color map for characters above MAX_SIMPLE_CHR is really a 2-D array,
+whose rows correspond to high characters or character ranges that are
+explicitly mentioned in the regex pattern, and whose columns correspond
+to sets of the locale-dependent character classes that are used in the
+regex.
+
+As an example, given the pattern '\w\u1234[\U0001D100-\U0001D1FF]'
+(and supposing that MAX_SIMPLE_CHR is less than 0x1234), we will need
+a high color map with three rows.  One row is for the single character
+U+1234 (represented as a single-element range), one is for the range
+U+1D100..U+1D1FF, and the other row represents all remaining high
+characters.  The color map has two columns, one for characters that
+satisfy iswalnum() and one for those that don't.
+
+We build this color map in parallel with scanning the regex.  Each time
+we detect a new explicit high character (or range) or a locale-dependent
+character class, we split existing entry(s) in the high color map so that
+characters we need to be able to distinguish will have distinct entries
+that can be given separate colors.  Often, though, single entries in the
+high color map will represent very large sets of characters.
+
+If there are both explicit high characters/ranges and locale-dependent
+character classes, we may have entries in the high color map array that
+have non-WHITE colors but don't actually represent any real characters.
+(For example, in a row representing a singleton range, only one of the
+columns could possibly be a live entry; it's the one matching the actual
+locale properties for that single character.)  We don't currently make
+any effort to reclaim such colors.  In principle it could be done, but
+it's not clear that it's worth the trouble.
+
+
+Detailed semantics of an NFA
+----------------------------
+
+When trying to read dumped-out NFAs, it's helpful to know these facts:
+
+State 0 (additionally marked with "@" in dumpnfa's output) is always the
+goal state, and state 1 (additionally marked with ">") is the start state.
+(The code refers to these as the post state and pre state respectively.)
+
+The possible arc types are:
+
+    PLAIN arcs, which specify matching of any character of a given "color"
+    (see above).  These are dumped as "[color_number]->to_state".
+
+    EMPTY arcs, which specify a no-op transition to another state.  These
+    are dumped as "->to_state".
+
+    AHEAD constraints, which represent a "next character must be of this
+    color" constraint.  AHEAD differs from a PLAIN arc in that the input
+    character is not consumed when crossing the arc.  These are dumped as
+    ">color_number>->to_state".
+
+    BEHIND constraints, which represent a "previous character must be of
+    this color" constraint, which likewise consumes no input.  These are
+    dumped as "<color_number<->to_state".
+
+    '^' arcs, which specify a beginning-of-input constraint.  These are
+    dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
+    beginning-of-line constraints respectively.
+
+    '$' arcs, which specify an end-of-input constraint.  These are dumped
+    as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line
+    constraints respectively.
+
+    LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and
+    "(?<!re)" constraints, i.e. the input starting/ending at this point must
+    match (or not match) a given sub-RE, but the matching input is not
+    consumed.  These are dumped as ":subtree_number:->to_state".
+
+If you see anything else (especially any question marks) in the display of
+an arc, it's dumpnfa() trying to tell you that there's something fishy
+about the arc; see the source code.
+
+The regex executor can only handle PLAIN and LACON transitions.  The regex
+optimize() function is responsible for transforming the parser's output
+to get rid of all the other arc types.  In particular, ^ and $ arcs that
+are not dropped as impossible will always end up adjacent to the pre or
+post state respectively, and then will be converted into PLAIN arcs that
+mention the special "colors" for BOS, BOL, EOS, or EOL.
+
+To decide whether a thus-transformed NFA matches a given substring of the
+input string, the executor essentially follows these rules:
+1. Start the NFA "looking at" the character *before* the given substring,
+or if the substring is at the start of the input, prepend an imaginary BOS
+character instead.
+2. Run the NFA until it has consumed the character *after* the given
+substring, or an imaginary following EOS character if the substring is at
+the end of the input.
+3. If the NFA is (or can be) in the goal state at this point, it matches.
+
+So one can mentally execute an untransformed NFA by taking ^ and $ as
+ordinary constraints that match at start and end of input; but plain
+arcs out of the start state should be taken as matches for the character
+before the target substring, and similarly, plain arcs leading to the
+post state are matches for the character after the target substring.
+This definition is necessary to support regexes that begin or end with
+constraints such as \m and \M, which imply requirements on the adjacent
+character if any.  NFAs for simple unanchored patterns will usually have
+pre-state outarcs for all possible character colors as well as BOS and
+BOL, and post-state inarcs for all possible character colors as well as
+EOS and EOL, so that the executor's behavior will work.
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n
new file mode 100644
index 0000000..4621bfc
--- /dev/null
+++ b/src/backend/regex/re_syntax.n
@@ -0,0 +1,979 @@
+'\"
+'\" Copyright (c) 1998 Sun Microsystems, Inc.
+'\" Copyright (c) 1999 Scriptics Corporation
+'\"
+'\" This software is copyrighted by the Regents of the University of
+'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+'\" Corporation and other parties.  The following terms apply to all files
+'\" associated with the software unless explicitly disclaimed in
+'\" individual files.
+'\" 
+'\" The authors hereby grant permission to use, copy, modify, distribute,
+'\" and license this software and its documentation for any purpose, provided
+'\" that existing copyright notices are retained in all copies and that this
+'\" notice is included verbatim in any distributions. No written agreement,
+'\" license, or royalty fee is required for any of the authorized uses.
+'\" Modifications to this software may be copyrighted by their authors
+'\" and need not follow the licensing terms described here, provided that
+'\" the new terms are clearly indicated on the first page of each file where
+'\" they apply.
+'\" 
+'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+'\" POSSIBILITY OF SUCH DAMAGE.
+'\" 
+'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+'\" MODIFICATIONS.
+'\" 
+'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
+'\" U.S. government, the Government shall have only "Restricted Rights"
+'\" in the software and related documentation as defined in the Federal 
+'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+'\" are acquiring the software on behalf of the Department of Defense, the
+'\" software shall be classified as "Commercial Computer Software" and the
+'\" Government shall have only "Restricted Rights" as defined in Clause
+'\" 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+'\" authors grant the U.S. Government and others acting in its behalf
+'\" permission to use and distribute the software in accordance with the
+'\" terms specified in this license. 
+'\" 
+'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp 
+'\"
+.so man.macros
+.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
+.BS
+.SH NAME
+re_syntax \- Syntax of Tcl regular expressions.
+.BE
+
+.SH DESCRIPTION
+.PP
+A \fIregular expression\fR describes strings of characters.
+It's a pattern that matches certain strings and doesn't match others.
+
+.SH "DIFFERENT FLAVORS OF REs"
+Regular expressions (``RE''s), as defined by POSIX, come in two
+flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
+EREs are roughly those of the traditional \fIegrep\fR, while BREs are
+roughly those of the traditional \fIed\fR.  This implementation adds
+a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
+some significant extensions.
+.PP
+This manual page primarily describes AREs.  BREs mostly exist for
+backward compatibility in some old programs; they will be discussed at
+the end.  POSIX EREs are almost an exact subset of AREs.  Features of
+AREs that are not present in EREs will be indicated.
+
+.SH "REGULAR EXPRESSION SYNTAX"
+.PP
+Tcl regular expressions are implemented using the package written by
+Henry Spencer, based on the 1003.2 spec and some (not quite all) of
+the Perl5 extensions (thanks, Henry!).  Much of the description of
+regular expressions below is copied verbatim from his manual entry.
+.PP
+An ARE is one or more \fIbranches\fR,
+separated by `\fB|\fR',
+matching anything that matches any of the branches.
+.PP
+A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
+concatenated.
+It matches a match for the first, followed by a match for the second, etc;
+an empty branch matches the empty string.
+.PP
+A quantified atom is an \fIatom\fR possibly followed
+by a single \fIquantifier\fR.
+Without a quantifier, it matches a match for the atom.
+The quantifiers,
+and what a so-quantified atom matches, are:
+.RS 2
+.TP 6
+\fB*\fR
+a sequence of 0 or more matches of the atom
+.TP
+\fB+\fR
+a sequence of 1 or more matches of the atom
+.TP
+\fB?\fR
+a sequence of 0 or 1 matches of the atom
+.TP
+\fB{\fIm\fB}\fR
+a sequence of exactly \fIm\fR matches of the atom
+.TP
+\fB{\fIm\fB,}\fR
+a sequence of \fIm\fR or more matches of the atom
+.TP
+\fB{\fIm\fB,\fIn\fB}\fR
+a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
+\fIm\fR may not exceed \fIn\fR
+.TP
+\fB*?  +?  ??  {\fIm\fB}?  {\fIm\fB,}?  {\fIm\fB,\fIn\fB}?\fR
+\fInon-greedy\fR quantifiers,
+which match the same possibilities,
+but prefer the smallest number rather than the largest number
+of matches (see MATCHING)
+.RE
+.PP
+The forms using
+\fB{\fR and \fB}\fR
+are known as \fIbound\fRs.
+The numbers
+\fIm\fR and \fIn\fR are unsigned decimal integers
+with permissible values from 0 to 255 inclusive.
+.PP
+An atom is one of:
+.RS 2
+.TP 6
+\fB(\fIre\fB)\fR
+(where \fIre\fR is any regular expression)
+matches a match for
+\fIre\fR, with the match noted for possible reporting
+.TP
+\fB(?:\fIre\fB)\fR
+as previous,
+but does no reporting
+(a ``non-capturing'' set of parentheses)
+.TP
+\fB()\fR
+matches an empty string,
+noted for possible reporting
+.TP
+\fB(?:)\fR
+matches an empty string,
+without reporting
+.TP
+\fB[\fIchars\fB]\fR
+a \fIbracket expression\fR,
+matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
+.TP
+ \fB.\fR
+matches any single character
+.TP
+\fB\e\fIk\fR
+(where \fIk\fR is a non-alphanumeric character)
+matches that character taken as an ordinary character,
+e.g. \e\e matches a backslash character
+.TP
+\fB\e\fIc\fR
+where \fIc\fR is alphanumeric
+(possibly followed by other characters),
+an \fIescape\fR (AREs only),
+see ESCAPES below
+.TP
+\fB{\fR
+when followed by a character other than a digit,
+matches the left-brace character `\fB{\fR';
+when followed by a digit, it is the beginning of a
+\fIbound\fR (see above)
+.TP
+\fIx\fR
+where \fIx\fR is
+a single character with no other significance, matches that character.
+.RE
+.PP
+A \fIconstraint\fR matches an empty string when specific conditions
+are met.
+A constraint may not be followed by a quantifier.
+The simple constraints are as follows; some more constraints are
+described later, under ESCAPES.
+.RS 2
+.TP 8
+\fB^\fR
+matches at the beginning of a line
+.TP
+\fB$\fR
+matches at the end of a line
+.TP
+\fB(?=\fIre\fB)\fR
+\fIpositive lookahead\fR (AREs only), matches at any point
+where a substring matching \fIre\fR begins
+.TP
+\fB(?!\fIre\fB)\fR
+\fInegative lookahead\fR (AREs only), matches at any point
+where no substring matching \fIre\fR begins
+.TP
+\fB(?<=\fIre\fB)\fR
+\fIpositive lookbehind\fR (AREs only), matches at any point
+where a substring matching \fIre\fR ends
+.TP
+\fB(?<!\fIre\fB)\fR
+\fInegative lookbehind\fR (AREs only), matches at any point
+where no substring matching \fIre\fR ends
+.RE
+.PP
+Lookahead and lookbehind constraints may not contain back references
+(see later), and all parentheses within them are considered non-capturing.
+.PP
+An RE may not end with `\fB\e\fR'.
+
+.SH "BRACKET EXPRESSIONS"
+A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
+It normally matches any single character from the list (but see below).
+If the list begins with `\fB^\fR',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+.PP
+If two characters in the list are separated by `\fB\-\fR',
+this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g.
+\fB[0\-9]\fR
+in ASCII matches any decimal digit.
+Two ranges may not share an
+endpoint, so e.g.
+\fBa\-c\-e\fR
+is illegal.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal
+\fB]\fR
+or
+\fB\-\fR
+in the list,
+the simplest method is to
+enclose it in
+\fB[.\fR and \fB.]\fR
+to make it a collating element (see below).
+Alternatively,
+make it the first character
+(following a possible `\fB^\fR'),
+or (AREs only) precede it with `\fB\e\fR'.
+Alternatively, for `\fB\-\fR',
+make it the last character,
+or the second endpoint of a range.
+To use a literal
+\fB\-\fR
+as the first endpoint of a range,
+make it a collating element
+or (AREs only) precede it with `\fB\e\fR'.
+With the exception of these, some combinations using
+\fB[\fR
+(see next
+paragraphs), and escapes,
+all other special characters lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in
+\fB[.\fR and \fB.]\fR
+stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression in a locale that has
+multi-character collating elements
+can thus match more than one character.
+.VS 8.2
+So (insidiously), a bracket expression that starts with \fB^\fR
+can match multi-character collating elements even if none of them
+appear in the bracket expression!
+(\fINote:\fR Tcl currently has no multi-character collating elements.
+This information is only for illustration.)
+.PP
+For example, assume the collating sequence includes a \fBch\fR
+multi-character collating element.
+Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
+matches the first five characters of `\fBchchcc\fR'.
+Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
+(because \fB[^c]\fR matches the multi-character \fBch\fR).
+.VE 8.2
+.PP
+Within a bracket expression, a collating element enclosed in
+\fB[=\fR
+and
+\fB=]\fR
+is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
+and `\fB.]\fR'.)
+For example, if
+\fBo\fR
+and
+\fB\o'o^'\fR
+are the members of an equivalence class,
+then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
+and `\fB[o\o'o^']\fR'\&
+are all synonymous.
+An equivalence class may not be an endpoint
+of a range.
+.VS 8.2
+(\fINote:\fR 
+Tcl currently implements only the Unicode locale.
+It doesn't define any equivalence classes.
+The examples above are just illustrations.)
+.VE 8.2
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in
+\fB[:\fR
+and
+\fB:]\fR
+stands for the list of all characters
+(not all collating elements!)
+belonging to that
+class.
+Standard character classes are:
+.PP
+.RS
+.ne 5
+.nf
+.ta 3c
+\fBalpha\fR	A letter. 
+\fBupper\fR	An upper-case letter. 
+\fBlower\fR	A lower-case letter. 
+\fBdigit\fR	A decimal digit. 
+\fBxdigit\fR	A hexadecimal digit. 
+\fBalnum\fR	An alphanumeric (letter or digit). 
+\fBprint\fR	An alphanumeric (same as alnum).
+\fBblank\fR	A space or tab character.
+\fBspace\fR	A character producing white space in displayed text. 
+\fBpunct\fR	A punctuation character. 
+\fBgraph\fR	A character with a visible representation. 
+\fBcntrl\fR	A control character. 
+.fi
+.RE
+.PP
+A locale may provide others.
+.VS 8.2
+(Note that the current Tcl implementation has only one locale:
+the Unicode locale.)
+.VE 8.2
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases of bracket expressions:
+the bracket expressions
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+are constraints, matching empty strings at
+the beginning and end of a word respectively.
+'\" note, discussion of escapes below references this definition of word
+A word is defined as a sequence of
+word characters
+that is neither preceded nor followed by
+word characters.
+A word character is an
+\fIalnum\fR
+character
+or an underscore
+(\fB_\fR).
+These special bracket expressions are deprecated;
+users of AREs should use constraint escapes instead (see below).
+.SH ESCAPES
+Escapes (AREs only), which begin with a
+\fB\e\fR
+followed by an alphanumeric character,
+come in several varieties:
+character entry, class shorthands, constraint escapes, and back references.
+A
+\fB\e\fR
+followed by an alphanumeric character but not constituting
+a valid escape is illegal in AREs.
+In EREs, there are no escapes:
+outside a bracket expression,
+a
+\fB\e\fR
+followed by an alphanumeric character merely stands for that
+character as an ordinary character,
+and inside a bracket expression,
+\fB\e\fR
+is an ordinary character.
+(The latter is the one actual incompatibility between EREs and AREs.)
+.PP
+Character-entry escapes (AREs only) exist to make it easier to specify
+non-printing and otherwise inconvenient characters in REs:
+.RS 2
+.TP 5
+\fB\ea\fR
+alert (bell) character, as in C
+.TP
+\fB\eb\fR
+backspace, as in C
+.TP
+\fB\eB\fR
+synonym for
+\fB\e\fR
+to help reduce backslash doubling in some
+applications where there are multiple levels of backslash processing
+.TP
+\fB\ec\fIX\fR
+(where X is any character) the character whose
+low-order 5 bits are the same as those of
+\fIX\fR,
+and whose other bits are all zero
+.TP
+\fB\ee\fR
+the character whose collating-sequence name
+is `\fBESC\fR',
+or failing that, the character with octal value 033
+.TP
+\fB\ef\fR
+formfeed, as in C
+.TP
+\fB\en\fR
+newline, as in C
+.TP
+\fB\er\fR
+carriage return, as in C
+.TP
+\fB\et\fR
+horizontal tab, as in C
+.TP
+\fB\eu\fIwxyz\fR
+(where
+\fIwxyz\fR
+is exactly four hexadecimal digits)
+the Unicode character
+\fBU+\fIwxyz\fR
+in the local byte ordering
+.TP
+\fB\eU\fIstuvwxyz\fR
+(where
+\fIstuvwxyz\fR
+is exactly eight hexadecimal digits)
+reserved for a somewhat-hypothetical Unicode extension to 32 bits
+.TP
+\fB\ev\fR
+vertical tab, as in C
+are all available.
+.TP
+\fB\ex\fIhhh\fR
+(where
+\fIhhh\fR
+is any sequence of hexadecimal digits)
+the character whose hexadecimal value is
+\fB0x\fIhhh\fR
+(a single character no matter how many hexadecimal digits are used).
+.TP
+\fB\e0\fR
+the character whose value is
+\fB0\fR
+.TP
+\fB\e\fIxy\fR
+(where
+\fIxy\fR
+is exactly two octal digits,
+and is not a
+\fIback reference\fR (see below))
+the character whose octal value is
+\fB0\fIxy\fR
+.TP
+\fB\e\fIxyz\fR
+(where
+\fIxyz\fR
+is exactly three octal digits,
+and is not a
+back reference (see below))
+the character whose octal value is
+\fB0\fIxyz\fR
+.RE
+.PP
+Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
+and `\fBA\fR'-`\fBF\fR'.
+Octal digits are `\fB0\fR'-`\fB7\fR'.
+.PP
+The character-entry escapes are always taken as ordinary characters.
+For example,
+\fB\e135\fR
+is
+\fB]\fR
+in ASCII,
+but
+\fB\e135\fR
+does not terminate a bracket expression.
+Beware, however, that some applications (e.g., C compilers) interpret 
+such sequences themselves before the regular-expression package
+gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
+.PP
+Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
+character classes:
+.RS 2
+.TP 10
+\fB\ed\fR
+\fB[[:digit:]]\fR
+.TP
+\fB\es\fR
+\fB[[:space:]]\fR
+.TP
+\fB\ew\fR
+\fB[[:alnum:]_]\fR
+(note underscore)
+.TP
+\fB\eD\fR
+\fB[^[:digit:]]\fR
+.TP
+\fB\eS\fR
+\fB[^[:space:]]\fR
+.TP
+\fB\eW\fR
+\fB[^[:alnum:]_]\fR
+(note underscore)
+.RE
+.PP
+Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
+and `\fB\ew\fR'\&
+lose their outer brackets,
+and `\fB\eD\fR', `\fB\eS\fR',
+and `\fB\eW\fR'\&
+are illegal.
+.VS 8.2
+(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
+Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+.VE 8.2
+.PP
+A constraint escape (AREs only) is a constraint,
+matching the empty string if specific conditions are met,
+written as an escape:
+.RS 2
+.TP 6
+\fB\eA\fR
+matches only at the beginning of the string
+(see MATCHING, below, for how this differs from `\fB^\fR')
+.TP
+\fB\em\fR
+matches only at the beginning of a word
+.TP
+\fB\eM\fR
+matches only at the end of a word
+.TP
+\fB\ey\fR
+matches only at the beginning or end of a word
+.TP
+\fB\eY\fR
+matches only at a point that is not the beginning or end of a word
+.TP
+\fB\eZ\fR
+matches only at the end of the string
+(see MATCHING, below, for how this differs from `\fB$\fR')
+.TP
+\fB\e\fIm\fR
+(where
+\fIm\fR
+is a nonzero digit) a \fIback reference\fR, see below
+.TP
+\fB\e\fImnn\fR
+(where
+\fIm\fR
+is a nonzero digit, and
+\fInn\fR
+is some more digits,
+and the decimal value
+\fImnn\fR
+is not greater than the number of closing capturing parentheses seen so far)
+a \fIback reference\fR, see below
+.RE
+.PP
+A word is defined as in the specification of
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+above.
+Constraint escapes are illegal within bracket expressions.
+.PP
+A back reference (AREs only) matches the same string matched by the parenthesized
+subexpression specified by the number,
+so that (e.g.)
+\fB([bc])\e1\fR
+matches
+\fBbb\fR
+or
+\fBcc\fR
+but not `\fBbc\fR'.
+The subexpression must entirely precede the back reference in the RE.
+Subexpressions are numbered in the order of their leading parentheses.
+Non-capturing parentheses do not define subexpressions.
+.PP
+There is an inherent historical ambiguity between octal character-entry 
+escapes and back references, which is resolved by heuristics,
+as hinted at above.
+A leading zero always indicates an octal escape.
+A single non-zero digit, not followed by another digit,
+is always taken as a back reference.
+A multi-digit sequence not starting with a zero is taken as a back 
+reference if it comes after a suitable subexpression
+(i.e. the number is in the legal range for a back reference),
+and otherwise is taken as octal.
+.SH "METASYNTAX"
+In addition to the main syntax described above, there are some special
+forms and miscellaneous syntactic facilities available.
+.PP
+Normally the flavor of RE being used is specified by
+application-dependent means.
+However, this can be overridden by a \fIdirector\fR.
+If an RE of any flavor begins with `\fB***:\fR',
+the rest of the RE is an ARE.
+If an RE of any flavor begins with `\fB***=\fR',
+the rest of the RE is taken to be a literal string,
+with all characters considered ordinary characters.
+.PP
+An ARE may begin with \fIembedded options\fR:
+a sequence
+\fB(?\fIxyz\fB)\fR
+(where
+\fIxyz\fR
+is one or more alphabetic characters)
+specifies options affecting the rest of the RE.
+These supplement, and can override,
+any options specified by the application.
+The available option letters are:
+.RS 2
+.TP 3
+\fBb\fR
+rest of RE is a BRE
+.TP 3
+\fBc\fR
+case-sensitive matching (usual default)
+.TP 3
+\fBe\fR
+rest of RE is an ERE
+.TP 3
+\fBi\fR
+case-insensitive matching (see MATCHING, below)
+.TP 3
+\fBm\fR
+historical synonym for
+\fBn\fR
+.TP 3
+\fBn\fR
+newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBp\fR
+partial newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBq\fR
+rest of RE is a literal (``quoted'') string, all ordinary characters
+.TP 3
+\fBs\fR
+non-newline-sensitive matching (usual default)
+.TP 3
+\fBt\fR
+tight syntax (usual default; see below)
+.TP 3
+\fBw\fR
+inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
+.TP 3
+\fBx\fR
+expanded syntax (see below)
+.RE
+.PP
+Embedded options take effect at the
+\fB)\fR
+terminating the sequence.
+They are available only at the start of an ARE,
+and may not be used later within it.
+.PP
+In addition to the usual (\fItight\fR) RE syntax, in which all characters are
+significant, there is an \fIexpanded\fR syntax,
+available in all flavors of RE
+with the \fB-expanded\fR switch, or in AREs with the embedded x option.
+In the expanded syntax,
+white-space characters are ignored
+and all characters between a
+\fB#\fR
+and the following newline (or the end of the RE) are ignored,
+permitting paragraphing and commenting a complex RE.
+There are three exceptions to that basic rule:
+.RS 2
+.PP
+a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
+.PP
+white space or `\fB#\fR' within a bracket expression is retained
+.PP
+white space and comments are illegal within multi-character symbols
+like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
+.RE
+.PP
+Expanded-syntax white-space characters are blank, tab, newline, and
+.VS 8.2
+any character that belongs to the \fIspace\fR character class.
+.VE 8.2
+.PP
+Finally, in an ARE,
+outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
+(where
+\fIttt\fR
+is any text not containing a `\fB)\fR')
+is a comment,
+completely ignored.
+Again, this is not allowed between the characters of
+multi-character symbols like `\fB(?:\fR'.
+Such comments are more a historical artifact than a useful facility,
+and their use is deprecated;
+use the expanded syntax instead.
+.PP
+\fINone\fR of these metasyntax extensions is available if the application
+(or an initial
+\fB***=\fR
+director)
+has specified that the user's input be treated as a literal string
+rather than as an RE.
+.SH MATCHING
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+its choice is determined by its \fIpreference\fR:
+either the longest substring, or the shortest.
+.PP
+Most atoms, and all constraints, have no preference.
+A parenthesized RE has the same preference (possibly none) as the RE.
+A quantified atom with quantifier
+\fB{\fIm\fB}\fR
+or
+\fB{\fIm\fB}?\fR
+has the same preference (possibly none) as the atom itself.
+A quantified atom with other normal quantifiers (including
+\fB{\fIm\fB,\fIn\fB}\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers longest match.
+A quantified atom with other non-greedy quantifiers (including
+\fB{\fIm\fB,\fIn\fB}?\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers shortest match.
+A branch has the same preference as the first quantified atom in it
+which has a preference.
+An RE consisting of two or more branches connected by the
+\fB|\fR
+operator prefers longest match.
+.PP
+Subject to the constraints imposed by the rules for matching the whole RE,
+subexpressions also match the longest or shortest possible substrings,
+based on their preferences,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that outer subexpressions thus take priority over
+their component subexpressions.
+.PP
+Note that the quantifiers
+\fB{1,1}\fR
+and
+\fB{1,1}?\fR
+can be used to force longest and shortest preference, respectively,
+on a subexpression or a whole RE.
+.PP
+Match lengths are measured in characters, not collating elements.
+An empty string is considered longer than no match at all.
+For example,
+\fBbb*\fR
+matches the three middle characters of `\fBabbbc\fR',
+\fB(week|wee)(night|knights)\fR
+matches all ten characters of `\fBweeknights\fR',
+when
+\fB(.*).*\fR
+is matched against
+\fBabc\fR
+the parenthesized subexpression
+matches all three characters, and
+when
+\fB(a*)*\fR
+is matched against
+\fBbc\fR
+both the whole RE and the parenthesized
+subexpression match an empty string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+so that
+\fBx\fR
+becomes `\fB[xX]\fR'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that
+\fB[x]\fR
+becomes
+\fB[xX]\fR
+and
+\fB[^x]\fR
+becomes `\fB[^xX]\fR'.
+.PP
+If newline-sensitive matching is specified, \fB.\fR
+and bracket expressions using
+\fB^\fR
+will never match the newline character
+(so that matches will never cross newlines unless the RE
+explicitly arranges it)
+and
+\fB^\fR
+and
+\fB$\fR
+will match the empty string after and before a newline
+respectively, in addition to matching at beginning and end of string
+respectively.
+ARE
+\fB\eA\fR
+and
+\fB\eZ\fR
+continue to match beginning or end of string \fIonly\fR.
+.PP
+If partial newline-sensitive matching is specified,
+this affects \fB.\fR
+and bracket expressions
+as with newline-sensitive matching, but not
+\fB^\fR
+and `\fB$\fR'.
+.PP
+If inverse partial newline-sensitive matching is specified,
+this affects
+\fB^\fR
+and
+\fB$\fR
+as with
+newline-sensitive matching,
+but not \fB.\fR
+and bracket expressions.
+This isn't very useful but is provided for symmetry.
+.SH "LIMITS AND COMPATIBILITY"
+No particular limit is imposed on the length of REs.
+Programs intended to be highly portable should not employ REs longer
+than 256 bytes,
+as a POSIX-compliant implementation can refuse to accept such REs.
+.PP
+The only feature of AREs that is actually incompatible with
+POSIX EREs is that
+\fB\e\fR
+does not lose its special
+significance inside bracket expressions.
+All other ARE features use syntax which is illegal or has
+undefined or unspecified effects in POSIX EREs;
+the
+\fB***\fR
+syntax of directors likewise is outside the POSIX
+syntax for both BREs and EREs.
+.PP
+Many of the ARE extensions are borrowed from Perl, but some have
+been changed to clean them up, and a few Perl extensions are not present.
+Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
+the lack of special treatment for a trailing newline,
+the addition of complemented bracket expressions to the things
+affected by newline-sensitive matching,
+the restrictions on parentheses and back references in lookahead/lookbehind
+constraints,
+and the longest/shortest-match (rather than first-match) matching semantics.
+.PP
+The matching rules for REs containing both normal and non-greedy quantifiers
+have changed since early beta-test versions of this package.
+(The new rules are much simpler and cleaner,
+but don't work as hard at guessing the user's real intentions.)
+.PP
+Henry Spencer's original 1986 \fIregexp\fR package,
+still in widespread use (e.g., in pre-8.1 releases of Tcl),
+implemented an early version of today's EREs.
+There are four incompatibilities between \fIregexp\fR's near-EREs
+(`RREs' for short) and AREs.
+In roughly increasing order of significance:
+.PP
+.RS
+In AREs,
+\fB\e\fR
+followed by an alphanumeric character is either an
+escape or an error,
+while in RREs, it was just another way of writing the 
+alphanumeric.
+This should not be a problem because there was no reason to write
+such a sequence in RREs.
+.PP
+\fB{\fR
+followed by a digit in an ARE is the beginning of a bound,
+while in RREs,
+\fB{\fR
+was always an ordinary character.
+Such sequences should be rare,
+and will often result in an error because following characters
+will not look like a valid bound.
+.PP
+In AREs,
+\fB\e\fR
+remains a special character within `\fB[\|]\fR',
+so a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+must be written `\fB\e\e\fR'.
+\fB\e\e\fR
+also gives a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+in RREs,
+but only truly paranoid programmers routinely doubled the backslash.
+.PP
+AREs report the longest/shortest match for the RE,
+rather than the first found in a specified search order.
+This may affect some RREs which were written in the expectation that
+the first match would be reported.
+(The careful crafting of RREs to optimize the search order for fast
+matching is obsolete (AREs examine all possible matches
+in parallel, and their performance is largely insensitive to their
+complexity) but cases where the search order was exploited to deliberately 
+find a match which was \fInot\fR the longest/shortest will need rewriting.)
+.RE
+
+.SH "BASIC REGULAR EXPRESSIONS"
+BREs differ from EREs in several respects.  `\fB|\fR', `\fB+\fR',
+and
+\fB?\fR
+are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are
+\fB\e{\fR
+and `\fB\e}\fR',
+with
+\fB{\fR
+and
+\fB}\fR
+by themselves ordinary characters.
+The parentheses for nested subexpressions are
+\fB\e(\fR
+and `\fB\e)\fR',
+with
+\fB(\fR
+and
+\fB)\fR
+by themselves ordinary characters.
+\fB^\fR
+is an ordinary character except at the beginning of the
+RE or the beginning of a parenthesized subexpression,
+\fB$\fR
+is an ordinary character except at the end of the
+RE or the end of a parenthesized subexpression,
+and
+\fB*\fR
+is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `\fB^\fR').
+Finally,
+single-digit back references are available,
+and
+\fB\e<\fR
+and
+\fB\e>\fR
+are synonyms for
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+respectively;
+no other escapes are available.
+
+.SH "SEE ALSO"
+RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
+
+.SH KEYWORDS
+match, regular expression, string
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
new file mode 100644
index 0000000..f5a4151
--- /dev/null
+++ b/src/backend/regex/regc_color.c
@@ -0,0 +1,1140 @@
+/*
+ * colorings of characters
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_color.c
+ *
+ *
+ * Note that there are some incestuous relationships between this code and
+ * NFA arc maintenance, which perhaps ought to be cleaned up sometime.
+ */
+
+
+
+#define CISERR()	VISERR(cm->v)
+#define CERR(e)		VERR(cm->v, (e))
+
+
+
+/*
+ * initcm - set up new colormap
+ */
+static void
+initcm(struct vars *v,
+	   struct colormap *cm)
+{
+	struct colordesc *cd;
+
+	cm->magic = CMMAGIC;
+	cm->v = v;
+
+	cm->ncds = NINLINECDS;
+	cm->cd = cm->cdspace;
+	cm->max = 0;
+	cm->free = 0;
+
+	cd = cm->cd;				/* cm->cd[WHITE] */
+	cd->nschrs = MAX_SIMPLE_CHR - CHR_MIN + 1;
+	cd->nuchrs = 1;
+	cd->sub = NOSUB;
+	cd->arcs = NULL;
+	cd->firstchr = CHR_MIN;
+	cd->flags = 0;
+
+	cm->locolormap = (color *)
+		MALLOC((MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color));
+	if (cm->locolormap == NULL)
+	{
+		CERR(REG_ESPACE);
+		cm->cmranges = NULL;	/* prevent failure during freecm */
+		cm->hicolormap = NULL;
+		return;
+	}
+	/* this memset relies on WHITE being zero: */
+	memset(cm->locolormap, WHITE,
+		   (MAX_SIMPLE_CHR - CHR_MIN + 1) * sizeof(color));
+
+	memset(cm->classbits, 0, sizeof(cm->classbits));
+	cm->numcmranges = 0;
+	cm->cmranges = NULL;
+	cm->maxarrayrows = 4;		/* arbitrary initial allocation */
+	cm->hiarrayrows = 1;		/* but we have only one row/col initially */
+	cm->hiarraycols = 1;
+	cm->hicolormap = (color *) MALLOC(cm->maxarrayrows * sizeof(color));
+	if (cm->hicolormap == NULL)
+	{
+		CERR(REG_ESPACE);
+		return;
+	}
+	/* initialize the "all other characters" row to WHITE */
+	cm->hicolormap[0] = WHITE;
+}
+
+/*
+ * freecm - free dynamically-allocated things in a colormap
+ */
+static void
+freecm(struct colormap *cm)
+{
+	cm->magic = 0;
+	if (cm->cd != cm->cdspace)
+		FREE(cm->cd);
+	if (cm->locolormap != NULL)
+		FREE(cm->locolormap);
+	if (cm->cmranges != NULL)
+		FREE(cm->cmranges);
+	if (cm->hicolormap != NULL)
+		FREE(cm->hicolormap);
+}
+
+/*
+ * pg_reg_getcolor - slow case of GETCOLOR()
+ */
+color
+pg_reg_getcolor(struct colormap *cm, chr c)
+{
+	int			rownum,
+				colnum,
+				low,
+				high;
+
+	/* Should not be used for chrs in the locolormap */
+	assert(c > MAX_SIMPLE_CHR);
+
+	/*
+	 * Find which row it's in.  The colormapranges are in order, so we can use
+	 * binary search.
+	 */
+	rownum = 0;					/* if no match, use array row zero */
+	low = 0;
+	high = cm->numcmranges;
+	while (low < high)
+	{
+		int			middle = low + (high - low) / 2;
+		const colormaprange *cmr = &cm->cmranges[middle];
+
+		if (c < cmr->cmin)
+			high = middle;
+		else if (c > cmr->cmax)
+			low = middle + 1;
+		else
+		{
+			rownum = cmr->rownum;	/* found a match */
+			break;
+		}
+	}
+
+	/*
+	 * Find which column it's in --- this is all locale-dependent.
+	 */
+	if (cm->hiarraycols > 1)
+	{
+		colnum = cclass_column_index(cm, c);
+		return cm->hicolormap[rownum * cm->hiarraycols + colnum];
+	}
+	else
+	{
+		/* fast path if no relevant cclasses */
+		return cm->hicolormap[rownum];
+	}
+}
+
+/*
+ * maxcolor - report largest color number in use
+ */
+static color
+maxcolor(struct colormap *cm)
+{
+	if (CISERR())
+		return COLORLESS;
+
+	return (color) cm->max;
+}
+
+/*
+ * newcolor - find a new color (must be assigned at once)
+ * Beware:	may relocate the colordescs.
+ */
+static color					/* COLORLESS for error */
+newcolor(struct colormap *cm)
+{
+	struct colordesc *cd;
+	size_t		n;
+
+	if (CISERR())
+		return COLORLESS;
+
+	if (cm->free != 0)
+	{
+		assert(cm->free > 0);
+		assert((size_t) cm->free < cm->ncds);
+		cd = &cm->cd[cm->free];
+		assert(UNUSEDCOLOR(cd));
+		assert(cd->arcs == NULL);
+		cm->free = cd->sub;
+	}
+	else if (cm->max < cm->ncds - 1)
+	{
+		cm->max++;
+		cd = &cm->cd[cm->max];
+	}
+	else
+	{
+		/* oops, must allocate more */
+		struct colordesc *newCd;
+
+		if (cm->max == MAX_COLOR)
+		{
+			CERR(REG_ECOLORS);
+			return COLORLESS;	/* too many colors */
+		}
+
+		n = cm->ncds * 2;
+		if (n > MAX_COLOR + 1)
+			n = MAX_COLOR + 1;
+		if (cm->cd == cm->cdspace)
+		{
+			newCd = (struct colordesc *) MALLOC(n * sizeof(struct colordesc));
+			if (newCd != NULL)
+				memcpy(VS(newCd), VS(cm->cdspace), cm->ncds *
+					   sizeof(struct colordesc));
+		}
+		else
+			newCd = (struct colordesc *)
+				REALLOC(cm->cd, n * sizeof(struct colordesc));
+		if (newCd == NULL)
+		{
+			CERR(REG_ESPACE);
+			return COLORLESS;
+		}
+		cm->cd = newCd;
+		cm->ncds = n;
+		assert(cm->max < cm->ncds - 1);
+		cm->max++;
+		cd = &cm->cd[cm->max];
+	}
+
+	cd->nschrs = 0;
+	cd->nuchrs = 0;
+	cd->sub = NOSUB;
+	cd->arcs = NULL;
+	cd->firstchr = CHR_MIN;		/* in case never set otherwise */
+	cd->flags = 0;
+
+	return (color) (cd - cm->cd);
+}
+
+/*
+ * freecolor - free a color (must have no arcs or subcolor)
+ */
+static void
+freecolor(struct colormap *cm,
+		  color co)
+{
+	struct colordesc *cd = &cm->cd[co];
+	color		pco,
+				nco;			/* for freelist scan */
+
+	assert(co >= 0);
+	if (co == WHITE)
+		return;
+
+	assert(cd->arcs == NULL);
+	assert(cd->sub == NOSUB);
+	assert(cd->nschrs == 0);
+	assert(cd->nuchrs == 0);
+	cd->flags = FREECOL;
+
+	if ((size_t) co == cm->max)
+	{
+		while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max]))
+			cm->max--;
+		assert(cm->free >= 0);
+		while ((size_t) cm->free > cm->max)
+			cm->free = cm->cd[cm->free].sub;
+		if (cm->free > 0)
+		{
+			assert(cm->free < cm->max);
+			pco = cm->free;
+			nco = cm->cd[pco].sub;
+			while (nco > 0)
+				if ((size_t) nco > cm->max)
+				{
+					/* take this one out of freelist */
+					nco = cm->cd[nco].sub;
+					cm->cd[pco].sub = nco;
+				}
+				else
+				{
+					assert(nco < cm->max);
+					pco = nco;
+					nco = cm->cd[pco].sub;
+				}
+		}
+	}
+	else
+	{
+		cd->sub = cm->free;
+		cm->free = (color) (cd - cm->cd);
+	}
+}
+
+/*
+ * pseudocolor - allocate a false color, to be managed by other means
+ */
+static color
+pseudocolor(struct colormap *cm)
+{
+	color		co;
+	struct colordesc *cd;
+
+	co = newcolor(cm);
+	if (CISERR())
+		return COLORLESS;
+	cd = &cm->cd[co];
+	cd->nschrs = 0;
+	cd->nuchrs = 1;				/* pretend it is in the upper map */
+	cd->sub = NOSUB;
+	cd->arcs = NULL;
+	cd->firstchr = CHR_MIN;
+	cd->flags = PSEUDO;
+	return co;
+}
+
+/*
+ * subcolor - allocate a new subcolor (if necessary) to this chr
+ *
+ * This works only for chrs that map into the low color map.
+ */
+static color
+subcolor(struct colormap *cm, chr c)
+{
+	color		co;				/* current color of c */
+	color		sco;			/* new subcolor */
+
+	assert(c <= MAX_SIMPLE_CHR);
+
+	co = cm->locolormap[c - CHR_MIN];
+	sco = newsub(cm, co);
+	if (CISERR())
+		return COLORLESS;
+	assert(sco != COLORLESS);
+
+	if (co == sco)				/* already in an open subcolor */
+		return co;				/* rest is redundant */
+	cm->cd[co].nschrs--;
+	if (cm->cd[sco].nschrs == 0)
+		cm->cd[sco].firstchr = c;
+	cm->cd[sco].nschrs++;
+	cm->locolormap[c - CHR_MIN] = sco;
+	return sco;
+}
+
+/*
+ * subcolorhi - allocate a new subcolor (if necessary) to this colormap entry
+ *
+ * This is the same processing as subcolor(), but for entries in the high
+ * colormap, which do not necessarily correspond to exactly one chr code.
+ */
+static color
+subcolorhi(struct colormap *cm, color *pco)
+{
+	color		co;				/* current color of entry */
+	color		sco;			/* new subcolor */
+
+	co = *pco;
+	sco = newsub(cm, co);
+	if (CISERR())
+		return COLORLESS;
+	assert(sco != COLORLESS);
+
+	if (co == sco)				/* already in an open subcolor */
+		return co;				/* rest is redundant */
+	cm->cd[co].nuchrs--;
+	cm->cd[sco].nuchrs++;
+	*pco = sco;
+	return sco;
+}
+
+/*
+ * newsub - allocate a new subcolor (if necessary) for a color
+ */
+static color
+newsub(struct colormap *cm,
+	   color co)
+{
+	color		sco;			/* new subcolor */
+
+	sco = cm->cd[co].sub;
+	if (sco == NOSUB)
+	{							/* color has no open subcolor */
+		/* optimization: singly-referenced color need not be subcolored */
+		if ((cm->cd[co].nschrs + cm->cd[co].nuchrs) == 1)
+			return co;
+		sco = newcolor(cm);		/* must create subcolor */
+		if (sco == COLORLESS)
+		{
+			assert(CISERR());
+			return COLORLESS;
+		}
+		cm->cd[co].sub = sco;
+		cm->cd[sco].sub = sco;	/* open subcolor points to self */
+	}
+	assert(sco != NOSUB);
+
+	return sco;
+}
+
+/*
+ * newhicolorrow - get a new row in the hicolormap, cloning it from oldrow
+ *
+ * Returns array index of new row.  Note the array might move.
+ */
+static int
+newhicolorrow(struct colormap *cm,
+			  int oldrow)
+{
+	int			newrow = cm->hiarrayrows;
+	color	   *newrowptr;
+	int			i;
+
+	/* Assign a fresh array row index, enlarging storage if needed */
+	if (newrow >= cm->maxarrayrows)
+	{
+		color	   *newarray;
+
+		if (cm->maxarrayrows >= INT_MAX / (cm->hiarraycols * 2))
+		{
+			CERR(REG_ESPACE);
+			return 0;
+		}
+		newarray = (color *) REALLOC(cm->hicolormap,
+									 cm->maxarrayrows * 2 *
+									 cm->hiarraycols * sizeof(color));
+		if (newarray == NULL)
+		{
+			CERR(REG_ESPACE);
+			return 0;
+		}
+		cm->hicolormap = newarray;
+		cm->maxarrayrows *= 2;
+	}
+	cm->hiarrayrows++;
+
+	/* Copy old row data */
+	newrowptr = &cm->hicolormap[newrow * cm->hiarraycols];
+	memcpy(newrowptr,
+		   &cm->hicolormap[oldrow * cm->hiarraycols],
+		   cm->hiarraycols * sizeof(color));
+
+	/* Increase color reference counts to reflect new colormap entries */
+	for (i = 0; i < cm->hiarraycols; i++)
+		cm->cd[newrowptr[i]].nuchrs++;
+
+	return newrow;
+}
+
+/*
+ * newhicolorcols - create a new set of columns in the high colormap
+ *
+ * Essentially, extends the 2-D array to the right with a copy of itself.
+ */
+static void
+newhicolorcols(struct colormap *cm)
+{
+	color	   *newarray;
+	int			r,
+				c;
+
+	if (cm->hiarraycols >= INT_MAX / (cm->maxarrayrows * 2))
+	{
+		CERR(REG_ESPACE);
+		return;
+	}
+	newarray = (color *) REALLOC(cm->hicolormap,
+								 cm->maxarrayrows *
+								 cm->hiarraycols * 2 * sizeof(color));
+	if (newarray == NULL)
+	{
+		CERR(REG_ESPACE);
+		return;
+	}
+	cm->hicolormap = newarray;
+
+	/* Duplicate existing columns to the right, and increase ref counts */
+	/* Must work backwards in the array because we realloc'd in place */
+	for (r = cm->hiarrayrows - 1; r >= 0; r--)
+	{
+		color	   *oldrowptr = &newarray[r * cm->hiarraycols];
+		color	   *newrowptr = &newarray[r * cm->hiarraycols * 2];
+		color	   *newrowptr2 = newrowptr + cm->hiarraycols;
+
+		for (c = 0; c < cm->hiarraycols; c++)
+		{
+			color		co = oldrowptr[c];
+
+			newrowptr[c] = newrowptr2[c] = co;
+			cm->cd[co].nuchrs++;
+		}
+	}
+
+	cm->hiarraycols *= 2;
+}
+
+/*
+ * subcolorcvec - allocate new subcolors to cvec members, fill in arcs
+ *
+ * For each chr "c" represented by the cvec, do the equivalent of
+ * newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
+ *
+ * Note that in typical cases, many of the subcolors are the same.
+ * While newarc() would discard duplicate arc requests, we can save
+ * some cycles by not calling it repetitively to begin with.  This is
+ * mechanized with the "lastsubcolor" state variable.
+ */
+static void
+subcolorcvec(struct vars *v,
+			 struct cvec *cv,
+			 struct state *lp,
+			 struct state *rp)
+{
+	struct colormap *cm = v->cm;
+	color		lastsubcolor = COLORLESS;
+	chr			ch,
+				from,
+				to;
+	const chr  *p;
+	int			i;
+
+	/* ordinary characters */
+	for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
+	{
+		ch = *p;
+		subcoloronechr(v, ch, lp, rp, &lastsubcolor);
+		NOERR();
+	}
+
+	/* and the ranges */
+	for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
+	{
+		from = *p;
+		to = *(p + 1);
+		if (from <= MAX_SIMPLE_CHR)
+		{
+			/* deal with simple chars one at a time */
+			chr			lim = (to <= MAX_SIMPLE_CHR) ? to : MAX_SIMPLE_CHR;
+
+			while (from <= lim)
+			{
+				color		sco = subcolor(cm, from);
+
+				NOERR();
+				if (sco != lastsubcolor)
+				{
+					newarc(v->nfa, PLAIN, sco, lp, rp);
+					NOERR();
+					lastsubcolor = sco;
+				}
+				from++;
+			}
+		}
+		/* deal with any part of the range that's above MAX_SIMPLE_CHR */
+		if (from < to)
+			subcoloronerange(v, from, to, lp, rp, &lastsubcolor);
+		else if (from == to)
+			subcoloronechr(v, from, lp, rp, &lastsubcolor);
+		NOERR();
+	}
+
+	/* and deal with cclass if any */
+	if (cv->cclasscode >= 0)
+	{
+		int			classbit;
+		color	   *pco;
+		int			r,
+					c;
+
+		/* Enlarge array if we don't have a column bit assignment for cclass */
+		if (cm->classbits[cv->cclasscode] == 0)
+		{
+			cm->classbits[cv->cclasscode] = cm->hiarraycols;
+			newhicolorcols(cm);
+			NOERR();
+		}
+		/* Apply subcolorhi() and make arc for each entry in relevant cols */
+		classbit = cm->classbits[cv->cclasscode];
+		pco = cm->hicolormap;
+		for (r = 0; r < cm->hiarrayrows; r++)
+		{
+			for (c = 0; c < cm->hiarraycols; c++)
+			{
+				if (c & classbit)
+				{
+					color		sco = subcolorhi(cm, pco);
+
+					NOERR();
+					/* add the arc if needed */
+					if (sco != lastsubcolor)
+					{
+						newarc(v->nfa, PLAIN, sco, lp, rp);
+						NOERR();
+						lastsubcolor = sco;
+					}
+				}
+				pco++;
+			}
+		}
+	}
+}
+
+/*
+ * subcoloronechr - do subcolorcvec's work for a singleton chr
+ *
+ * We could just let subcoloronerange do this, but it's a bit more efficient
+ * if we exploit the single-chr case.  Also, callers find it useful for this
+ * to be able to handle both low and high chr codes.
+ */
+static void
+subcoloronechr(struct vars *v,
+			   chr ch,
+			   struct state *lp,
+			   struct state *rp,
+			   color *lastsubcolor)
+{
+	struct colormap *cm = v->cm;
+	colormaprange *newranges;
+	int			numnewranges;
+	colormaprange *oldrange;
+	int			oldrangen;
+	int			newrow;
+
+	/* Easy case for low chr codes */
+	if (ch <= MAX_SIMPLE_CHR)
+	{
+		color		sco = subcolor(cm, ch);
+
+		NOERR();
+		if (sco != *lastsubcolor)
+		{
+			newarc(v->nfa, PLAIN, sco, lp, rp);
+			*lastsubcolor = sco;
+		}
+		return;
+	}
+
+	/*
+	 * Potentially, we could need two more colormapranges than we have now, if
+	 * the given chr is in the middle of some existing range.
+	 */
+	newranges = (colormaprange *)
+		MALLOC((cm->numcmranges + 2) * sizeof(colormaprange));
+	if (newranges == NULL)
+	{
+		CERR(REG_ESPACE);
+		return;
+	}
+	numnewranges = 0;
+
+	/* Ranges before target are unchanged */
+	for (oldrange = cm->cmranges, oldrangen = 0;
+		 oldrangen < cm->numcmranges;
+		 oldrange++, oldrangen++)
+	{
+		if (oldrange->cmax >= ch)
+			break;
+		newranges[numnewranges++] = *oldrange;
+	}
+
+	/* Match target chr against current range */
+	if (oldrangen >= cm->numcmranges || oldrange->cmin > ch)
+	{
+		/* chr does not belong to any existing range, make a new one */
+		newranges[numnewranges].cmin = ch;
+		newranges[numnewranges].cmax = ch;
+		/* row state should be cloned from the "all others" row */
+		newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+		numnewranges++;
+	}
+	else if (oldrange->cmin == oldrange->cmax)
+	{
+		/* we have an existing singleton range matching the chr */
+		newranges[numnewranges++] = *oldrange;
+		newrow = oldrange->rownum;
+		/* we've now fully processed this old range */
+		oldrange++, oldrangen++;
+	}
+	else
+	{
+		/* chr is a subset of this existing range, must split it */
+		if (ch > oldrange->cmin)
+		{
+			/* emit portion of old range before chr */
+			newranges[numnewranges].cmin = oldrange->cmin;
+			newranges[numnewranges].cmax = ch - 1;
+			newranges[numnewranges].rownum = oldrange->rownum;
+			numnewranges++;
+		}
+		/* emit chr as singleton range, initially cloning from range */
+		newranges[numnewranges].cmin = ch;
+		newranges[numnewranges].cmax = ch;
+		newranges[numnewranges].rownum = newrow =
+			newhicolorrow(cm, oldrange->rownum);
+		numnewranges++;
+		if (ch < oldrange->cmax)
+		{
+			/* emit portion of old range after chr */
+			newranges[numnewranges].cmin = ch + 1;
+			newranges[numnewranges].cmax = oldrange->cmax;
+			/* must clone the row if we are making two new ranges from old */
+			newranges[numnewranges].rownum =
+				(ch > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) :
+				oldrange->rownum;
+			numnewranges++;
+		}
+		/* we've now fully processed this old range */
+		oldrange++, oldrangen++;
+	}
+
+	/* Update colors in newrow and create arcs as needed */
+	subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+
+	/* Ranges after target are unchanged */
+	for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++)
+	{
+		newranges[numnewranges++] = *oldrange;
+	}
+
+	/* Assert our original space estimate was adequate */
+	assert(numnewranges <= (cm->numcmranges + 2));
+
+	/* And finally, store back the updated list of ranges */
+	if (cm->cmranges != NULL)
+		FREE(cm->cmranges);
+	cm->cmranges = newranges;
+	cm->numcmranges = numnewranges;
+}
+
+/*
+ * subcoloronerange - do subcolorcvec's work for a high range
+ */
+static void
+subcoloronerange(struct vars *v,
+				 chr from,
+				 chr to,
+				 struct state *lp,
+				 struct state *rp,
+				 color *lastsubcolor)
+{
+	struct colormap *cm = v->cm;
+	colormaprange *newranges;
+	int			numnewranges;
+	colormaprange *oldrange;
+	int			oldrangen;
+	int			newrow;
+
+	/* Caller should take care of non-high-range cases */
+	assert(from > MAX_SIMPLE_CHR);
+	assert(from < to);
+
+	/*
+	 * Potentially, if we have N non-adjacent ranges, we could need as many as
+	 * 2N+1 result ranges (consider case where new range spans 'em all).
+	 */
+	newranges = (colormaprange *)
+		MALLOC((cm->numcmranges * 2 + 1) * sizeof(colormaprange));
+	if (newranges == NULL)
+	{
+		CERR(REG_ESPACE);
+		return;
+	}
+	numnewranges = 0;
+
+	/* Ranges before target are unchanged */
+	for (oldrange = cm->cmranges, oldrangen = 0;
+		 oldrangen < cm->numcmranges;
+		 oldrange++, oldrangen++)
+	{
+		if (oldrange->cmax >= from)
+			break;
+		newranges[numnewranges++] = *oldrange;
+	}
+
+	/*
+	 * Deal with ranges that (partially) overlap the target.  As we process
+	 * each such range, increase "from" to remove the dealt-with characters
+	 * from the target range.
+	 */
+	while (oldrangen < cm->numcmranges && oldrange->cmin <= to)
+	{
+		if (from < oldrange->cmin)
+		{
+			/* Handle portion of new range that corresponds to no old range */
+			newranges[numnewranges].cmin = from;
+			newranges[numnewranges].cmax = oldrange->cmin - 1;
+			/* row state should be cloned from the "all others" row */
+			newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+			numnewranges++;
+			/* Update colors in newrow and create arcs as needed */
+			subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+			/* We've now fully processed the part of new range before old */
+			from = oldrange->cmin;
+		}
+
+		if (from <= oldrange->cmin && to >= oldrange->cmax)
+		{
+			/* old range is fully contained in new, process it in-place */
+			newranges[numnewranges++] = *oldrange;
+			newrow = oldrange->rownum;
+			from = oldrange->cmax + 1;
+		}
+		else
+		{
+			/* some part of old range does not overlap new range */
+			if (from > oldrange->cmin)
+			{
+				/* emit portion of old range before new range */
+				newranges[numnewranges].cmin = oldrange->cmin;
+				newranges[numnewranges].cmax = from - 1;
+				newranges[numnewranges].rownum = oldrange->rownum;
+				numnewranges++;
+			}
+			/* emit common subrange, initially cloning from old range */
+			newranges[numnewranges].cmin = from;
+			newranges[numnewranges].cmax =
+				(to < oldrange->cmax) ? to : oldrange->cmax;
+			newranges[numnewranges].rownum = newrow =
+				newhicolorrow(cm, oldrange->rownum);
+			numnewranges++;
+			if (to < oldrange->cmax)
+			{
+				/* emit portion of old range after new range */
+				newranges[numnewranges].cmin = to + 1;
+				newranges[numnewranges].cmax = oldrange->cmax;
+				/* must clone the row if we are making two new ranges from old */
+				newranges[numnewranges].rownum =
+					(from > oldrange->cmin) ? newhicolorrow(cm, oldrange->rownum) :
+					oldrange->rownum;
+				numnewranges++;
+			}
+			from = oldrange->cmax + 1;
+		}
+		/* Update colors in newrow and create arcs as needed */
+		subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+		/* we've now fully processed this old range */
+		oldrange++, oldrangen++;
+	}
+
+	if (from <= to)
+	{
+		/* Handle portion of new range that corresponds to no old range */
+		newranges[numnewranges].cmin = from;
+		newranges[numnewranges].cmax = to;
+		/* row state should be cloned from the "all others" row */
+		newranges[numnewranges].rownum = newrow = newhicolorrow(cm, 0);
+		numnewranges++;
+		/* Update colors in newrow and create arcs as needed */
+		subcoloronerow(v, newrow, lp, rp, lastsubcolor);
+	}
+
+	/* Ranges after target are unchanged */
+	for (; oldrangen < cm->numcmranges; oldrange++, oldrangen++)
+	{
+		newranges[numnewranges++] = *oldrange;
+	}
+
+	/* Assert our original space estimate was adequate */
+	assert(numnewranges <= (cm->numcmranges * 2 + 1));
+
+	/* And finally, store back the updated list of ranges */
+	if (cm->cmranges != NULL)
+		FREE(cm->cmranges);
+	cm->cmranges = newranges;
+	cm->numcmranges = numnewranges;
+}
+
+/*
+ * subcoloronerow - do subcolorcvec's work for one new row in the high colormap
+ */
+static void
+subcoloronerow(struct vars *v,
+			   int rownum,
+			   struct state *lp,
+			   struct state *rp,
+			   color *lastsubcolor)
+{
+	struct colormap *cm = v->cm;
+	color	   *pco;
+	int			i;
+
+	/* Apply subcolorhi() and make arc for each entry in row */
+	pco = &cm->hicolormap[rownum * cm->hiarraycols];
+	for (i = 0; i < cm->hiarraycols; pco++, i++)
+	{
+		color		sco = subcolorhi(cm, pco);
+
+		NOERR();
+		/* make the arc if needed */
+		if (sco != *lastsubcolor)
+		{
+			newarc(v->nfa, PLAIN, sco, lp, rp);
+			NOERR();
+			*lastsubcolor = sco;
+		}
+	}
+}
+
+/*
+ * okcolors - promote subcolors to full colors
+ */
+static void
+okcolors(struct nfa *nfa,
+		 struct colormap *cm)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	struct colordesc *scd;
+	struct arc *a;
+	color		co;
+	color		sco;
+
+	for (cd = cm->cd, co = 0; cd < end; cd++, co++)
+	{
+		sco = cd->sub;
+		if (UNUSEDCOLOR(cd) || sco == NOSUB)
+		{
+			/* has no subcolor, no further action */
+		}
+		else if (sco == co)
+		{
+			/* is subcolor, let parent deal with it */
+		}
+		else if (cd->nschrs == 0 && cd->nuchrs == 0)
+		{
+			/* parent empty, its arcs change color to subcolor */
+			cd->sub = NOSUB;
+			scd = &cm->cd[sco];
+			assert(scd->nschrs > 0 || scd->nuchrs > 0);
+			assert(scd->sub == sco);
+			scd->sub = NOSUB;
+			while ((a = cd->arcs) != NULL)
+			{
+				assert(a->co == co);
+				uncolorchain(cm, a);
+				a->co = sco;
+				colorchain(cm, a);
+			}
+			freecolor(cm, co);
+		}
+		else
+		{
+			/* parent's arcs must gain parallel subcolor arcs */
+			cd->sub = NOSUB;
+			scd = &cm->cd[sco];
+			assert(scd->nschrs > 0 || scd->nuchrs > 0);
+			assert(scd->sub == sco);
+			scd->sub = NOSUB;
+			for (a = cd->arcs; a != NULL; a = a->colorchain)
+			{
+				assert(a->co == co);
+				newarc(nfa, a->type, sco, a->from, a->to);
+			}
+		}
+	}
+}
+
+/*
+ * colorchain - add this arc to the color chain of its color
+ */
+static void
+colorchain(struct colormap *cm,
+		   struct arc *a)
+{
+	struct colordesc *cd = &cm->cd[a->co];
+
+	if (cd->arcs != NULL)
+		cd->arcs->colorchainRev = a;
+	a->colorchain = cd->arcs;
+	a->colorchainRev = NULL;
+	cd->arcs = a;
+}
+
+/*
+ * uncolorchain - delete this arc from the color chain of its color
+ */
+static void
+uncolorchain(struct colormap *cm,
+			 struct arc *a)
+{
+	struct colordesc *cd = &cm->cd[a->co];
+	struct arc *aa = a->colorchainRev;
+
+	if (aa == NULL)
+	{
+		assert(cd->arcs == a);
+		cd->arcs = a->colorchain;
+	}
+	else
+	{
+		assert(aa->colorchain == a);
+		aa->colorchain = a->colorchain;
+	}
+	if (a->colorchain != NULL)
+		a->colorchain->colorchainRev = aa;
+	a->colorchain = NULL;		/* paranoia */
+	a->colorchainRev = NULL;
+}
+
+/*
+ * rainbow - add arcs of all full colors (but one) between specified states
+ */
+static void
+rainbow(struct nfa *nfa,
+		struct colormap *cm,
+		int type,
+		color but,				/* COLORLESS if no exceptions */
+		struct state *from,
+		struct state *to)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	color		co;
+
+	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+		if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
+			!(cd->flags & PSEUDO))
+			newarc(nfa, type, co, from, to);
+}
+
+/*
+ * colorcomplement - add arcs of complementary colors
+ *
+ * The calling sequence ought to be reconciled with cloneouts().
+ */
+static void
+colorcomplement(struct nfa *nfa,
+				struct colormap *cm,
+				int type,
+				struct state *of,	/* complements of this guy's PLAIN outarcs */
+				struct state *from,
+				struct state *to)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	color		co;
+
+	assert(of != from);
+	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+		if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+			if (findarc(of, PLAIN, co) == NULL)
+				newarc(nfa, type, co, from, to);
+}
+
+
+#ifdef REG_DEBUG
+
+/*
+ * dumpcolors - debugging output
+ */
+static void
+dumpcolors(struct colormap *cm,
+		   FILE *f)
+{
+	struct colordesc *cd;
+	struct colordesc *end;
+	color		co;
+	chr			c;
+
+	fprintf(f, "max %ld\n", (long) cm->max);
+	end = CDEND(cm);
+	for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++) /* skip 0 */
+	{
+		if (!UNUSEDCOLOR(cd))
+		{
+			assert(cd->nschrs > 0 || cd->nuchrs > 0);
+			if (cd->flags & PSEUDO)
+				fprintf(f, "#%2ld(ps): ", (long) co);
+			else
+				fprintf(f, "#%2ld(%2d): ", (long) co, cd->nschrs + cd->nuchrs);
+
+			/*
+			 * Unfortunately, it's hard to do this next bit more efficiently.
+			 */
+			for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
+				if (GETCOLOR(cm, c) == co)
+					dumpchr(c, f);
+			fprintf(f, "\n");
+		}
+	}
+	/* dump the high colormap if it contains anything interesting */
+	if (cm->hiarrayrows > 1 || cm->hiarraycols > 1)
+	{
+		int			r,
+					c;
+		const color *rowptr;
+
+		fprintf(f, "other:\t");
+		for (c = 0; c < cm->hiarraycols; c++)
+		{
+			fprintf(f, "\t%ld", (long) cm->hicolormap[c]);
+		}
+		fprintf(f, "\n");
+		for (r = 0; r < cm->numcmranges; r++)
+		{
+			dumpchr(cm->cmranges[r].cmin, f);
+			fprintf(f, "..");
+			dumpchr(cm->cmranges[r].cmax, f);
+			fprintf(f, ":");
+			rowptr = &cm->hicolormap[cm->cmranges[r].rownum * cm->hiarraycols];
+			for (c = 0; c < cm->hiarraycols; c++)
+			{
+				fprintf(f, "\t%ld", (long) rowptr[c]);
+			}
+			fprintf(f, "\n");
+		}
+	}
+}
+
+/*
+ * dumpchr - print a chr
+ *
+ * Kind of char-centric but works well enough for debug use.
+ */
+static void
+dumpchr(chr c,
+		FILE *f)
+{
+	if (c == '\\')
+		fprintf(f, "\\\\");
+	else if (c > ' ' && c <= '~')
+		putc((char) c, f);
+	else
+		fprintf(f, "\\u%04lx", (long) c);
+}
+
+#endif							/* REG_DEBUG */
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
new file mode 100644
index 0000000..1030621
--- /dev/null
+++ b/src/backend/regex/regc_cvec.c
@@ -0,0 +1,138 @@
+/*
+ * Utility functions for handling cvecs
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_cvec.c
+ *
+ */
+
+/*
+ * Notes:
+ * Only (selected) functions in _this_ file should treat the chr arrays
+ * of a cvec as non-constant.
+ */
+
+/*
+ * newcvec - allocate a new cvec
+ */
+static struct cvec *
+newcvec(int nchrs,				/* to hold this many chrs... */
+		int nranges)			/* ... and this many ranges */
+{
+	size_t		nc = (size_t) nchrs + (size_t) nranges * 2;
+	size_t		n = sizeof(struct cvec) + nc * sizeof(chr);
+	struct cvec *cv = (struct cvec *) MALLOC(n);
+
+	if (cv == NULL)
+		return NULL;
+	cv->chrspace = nchrs;
+	cv->chrs = (chr *) (((char *) cv) + sizeof(struct cvec));
+	cv->ranges = cv->chrs + nchrs;
+	cv->rangespace = nranges;
+	return clearcvec(cv);
+}
+
+/*
+ * clearcvec - clear a possibly-new cvec
+ * Returns pointer as convenience.
+ */
+static struct cvec *
+clearcvec(struct cvec *cv)
+{
+	assert(cv != NULL);
+	cv->nchrs = 0;
+	cv->nranges = 0;
+	cv->cclasscode = -1;
+	return cv;
+}
+
+/*
+ * addchr - add a chr to a cvec
+ */
+static void
+addchr(struct cvec *cv,			/* character vector */
+	   chr c)					/* character to add */
+{
+	assert(cv->nchrs < cv->chrspace);
+	cv->chrs[cv->nchrs++] = c;
+}
+
+/*
+ * addrange - add a range to a cvec
+ */
+static void
+addrange(struct cvec *cv,		/* character vector */
+		 chr from,				/* first character of range */
+		 chr to)				/* last character of range */
+{
+	assert(cv->nranges < cv->rangespace);
+	cv->ranges[cv->nranges * 2] = from;
+	cv->ranges[cv->nranges * 2 + 1] = to;
+	cv->nranges++;
+}
+
+/*
+ * getcvec - get a transient cvec, initialized to empty
+ *
+ * The returned cvec is valid only until the next call of getcvec, which
+ * typically will recycle the space.  Callers should *not* free the cvec
+ * explicitly; it will be cleaned up when the struct vars is destroyed.
+ *
+ * This is typically used while interpreting bracket expressions.  In that
+ * usage the cvec is only needed momentarily until we build arcs from it,
+ * so transientness is a convenient behavior.
+ */
+static struct cvec *
+getcvec(struct vars *v,			/* context */
+		int nchrs,				/* to hold this many chrs... */
+		int nranges)			/* ... and this many ranges */
+{
+	/* recycle existing transient cvec if large enough */
+	if (v->cv != NULL && nchrs <= v->cv->chrspace &&
+		nranges <= v->cv->rangespace)
+		return clearcvec(v->cv);
+
+	/* nope, make a new one */
+	if (v->cv != NULL)
+		freecvec(v->cv);
+	v->cv = newcvec(nchrs, nranges);
+	if (v->cv == NULL)
+		ERR(REG_ESPACE);
+
+	return v->cv;
+}
+
+/*
+ * freecvec - free a cvec
+ */
+static void
+freecvec(struct cvec *cv)
+{
+	FREE(cv);
+}
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
new file mode 100644
index 0000000..1666453
--- /dev/null
+++ b/src/backend/regex/regc_lex.c
@@ -0,0 +1,1173 @@
+/*
+ * lexical analyzer
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_lex.c
+ *
+ */
+
+/* scanning macros (know about v) */
+#define ATEOS()		(v->now >= v->stop)
+#define HAVE(n)		(v->stop - v->now >= (n))
+#define NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
+#define NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
+#define NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
+						*(v->now+1) == CHR(b) && \
+						*(v->now+2) == CHR(c))
+#define SET(c)		(v->nexttype = (c))
+#define SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
+#define RET(c)		return (SET(c), 1)
+#define RETV(c, n)	return (SETV(c, n), 1)
+#define FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
+#define LASTTYPE(t) (v->lasttype == (t))
+
+/* lexical contexts */
+#define L_ERE	1				/* mainline ERE/ARE */
+#define L_BRE	2				/* mainline BRE */
+#define L_Q 3					/* REG_QUOTE */
+#define L_EBND	4				/* ERE/ARE bound */
+#define L_BBND	5				/* BRE bound */
+#define L_BRACK 6				/* brackets */
+#define L_CEL	7				/* collating element */
+#define L_ECL	8				/* equivalence class */
+#define L_CCL	9				/* character class */
+#define INTOCON(c)	(v->lexcon = (c))
+#define INCON(con)	(v->lexcon == (con))
+
+/* construct pointer past end of chr array */
+#define ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
+
+/*
+ * lexstart - set up lexical stuff, scan leading options
+ */
+static void
+lexstart(struct vars *v)
+{
+	prefixes(v);				/* may turn on new type bits etc. */
+	NOERR();
+
+	if (v->cflags & REG_QUOTE)
+	{
+		assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
+		INTOCON(L_Q);
+	}
+	else if (v->cflags & REG_EXTENDED)
+	{
+		assert(!(v->cflags & REG_QUOTE));
+		INTOCON(L_ERE);
+	}
+	else
+	{
+		assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
+		INTOCON(L_BRE);
+	}
+
+	v->nexttype = EMPTY;		/* remember we were at the start */
+	next(v);					/* set up the first token */
+}
+
+/*
+ * prefixes - implement various special prefixes
+ */
+static void
+prefixes(struct vars *v)
+{
+	/* literal string doesn't get any of this stuff */
+	if (v->cflags & REG_QUOTE)
+		return;
+
+	/* initial "***" gets special things */
+	if (HAVE(4) && NEXT3('*', '*', '*'))
+		switch (*(v->now + 3))
+		{
+			case CHR('?'):		/* "***?" error, msg shows version */
+				ERR(REG_BADPAT);
+				return;			/* proceed no further */
+				break;
+			case CHR('='):		/* "***=" shifts to literal string */
+				NOTE(REG_UNONPOSIX);
+				v->cflags |= REG_QUOTE;
+				v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
+				v->now += 4;
+				return;			/* and there can be no more prefixes */
+				break;
+			case CHR(':'):		/* "***:" shifts to AREs */
+				NOTE(REG_UNONPOSIX);
+				v->cflags |= REG_ADVANCED;
+				v->now += 4;
+				break;
+			default:			/* otherwise *** is just an error */
+				ERR(REG_BADRPT);
+				return;
+				break;
+		}
+
+	/* BREs and EREs don't get embedded options */
+	if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
+		return;
+
+	/* embedded options (AREs only) */
+	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
+	{
+		NOTE(REG_UNONPOSIX);
+		v->now += 2;
+		for (; !ATEOS() && iscalpha(*v->now); v->now++)
+			switch (*v->now)
+			{
+				case CHR('b'):	/* BREs (but why???) */
+					v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
+					break;
+				case CHR('c'):	/* case sensitive */
+					v->cflags &= ~REG_ICASE;
+					break;
+				case CHR('e'):	/* plain EREs */
+					v->cflags |= REG_EXTENDED;
+					v->cflags &= ~(REG_ADVF | REG_QUOTE);
+					break;
+				case CHR('i'):	/* case insensitive */
+					v->cflags |= REG_ICASE;
+					break;
+				case CHR('m'):	/* Perloid synonym for n */
+				case CHR('n'):	/* \n affects ^ $ . [^ */
+					v->cflags |= REG_NEWLINE;
+					break;
+				case CHR('p'):	/* ~Perl, \n affects . [^ */
+					v->cflags |= REG_NLSTOP;
+					v->cflags &= ~REG_NLANCH;
+					break;
+				case CHR('q'):	/* literal string */
+					v->cflags |= REG_QUOTE;
+					v->cflags &= ~REG_ADVANCED;
+					break;
+				case CHR('s'):	/* single line, \n ordinary */
+					v->cflags &= ~REG_NEWLINE;
+					break;
+				case CHR('t'):	/* tight syntax */
+					v->cflags &= ~REG_EXPANDED;
+					break;
+				case CHR('w'):	/* weird, \n affects ^ $ only */
+					v->cflags &= ~REG_NLSTOP;
+					v->cflags |= REG_NLANCH;
+					break;
+				case CHR('x'):	/* expanded syntax */
+					v->cflags |= REG_EXPANDED;
+					break;
+				default:
+					ERR(REG_BADOPT);
+					return;
+			}
+		if (!NEXT1(')'))
+		{
+			ERR(REG_BADOPT);
+			return;
+		}
+		v->now++;
+		if (v->cflags & REG_QUOTE)
+			v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
+	}
+}
+
+/*
+ * lexnest - "call a subroutine", interpolating string at the lexical level
+ *
+ * Note, this is not a very general facility.  There are a number of
+ * implicit assumptions about what sorts of strings can be subroutines.
+ */
+static void
+lexnest(struct vars *v,
+		const chr *beginp,		/* start of interpolation */
+		const chr *endp)		/* one past end of interpolation */
+{
+	assert(v->savenow == NULL); /* only one level of nesting */
+	v->savenow = v->now;
+	v->savestop = v->stop;
+	v->now = beginp;
+	v->stop = endp;
+}
+
+/*
+ * string constants to interpolate as expansions of things like \d
+ */
+static const chr backd[] = {	/* \d */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static const chr backD[] = {	/* \D */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static const chr brbackd[] = {	/* \d within brackets */
+	CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']')
+};
+static const chr backs[] = {	/* \s */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static const chr backS[] = {	/* \S */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static const chr brbacks[] = {	/* \s within brackets */
+	CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']')
+};
+static const chr backw[] = {	/* \w */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static const chr backW[] = {	/* \W */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static const chr brbackw[] = {	/* \w within brackets */
+	CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_')
+};
+
+/*
+ * lexword - interpolate a bracket expression for word characters
+ * Possibly ought to inquire whether there is a "word" character class.
+ */
+static void
+lexword(struct vars *v)
+{
+	lexnest(v, backw, ENDOF(backw));
+}
+
+/*
+ * next - get next token
+ */
+static int						/* 1 normal, 0 failure */
+next(struct vars *v)
+{
+	chr			c;
+
+	/* errors yield an infinite sequence of failures */
+	if (ISERR())
+		return 0;				/* the error has set nexttype to EOS */
+
+	/* remember flavor of last token */
+	v->lasttype = v->nexttype;
+
+	/* REG_BOSONLY */
+	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
+	{
+		/* at start of a REG_BOSONLY RE */
+		RETV(SBEGIN, 0);		/* same as \A */
+	}
+
+	/* if we're nested and we've hit end, return to outer level */
+	if (v->savenow != NULL && ATEOS())
+	{
+		v->now = v->savenow;
+		v->stop = v->savestop;
+		v->savenow = v->savestop = NULL;
+	}
+
+	/* skip white space etc. if appropriate (not in literal or []) */
+	if (v->cflags & REG_EXPANDED)
+		switch (v->lexcon)
+		{
+			case L_ERE:
+			case L_BRE:
+			case L_EBND:
+			case L_BBND:
+				skip(v);
+				break;
+		}
+
+	/* handle EOS, depending on context */
+	if (ATEOS())
+	{
+		switch (v->lexcon)
+		{
+			case L_ERE:
+			case L_BRE:
+			case L_Q:
+				RET(EOS);
+				break;
+			case L_EBND:
+			case L_BBND:
+				FAILW(REG_EBRACE);
+				break;
+			case L_BRACK:
+			case L_CEL:
+			case L_ECL:
+			case L_CCL:
+				FAILW(REG_EBRACK);
+				break;
+		}
+		assert(NOTREACHED);
+	}
+
+	/* okay, time to actually get a character */
+	c = *v->now++;
+
+	/* deal with the easy contexts, punt EREs to code below */
+	switch (v->lexcon)
+	{
+		case L_BRE:				/* punt BREs to separate function */
+			return brenext(v, c);
+			break;
+		case L_ERE:				/* see below */
+			break;
+		case L_Q:				/* literal strings are easy */
+			RETV(PLAIN, c);
+			break;
+		case L_BBND:			/* bounds are fairly simple */
+		case L_EBND:
+			switch (c)
+			{
+				case CHR('0'):
+				case CHR('1'):
+				case CHR('2'):
+				case CHR('3'):
+				case CHR('4'):
+				case CHR('5'):
+				case CHR('6'):
+				case CHR('7'):
+				case CHR('8'):
+				case CHR('9'):
+					RETV(DIGIT, (chr) DIGITVAL(c));
+					break;
+				case CHR(','):
+					RET(',');
+					break;
+				case CHR('}'):	/* ERE bound ends with } */
+					if (INCON(L_EBND))
+					{
+						INTOCON(L_ERE);
+						if ((v->cflags & REG_ADVF) && NEXT1('?'))
+						{
+							v->now++;
+							NOTE(REG_UNONPOSIX);
+							RETV('}', 0);
+						}
+						RETV('}', 1);
+					}
+					else
+						FAILW(REG_BADBR);
+					break;
+				case CHR('\\'): /* BRE bound ends with \} */
+					if (INCON(L_BBND) && NEXT1('}'))
+					{
+						v->now++;
+						INTOCON(L_BRE);
+						RETV('}', 1);
+					}
+					else
+						FAILW(REG_BADBR);
+					break;
+				default:
+					FAILW(REG_BADBR);
+					break;
+			}
+			assert(NOTREACHED);
+			break;
+		case L_BRACK:			/* brackets are not too hard */
+			switch (c)
+			{
+				case CHR(']'):
+					if (LASTTYPE('['))
+						RETV(PLAIN, c);
+					else
+					{
+						INTOCON((v->cflags & REG_EXTENDED) ?
+								L_ERE : L_BRE);
+						RET(']');
+					}
+					break;
+				case CHR('\\'):
+					NOTE(REG_UBBS);
+					if (!(v->cflags & REG_ADVF))
+						RETV(PLAIN, c);
+					NOTE(REG_UNONPOSIX);
+					if (ATEOS())
+						FAILW(REG_EESCAPE);
+					(DISCARD) lexescape(v);
+					switch (v->nexttype)
+					{			/* not all escapes okay here */
+						case PLAIN:
+							return 1;
+							break;
+						case CCLASS:
+							switch (v->nextvalue)
+							{
+								case 'd':
+									lexnest(v, brbackd, ENDOF(brbackd));
+									break;
+								case 's':
+									lexnest(v, brbacks, ENDOF(brbacks));
+									break;
+								case 'w':
+									lexnest(v, brbackw, ENDOF(brbackw));
+									break;
+								default:
+									FAILW(REG_EESCAPE);
+									break;
+							}
+							/* lexnest done, back up and try again */
+							v->nexttype = v->lasttype;
+							return next(v);
+							break;
+					}
+					/* not one of the acceptable escapes */
+					FAILW(REG_EESCAPE);
+					break;
+				case CHR('-'):
+					if (LASTTYPE('[') || NEXT1(']'))
+						RETV(PLAIN, c);
+					else
+						RETV(RANGE, c);
+					break;
+				case CHR('['):
+					if (ATEOS())
+						FAILW(REG_EBRACK);
+					switch (*v->now++)
+					{
+						case CHR('.'):
+							INTOCON(L_CEL);
+							/* might or might not be locale-specific */
+							RET(COLLEL);
+							break;
+						case CHR('='):
+							INTOCON(L_ECL);
+							NOTE(REG_ULOCALE);
+							RET(ECLASS);
+							break;
+						case CHR(':'):
+							INTOCON(L_CCL);
+							NOTE(REG_ULOCALE);
+							RET(CCLASS);
+							break;
+						default:	/* oops */
+							v->now--;
+							RETV(PLAIN, c);
+							break;
+					}
+					assert(NOTREACHED);
+					break;
+				default:
+					RETV(PLAIN, c);
+					break;
+			}
+			assert(NOTREACHED);
+			break;
+		case L_CEL:				/* collating elements are easy */
+			if (c == CHR('.') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, '.');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		case L_ECL:				/* ditto equivalence classes */
+			if (c == CHR('=') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, '=');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		case L_CCL:				/* ditto character classes */
+			if (c == CHR(':') && NEXT1(']'))
+			{
+				v->now++;
+				INTOCON(L_BRACK);
+				RETV(END, ':');
+			}
+			else
+				RETV(PLAIN, c);
+			break;
+		default:
+			assert(NOTREACHED);
+			break;
+	}
+
+	/* that got rid of everything except EREs and AREs */
+	assert(INCON(L_ERE));
+
+	/* deal with EREs and AREs, except for backslashes */
+	switch (c)
+	{
+		case CHR('|'):
+			RET('|');
+			break;
+		case CHR('*'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('*', 0);
+			}
+			RETV('*', 1);
+			break;
+		case CHR('+'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('+', 0);
+			}
+			RETV('+', 1);
+			break;
+		case CHR('?'):
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				v->now++;
+				NOTE(REG_UNONPOSIX);
+				RETV('?', 0);
+			}
+			RETV('?', 1);
+			break;
+		case CHR('{'):			/* bounds start or plain character */
+			if (v->cflags & REG_EXPANDED)
+				skip(v);
+			if (ATEOS() || !iscdigit(*v->now))
+			{
+				NOTE(REG_UBRACES);
+				NOTE(REG_UUNSPEC);
+				RETV(PLAIN, c);
+			}
+			else
+			{
+				NOTE(REG_UBOUNDS);
+				INTOCON(L_EBND);
+				RET('{');
+			}
+			assert(NOTREACHED);
+			break;
+		case CHR('('):			/* parenthesis, or advanced extension */
+			if ((v->cflags & REG_ADVF) && NEXT1('?'))
+			{
+				NOTE(REG_UNONPOSIX);
+				v->now++;
+				if (ATEOS())
+					FAILW(REG_BADRPT);
+				switch (*v->now++)
+				{
+					case CHR(':'):	/* non-capturing paren */
+						RETV('(', 0);
+						break;
+					case CHR('#'):	/* comment */
+						while (!ATEOS() && *v->now != CHR(')'))
+							v->now++;
+						if (!ATEOS())
+							v->now++;
+						assert(v->nexttype == v->lasttype);
+						return next(v);
+						break;
+					case CHR('='):	/* positive lookahead */
+						NOTE(REG_ULOOKAROUND);
+						RETV(LACON, LATYPE_AHEAD_POS);
+						break;
+					case CHR('!'):	/* negative lookahead */
+						NOTE(REG_ULOOKAROUND);
+						RETV(LACON, LATYPE_AHEAD_NEG);
+						break;
+					case CHR('<'):
+						if (ATEOS())
+							FAILW(REG_BADRPT);
+						switch (*v->now++)
+						{
+							case CHR('='):	/* positive lookbehind */
+								NOTE(REG_ULOOKAROUND);
+								RETV(LACON, LATYPE_BEHIND_POS);
+								break;
+							case CHR('!'):	/* negative lookbehind */
+								NOTE(REG_ULOOKAROUND);
+								RETV(LACON, LATYPE_BEHIND_NEG);
+								break;
+							default:
+								FAILW(REG_BADRPT);
+								break;
+						}
+						assert(NOTREACHED);
+						break;
+					default:
+						FAILW(REG_BADRPT);
+						break;
+				}
+				assert(NOTREACHED);
+			}
+			if (v->cflags & REG_NOSUB)
+				RETV('(', 0);	/* all parens non-capturing */
+			else
+				RETV('(', 1);
+			break;
+		case CHR(')'):
+			if (LASTTYPE('('))
+				NOTE(REG_UUNSPEC);
+			RETV(')', c);
+			break;
+		case CHR('['):			/* easy except for [[:<:]] and [[:>:]] */
+			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+				*(v->now + 1) == CHR(':') &&
+				(*(v->now + 2) == CHR('<') ||
+				 *(v->now + 2) == CHR('>')) &&
+				*(v->now + 3) == CHR(':') &&
+				*(v->now + 4) == CHR(']') &&
+				*(v->now + 5) == CHR(']'))
+			{
+				c = *(v->now + 2);
+				v->now += 6;
+				NOTE(REG_UNONPOSIX);
+				RET((c == CHR('<')) ? '<' : '>');
+			}
+			INTOCON(L_BRACK);
+			if (NEXT1('^'))
+			{
+				v->now++;
+				RETV('[', 0);
+			}
+			RETV('[', 1);
+			break;
+		case CHR('.'):
+			RET('.');
+			break;
+		case CHR('^'):
+			RET('^');
+			break;
+		case CHR('$'):
+			RET('$');
+			break;
+		case CHR('\\'):			/* mostly punt backslashes to code below */
+			if (ATEOS())
+				FAILW(REG_EESCAPE);
+			break;
+		default:				/* ordinary character */
+			RETV(PLAIN, c);
+			break;
+	}
+
+	/* ERE/ARE backslash handling; backslash already eaten */
+	assert(!ATEOS());
+	if (!(v->cflags & REG_ADVF))
+	{							/* only AREs have non-trivial escapes */
+		if (iscalnum(*v->now))
+		{
+			NOTE(REG_UBSALNUM);
+			NOTE(REG_UUNSPEC);
+		}
+		RETV(PLAIN, *v->now++);
+	}
+	(DISCARD) lexescape(v);
+	if (ISERR())
+		FAILW(REG_EESCAPE);
+	if (v->nexttype == CCLASS)
+	{							/* fudge at lexical level */
+		switch (v->nextvalue)
+		{
+			case 'd':
+				lexnest(v, backd, ENDOF(backd));
+				break;
+			case 'D':
+				lexnest(v, backD, ENDOF(backD));
+				break;
+			case 's':
+				lexnest(v, backs, ENDOF(backs));
+				break;
+			case 'S':
+				lexnest(v, backS, ENDOF(backS));
+				break;
+			case 'w':
+				lexnest(v, backw, ENDOF(backw));
+				break;
+			case 'W':
+				lexnest(v, backW, ENDOF(backW));
+				break;
+			default:
+				assert(NOTREACHED);
+				FAILW(REG_ASSERT);
+				break;
+		}
+		/* lexnest done, back up and try again */
+		v->nexttype = v->lasttype;
+		return next(v);
+	}
+	/* otherwise, lexescape has already done the work */
+	return !ISERR();
+}
+
+/*
+ * lexescape - parse an ARE backslash escape (backslash already eaten)
+ * Note slightly nonstandard use of the CCLASS type code.
+ */
+static int						/* not actually used, but convenient for RETV */
+lexescape(struct vars *v)
+{
+	chr			c;
+	static const chr alert[] = {
+		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
+	};
+	static const chr esc[] = {
+		CHR('E'), CHR('S'), CHR('C')
+	};
+	const chr  *save;
+
+	assert(v->cflags & REG_ADVF);
+
+	assert(!ATEOS());
+	c = *v->now++;
+	if (!iscalnum(c))
+		RETV(PLAIN, c);
+
+	NOTE(REG_UNONPOSIX);
+	switch (c)
+	{
+		case CHR('a'):
+			RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
+			break;
+		case CHR('A'):
+			RETV(SBEGIN, 0);
+			break;
+		case CHR('b'):
+			RETV(PLAIN, CHR('\b'));
+			break;
+		case CHR('B'):
+			RETV(PLAIN, CHR('\\'));
+			break;
+		case CHR('c'):
+			NOTE(REG_UUNPORT);
+			if (ATEOS())
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, (chr) (*v->now++ & 037));
+			break;
+		case CHR('d'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'd');
+			break;
+		case CHR('D'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'D');
+			break;
+		case CHR('e'):
+			NOTE(REG_UUNPORT);
+			RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
+			break;
+		case CHR('f'):
+			RETV(PLAIN, CHR('\f'));
+			break;
+		case CHR('m'):
+			RET('<');
+			break;
+		case CHR('M'):
+			RET('>');
+			break;
+		case CHR('n'):
+			RETV(PLAIN, CHR('\n'));
+			break;
+		case CHR('r'):
+			RETV(PLAIN, CHR('\r'));
+			break;
+		case CHR('s'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 's');
+			break;
+		case CHR('S'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'S');
+			break;
+		case CHR('t'):
+			RETV(PLAIN, CHR('\t'));
+			break;
+		case CHR('u'):
+			c = lexdigits(v, 16, 4, 4);
+			if (ISERR() || !CHR_IS_IN_RANGE(c))
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('U'):
+			c = lexdigits(v, 16, 8, 8);
+			if (ISERR() || !CHR_IS_IN_RANGE(c))
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('v'):
+			RETV(PLAIN, CHR('\v'));
+			break;
+		case CHR('w'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'w');
+			break;
+		case CHR('W'):
+			NOTE(REG_ULOCALE);
+			RETV(CCLASS, 'W');
+			break;
+		case CHR('x'):
+			NOTE(REG_UUNPORT);
+			c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+			if (ISERR() || !CHR_IS_IN_RANGE(c))
+				FAILW(REG_EESCAPE);
+			RETV(PLAIN, c);
+			break;
+		case CHR('y'):
+			NOTE(REG_ULOCALE);
+			RETV(WBDRY, 0);
+			break;
+		case CHR('Y'):
+			NOTE(REG_ULOCALE);
+			RETV(NWBDRY, 0);
+			break;
+		case CHR('Z'):
+			RETV(SEND, 0);
+			break;
+		case CHR('1'):
+		case CHR('2'):
+		case CHR('3'):
+		case CHR('4'):
+		case CHR('5'):
+		case CHR('6'):
+		case CHR('7'):
+		case CHR('8'):
+		case CHR('9'):
+			save = v->now;
+			v->now--;			/* put first digit back */
+			c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			/* ugly heuristic (first test is "exactly 1 digit?") */
+			if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
+			{
+				NOTE(REG_UBACKREF);
+				RETV(BACKREF, c);
+			}
+			/* oops, doesn't look like it's a backref after all... */
+			v->now = save;
+			/* and fall through into octal number */
+			/* FALLTHROUGH */
+		case CHR('0'):
+			NOTE(REG_UUNPORT);
+			v->now--;			/* put first digit back */
+			c = lexdigits(v, 8, 1, 3);
+			if (ISERR())
+				FAILW(REG_EESCAPE);
+			if (c > 0xff)
+			{
+				/* out of range, so we handled one digit too much */
+				v->now--;
+				c >>= 3;
+			}
+			RETV(PLAIN, c);
+			break;
+		default:
+			assert(iscalpha(c));
+			FAILW(REG_EESCAPE); /* unknown alphabetic escape */
+			break;
+	}
+	assert(NOTREACHED);
+}
+
+/*
+ * lexdigits - slurp up digits and return chr value
+ *
+ * This does not account for overflow; callers should range-check the result
+ * if maxlen is large enough to make that possible.
+ */
+static chr						/* chr value; errors signalled via ERR */
+lexdigits(struct vars *v,
+		  int base,
+		  int minlen,
+		  int maxlen)
+{
+	uchr		n;				/* unsigned to avoid overflow misbehavior */
+	int			len;
+	chr			c;
+	int			d;
+	const uchr	ub = (uchr) base;
+
+	n = 0;
+	for (len = 0; len < maxlen && !ATEOS(); len++)
+	{
+		c = *v->now++;
+		switch (c)
+		{
+			case CHR('0'):
+			case CHR('1'):
+			case CHR('2'):
+			case CHR('3'):
+			case CHR('4'):
+			case CHR('5'):
+			case CHR('6'):
+			case CHR('7'):
+			case CHR('8'):
+			case CHR('9'):
+				d = DIGITVAL(c);
+				break;
+			case CHR('a'):
+			case CHR('A'):
+				d = 10;
+				break;
+			case CHR('b'):
+			case CHR('B'):
+				d = 11;
+				break;
+			case CHR('c'):
+			case CHR('C'):
+				d = 12;
+				break;
+			case CHR('d'):
+			case CHR('D'):
+				d = 13;
+				break;
+			case CHR('e'):
+			case CHR('E'):
+				d = 14;
+				break;
+			case CHR('f'):
+			case CHR('F'):
+				d = 15;
+				break;
+			default:
+				v->now--;		/* oops, not a digit at all */
+				d = -1;
+				break;
+		}
+
+		if (d >= base)
+		{						/* not a plausible digit */
+			v->now--;
+			d = -1;
+		}
+		if (d < 0)
+			break;				/* NOTE BREAK OUT */
+		n = n * ub + (uchr) d;
+	}
+	if (len < minlen)
+		ERR(REG_EESCAPE);
+
+	return (chr) n;
+}
+
+/*
+ * brenext - get next BRE token
+ *
+ * This is much like EREs except for all the stupid backslashes and the
+ * context-dependency of some things.
+ */
+static int						/* 1 normal, 0 failure */
+brenext(struct vars *v,
+		chr c)
+{
+	switch (c)
+	{
+		case CHR('*'):
+			if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
+				RETV(PLAIN, c);
+			RETV('*', 1);
+			break;
+		case CHR('['):
+			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
+				*(v->now + 1) == CHR(':') &&
+				(*(v->now + 2) == CHR('<') ||
+				 *(v->now + 2) == CHR('>')) &&
+				*(v->now + 3) == CHR(':') &&
+				*(v->now + 4) == CHR(']') &&
+				*(v->now + 5) == CHR(']'))
+			{
+				c = *(v->now + 2);
+				v->now += 6;
+				NOTE(REG_UNONPOSIX);
+				RET((c == CHR('<')) ? '<' : '>');
+			}
+			INTOCON(L_BRACK);
+			if (NEXT1('^'))
+			{
+				v->now++;
+				RETV('[', 0);
+			}
+			RETV('[', 1);
+			break;
+		case CHR('.'):
+			RET('.');
+			break;
+		case CHR('^'):
+			if (LASTTYPE(EMPTY))
+				RET('^');
+			if (LASTTYPE('('))
+			{
+				NOTE(REG_UUNSPEC);
+				RET('^');
+			}
+			RETV(PLAIN, c);
+			break;
+		case CHR('$'):
+			if (v->cflags & REG_EXPANDED)
+				skip(v);
+			if (ATEOS())
+				RET('$');
+			if (NEXT2('\\', ')'))
+			{
+				NOTE(REG_UUNSPEC);
+				RET('$');
+			}
+			RETV(PLAIN, c);
+			break;
+		case CHR('\\'):
+			break;				/* see below */
+		default:
+			RETV(PLAIN, c);
+			break;
+	}
+
+	assert(c == CHR('\\'));
+
+	if (ATEOS())
+		FAILW(REG_EESCAPE);
+
+	c = *v->now++;
+	switch (c)
+	{
+		case CHR('{'):
+			INTOCON(L_BBND);
+			NOTE(REG_UBOUNDS);
+			RET('{');
+			break;
+		case CHR('('):
+			RETV('(', 1);
+			break;
+		case CHR(')'):
+			RETV(')', c);
+			break;
+		case CHR('<'):
+			NOTE(REG_UNONPOSIX);
+			RET('<');
+			break;
+		case CHR('>'):
+			NOTE(REG_UNONPOSIX);
+			RET('>');
+			break;
+		case CHR('1'):
+		case CHR('2'):
+		case CHR('3'):
+		case CHR('4'):
+		case CHR('5'):
+		case CHR('6'):
+		case CHR('7'):
+		case CHR('8'):
+		case CHR('9'):
+			NOTE(REG_UBACKREF);
+			RETV(BACKREF, (chr) DIGITVAL(c));
+			break;
+		default:
+			if (iscalnum(c))
+			{
+				NOTE(REG_UBSALNUM);
+				NOTE(REG_UUNSPEC);
+			}
+			RETV(PLAIN, c);
+			break;
+	}
+
+	assert(NOTREACHED);
+	return 0;
+}
+
+/*
+ * skip - skip white space and comments in expanded form
+ */
+static void
+skip(struct vars *v)
+{
+	const chr  *start = v->now;
+
+	assert(v->cflags & REG_EXPANDED);
+
+	for (;;)
+	{
+		while (!ATEOS() && iscspace(*v->now))
+			v->now++;
+		if (ATEOS() || *v->now != CHR('#'))
+			break;				/* NOTE BREAK OUT */
+		assert(NEXT1('#'));
+		while (!ATEOS() && *v->now != CHR('\n'))
+			v->now++;
+		/* leave the newline to be picked up by the iscspace loop */
+	}
+
+	if (v->now != start)
+		NOTE(REG_UNONPOSIX);
+}
+
+/*
+ * newline - return the chr for a newline
+ *
+ * This helps confine use of CHR to this source file.
+ */
+static chr
+newline(void)
+{
+	return CHR('\n');
+}
+
+/*
+ * chrnamed - return the chr known by a given (chr string) name
+ *
+ * The code is a bit clumsy, but this routine gets only such specialized
+ * use that it hardly matters.
+ */
+static chr
+chrnamed(struct vars *v,
+		 const chr *startp,		/* start of name */
+		 const chr *endp,		/* just past end of name */
+		 chr lastresort)		/* what to return if name lookup fails */
+{
+	chr			c;
+	int			errsave;
+	int			e;
+	struct cvec *cv;
+
+	errsave = v->err;
+	v->err = 0;
+	c = element(v, startp, endp);
+	e = v->err;
+	v->err = errsave;
+
+	if (e != 0)
+		return lastresort;
+
+	cv = range(v, c, c, 0);
+	if (cv->nchrs == 0)
+		return lastresort;
+	return cv->chrs[0];
+}
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
new file mode 100644
index 0000000..047abc3
--- /dev/null
+++ b/src/backend/regex/regc_locale.c
@@ -0,0 +1,766 @@
+/*
+ * regc_locale.c --
+ *
+ *	This file contains locale-specific regexp routines.
+ *	This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties.  The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ *
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ *
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ *
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license.
+ *
+ * src/backend/regex/regc_locale.c
+ */
+
+/* ASCII character-name table */
+
+static const struct cname
+{
+	const char *name;
+	const char	code;
+}			cnames[] =
+
+{
+	{
+		"NUL", '\0'
+	},
+	{
+		"SOH", '\001'
+	},
+	{
+		"STX", '\002'
+	},
+	{
+		"ETX", '\003'
+	},
+	{
+		"EOT", '\004'
+	},
+	{
+		"ENQ", '\005'
+	},
+	{
+		"ACK", '\006'
+	},
+	{
+		"BEL", '\007'
+	},
+	{
+		"alert", '\007'
+	},
+	{
+		"BS", '\010'
+	},
+	{
+		"backspace", '\b'
+	},
+	{
+		"HT", '\011'
+	},
+	{
+		"tab", '\t'
+	},
+	{
+		"LF", '\012'
+	},
+	{
+		"newline", '\n'
+	},
+	{
+		"VT", '\013'
+	},
+	{
+		"vertical-tab", '\v'
+	},
+	{
+		"FF", '\014'
+	},
+	{
+		"form-feed", '\f'
+	},
+	{
+		"CR", '\015'
+	},
+	{
+		"carriage-return", '\r'
+	},
+	{
+		"SO", '\016'
+	},
+	{
+		"SI", '\017'
+	},
+	{
+		"DLE", '\020'
+	},
+	{
+		"DC1", '\021'
+	},
+	{
+		"DC2", '\022'
+	},
+	{
+		"DC3", '\023'
+	},
+	{
+		"DC4", '\024'
+	},
+	{
+		"NAK", '\025'
+	},
+	{
+		"SYN", '\026'
+	},
+	{
+		"ETB", '\027'
+	},
+	{
+		"CAN", '\030'
+	},
+	{
+		"EM", '\031'
+	},
+	{
+		"SUB", '\032'
+	},
+	{
+		"ESC", '\033'
+	},
+	{
+		"IS4", '\034'
+	},
+	{
+		"FS", '\034'
+	},
+	{
+		"IS3", '\035'
+	},
+	{
+		"GS", '\035'
+	},
+	{
+		"IS2", '\036'
+	},
+	{
+		"RS", '\036'
+	},
+	{
+		"IS1", '\037'
+	},
+	{
+		"US", '\037'
+	},
+	{
+		"space", ' '
+	},
+	{
+		"exclamation-mark", '!'
+	},
+	{
+		"quotation-mark", '"'
+	},
+	{
+		"number-sign", '#'
+	},
+	{
+		"dollar-sign", '$'
+	},
+	{
+		"percent-sign", '%'
+	},
+	{
+		"ampersand", '&'
+	},
+	{
+		"apostrophe", '\''
+	},
+	{
+		"left-parenthesis", '('
+	},
+	{
+		"right-parenthesis", ')'
+	},
+	{
+		"asterisk", '*'
+	},
+	{
+		"plus-sign", '+'
+	},
+	{
+		"comma", ','
+	},
+	{
+		"hyphen", '-'
+	},
+	{
+		"hyphen-minus", '-'
+	},
+	{
+		"period", '.'
+	},
+	{
+		"full-stop", '.'
+	},
+	{
+		"slash", '/'
+	},
+	{
+		"solidus", '/'
+	},
+	{
+		"zero", '0'
+	},
+	{
+		"one", '1'
+	},
+	{
+		"two", '2'
+	},
+	{
+		"three", '3'
+	},
+	{
+		"four", '4'
+	},
+	{
+		"five", '5'
+	},
+	{
+		"six", '6'
+	},
+	{
+		"seven", '7'
+	},
+	{
+		"eight", '8'
+	},
+	{
+		"nine", '9'
+	},
+	{
+		"colon", ':'
+	},
+	{
+		"semicolon", ';'
+	},
+	{
+		"less-than-sign", '<'
+	},
+	{
+		"equals-sign", '='
+	},
+	{
+		"greater-than-sign", '>'
+	},
+	{
+		"question-mark", '?'
+	},
+	{
+		"commercial-at", '@'
+	},
+	{
+		"left-square-bracket", '['
+	},
+	{
+		"backslash", '\\'
+	},
+	{
+		"reverse-solidus", '\\'
+	},
+	{
+		"right-square-bracket", ']'
+	},
+	{
+		"circumflex", '^'
+	},
+	{
+		"circumflex-accent", '^'
+	},
+	{
+		"underscore", '_'
+	},
+	{
+		"low-line", '_'
+	},
+	{
+		"grave-accent", '`'
+	},
+	{
+		"left-brace", '{'
+	},
+	{
+		"left-curly-bracket", '{'
+	},
+	{
+		"vertical-line", '|'
+	},
+	{
+		"right-brace", '}'
+	},
+	{
+		"right-curly-bracket", '}'
+	},
+	{
+		"tilde", '~'
+	},
+	{
+		"DEL", '\177'
+	},
+	{
+		NULL, 0
+	}
+};
+
+/*
+ * The following arrays define the valid character class names.
+ */
+static const char *const classNames[NUM_CCLASSES + 1] = {
+	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+	"lower", "print", "punct", "space", "upper", "xdigit", NULL
+};
+
+enum classes
+{
+	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+};
+
+/*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible.  For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter.  So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
+/*
+ * element - map collating-element name to chr
+ */
+static chr
+element(struct vars *v,			/* context */
+		const chr *startp,		/* points to start of name */
+		const chr *endp)		/* points just past end of name */
+{
+	const struct cname *cn;
+	size_t		len;
+
+	/* generic:  one-chr names stand for themselves */
+	assert(startp < endp);
+	len = endp - startp;
+	if (len == 1)
+		return *startp;
+
+	NOTE(REG_ULOCALE);
+
+	/* search table */
+	for (cn = cnames; cn->name != NULL; cn++)
+	{
+		if (strlen(cn->name) == len &&
+			pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
+		{
+			break;				/* NOTE BREAK OUT */
+		}
+	}
+	if (cn->name != NULL)
+		return CHR(cn->code);
+
+	/* couldn't find it */
+	ERR(REG_ECOLLATE);
+	return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars *v,			/* context */
+	  chr a,					/* range start */
+	  chr b,					/* range end, might equal a */
+	  int cases)				/* case-independent? */
+{
+	int			nchrs;
+	struct cvec *cv;
+	chr			c,
+				cc;
+
+	if (a != b && !before(a, b))
+	{
+		ERR(REG_ERANGE);
+		return NULL;
+	}
+
+	if (!cases)
+	{							/* easy version */
+		cv = getcvec(v, 0, 1);
+		NOERRN();
+		addrange(cv, a, b);
+		return cv;
+	}
+
+	/*
+	 * When case-independent, it's hard to decide when cvec ranges are usable,
+	 * so for now at least, we won't try.  We use a range for the originally
+	 * specified chrs and then add on any case-equivalents that are outside
+	 * that range as individual chrs.
+	 *
+	 * To ensure sane behavior if someone specifies a very large range, limit
+	 * the allocation size to 100000 chrs (arbitrary) and check for overrun
+	 * inside the loop below.
+	 */
+	nchrs = b - a + 1;
+	if (nchrs <= 0 || nchrs > 100000)
+		nchrs = 100000;
+
+	cv = getcvec(v, nchrs, 1);
+	NOERRN();
+	addrange(cv, a, b);
+
+	for (c = a; c <= b; c++)
+	{
+		cc = pg_wc_tolower(c);
+		if (cc != c &&
+			(before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
+		cc = pg_wc_toupper(c);
+		if (cc != c &&
+			(before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
+		if (CANCEL_REQUESTED(v->re))
+		{
+			ERR(REG_CANCEL);
+			return NULL;
+		}
+	}
+
+	return cv;
+}
+
+/*
+ * before - is chr x before chr y, for purposes of range legality?
+ */
+static int						/* predicate */
+before(chr x, chr y)
+{
+	if (x < y)
+		return 1;
+	return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars *v,			/* context */
+	   chr c,					/* Collating element representing the
+								 * equivalence class. */
+	   int cases)				/* all cases? */
+{
+	struct cvec *cv;
+
+	/* crude fake equivalence class for testing */
+	if ((v->cflags & REG_FAKE) && c == 'x')
+	{
+		cv = getcvec(v, 4, 0);
+		addchr(cv, CHR('x'));
+		addchr(cv, CHR('y'));
+		if (cases)
+		{
+			addchr(cv, CHR('X'));
+			addchr(cv, CHR('Y'));
+		}
+		return cv;
+	}
+
+	/* otherwise, none */
+	if (cases)
+		return allcases(v, c);
+	cv = getcvec(v, 1, 0);
+	assert(cv != NULL);
+	addchr(cv, c);
+	return cv;
+}
+
+/*
+ * cclass - supply cvec for a character class
+ *
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache().  This is okay
+ * because callers are not supposed to explicitly free the result either way.
+ */
+static struct cvec *
+cclass(struct vars *v,			/* context */
+	   const chr *startp,		/* where the name starts */
+	   const chr *endp,			/* just past the end of the name */
+	   int cases)				/* case-independent? */
+{
+	size_t		len;
+	struct cvec *cv = NULL;
+	const char *const *namePtr;
+	int			i,
+				index;
+
+	/*
+	 * Map the name to the corresponding enumerated value.
+	 */
+	len = endp - startp;
+	index = -1;
+	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
+	{
+		if (strlen(*namePtr) == len &&
+			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
+		{
+			index = i;
+			break;
+		}
+	}
+	if (index == -1)
+	{
+		ERR(REG_ECTYPE);
+		return NULL;
+	}
+
+	/*
+	 * Remap lower and upper to alpha if the match is case insensitive.
+	 */
+
+	if (cases &&
+		((enum classes) index == CC_LOWER ||
+		 (enum classes) index == CC_UPPER))
+		index = (int) CC_ALPHA;
+
+	/*
+	 * Now compute the character class contents.  For classes that are based
+	 * on the behavior of a <wctype.h> or <ctype.h> function, we use
+	 * pg_ctype_get_cache so that we can cache the results.  Other classes
+	 * have definitions that are hard-wired here, and for those we just
+	 * construct a transient cvec on the fly.
+	 *
+	 * NB: keep this code in sync with cclass_column_index(), below.
+	 */
+
+	switch ((enum classes) index)
+	{
+		case CC_PRINT:
+			cv = pg_ctype_get_cache(pg_wc_isprint, index);
+			break;
+		case CC_ALNUM:
+			cv = pg_ctype_get_cache(pg_wc_isalnum, index);
+			break;
+		case CC_ALPHA:
+			cv = pg_ctype_get_cache(pg_wc_isalpha, index);
+			break;
+		case CC_ASCII:
+			/* hard-wired meaning */
+			cv = getcvec(v, 0, 1);
+			if (cv)
+				addrange(cv, 0, 0x7f);
+			break;
+		case CC_BLANK:
+			/* hard-wired meaning */
+			cv = getcvec(v, 2, 0);
+			addchr(cv, '\t');
+			addchr(cv, ' ');
+			break;
+		case CC_CNTRL:
+			/* hard-wired meaning */
+			cv = getcvec(v, 0, 2);
+			addrange(cv, 0x0, 0x1f);
+			addrange(cv, 0x7f, 0x9f);
+			break;
+		case CC_DIGIT:
+			cv = pg_ctype_get_cache(pg_wc_isdigit, index);
+			break;
+		case CC_PUNCT:
+			cv = pg_ctype_get_cache(pg_wc_ispunct, index);
+			break;
+		case CC_XDIGIT:
+
+			/*
+			 * It's not clear how to define this in non-western locales, and
+			 * even less clear that there's any particular use in trying. So
+			 * just hard-wire the meaning.
+			 */
+			cv = getcvec(v, 0, 3);
+			if (cv)
+			{
+				addrange(cv, '0', '9');
+				addrange(cv, 'a', 'f');
+				addrange(cv, 'A', 'F');
+			}
+			break;
+		case CC_SPACE:
+			cv = pg_ctype_get_cache(pg_wc_isspace, index);
+			break;
+		case CC_LOWER:
+			cv = pg_ctype_get_cache(pg_wc_islower, index);
+			break;
+		case CC_UPPER:
+			cv = pg_ctype_get_cache(pg_wc_isupper, index);
+			break;
+		case CC_GRAPH:
+			cv = pg_ctype_get_cache(pg_wc_isgraph, index);
+			break;
+	}
+
+	/* If cv is NULL now, the reason must be "out of memory" */
+	if (cv == NULL)
+		ERR(REG_ESPACE);
+	return cv;
+}
+
+/*
+ * cclass_column_index - get appropriate high colormap column index for chr
+ */
+static int
+cclass_column_index(struct colormap *cm, chr c)
+{
+	int			colnum = 0;
+
+	/* Shouldn't go through all these pushups for simple chrs */
+	assert(c > MAX_SIMPLE_CHR);
+
+	/*
+	 * Note: we should not see requests to consider cclasses that are not
+	 * treated as locale-specific by cclass(), above.
+	 */
+	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
+		colnum |= cm->classbits[CC_PRINT];
+	if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
+		colnum |= cm->classbits[CC_ALNUM];
+	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
+		colnum |= cm->classbits[CC_ALPHA];
+	assert(cm->classbits[CC_ASCII] == 0);
+	assert(cm->classbits[CC_BLANK] == 0);
+	assert(cm->classbits[CC_CNTRL] == 0);
+	if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
+		colnum |= cm->classbits[CC_DIGIT];
+	if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
+		colnum |= cm->classbits[CC_PUNCT];
+	assert(cm->classbits[CC_XDIGIT] == 0);
+	if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
+		colnum |= cm->classbits[CC_SPACE];
+	if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
+		colnum |= cm->classbits[CC_LOWER];
+	if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
+		colnum |= cm->classbits[CC_UPPER];
+	if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
+		colnum |= cm->classbits[CC_GRAPH];
+
+	return colnum;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars *v,		/* context */
+		 chr c)					/* character to get case equivs of */
+{
+	struct cvec *cv;
+	chr			lc,
+				uc;
+
+	lc = pg_wc_tolower(c);
+	uc = pg_wc_toupper(c);
+
+	cv = getcvec(v, 2, 0);
+	addchr(cv, lc);
+	if (lc != uc)
+		addchr(cv, uc);
+	return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int						/* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y, /* strings to compare */
+	size_t len)					/* exact length of comparison */
+{
+	return memcmp(VS(x), VS(y), len * sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int						/* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y, /* strings to compare */
+		size_t len)				/* exact length of comparison */
+{
+	for (; len > 0; len--, x++, y++)
+	{
+		if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
+			return 1;
+	}
+	return 0;
+}
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
new file mode 100644
index 0000000..92c9c4d
--- /dev/null
+++ b/src/backend/regex/regc_nfa.c
@@ -0,0 +1,3181 @@
+/*
+ * NFA utilities.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regc_nfa.c
+ *
+ *
+ * One or two things that technically ought to be in here
+ * are actually in color.c, thanks to some incestuous relationships in
+ * the color chains.
+ */
+
+#define NISERR()	VISERR(nfa->v)
+#define NERR(e)		VERR(nfa->v, (e))
+
+
+/*
+ * newnfa - set up an NFA
+ */
+static struct nfa *				/* the NFA, or NULL */
+newnfa(struct vars *v,
+	   struct colormap *cm,
+	   struct nfa *parent)		/* NULL if primary NFA */
+{
+	struct nfa *nfa;
+
+	nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
+	if (nfa == NULL)
+	{
+		ERR(REG_ESPACE);
+		return NULL;
+	}
+
+	nfa->states = NULL;
+	nfa->slast = NULL;
+	nfa->free = NULL;
+	nfa->nstates = 0;
+	nfa->cm = cm;
+	nfa->v = v;
+	nfa->bos[0] = nfa->bos[1] = COLORLESS;
+	nfa->eos[0] = nfa->eos[1] = COLORLESS;
+	nfa->parent = parent;		/* Precedes newfstate so parent is valid. */
+	nfa->post = newfstate(nfa, '@');	/* number 0 */
+	nfa->pre = newfstate(nfa, '>'); /* number 1 */
+
+	nfa->init = newstate(nfa);	/* may become invalid later */
+	nfa->final = newstate(nfa);
+	if (ISERR())
+	{
+		freenfa(nfa);
+		return NULL;
+	}
+	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
+	newarc(nfa, '^', 1, nfa->pre, nfa->init);
+	newarc(nfa, '^', 0, nfa->pre, nfa->init);
+	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
+	newarc(nfa, '$', 1, nfa->final, nfa->post);
+	newarc(nfa, '$', 0, nfa->final, nfa->post);
+
+	if (ISERR())
+	{
+		freenfa(nfa);
+		return NULL;
+	}
+	return nfa;
+}
+
+/*
+ * freenfa - free an entire NFA
+ */
+static void
+freenfa(struct nfa *nfa)
+{
+	struct state *s;
+
+	while ((s = nfa->states) != NULL)
+	{
+		s->nins = s->nouts = 0; /* don't worry about arcs */
+		freestate(nfa, s);
+	}
+	while ((s = nfa->free) != NULL)
+	{
+		nfa->free = s->next;
+		destroystate(nfa, s);
+	}
+
+	nfa->slast = NULL;
+	nfa->nstates = -1;
+	nfa->pre = NULL;
+	nfa->post = NULL;
+	FREE(nfa);
+}
+
+/*
+ * newstate - allocate an NFA state, with zero flag value
+ */
+static struct state *			/* NULL on error */
+newstate(struct nfa *nfa)
+{
+	struct state *s;
+
+	/*
+	 * This is a handy place to check for operation cancel during regex
+	 * compilation, since no code path will go very long without making a new
+	 * state or arc.
+	 */
+	if (CANCEL_REQUESTED(nfa->v->re))
+	{
+		NERR(REG_CANCEL);
+		return NULL;
+	}
+
+	if (nfa->free != NULL)
+	{
+		s = nfa->free;
+		nfa->free = s->next;
+	}
+	else
+	{
+		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
+		{
+			NERR(REG_ETOOBIG);
+			return NULL;
+		}
+		s = (struct state *) MALLOC(sizeof(struct state));
+		if (s == NULL)
+		{
+			NERR(REG_ESPACE);
+			return NULL;
+		}
+		nfa->v->spaceused += sizeof(struct state);
+		s->oas.next = NULL;
+		s->free = NULL;
+		s->noas = 0;
+	}
+
+	assert(nfa->nstates >= 0);
+	s->no = nfa->nstates++;
+	s->flag = 0;
+	if (nfa->states == NULL)
+		nfa->states = s;
+	s->nins = 0;
+	s->ins = NULL;
+	s->nouts = 0;
+	s->outs = NULL;
+	s->tmp = NULL;
+	s->next = NULL;
+	if (nfa->slast != NULL)
+	{
+		assert(nfa->slast->next == NULL);
+		nfa->slast->next = s;
+	}
+	s->prev = nfa->slast;
+	nfa->slast = s;
+	return s;
+}
+
+/*
+ * newfstate - allocate an NFA state with a specified flag value
+ */
+static struct state *			/* NULL on error */
+newfstate(struct nfa *nfa, int flag)
+{
+	struct state *s;
+
+	s = newstate(nfa);
+	if (s != NULL)
+		s->flag = (char) flag;
+	return s;
+}
+
+/*
+ * dropstate - delete a state's inarcs and outarcs and free it
+ */
+static void
+dropstate(struct nfa *nfa,
+		  struct state *s)
+{
+	struct arc *a;
+
+	while ((a = s->ins) != NULL)
+		freearc(nfa, a);
+	while ((a = s->outs) != NULL)
+		freearc(nfa, a);
+	freestate(nfa, s);
+}
+
+/*
+ * freestate - free a state, which has no in-arcs or out-arcs
+ */
+static void
+freestate(struct nfa *nfa,
+		  struct state *s)
+{
+	assert(s != NULL);
+	assert(s->nins == 0 && s->nouts == 0);
+
+	s->no = FREESTATE;
+	s->flag = 0;
+	if (s->next != NULL)
+		s->next->prev = s->prev;
+	else
+	{
+		assert(s == nfa->slast);
+		nfa->slast = s->prev;
+	}
+	if (s->prev != NULL)
+		s->prev->next = s->next;
+	else
+	{
+		assert(s == nfa->states);
+		nfa->states = s->next;
+	}
+	s->prev = NULL;
+	s->next = nfa->free;		/* don't delete it, put it on the free list */
+	nfa->free = s;
+}
+
+/*
+ * destroystate - really get rid of an already-freed state
+ */
+static void
+destroystate(struct nfa *nfa,
+			 struct state *s)
+{
+	struct arcbatch *ab;
+	struct arcbatch *abnext;
+
+	assert(s->no == FREESTATE);
+	for (ab = s->oas.next; ab != NULL; ab = abnext)
+	{
+		abnext = ab->next;
+		FREE(ab);
+		nfa->v->spaceused -= sizeof(struct arcbatch);
+	}
+	s->ins = NULL;
+	s->outs = NULL;
+	s->next = NULL;
+	FREE(s);
+	nfa->v->spaceused -= sizeof(struct state);
+}
+
+/*
+ * newarc - set up a new arc within an NFA
+ *
+ * This function checks to make sure that no duplicate arcs are created.
+ * In general we never want duplicates.
+ */
+static void
+newarc(struct nfa *nfa,
+	   int t,
+	   color co,
+	   struct state *from,
+	   struct state *to)
+{
+	struct arc *a;
+
+	assert(from != NULL && to != NULL);
+
+	/*
+	 * This is a handy place to check for operation cancel during regex
+	 * compilation, since no code path will go very long without making a new
+	 * state or arc.
+	 */
+	if (CANCEL_REQUESTED(nfa->v->re))
+	{
+		NERR(REG_CANCEL);
+		return;
+	}
+
+	/* check for duplicate arc, using whichever chain is shorter */
+	if (from->nouts <= to->nins)
+	{
+		for (a = from->outs; a != NULL; a = a->outchain)
+			if (a->to == to && a->co == co && a->type == t)
+				return;
+	}
+	else
+	{
+		for (a = to->ins; a != NULL; a = a->inchain)
+			if (a->from == from && a->co == co && a->type == t)
+				return;
+	}
+
+	/* no dup, so create the arc */
+	createarc(nfa, t, co, from, to);
+}
+
+/*
+ * createarc - create a new arc within an NFA
+ *
+ * This function must *only* be used after verifying that there is no existing
+ * identical arc (same type/color/from/to).
+ */
+static void
+createarc(struct nfa *nfa,
+		  int t,
+		  color co,
+		  struct state *from,
+		  struct state *to)
+{
+	struct arc *a;
+
+	/* the arc is physically allocated within its from-state */
+	a = allocarc(nfa, from);
+	if (NISERR())
+		return;
+	assert(a != NULL);
+
+	a->type = t;
+	a->co = co;
+	a->to = to;
+	a->from = from;
+
+	/*
+	 * Put the new arc on the beginning, not the end, of the chains; it's
+	 * simpler here, and freearc() is the same cost either way.  See also the
+	 * logic in moveins() and its cohorts, as well as fixempties().
+	 */
+	a->inchain = to->ins;
+	a->inchainRev = NULL;
+	if (to->ins)
+		to->ins->inchainRev = a;
+	to->ins = a;
+	a->outchain = from->outs;
+	a->outchainRev = NULL;
+	if (from->outs)
+		from->outs->outchainRev = a;
+	from->outs = a;
+
+	from->nouts++;
+	to->nins++;
+
+	if (COLORED(a) && nfa->parent == NULL)
+		colorchain(nfa->cm, a);
+}
+
+/*
+ * allocarc - allocate a new out-arc within a state
+ */
+static struct arc *				/* NULL for failure */
+allocarc(struct nfa *nfa,
+		 struct state *s)
+{
+	struct arc *a;
+
+	/* shortcut */
+	if (s->free == NULL && s->noas < ABSIZE)
+	{
+		a = &s->oas.a[s->noas];
+		s->noas++;
+		return a;
+	}
+
+	/* if none at hand, get more */
+	if (s->free == NULL)
+	{
+		struct arcbatch *newAb;
+		int			i;
+
+		if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
+		{
+			NERR(REG_ETOOBIG);
+			return NULL;
+		}
+		newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
+		if (newAb == NULL)
+		{
+			NERR(REG_ESPACE);
+			return NULL;
+		}
+		nfa->v->spaceused += sizeof(struct arcbatch);
+		newAb->next = s->oas.next;
+		s->oas.next = newAb;
+
+		for (i = 0; i < ABSIZE; i++)
+		{
+			newAb->a[i].type = 0;
+			newAb->a[i].freechain = &newAb->a[i + 1];
+		}
+		newAb->a[ABSIZE - 1].freechain = NULL;
+		s->free = &newAb->a[0];
+	}
+	assert(s->free != NULL);
+
+	a = s->free;
+	s->free = a->freechain;
+	return a;
+}
+
+/*
+ * freearc - free an arc
+ */
+static void
+freearc(struct nfa *nfa,
+		struct arc *victim)
+{
+	struct state *from = victim->from;
+	struct state *to = victim->to;
+	struct arc *predecessor;
+
+	assert(victim->type != 0);
+
+	/* take it off color chain if necessary */
+	if (COLORED(victim) && nfa->parent == NULL)
+		uncolorchain(nfa->cm, victim);
+
+	/* take it off source's out-chain */
+	assert(from != NULL);
+	predecessor = victim->outchainRev;
+	if (predecessor == NULL)
+	{
+		assert(from->outs == victim);
+		from->outs = victim->outchain;
+	}
+	else
+	{
+		assert(predecessor->outchain == victim);
+		predecessor->outchain = victim->outchain;
+	}
+	if (victim->outchain != NULL)
+	{
+		assert(victim->outchain->outchainRev == victim);
+		victim->outchain->outchainRev = predecessor;
+	}
+	from->nouts--;
+
+	/* take it off target's in-chain */
+	assert(to != NULL);
+	predecessor = victim->inchainRev;
+	if (predecessor == NULL)
+	{
+		assert(to->ins == victim);
+		to->ins = victim->inchain;
+	}
+	else
+	{
+		assert(predecessor->inchain == victim);
+		predecessor->inchain = victim->inchain;
+	}
+	if (victim->inchain != NULL)
+	{
+		assert(victim->inchain->inchainRev == victim);
+		victim->inchain->inchainRev = predecessor;
+	}
+	to->nins--;
+
+	/* clean up and place on from-state's free list */
+	victim->type = 0;
+	victim->from = NULL;		/* precautions... */
+	victim->to = NULL;
+	victim->inchain = NULL;
+	victim->inchainRev = NULL;
+	victim->outchain = NULL;
+	victim->outchainRev = NULL;
+	victim->freechain = from->free;
+	from->free = victim;
+}
+
+/*
+ * changearctarget - flip an arc to have a different to state
+ *
+ * Caller must have verified that there is no pre-existing duplicate arc.
+ *
+ * Note that because we store arcs in their from state, we can't easily have
+ * a similar changearcsource function.
+ */
+static void
+changearctarget(struct arc *a, struct state *newto)
+{
+	struct state *oldto = a->to;
+	struct arc *predecessor;
+
+	assert(oldto != newto);
+
+	/* take it off old target's in-chain */
+	assert(oldto != NULL);
+	predecessor = a->inchainRev;
+	if (predecessor == NULL)
+	{
+		assert(oldto->ins == a);
+		oldto->ins = a->inchain;
+	}
+	else
+	{
+		assert(predecessor->inchain == a);
+		predecessor->inchain = a->inchain;
+	}
+	if (a->inchain != NULL)
+	{
+		assert(a->inchain->inchainRev == a);
+		a->inchain->inchainRev = predecessor;
+	}
+	oldto->nins--;
+
+	a->to = newto;
+
+	/* prepend it to new target's in-chain */
+	a->inchain = newto->ins;
+	a->inchainRev = NULL;
+	if (newto->ins)
+		newto->ins->inchainRev = a;
+	newto->ins = a;
+	newto->nins++;
+}
+
+/*
+ * hasnonemptyout - Does state have a non-EMPTY out arc?
+ */
+static int
+hasnonemptyout(struct state *s)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != EMPTY)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * findarc - find arc, if any, from given source with given type and color
+ * If there is more than one such arc, the result is random.
+ */
+static struct arc *
+findarc(struct state *s,
+		int type,
+		color co)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		if (a->type == type && a->co == co)
+			return a;
+	return NULL;
+}
+
+/*
+ * cparc - allocate a new arc within an NFA, copying details from old one
+ */
+static void
+cparc(struct nfa *nfa,
+	  struct arc *oa,
+	  struct state *from,
+	  struct state *to)
+{
+	newarc(nfa, oa->type, oa->co, from, to);
+}
+
+/*
+ * sortins - sort the in arcs of a state by from/color/type
+ */
+static void
+sortins(struct nfa *nfa,
+		struct state *s)
+{
+	struct arc **sortarray;
+	struct arc *a;
+	int			n = s->nins;
+	int			i;
+
+	if (n <= 1)
+		return;					/* nothing to do */
+	/* make an array of arc pointers ... */
+	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
+	if (sortarray == NULL)
+	{
+		NERR(REG_ESPACE);
+		return;
+	}
+	i = 0;
+	for (a = s->ins; a != NULL; a = a->inchain)
+		sortarray[i++] = a;
+	assert(i == n);
+	/* ... sort the array */
+	qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
+	/* ... and rebuild arc list in order */
+	/* it seems worth special-casing first and last items to simplify loop */
+	a = sortarray[0];
+	s->ins = a;
+	a->inchain = sortarray[1];
+	a->inchainRev = NULL;
+	for (i = 1; i < n - 1; i++)
+	{
+		a = sortarray[i];
+		a->inchain = sortarray[i + 1];
+		a->inchainRev = sortarray[i - 1];
+	}
+	a = sortarray[i];
+	a->inchain = NULL;
+	a->inchainRev = sortarray[i - 1];
+	FREE(sortarray);
+}
+
+static int
+sortins_cmp(const void *a, const void *b)
+{
+	const struct arc *aa = *((const struct arc *const *) a);
+	const struct arc *bb = *((const struct arc *const *) b);
+
+	/* we check the fields in the order they are most likely to be different */
+	if (aa->from->no < bb->from->no)
+		return -1;
+	if (aa->from->no > bb->from->no)
+		return 1;
+	if (aa->co < bb->co)
+		return -1;
+	if (aa->co > bb->co)
+		return 1;
+	if (aa->type < bb->type)
+		return -1;
+	if (aa->type > bb->type)
+		return 1;
+	return 0;
+}
+
+/*
+ * sortouts - sort the out arcs of a state by to/color/type
+ */
+static void
+sortouts(struct nfa *nfa,
+		 struct state *s)
+{
+	struct arc **sortarray;
+	struct arc *a;
+	int			n = s->nouts;
+	int			i;
+
+	if (n <= 1)
+		return;					/* nothing to do */
+	/* make an array of arc pointers ... */
+	sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
+	if (sortarray == NULL)
+	{
+		NERR(REG_ESPACE);
+		return;
+	}
+	i = 0;
+	for (a = s->outs; a != NULL; a = a->outchain)
+		sortarray[i++] = a;
+	assert(i == n);
+	/* ... sort the array */
+	qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
+	/* ... and rebuild arc list in order */
+	/* it seems worth special-casing first and last items to simplify loop */
+	a = sortarray[0];
+	s->outs = a;
+	a->outchain = sortarray[1];
+	a->outchainRev = NULL;
+	for (i = 1; i < n - 1; i++)
+	{
+		a = sortarray[i];
+		a->outchain = sortarray[i + 1];
+		a->outchainRev = sortarray[i - 1];
+	}
+	a = sortarray[i];
+	a->outchain = NULL;
+	a->outchainRev = sortarray[i - 1];
+	FREE(sortarray);
+}
+
+static int
+sortouts_cmp(const void *a, const void *b)
+{
+	const struct arc *aa = *((const struct arc *const *) a);
+	const struct arc *bb = *((const struct arc *const *) b);
+
+	/* we check the fields in the order they are most likely to be different */
+	if (aa->to->no < bb->to->no)
+		return -1;
+	if (aa->to->no > bb->to->no)
+		return 1;
+	if (aa->co < bb->co)
+		return -1;
+	if (aa->co > bb->co)
+		return 1;
+	if (aa->type < bb->type)
+		return -1;
+	if (aa->type > bb->type)
+		return 1;
+	return 0;
+}
+
+/*
+ * Common decision logic about whether to use arc-by-arc operations or
+ * sort/merge.  If there's just a few source arcs we cannot recoup the
+ * cost of sorting the destination arc list, no matter how large it is.
+ * Otherwise, limit the number of arc-by-arc comparisons to about 1000
+ * (a somewhat arbitrary choice, but the breakeven point would probably
+ * be machine dependent anyway).
+ */
+#define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
+	((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
+
+/*
+ * moveins - move all in arcs of a state to another state
+ *
+ * You might think this could be done better by just updating the
+ * existing arcs, and you would be right if it weren't for the need
+ * for duplicate suppression, which makes it easier to just make new
+ * ones to exploit the suppression built into newarc.
+ *
+ * However, if we have a whole lot of arcs to deal with, retail duplicate
+ * checks become too slow.  In that case we proceed by sorting and merging
+ * the arc lists, and then we can indeed just update the arcs in-place.
+ */
+static void
+moveins(struct nfa *nfa,
+		struct state *oldState,
+		struct state *newState)
+{
+	assert(oldState != newState);
+
+	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+	{
+		/* With not too many arcs, just do them one at a time */
+		struct arc *a;
+
+		while ((a = oldState->ins) != NULL)
+		{
+			cparc(nfa, a, a->from, newState);
+			freearc(nfa, a);
+		}
+	}
+	else
+	{
+		/*
+		 * With many arcs, use a sort-merge approach.  Note changearctarget()
+		 * will put the arc onto the front of newState's chain, so it does not
+		 * break our walk through the sorted part of the chain.
+		 */
+		struct arc *oa;
+		struct arc *na;
+
+		/*
+		 * Because we bypass newarc() in this code path, we'd better include a
+		 * cancel check.
+		 */
+		if (CANCEL_REQUESTED(nfa->v->re))
+		{
+			NERR(REG_CANCEL);
+			return;
+		}
+
+		sortins(nfa, oldState);
+		sortins(nfa, newState);
+		if (NISERR())
+			return;				/* might have failed to sort */
+		oa = oldState->ins;
+		na = newState->ins;
+		while (oa != NULL && na != NULL)
+		{
+			struct arc *a = oa;
+
+			switch (sortins_cmp(&oa, &na))
+			{
+				case -1:
+					/* newState does not have anything matching oa */
+					oa = oa->inchain;
+
+					/*
+					 * Rather than doing createarc+freearc, we can just unlink
+					 * and relink the existing arc struct.
+					 */
+					changearctarget(a, newState);
+					break;
+				case 0:
+					/* match, advance in both lists */
+					oa = oa->inchain;
+					na = na->inchain;
+					/* ... and drop duplicate arc from oldState */
+					freearc(nfa, a);
+					break;
+				case +1:
+					/* advance only na; oa might have a match later */
+					na = na->inchain;
+					break;
+				default:
+					assert(NOTREACHED);
+			}
+		}
+		while (oa != NULL)
+		{
+			/* newState does not have anything matching oa */
+			struct arc *a = oa;
+
+			oa = oa->inchain;
+			changearctarget(a, newState);
+		}
+	}
+
+	assert(oldState->nins == 0);
+	assert(oldState->ins == NULL);
+}
+
+/*
+ * copyins - copy in arcs of a state to another state
+ */
+static void
+copyins(struct nfa *nfa,
+		struct state *oldState,
+		struct state *newState)
+{
+	assert(oldState != newState);
+
+	if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+	{
+		/* With not too many arcs, just do them one at a time */
+		struct arc *a;
+
+		for (a = oldState->ins; a != NULL; a = a->inchain)
+			cparc(nfa, a, a->from, newState);
+	}
+	else
+	{
+		/*
+		 * With many arcs, use a sort-merge approach.  Note that createarc()
+		 * will put new arcs onto the front of newState's chain, so it does
+		 * not break our walk through the sorted part of the chain.
+		 */
+		struct arc *oa;
+		struct arc *na;
+
+		/*
+		 * Because we bypass newarc() in this code path, we'd better include a
+		 * cancel check.
+		 */
+		if (CANCEL_REQUESTED(nfa->v->re))
+		{
+			NERR(REG_CANCEL);
+			return;
+		}
+
+		sortins(nfa, oldState);
+		sortins(nfa, newState);
+		if (NISERR())
+			return;				/* might have failed to sort */
+		oa = oldState->ins;
+		na = newState->ins;
+		while (oa != NULL && na != NULL)
+		{
+			struct arc *a = oa;
+
+			switch (sortins_cmp(&oa, &na))
+			{
+				case -1:
+					/* newState does not have anything matching oa */
+					oa = oa->inchain;
+					createarc(nfa, a->type, a->co, a->from, newState);
+					break;
+				case 0:
+					/* match, advance in both lists */
+					oa = oa->inchain;
+					na = na->inchain;
+					break;
+				case +1:
+					/* advance only na; oa might have a match later */
+					na = na->inchain;
+					break;
+				default:
+					assert(NOTREACHED);
+			}
+		}
+		while (oa != NULL)
+		{
+			/* newState does not have anything matching oa */
+			struct arc *a = oa;
+
+			oa = oa->inchain;
+			createarc(nfa, a->type, a->co, a->from, newState);
+		}
+	}
+}
+
+/*
+ * mergeins - merge a list of inarcs into a state
+ *
+ * This is much like copyins, but the source arcs are listed in an array,
+ * and are not guaranteed unique.  It's okay to clobber the array contents.
+ */
+static void
+mergeins(struct nfa *nfa,
+		 struct state *s,
+		 struct arc **arcarray,
+		 int arccount)
+{
+	struct arc *na;
+	int			i;
+	int			j;
+
+	if (arccount <= 0)
+		return;
+
+	/*
+	 * Because we bypass newarc() in this code path, we'd better include a
+	 * cancel check.
+	 */
+	if (CANCEL_REQUESTED(nfa->v->re))
+	{
+		NERR(REG_CANCEL);
+		return;
+	}
+
+	/* Sort existing inarcs as well as proposed new ones */
+	sortins(nfa, s);
+	if (NISERR())
+		return;					/* might have failed to sort */
+
+	qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
+
+	/*
+	 * arcarray very likely includes dups, so we must eliminate them.  (This
+	 * could be folded into the next loop, but it's not worth the trouble.)
+	 */
+	j = 0;
+	for (i = 1; i < arccount; i++)
+	{
+		switch (sortins_cmp(&arcarray[j], &arcarray[i]))
+		{
+			case -1:
+				/* non-dup */
+				arcarray[++j] = arcarray[i];
+				break;
+			case 0:
+				/* dup */
+				break;
+			default:
+				/* trouble */
+				assert(NOTREACHED);
+		}
+	}
+	arccount = j + 1;
+
+	/*
+	 * Now merge into s' inchain.  Note that createarc() will put new arcs
+	 * onto the front of s's chain, so it does not break our walk through the
+	 * sorted part of the chain.
+	 */
+	i = 0;
+	na = s->ins;
+	while (i < arccount && na != NULL)
+	{
+		struct arc *a = arcarray[i];
+
+		switch (sortins_cmp(&a, &na))
+		{
+			case -1:
+				/* s does not have anything matching a */
+				createarc(nfa, a->type, a->co, a->from, s);
+				i++;
+				break;
+			case 0:
+				/* match, advance in both lists */
+				i++;
+				na = na->inchain;
+				break;
+			case +1:
+				/* advance only na; array might have a match later */
+				na = na->inchain;
+				break;
+			default:
+				assert(NOTREACHED);
+		}
+	}
+	while (i < arccount)
+	{
+		/* s does not have anything matching a */
+		struct arc *a = arcarray[i];
+
+		createarc(nfa, a->type, a->co, a->from, s);
+		i++;
+	}
+}
+
+/*
+ * moveouts - move all out arcs of a state to another state
+ */
+static void
+moveouts(struct nfa *nfa,
+		 struct state *oldState,
+		 struct state *newState)
+{
+	assert(oldState != newState);
+
+	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+	{
+		/* With not too many arcs, just do them one at a time */
+		struct arc *a;
+
+		while ((a = oldState->outs) != NULL)
+		{
+			cparc(nfa, a, newState, a->to);
+			freearc(nfa, a);
+		}
+	}
+	else
+	{
+		/*
+		 * With many arcs, use a sort-merge approach.  Note that createarc()
+		 * will put new arcs onto the front of newState's chain, so it does
+		 * not break our walk through the sorted part of the chain.
+		 */
+		struct arc *oa;
+		struct arc *na;
+
+		/*
+		 * Because we bypass newarc() in this code path, we'd better include a
+		 * cancel check.
+		 */
+		if (CANCEL_REQUESTED(nfa->v->re))
+		{
+			NERR(REG_CANCEL);
+			return;
+		}
+
+		sortouts(nfa, oldState);
+		sortouts(nfa, newState);
+		if (NISERR())
+			return;				/* might have failed to sort */
+		oa = oldState->outs;
+		na = newState->outs;
+		while (oa != NULL && na != NULL)
+		{
+			struct arc *a = oa;
+
+			switch (sortouts_cmp(&oa, &na))
+			{
+				case -1:
+					/* newState does not have anything matching oa */
+					oa = oa->outchain;
+					createarc(nfa, a->type, a->co, newState, a->to);
+					freearc(nfa, a);
+					break;
+				case 0:
+					/* match, advance in both lists */
+					oa = oa->outchain;
+					na = na->outchain;
+					/* ... and drop duplicate arc from oldState */
+					freearc(nfa, a);
+					break;
+				case +1:
+					/* advance only na; oa might have a match later */
+					na = na->outchain;
+					break;
+				default:
+					assert(NOTREACHED);
+			}
+		}
+		while (oa != NULL)
+		{
+			/* newState does not have anything matching oa */
+			struct arc *a = oa;
+
+			oa = oa->outchain;
+			createarc(nfa, a->type, a->co, newState, a->to);
+			freearc(nfa, a);
+		}
+	}
+
+	assert(oldState->nouts == 0);
+	assert(oldState->outs == NULL);
+}
+
+/*
+ * copyouts - copy out arcs of a state to another state
+ */
+static void
+copyouts(struct nfa *nfa,
+		 struct state *oldState,
+		 struct state *newState)
+{
+	assert(oldState != newState);
+
+	if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+	{
+		/* With not too many arcs, just do them one at a time */
+		struct arc *a;
+
+		for (a = oldState->outs; a != NULL; a = a->outchain)
+			cparc(nfa, a, newState, a->to);
+	}
+	else
+	{
+		/*
+		 * With many arcs, use a sort-merge approach.  Note that createarc()
+		 * will put new arcs onto the front of newState's chain, so it does
+		 * not break our walk through the sorted part of the chain.
+		 */
+		struct arc *oa;
+		struct arc *na;
+
+		/*
+		 * Because we bypass newarc() in this code path, we'd better include a
+		 * cancel check.
+		 */
+		if (CANCEL_REQUESTED(nfa->v->re))
+		{
+			NERR(REG_CANCEL);
+			return;
+		}
+
+		sortouts(nfa, oldState);
+		sortouts(nfa, newState);
+		if (NISERR())
+			return;				/* might have failed to sort */
+		oa = oldState->outs;
+		na = newState->outs;
+		while (oa != NULL && na != NULL)
+		{
+			struct arc *a = oa;
+
+			switch (sortouts_cmp(&oa, &na))
+			{
+				case -1:
+					/* newState does not have anything matching oa */
+					oa = oa->outchain;
+					createarc(nfa, a->type, a->co, newState, a->to);
+					break;
+				case 0:
+					/* match, advance in both lists */
+					oa = oa->outchain;
+					na = na->outchain;
+					break;
+				case +1:
+					/* advance only na; oa might have a match later */
+					na = na->outchain;
+					break;
+				default:
+					assert(NOTREACHED);
+			}
+		}
+		while (oa != NULL)
+		{
+			/* newState does not have anything matching oa */
+			struct arc *a = oa;
+
+			oa = oa->outchain;
+			createarc(nfa, a->type, a->co, newState, a->to);
+		}
+	}
+}
+
+/*
+ * cloneouts - copy out arcs of a state to another state pair, modifying type
+ */
+static void
+cloneouts(struct nfa *nfa,
+		  struct state *old,
+		  struct state *from,
+		  struct state *to,
+		  int type)
+{
+	struct arc *a;
+
+	assert(old != from);
+
+	for (a = old->outs; a != NULL; a = a->outchain)
+		newarc(nfa, type, a->co, from, to);
+}
+
+/*
+ * delsub - delete a sub-NFA, updating subre pointers if necessary
+ *
+ * This uses a recursive traversal of the sub-NFA, marking already-seen
+ * states using their tmp pointer.
+ */
+static void
+delsub(struct nfa *nfa,
+	   struct state *lp,		/* the sub-NFA goes from here... */
+	   struct state *rp)		/* ...to here, *not* inclusive */
+{
+	assert(lp != rp);
+
+	rp->tmp = rp;				/* mark end */
+
+	deltraverse(nfa, lp, lp);
+	if (NISERR())
+		return;					/* asserts might not hold after failure */
+	assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
+	assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
+
+	rp->tmp = NULL;				/* unmark end */
+	lp->tmp = NULL;				/* and begin, marked by deltraverse */
+}
+
+/*
+ * deltraverse - the recursive heart of delsub
+ * This routine's basic job is to destroy all out-arcs of the state.
+ */
+static void
+deltraverse(struct nfa *nfa,
+			struct state *leftend,
+			struct state *s)
+{
+	struct arc *a;
+	struct state *to;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	if (s->nouts == 0)
+		return;					/* nothing to do */
+	if (s->tmp != NULL)
+		return;					/* already in progress */
+
+	s->tmp = s;					/* mark as in progress */
+
+	while ((a = s->outs) != NULL)
+	{
+		to = a->to;
+		deltraverse(nfa, leftend, to);
+		if (NISERR())
+			return;				/* asserts might not hold after failure */
+		assert(to->nouts == 0 || to->tmp != NULL);
+		freearc(nfa, a);
+		if (to->nins == 0 && to->tmp == NULL)
+		{
+			assert(to->nouts == 0);
+			freestate(nfa, to);
+		}
+	}
+
+	assert(s->no != FREESTATE); /* we're still here */
+	assert(s == leftend || s->nins != 0);	/* and still reachable */
+	assert(s->nouts == 0);		/* but have no outarcs */
+
+	s->tmp = NULL;				/* we're done here */
+}
+
+/*
+ * dupnfa - duplicate sub-NFA
+ *
+ * Another recursive traversal, this time using tmp to point to duplicates
+ * as well as mark already-seen states.  (You knew there was a reason why
+ * it's a state pointer, didn't you? :-))
+ */
+static void
+dupnfa(struct nfa *nfa,
+	   struct state *start,		/* duplicate of subNFA starting here */
+	   struct state *stop,		/* and stopping here */
+	   struct state *from,		/* stringing duplicate from here */
+	   struct state *to)		/* to here */
+{
+	if (start == stop)
+	{
+		newarc(nfa, EMPTY, 0, from, to);
+		return;
+	}
+
+	stop->tmp = to;
+	duptraverse(nfa, start, from);
+	/* done, except for clearing out the tmp pointers */
+
+	stop->tmp = NULL;
+	cleartraverse(nfa, start);
+}
+
+/*
+ * duptraverse - recursive heart of dupnfa
+ */
+static void
+duptraverse(struct nfa *nfa,
+			struct state *s,
+			struct state *stmp) /* s's duplicate, or NULL */
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	if (s->tmp != NULL)
+		return;					/* already done */
+
+	s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
+	if (s->tmp == NULL)
+	{
+		assert(NISERR());
+		return;
+	}
+
+	for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
+	{
+		duptraverse(nfa, a->to, (struct state *) NULL);
+		if (NISERR())
+			break;
+		assert(a->to->tmp != NULL);
+		cparc(nfa, a, s->tmp, a->to->tmp);
+	}
+}
+
+/*
+ * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
+ */
+static void
+cleartraverse(struct nfa *nfa,
+			  struct state *s)
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	if (s->tmp == NULL)
+		return;
+	s->tmp = NULL;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		cleartraverse(nfa, a->to);
+}
+
+/*
+ * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
+ *
+ * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
+ * of a set of colors), return a state whose outarc list contains only PLAIN
+ * arcs of those color(s).  Otherwise return NULL.
+ *
+ * This is used before optimizing the NFA, so there may be EMPTY arcs, which
+ * we should ignore; the possibility of an EMPTY is why the result state could
+ * be different from s1.
+ *
+ * It's worth troubling to handle multiple parallel PLAIN arcs here because a
+ * bracket construct such as [abc] might yield either one or several parallel
+ * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
+ * that implementation detail not create user-visible performance differences.
+ */
+static struct state *
+single_color_transition(struct state *s1, struct state *s2)
+{
+	struct arc *a;
+
+	/* Ignore leading EMPTY arc, if any */
+	if (s1->nouts == 1 && s1->outs->type == EMPTY)
+		s1 = s1->outs->to;
+	/* Likewise for any trailing EMPTY arc */
+	if (s2->nins == 1 && s2->ins->type == EMPTY)
+		s2 = s2->ins->from;
+	/* Perhaps we could have a single-state loop in between, if so reject */
+	if (s1 == s2)
+		return NULL;
+	/* s1 must have at least one outarc... */
+	if (s1->outs == NULL)
+		return NULL;
+	/* ... and they must all be PLAIN arcs to s2 */
+	for (a = s1->outs; a != NULL; a = a->outchain)
+	{
+		if (a->type != PLAIN || a->to != s2)
+			return NULL;
+	}
+	/* OK, return s1 as the possessor of the relevant outarcs */
+	return s1;
+}
+
+/*
+ * specialcolors - fill in special colors for an NFA
+ */
+static void
+specialcolors(struct nfa *nfa)
+{
+	/* false colors for BOS, BOL, EOS, EOL */
+	if (nfa->parent == NULL)
+	{
+		nfa->bos[0] = pseudocolor(nfa->cm);
+		nfa->bos[1] = pseudocolor(nfa->cm);
+		nfa->eos[0] = pseudocolor(nfa->cm);
+		nfa->eos[1] = pseudocolor(nfa->cm);
+	}
+	else
+	{
+		assert(nfa->parent->bos[0] != COLORLESS);
+		nfa->bos[0] = nfa->parent->bos[0];
+		assert(nfa->parent->bos[1] != COLORLESS);
+		nfa->bos[1] = nfa->parent->bos[1];
+		assert(nfa->parent->eos[0] != COLORLESS);
+		nfa->eos[0] = nfa->parent->eos[0];
+		assert(nfa->parent->eos[1] != COLORLESS);
+		nfa->eos[1] = nfa->parent->eos[1];
+	}
+}
+
+/*
+ * optimize - optimize an NFA
+ *
+ * The main goal of this function is not so much "optimization" (though it
+ * does try to get rid of useless NFA states) as reducing the NFA to a form
+ * the regex executor can handle.  The executor, and indeed the cNFA format
+ * that is its input, can only handle PLAIN and LACON arcs.  The output of
+ * the regex parser also includes EMPTY (do-nothing) arcs, as well as
+ * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
+ * We first get rid of EMPTY arcs and then deal with the constraint arcs.
+ * The hardest part of either job is to get rid of circular loops of the
+ * target arc type.  We would have to do that in any case, though, as such a
+ * loop would otherwise allow the executor to cycle through the loop endlessly
+ * without making any progress in the input string.
+ */
+static long						/* re_info bits */
+optimize(struct nfa *nfa,
+		 FILE *f)				/* for debug output; NULL none */
+{
+#ifdef REG_DEBUG
+	int			verbose = (f != NULL) ? 1 : 0;
+
+	if (verbose)
+		fprintf(f, "\ninitial cleanup:\n");
+#endif
+	cleanup(nfa);				/* may simplify situation */
+#ifdef REG_DEBUG
+	if (verbose)
+		dumpnfa(nfa, f);
+	if (verbose)
+		fprintf(f, "\nempties:\n");
+#endif
+	fixempties(nfa, f);			/* get rid of EMPTY arcs */
+#ifdef REG_DEBUG
+	if (verbose)
+		fprintf(f, "\nconstraints:\n");
+#endif
+	fixconstraintloops(nfa, f); /* get rid of constraint loops */
+	pullback(nfa, f);			/* pull back constraints backward */
+	pushfwd(nfa, f);			/* push fwd constraints forward */
+#ifdef REG_DEBUG
+	if (verbose)
+		fprintf(f, "\nfinal cleanup:\n");
+#endif
+	cleanup(nfa);				/* final tidying */
+#ifdef REG_DEBUG
+	if (verbose)
+		dumpnfa(nfa, f);
+#endif
+	return analyze(nfa);		/* and analysis */
+}
+
+/*
+ * pullback - pull back constraints backward to eliminate them
+ */
+static void
+pullback(struct nfa *nfa,
+		 FILE *f)				/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *intermediates;
+	int			progress;
+
+	/* find and pull until there are no more */
+	do
+	{
+		progress = 0;
+		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+		{
+			nexts = s->next;
+			intermediates = NULL;
+			for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+			{
+				nexta = a->outchain;
+				if (a->type == '^' || a->type == BEHIND)
+					if (pull(nfa, a, &intermediates))
+						progress = 1;
+			}
+			/* clear tmp fields of intermediate states created here */
+			while (intermediates != NULL)
+			{
+				struct state *ns = intermediates->tmp;
+
+				intermediates->tmp = NULL;
+				intermediates = ns;
+			}
+			/* if s is now useless, get rid of it */
+			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+				dropstate(nfa, s);
+		}
+		if (progress && f != NULL)
+			dumpnfa(nfa, f);
+	} while (progress && !NISERR());
+	if (NISERR())
+		return;
+
+	/*
+	 * Any ^ constraints we were able to pull to the start state can now be
+	 * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
+	 * be no other ^ or BEHIND arcs left in the NFA, though we do not check
+	 * that here (compact() will fail if so).
+	 */
+	for (a = nfa->pre->outs; a != NULL; a = nexta)
+	{
+		nexta = a->outchain;
+		if (a->type == '^')
+		{
+			assert(a->co == 0 || a->co == 1);
+			newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
+			freearc(nfa, a);
+		}
+	}
+}
+
+/*
+ * pull - pull a back constraint backward past its source state
+ *
+ * Returns 1 if successful (which it always is unless the source is the
+ * start state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no outarcs of the constraint's from state other than the given
+ * constraint arc.  This makes the loops in pullback() safe, at the cost that
+ * we may leave useless states behind.  Therefore, we leave it to pullback()
+ * to delete such states.
+ *
+ * If the from state has multiple back-constraint outarcs, and/or multiple
+ * compatible constraint inarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states.  *intermediates
+ * points to a list of such intermediate states for this from state (chained
+ * through their tmp fields).
+ */
+static int
+pull(struct nfa *nfa,
+	 struct arc *con,
+	 struct state **intermediates)
+{
+	struct state *from = con->from;
+	struct state *to = con->to;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *s;
+
+	assert(from != to);			/* should have gotten rid of this earlier */
+	if (from->flag)				/* can't pull back beyond start */
+		return 0;
+	if (from->nins == 0)
+	{							/* unreachable */
+		freearc(nfa, con);
+		return 1;
+	}
+
+	/*
+	 * First, clone from state if necessary to avoid other outarcs.  This may
+	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
+	 * clone state again at the bottom.
+	 */
+	if (from->nouts > 1)
+	{
+		s = newstate(nfa);
+		if (NISERR())
+			return 0;
+		copyins(nfa, from, s);	/* duplicate inarcs */
+		cparc(nfa, con, s, to); /* move constraint arc */
+		freearc(nfa, con);
+		if (NISERR())
+			return 0;
+		from = s;
+		con = from->outs;
+	}
+	assert(from->nouts == 1);
+
+	/* propagate the constraint into the from state's inarcs */
+	for (a = from->ins; a != NULL && !NISERR(); a = nexta)
+	{
+		nexta = a->inchain;
+		switch (combine(con, a))
+		{
+			case INCOMPATIBLE:	/* destroy the arc */
+				freearc(nfa, a);
+				break;
+			case SATISFIED:		/* no action needed */
+				break;
+			case COMPATIBLE:	/* swap the two arcs, more or less */
+				/* need an intermediate state, but might have one already */
+				for (s = *intermediates; s != NULL; s = s->tmp)
+				{
+					assert(s->nins > 0 && s->nouts > 0);
+					if (s->ins->from == a->from && s->outs->to == to)
+						break;
+				}
+				if (s == NULL)
+				{
+					s = newstate(nfa);
+					if (NISERR())
+						return 0;
+					s->tmp = *intermediates;
+					*intermediates = s;
+				}
+				cparc(nfa, con, a->from, s);
+				cparc(nfa, a, s, to);
+				freearc(nfa, a);
+				break;
+			default:
+				assert(NOTREACHED);
+				break;
+		}
+	}
+
+	/* remaining inarcs, if any, incorporate the constraint */
+	moveins(nfa, from, to);
+	freearc(nfa, con);
+	/* from state is now useless, but we leave it to pullback() to clean up */
+	return 1;
+}
+
+/*
+ * pushfwd - push forward constraints forward to eliminate them
+ */
+static void
+pushfwd(struct nfa *nfa,
+		FILE *f)				/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *intermediates;
+	int			progress;
+
+	/* find and push until there are no more */
+	do
+	{
+		progress = 0;
+		for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+		{
+			nexts = s->next;
+			intermediates = NULL;
+			for (a = s->ins; a != NULL && !NISERR(); a = nexta)
+			{
+				nexta = a->inchain;
+				if (a->type == '$' || a->type == AHEAD)
+					if (push(nfa, a, &intermediates))
+						progress = 1;
+			}
+			/* clear tmp fields of intermediate states created here */
+			while (intermediates != NULL)
+			{
+				struct state *ns = intermediates->tmp;
+
+				intermediates->tmp = NULL;
+				intermediates = ns;
+			}
+			/* if s is now useless, get rid of it */
+			if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+				dropstate(nfa, s);
+		}
+		if (progress && f != NULL)
+			dumpnfa(nfa, f);
+	} while (progress && !NISERR());
+	if (NISERR())
+		return;
+
+	/*
+	 * Any $ constraints we were able to push to the post state can now be
+	 * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
+	 * be no other $ or AHEAD arcs left in the NFA, though we do not check
+	 * that here (compact() will fail if so).
+	 */
+	for (a = nfa->post->ins; a != NULL; a = nexta)
+	{
+		nexta = a->inchain;
+		if (a->type == '$')
+		{
+			assert(a->co == 0 || a->co == 1);
+			newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
+			freearc(nfa, a);
+		}
+	}
+}
+
+/*
+ * push - push a forward constraint forward past its destination state
+ *
+ * Returns 1 if successful (which it always is unless the destination is the
+ * post state or we have an internal error), 0 if nothing happened.
+ *
+ * A significant property of this function is that it deletes no pre-existing
+ * states, and no inarcs of the constraint's to state other than the given
+ * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
+ * we may leave useless states behind.  Therefore, we leave it to pushfwd()
+ * to delete such states.
+ *
+ * If the to state has multiple forward-constraint inarcs, and/or multiple
+ * compatible constraint outarcs, we only need to create one new intermediate
+ * state per combination of predecessor and successor states.  *intermediates
+ * points to a list of such intermediate states for this to state (chained
+ * through their tmp fields).
+ */
+static int
+push(struct nfa *nfa,
+	 struct arc *con,
+	 struct state **intermediates)
+{
+	struct state *from = con->from;
+	struct state *to = con->to;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *s;
+
+	assert(to != from);			/* should have gotten rid of this earlier */
+	if (to->flag)				/* can't push forward beyond end */
+		return 0;
+	if (to->nouts == 0)
+	{							/* dead end */
+		freearc(nfa, con);
+		return 1;
+	}
+
+	/*
+	 * First, clone to state if necessary to avoid other inarcs.  This may
+	 * seem wasteful, but it simplifies the logic, and we'll get rid of the
+	 * clone state again at the bottom.
+	 */
+	if (to->nins > 1)
+	{
+		s = newstate(nfa);
+		if (NISERR())
+			return 0;
+		copyouts(nfa, to, s);	/* duplicate outarcs */
+		cparc(nfa, con, from, s);	/* move constraint arc */
+		freearc(nfa, con);
+		if (NISERR())
+			return 0;
+		to = s;
+		con = to->ins;
+	}
+	assert(to->nins == 1);
+
+	/* propagate the constraint into the to state's outarcs */
+	for (a = to->outs; a != NULL && !NISERR(); a = nexta)
+	{
+		nexta = a->outchain;
+		switch (combine(con, a))
+		{
+			case INCOMPATIBLE:	/* destroy the arc */
+				freearc(nfa, a);
+				break;
+			case SATISFIED:		/* no action needed */
+				break;
+			case COMPATIBLE:	/* swap the two arcs, more or less */
+				/* need an intermediate state, but might have one already */
+				for (s = *intermediates; s != NULL; s = s->tmp)
+				{
+					assert(s->nins > 0 && s->nouts > 0);
+					if (s->ins->from == from && s->outs->to == a->to)
+						break;
+				}
+				if (s == NULL)
+				{
+					s = newstate(nfa);
+					if (NISERR())
+						return 0;
+					s->tmp = *intermediates;
+					*intermediates = s;
+				}
+				cparc(nfa, con, s, a->to);
+				cparc(nfa, a, from, s);
+				freearc(nfa, a);
+				break;
+			default:
+				assert(NOTREACHED);
+				break;
+		}
+	}
+
+	/* remaining outarcs, if any, incorporate the constraint */
+	moveouts(nfa, to, from);
+	freearc(nfa, con);
+	/* to state is now useless, but we leave it to pushfwd() to clean up */
+	return 1;
+}
+
+/*
+ * combine - constraint lands on an arc, what happens?
+ *
+ * #def INCOMPATIBLE	1	// destroys arc
+ * #def SATISFIED		2	// constraint satisfied
+ * #def COMPATIBLE		3	// compatible but not satisfied yet
+ */
+static int
+combine(struct arc *con,
+		struct arc *a)
+{
+#define  CA(ct,at)	 (((ct)<<CHAR_BIT) | (at))
+
+	switch (CA(con->type, a->type))
+	{
+		case CA('^', PLAIN):	/* newlines are handled separately */
+		case CA('$', PLAIN):
+			return INCOMPATIBLE;
+			break;
+		case CA(AHEAD, PLAIN):	/* color constraints meet colors */
+		case CA(BEHIND, PLAIN):
+			if (con->co == a->co)
+				return SATISFIED;
+			return INCOMPATIBLE;
+			break;
+		case CA('^', '^'):		/* collision, similar constraints */
+		case CA('$', '$'):
+		case CA(AHEAD, AHEAD):
+		case CA(BEHIND, BEHIND):
+			if (con->co == a->co)	/* true duplication */
+				return SATISFIED;
+			return INCOMPATIBLE;
+			break;
+		case CA('^', BEHIND):	/* collision, dissimilar constraints */
+		case CA(BEHIND, '^'):
+		case CA('$', AHEAD):
+		case CA(AHEAD, '$'):
+			return INCOMPATIBLE;
+			break;
+		case CA('^', '$'):		/* constraints passing each other */
+		case CA('^', AHEAD):
+		case CA(BEHIND, '$'):
+		case CA(BEHIND, AHEAD):
+		case CA('$', '^'):
+		case CA('$', BEHIND):
+		case CA(AHEAD, '^'):
+		case CA(AHEAD, BEHIND):
+		case CA('^', LACON):
+		case CA(BEHIND, LACON):
+		case CA('$', LACON):
+		case CA(AHEAD, LACON):
+			return COMPATIBLE;
+			break;
+	}
+	assert(NOTREACHED);
+	return INCOMPATIBLE;		/* for benefit of blind compilers */
+}
+
+/*
+ * fixempties - get rid of EMPTY arcs
+ */
+static void
+fixempties(struct nfa *nfa,
+		   FILE *f)				/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *s2;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	int			totalinarcs;
+	struct arc **inarcsorig;
+	struct arc **arcarray;
+	int			arccount;
+	int			prevnins;
+	int			nskip;
+
+	/*
+	 * First, get rid of any states whose sole out-arc is an EMPTY, since
+	 * they're basically just aliases for their successor.  The parsing
+	 * algorithm creates enough of these that it's worth special-casing this.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		if (s->flag || s->nouts != 1)
+			continue;
+		a = s->outs;
+		assert(a != NULL && a->outchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->to)
+			moveins(nfa, s, a->to);
+		dropstate(nfa, s);
+	}
+
+	/*
+	 * Similarly, get rid of any state with a single EMPTY in-arc, by folding
+	 * it into its predecessor.
+	 */
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		/* while we're at it, ensure tmp fields are clear for next step */
+		assert(s->tmp == NULL);
+		if (s->flag || s->nins != 1)
+			continue;
+		a = s->ins;
+		assert(a != NULL && a->inchain == NULL);
+		if (a->type != EMPTY)
+			continue;
+		if (s != a->from)
+			moveouts(nfa, s, a->from);
+		dropstate(nfa, s);
+	}
+
+	if (NISERR())
+		return;
+
+	/*
+	 * For each remaining NFA state, find all other states from which it is
+	 * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
+	 * that eliminate the need for each such chain.
+	 *
+	 * We could replace a chain of EMPTY arcs that leads from a "from" state
+	 * to a "to" state either by pushing non-EMPTY arcs forward (linking
+	 * directly from "from"'s predecessors to "to") or by pulling them back
+	 * (linking directly from "from" to "to"'s successors).  We choose to
+	 * always do the former; this choice is somewhat arbitrary, but the
+	 * approach below requires that we uniformly do one or the other.
+	 *
+	 * Suppose we have a chain of N successive EMPTY arcs (where N can easily
+	 * approach the size of the NFA).  All of the intermediate states must
+	 * have additional inarcs and outarcs, else they'd have been removed by
+	 * the steps above.  Assuming their inarcs are mostly not empties, we will
+	 * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
+	 * state in the chain must be duplicated to lead to all its successor
+	 * states as well.  So there is no hope of doing less than O(N^2) work;
+	 * however, we should endeavor to keep the big-O cost from being even
+	 * worse than that, which it can easily become without care.  In
+	 * particular, suppose we were to copy all S1's inarcs forward to S2, and
+	 * then also to S3, and then later we consider pushing S2's inarcs forward
+	 * to S3.  If we include the arcs already copied from S1 in that, we'd be
+	 * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
+	 * and its cohorts would get rid of the extra arcs, but not without cost.)
+	 *
+	 * We can avoid this cost by treating only arcs that existed at the start
+	 * of this phase as candidates to be pushed forward.  To identify those,
+	 * we remember the first inarc each state had to start with.  We rely on
+	 * the fact that newarc() and friends put new arcs on the front of their
+	 * to-states' inchains, and that this phase never deletes arcs, so that
+	 * the original arcs must be the last arcs in their to-states' inchains.
+	 *
+	 * So the process here is that, for each state in the NFA, we gather up
+	 * all non-EMPTY inarcs of states that can reach the target state via
+	 * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
+	 * target state's inchain.  (We can safely use sort-merge for this as long
+	 * as we update each state's original-arcs pointer after we add arcs to
+	 * it; the sort step of mergeins probably changed the order of the old
+	 * arcs.)
+	 *
+	 * Another refinement worth making is that, because we only add non-EMPTY
+	 * arcs during this phase, and all added arcs have the same from-state as
+	 * the non-EMPTY arc they were cloned from, we know ahead of time that any
+	 * states having only EMPTY outarcs will be useless for lack of outarcs
+	 * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
+	 * they had none to start with.)  So we need not bother to update the
+	 * inchains of such states at all.
+	 */
+
+	/* Remember the states' first original inarcs */
+	/* ... and while at it, count how many old inarcs there are altogether */
+	inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
+	if (inarcsorig == NULL)
+	{
+		NERR(REG_ESPACE);
+		return;
+	}
+	totalinarcs = 0;
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		inarcsorig[s->no] = s->ins;
+		totalinarcs += s->nins;
+	}
+
+	/*
+	 * Create a workspace for accumulating the inarcs to be added to the
+	 * current target state.  totalinarcs is probably a considerable
+	 * overestimate of the space needed, but the NFA is unlikely to be large
+	 * enough at this point to make it worth being smarter.
+	 */
+	arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
+	if (arcarray == NULL)
+	{
+		NERR(REG_ESPACE);
+		FREE(inarcsorig);
+		return;
+	}
+
+	/* And iterate over the target states */
+	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+	{
+		/* Ignore target states without non-EMPTY outarcs, per note above */
+		if (!s->flag && !hasnonemptyout(s))
+			continue;
+
+		/* Find predecessor states and accumulate their original inarcs */
+		arccount = 0;
+		for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
+		{
+			/* Add s2's original inarcs to arcarray[], but ignore empties */
+			for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
+			{
+				if (a->type != EMPTY)
+					arcarray[arccount++] = a;
+			}
+
+			/* Reset the tmp fields as we walk back */
+			nexts = s2->tmp;
+			s2->tmp = NULL;
+		}
+		s->tmp = NULL;
+		assert(arccount <= totalinarcs);
+
+		/* Remember how many original inarcs this state has */
+		prevnins = s->nins;
+
+		/* Add non-duplicate inarcs to target state */
+		mergeins(nfa, s, arcarray, arccount);
+
+		/* Now we must update the state's inarcsorig pointer */
+		nskip = s->nins - prevnins;
+		a = s->ins;
+		while (nskip-- > 0)
+			a = a->inchain;
+		inarcsorig[s->no] = a;
+	}
+
+	FREE(arcarray);
+	FREE(inarcsorig);
+
+	if (NISERR())
+		return;
+
+	/*
+	 * Now remove all the EMPTY arcs, since we don't need them anymore.
+	 */
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		for (a = s->outs; a != NULL; a = nexta)
+		{
+			nexta = a->outchain;
+			if (a->type == EMPTY)
+				freearc(nfa, a);
+		}
+	}
+
+	/*
+	 * And remove any states that have become useless.  (This cleanup is not
+	 * very thorough, and would be even less so if we tried to combine it with
+	 * the previous step; but cleanup() will take care of anything we miss.)
+	 */
+	for (s = nfa->states; s != NULL; s = nexts)
+	{
+		nexts = s->next;
+		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+			dropstate(nfa, s);
+	}
+
+	if (f != NULL)
+		dumpnfa(nfa, f);
+}
+
+/*
+ * emptyreachable - recursively find all states that can reach s by EMPTY arcs
+ *
+ * The return value is the last such state found.  Its tmp field links back
+ * to the next-to-last such state, and so on back to s, so that all these
+ * states can be located without searching the whole NFA.
+ *
+ * Since this is only used in fixempties(), we pass in the inarcsorig[] array
+ * maintained by that function.  This lets us skip over all new inarcs, which
+ * are certainly not EMPTY arcs.
+ *
+ * The maximum recursion depth here is equal to the length of the longest
+ * loop-free chain of EMPTY arcs, which is surely no more than the size of
+ * the NFA ... but that could still be enough to cause trouble.
+ */
+static struct state *
+emptyreachable(struct nfa *nfa,
+			   struct state *s,
+			   struct state *lastfound,
+			   struct arc **inarcsorig)
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return lastfound;
+	}
+
+	s->tmp = lastfound;
+	lastfound = s;
+	for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
+	{
+		if (a->type == EMPTY && a->from->tmp == NULL)
+			lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
+	}
+	return lastfound;
+}
+
+/*
+ * isconstraintarc - detect whether an arc is of a constraint type
+ */
+static inline int
+isconstraintarc(struct arc *a)
+{
+	switch (a->type)
+	{
+		case '^':
+		case '$':
+		case BEHIND:
+		case AHEAD:
+		case LACON:
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * hasconstraintout - does state have a constraint out arc?
+ */
+static int
+hasconstraintout(struct state *s)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (isconstraintarc(a))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * fixconstraintloops - get rid of loops containing only constraint arcs
+ *
+ * A loop of states that contains only constraint arcs is useless, since
+ * passing around the loop represents no forward progress.  Moreover, it
+ * would cause infinite looping in pullback/pushfwd, so we need to get rid
+ * of such loops before doing that.
+ */
+static void
+fixconstraintloops(struct nfa *nfa,
+				   FILE *f)		/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	int			hasconstraints;
+
+	/*
+	 * In the trivial case of a state that loops to itself, we can just drop
+	 * the constraint arc altogether.  This is worth special-casing because
+	 * such loops are far more common than loops containing multiple states.
+	 * While we're at it, note whether any constraint arcs survive.
+	 */
+	hasconstraints = 0;
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		/* while we're at it, ensure tmp fields are clear for next step */
+		assert(s->tmp == NULL);
+		for (a = s->outs; a != NULL && !NISERR(); a = nexta)
+		{
+			nexta = a->outchain;
+			if (isconstraintarc(a))
+			{
+				if (a->to == s)
+					freearc(nfa, a);
+				else
+					hasconstraints = 1;
+			}
+		}
+		/* If we removed all the outarcs, the state is useless. */
+		if (s->nouts == 0 && !s->flag)
+			dropstate(nfa, s);
+	}
+
+	/* Nothing to do if no remaining constraint arcs */
+	if (NISERR() || !hasconstraints)
+		return;
+
+	/*
+	 * Starting from each remaining NFA state, search outwards for a
+	 * constraint loop.  If we find a loop, break the loop, then start the
+	 * search over.  (We could possibly retain some state from the first scan,
+	 * but it would complicate things greatly, and multi-state constraint
+	 * loops are rare enough that it's not worth optimizing the case.)
+	 */
+restart:
+	for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
+	{
+		if (findconstraintloop(nfa, s))
+			goto restart;
+	}
+
+	if (NISERR())
+		return;
+
+	/*
+	 * Now remove any states that have become useless.  (This cleanup is not
+	 * very thorough, and would be even less so if we tried to combine it with
+	 * the previous step; but cleanup() will take care of anything we miss.)
+	 *
+	 * Because findconstraintloop intentionally doesn't reset all tmp fields,
+	 * we have to clear them after it's done.  This is a convenient place to
+	 * do that, too.
+	 */
+	for (s = nfa->states; s != NULL; s = nexts)
+	{
+		nexts = s->next;
+		s->tmp = NULL;
+		if ((s->nins == 0 || s->nouts == 0) && !s->flag)
+			dropstate(nfa, s);
+	}
+
+	if (f != NULL)
+		dumpnfa(nfa, f);
+}
+
+/*
+ * findconstraintloop - recursively find a loop of constraint arcs
+ *
+ * If we find a loop, break it by calling breakconstraintloop(), then
+ * return 1; otherwise return 0.
+ *
+ * State tmp fields are guaranteed all NULL on a success return, because
+ * breakconstraintloop does that.  After a failure return, any state that
+ * is known not to be part of a loop is marked with s->tmp == s; this allows
+ * us not to have to re-prove that fact on later calls.  (This convention is
+ * workable because we already eliminated single-state loops.)
+ *
+ * Note that the found loop doesn't necessarily include the first state we
+ * are called on.  Any loop reachable from that state will do.
+ *
+ * The maximum recursion depth here is one more than the length of the longest
+ * loop-free chain of constraint arcs, which is surely no more than the size
+ * of the NFA ... but that could still be enough to cause trouble.
+ */
+static int
+findconstraintloop(struct nfa *nfa, struct state *s)
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return 1;				/* to exit as quickly as possible */
+	}
+
+	if (s->tmp != NULL)
+	{
+		/* Already proven uninteresting? */
+		if (s->tmp == s)
+			return 0;
+		/* Found a loop involving s */
+		breakconstraintloop(nfa, s);
+		/* The tmp fields have been cleaned up by breakconstraintloop */
+		return 1;
+	}
+	for (a = s->outs; a != NULL; a = a->outchain)
+	{
+		if (isconstraintarc(a))
+		{
+			struct state *sto = a->to;
+
+			assert(sto != s);
+			s->tmp = sto;
+			if (findconstraintloop(nfa, sto))
+				return 1;
+		}
+	}
+
+	/*
+	 * If we get here, no constraint loop exists leading out from s.  Mark it
+	 * with s->tmp == s so we need not rediscover that fact again later.
+	 */
+	s->tmp = s;
+	return 0;
+}
+
+/*
+ * breakconstraintloop - break a loop of constraint arcs
+ *
+ * sinitial is any one member state of the loop.  Each loop member's tmp
+ * field links to its successor within the loop.  (Note that this function
+ * will reset all the tmp fields to NULL.)
+ *
+ * We can break the loop by, for any one state S1 in the loop, cloning its
+ * loop successor state S2 (and possibly following states), and then moving
+ * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
+ * copy any non-constraint outarcs of S2.  Constraint outarcs should be
+ * dropped if they point back to S1, else they need to be copied as arcs to
+ * similarly cloned states S3, S4, etc.  In general, each cloned state copies
+ * non-constraint outarcs, drops constraint outarcs that would lead to itself
+ * or any earlier cloned state, and sends other constraint outarcs to newly
+ * cloned states.  No cloned state will have any inarcs that aren't constraint
+ * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
+ * constraint back-arcs since they would not take us to any state we've not
+ * already been in; therefore, no new constraint loop is created.  In this way
+ * we generate a modified NFA that can still represent every useful state
+ * sequence, but not sequences that represent state loops with no consumption
+ * of input data.  Note that the set of cloned states will certainly include
+ * all of the loop member states other than S1, and it may also include
+ * non-loop states that are reachable from S2 via constraint arcs.  This is
+ * important because there is no guarantee that findconstraintloop found a
+ * maximal loop (and searching for one would be NP-hard, so don't try).
+ * Frequently the "non-loop states" are actually part of a larger loop that
+ * we didn't notice, and indeed there may be several overlapping loops.
+ * This technique ensures convergence in such cases, while considering only
+ * the originally-found loop does not.
+ *
+ * If there is only one S1->S2 constraint arc, then that constraint is
+ * certainly satisfied when we enter any of the clone states.  This means that
+ * in the common case where many of the constraint arcs are identically
+ * labeled, we can merge together clone states linked by a similarly-labeled
+ * constraint: if we can get to the first one we can certainly get to the
+ * second, so there's no need to distinguish.  This greatly reduces the number
+ * of new states needed, so we preferentially break the given loop at a state
+ * pair where this is true.
+ *
+ * Furthermore, it's fairly common to find that a cloned successor state has
+ * no outarcs, especially if we're a bit aggressive about removing unnecessary
+ * outarcs.  If that happens, then there is simply not any interesting state
+ * that can be reached through the predecessor's loop arcs, which means we can
+ * break the loop just by removing those loop arcs, with no new states added.
+ */
+static void
+breakconstraintloop(struct nfa *nfa, struct state *sinitial)
+{
+	struct state *s;
+	struct state *shead;
+	struct state *stail;
+	struct state *sclone;
+	struct state *nexts;
+	struct arc *refarc;
+	struct arc *a;
+	struct arc *nexta;
+
+	/*
+	 * Start by identifying which loop step we want to break at.
+	 * Preferentially this is one with only one constraint arc.  (XXX are
+	 * there any other secondary heuristics we want to use here?)  Set refarc
+	 * to point to the selected lone constraint arc, if there is one.
+	 */
+	refarc = NULL;
+	s = sinitial;
+	do
+	{
+		nexts = s->tmp;
+		assert(nexts != s);		/* should not see any one-element loops */
+		if (refarc == NULL)
+		{
+			int			narcs = 0;
+
+			for (a = s->outs; a != NULL; a = a->outchain)
+			{
+				if (a->to == nexts && isconstraintarc(a))
+				{
+					refarc = a;
+					narcs++;
+				}
+			}
+			assert(narcs > 0);
+			if (narcs > 1)
+				refarc = NULL;	/* multiple constraint arcs here, no good */
+		}
+		s = nexts;
+	} while (s != sinitial);
+
+	if (refarc)
+	{
+		/* break at the refarc */
+		shead = refarc->from;
+		stail = refarc->to;
+		assert(stail == shead->tmp);
+	}
+	else
+	{
+		/* for lack of a better idea, break after sinitial */
+		shead = sinitial;
+		stail = sinitial->tmp;
+	}
+
+	/*
+	 * Reset the tmp fields so that we can use them for local storage in
+	 * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
+	 * going to abandon its search anyway.)
+	 */
+	for (s = nfa->states; s != NULL; s = s->next)
+		s->tmp = NULL;
+
+	/*
+	 * Recursively build clone state(s) as needed.
+	 */
+	sclone = newstate(nfa);
+	if (sclone == NULL)
+	{
+		assert(NISERR());
+		return;
+	}
+
+	clonesuccessorstates(nfa, stail, sclone, shead, refarc,
+						 NULL, NULL, nfa->nstates);
+
+	if (NISERR())
+		return;
+
+	/*
+	 * It's possible that sclone has no outarcs at all, in which case it's
+	 * useless.  (We don't try extremely hard to get rid of useless states
+	 * here, but this is an easy and fairly common case.)
+	 */
+	if (sclone->nouts == 0)
+	{
+		freestate(nfa, sclone);
+		sclone = NULL;
+	}
+
+	/*
+	 * Move shead's constraint-loop arcs to point to sclone, or just drop them
+	 * if we discovered we don't need sclone.
+	 */
+	for (a = shead->outs; a != NULL; a = nexta)
+	{
+		nexta = a->outchain;
+		if (a->to == stail && isconstraintarc(a))
+		{
+			if (sclone)
+				cparc(nfa, a, shead, sclone);
+			freearc(nfa, a);
+			if (NISERR())
+				break;
+		}
+	}
+}
+
+/*
+ * clonesuccessorstates - create a tree of constraint-arc successor states
+ *
+ * ssource is the state to be cloned, and sclone is the state to copy its
+ * outarcs into.  sclone's inarcs, if any, should already be set up.
+ *
+ * spredecessor is the original predecessor state that we are trying to build
+ * successors for (it may not be the immediate predecessor of ssource).
+ * refarc, if not NULL, is the original constraint arc that is known to have
+ * been traversed out of spredecessor to reach the successor(s).
+ *
+ * For each cloned successor state, we transiently create a "donemap" that is
+ * a boolean array showing which source states we've already visited for this
+ * clone state.  This prevents infinite recursion as well as useless repeat
+ * visits to the same state subtree (which can add up fast, since typical NFAs
+ * have multiple redundant arc pathways).  Each donemap is a char array
+ * indexed by state number.  The donemaps are all of the same size "nstates",
+ * which is nfa->nstates as of the start of the recursion.  This is enough to
+ * have entries for all pre-existing states, but *not* entries for clone
+ * states created during the recursion.  That's okay since we have no need to
+ * mark those.
+ *
+ * curdonemap is NULL when recursing to a new sclone state, or sclone's
+ * donemap when we are recursing without having created a new state (which we
+ * do when we decide we can merge a successor state into the current clone
+ * state).  outerdonemap is NULL at the top level and otherwise the parent
+ * clone state's donemap.
+ *
+ * The successor states we create and fill here form a strict tree structure,
+ * with each state having exactly one predecessor, except that the toplevel
+ * state has no inarcs as yet (breakconstraintloop will add its inarcs from
+ * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
+ * to the root, plus refarc if any, to identify the set of constraints already
+ * known valid at the current point.  This allows us to avoid generating extra
+ * successor states.
+ */
+static void
+clonesuccessorstates(struct nfa *nfa,
+					 struct state *ssource,
+					 struct state *sclone,
+					 struct state *spredecessor,
+					 struct arc *refarc,
+					 char *curdonemap,
+					 char *outerdonemap,
+					 int nstates)
+{
+	char	   *donemap;
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	/* If this state hasn't already got a donemap, create one */
+	donemap = curdonemap;
+	if (donemap == NULL)
+	{
+		donemap = (char *) MALLOC(nstates * sizeof(char));
+		if (donemap == NULL)
+		{
+			NERR(REG_ESPACE);
+			return;
+		}
+
+		if (outerdonemap != NULL)
+		{
+			/*
+			 * Not at outermost recursion level, so copy the outer level's
+			 * donemap; this ensures that we see states in process of being
+			 * visited at outer levels, or already merged into predecessor
+			 * states, as ones we shouldn't traverse back to.
+			 */
+			memcpy(donemap, outerdonemap, nstates * sizeof(char));
+		}
+		else
+		{
+			/* At outermost level, only spredecessor is off-limits */
+			memset(donemap, 0, nstates * sizeof(char));
+			assert(spredecessor->no < nstates);
+			donemap[spredecessor->no] = 1;
+		}
+	}
+
+	/* Mark ssource as visited in the donemap */
+	assert(ssource->no < nstates);
+	assert(donemap[ssource->no] == 0);
+	donemap[ssource->no] = 1;
+
+	/*
+	 * We proceed by first cloning all of ssource's outarcs, creating new
+	 * clone states as needed but not doing more with them than that.  Then in
+	 * a second pass, recurse to process the child clone states.  This allows
+	 * us to have only one child clone state per reachable source state, even
+	 * when there are multiple outarcs leading to the same state.  Also, when
+	 * we do visit a child state, its set of inarcs is known exactly, which
+	 * makes it safe to apply the constraint-is-already-checked optimization.
+	 * Also, this ensures that we've merged all the states we can into the
+	 * current clone before we recurse to any children, thus possibly saving
+	 * them from making extra images of those states.
+	 *
+	 * While this function runs, child clone states of the current state are
+	 * marked by setting their tmp fields to point to the original state they
+	 * were cloned from.  This makes it possible to detect multiple outarcs
+	 * leading to the same state, and also makes it easy to distinguish clone
+	 * states from original states (which will have tmp == NULL).
+	 */
+	for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
+	{
+		struct state *sto = a->to;
+
+		/*
+		 * We do not consider cloning successor states that have no constraint
+		 * outarcs; just link to them as-is.  They cannot be part of a
+		 * constraint loop so there is no need to make copies.  In particular,
+		 * this rule keeps us from trying to clone the post state, which would
+		 * be a bad idea.
+		 */
+		if (isconstraintarc(a) && hasconstraintout(sto))
+		{
+			struct state *prevclone;
+			int			canmerge;
+			struct arc *a2;
+
+			/*
+			 * Back-link constraint arcs must not be followed.  Nor is there a
+			 * need to revisit states previously merged into this clone.
+			 */
+			assert(sto->no < nstates);
+			if (donemap[sto->no] != 0)
+				continue;
+
+			/*
+			 * Check whether we already have a child clone state for this
+			 * source state.
+			 */
+			prevclone = NULL;
+			for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
+			{
+				if (a2->to->tmp == sto)
+				{
+					prevclone = a2->to;
+					break;
+				}
+			}
+
+			/*
+			 * If this arc is labeled the same as refarc, or the same as any
+			 * arc we must have traversed to get to sclone, then no additional
+			 * constraints need to be met to get to sto, so we should just
+			 * merge its outarcs into sclone.
+			 */
+			if (refarc && a->type == refarc->type && a->co == refarc->co)
+				canmerge = 1;
+			else
+			{
+				struct state *s;
+
+				canmerge = 0;
+				for (s = sclone; s->ins; s = s->ins->from)
+				{
+					if (s->nins == 1 &&
+						a->type == s->ins->type && a->co == s->ins->co)
+					{
+						canmerge = 1;
+						break;
+					}
+				}
+			}
+
+			if (canmerge)
+			{
+				/*
+				 * We can merge into sclone.  If we previously made a child
+				 * clone state, drop it; there's no need to visit it.  (This
+				 * can happen if ssource has multiple pathways to sto, and we
+				 * only just now found one that is provably a no-op.)
+				 */
+				if (prevclone)
+					dropstate(nfa, prevclone);	/* kills our outarc, too */
+
+				/* Recurse to merge sto's outarcs into sclone */
+				clonesuccessorstates(nfa,
+									 sto,
+									 sclone,
+									 spredecessor,
+									 refarc,
+									 donemap,
+									 outerdonemap,
+									 nstates);
+				/* sto should now be marked as previously visited */
+				assert(NISERR() || donemap[sto->no] == 1);
+			}
+			else if (prevclone)
+			{
+				/*
+				 * We already have a clone state for this successor, so just
+				 * make another arc to it.
+				 */
+				cparc(nfa, a, sclone, prevclone);
+			}
+			else
+			{
+				/*
+				 * We need to create a new successor clone state.
+				 */
+				struct state *stoclone;
+
+				stoclone = newstate(nfa);
+				if (stoclone == NULL)
+				{
+					assert(NISERR());
+					break;
+				}
+				/* Mark it as to what it's a clone of */
+				stoclone->tmp = sto;
+				/* ... and add the outarc leading to it */
+				cparc(nfa, a, sclone, stoclone);
+			}
+		}
+		else
+		{
+			/*
+			 * Non-constraint outarcs just get copied to sclone, as do outarcs
+			 * leading to states with no constraint outarc.
+			 */
+			cparc(nfa, a, sclone, sto);
+		}
+	}
+
+	/*
+	 * If we are at outer level for this clone state, recurse to all its child
+	 * clone states, clearing their tmp fields as we go.  (If we're not
+	 * outermost for sclone, leave this to be done by the outer call level.)
+	 * Note that if we have multiple outarcs leading to the same clone state,
+	 * it will only be recursed-to once.
+	 */
+	if (curdonemap == NULL)
+	{
+		for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
+		{
+			struct state *stoclone = a->to;
+			struct state *sto = stoclone->tmp;
+
+			if (sto != NULL)
+			{
+				stoclone->tmp = NULL;
+				clonesuccessorstates(nfa,
+									 sto,
+									 stoclone,
+									 spredecessor,
+									 refarc,
+									 NULL,
+									 donemap,
+									 nstates);
+			}
+		}
+
+		/* Don't forget to free sclone's donemap when done with it */
+		FREE(donemap);
+	}
+}
+
+/*
+ * cleanup - clean up NFA after optimizations
+ */
+static void
+cleanup(struct nfa *nfa)
+{
+	struct state *s;
+	struct state *nexts;
+	int			n;
+
+	if (NISERR())
+		return;
+
+	/* clear out unreachable or dead-end states */
+	/* use pre to mark reachable, then post to mark can-reach-post */
+	markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
+	markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
+	for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
+	{
+		nexts = s->next;
+		if (s->tmp != nfa->post && !s->flag)
+			dropstate(nfa, s);
+	}
+	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
+	cleartraverse(nfa, nfa->pre);
+	assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
+	/* the nins==0 (final unreachable) case will be caught later */
+
+	/* renumber surviving states */
+	n = 0;
+	for (s = nfa->states; s != NULL; s = s->next)
+		s->no = n++;
+	nfa->nstates = n;
+}
+
+/*
+ * markreachable - recursive marking of reachable states
+ */
+static void
+markreachable(struct nfa *nfa,
+			  struct state *s,
+			  struct state *okay,	/* consider only states with this mark */
+			  struct state *mark)	/* the value to mark with */
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	if (s->tmp != okay)
+		return;
+	s->tmp = mark;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		markreachable(nfa, a->to, okay, mark);
+}
+
+/*
+ * markcanreach - recursive marking of states which can reach here
+ */
+static void
+markcanreach(struct nfa *nfa,
+			 struct state *s,
+			 struct state *okay,	/* consider only states with this mark */
+			 struct state *mark)	/* the value to mark with */
+{
+	struct arc *a;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(nfa->v->re))
+	{
+		NERR(REG_ETOOBIG);
+		return;
+	}
+
+	if (s->tmp != okay)
+		return;
+	s->tmp = mark;
+
+	for (a = s->ins; a != NULL; a = a->inchain)
+		markcanreach(nfa, a->from, okay, mark);
+}
+
+/*
+ * analyze - ascertain potentially-useful facts about an optimized NFA
+ */
+static long						/* re_info bits to be ORed in */
+analyze(struct nfa *nfa)
+{
+	struct arc *a;
+	struct arc *aa;
+
+	if (NISERR())
+		return 0;
+
+	if (nfa->pre->outs == NULL)
+		return REG_UIMPOSSIBLE;
+	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+		for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
+			if (aa->to == nfa->post)
+				return REG_UEMPTYMATCH;
+	return 0;
+}
+
+/*
+ * compact - construct the compact representation of an NFA
+ */
+static void
+compact(struct nfa *nfa,
+		struct cnfa *cnfa)
+{
+	struct state *s;
+	struct arc *a;
+	size_t		nstates;
+	size_t		narcs;
+	struct carc *ca;
+	struct carc *first;
+
+	assert(!NISERR());
+
+	nstates = 0;
+	narcs = 0;
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		nstates++;
+		narcs += s->nouts + 1;	/* need one extra for endmarker */
+	}
+
+	cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
+	cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
+	cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
+	if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
+	{
+		if (cnfa->stflags != NULL)
+			FREE(cnfa->stflags);
+		if (cnfa->states != NULL)
+			FREE(cnfa->states);
+		if (cnfa->arcs != NULL)
+			FREE(cnfa->arcs);
+		NERR(REG_ESPACE);
+		return;
+	}
+	cnfa->nstates = nstates;
+	cnfa->pre = nfa->pre->no;
+	cnfa->post = nfa->post->no;
+	cnfa->bos[0] = nfa->bos[0];
+	cnfa->bos[1] = nfa->bos[1];
+	cnfa->eos[0] = nfa->eos[0];
+	cnfa->eos[1] = nfa->eos[1];
+	cnfa->ncolors = maxcolor(nfa->cm) + 1;
+	cnfa->flags = 0;
+
+	ca = cnfa->arcs;
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		assert((size_t) s->no < nstates);
+		cnfa->stflags[s->no] = 0;
+		cnfa->states[s->no] = ca;
+		first = ca;
+		for (a = s->outs; a != NULL; a = a->outchain)
+			switch (a->type)
+			{
+				case PLAIN:
+					ca->co = a->co;
+					ca->to = a->to->no;
+					ca++;
+					break;
+				case LACON:
+					assert(s->no != cnfa->pre);
+					ca->co = (color) (cnfa->ncolors + a->co);
+					ca->to = a->to->no;
+					ca++;
+					cnfa->flags |= HASLACONS;
+					break;
+				default:
+					NERR(REG_ASSERT);
+					break;
+			}
+		carcsort(first, ca - first);
+		ca->co = COLORLESS;
+		ca->to = 0;
+		ca++;
+	}
+	assert(ca == &cnfa->arcs[narcs]);
+	assert(cnfa->nstates != 0);
+
+	/* mark no-progress states */
+	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+		cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+	cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
+}
+
+/*
+ * carcsort - sort compacted-NFA arcs by color
+ */
+static void
+carcsort(struct carc *first, size_t n)
+{
+	if (n > 1)
+		qsort(first, n, sizeof(struct carc), carc_cmp);
+}
+
+static int
+carc_cmp(const void *a, const void *b)
+{
+	const struct carc *aa = (const struct carc *) a;
+	const struct carc *bb = (const struct carc *) b;
+
+	if (aa->co < bb->co)
+		return -1;
+	if (aa->co > bb->co)
+		return +1;
+	if (aa->to < bb->to)
+		return -1;
+	if (aa->to > bb->to)
+		return +1;
+	return 0;
+}
+
+/*
+ * freecnfa - free a compacted NFA
+ */
+static void
+freecnfa(struct cnfa *cnfa)
+{
+	assert(cnfa->nstates != 0); /* not empty already */
+	cnfa->nstates = 0;
+	FREE(cnfa->stflags);
+	FREE(cnfa->states);
+	FREE(cnfa->arcs);
+}
+
+/*
+ * dumpnfa - dump an NFA in human-readable form
+ */
+static void
+dumpnfa(struct nfa *nfa,
+		FILE *f)
+{
+#ifdef REG_DEBUG
+	struct state *s;
+	int			nstates = 0;
+	int			narcs = 0;
+
+	fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
+	if (nfa->bos[0] != COLORLESS)
+		fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
+	if (nfa->bos[1] != COLORLESS)
+		fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
+	if (nfa->eos[0] != COLORLESS)
+		fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
+	if (nfa->eos[1] != COLORLESS)
+		fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
+	fprintf(f, "\n");
+	for (s = nfa->states; s != NULL; s = s->next)
+	{
+		dumpstate(s, f);
+		nstates++;
+		narcs += s->nouts;
+	}
+	fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
+	if (nfa->parent == NULL)
+		dumpcolors(nfa->cm, f);
+	fflush(f);
+#endif
+}
+
+#ifdef REG_DEBUG				/* subordinates of dumpnfa */
+
+/*
+ * dumpstate - dump an NFA state in human-readable form
+ */
+static void
+dumpstate(struct state *s,
+		  FILE *f)
+{
+	struct arc *a;
+
+	fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
+			(s->flag) ? s->flag : '.');
+	if (s->prev != NULL && s->prev->next != s)
+		fprintf(f, "\tstate chain bad\n");
+	if (s->nouts == 0)
+		fprintf(f, "\tno out arcs\n");
+	else
+		dumparcs(s, f);
+	fflush(f);
+	for (a = s->ins; a != NULL; a = a->inchain)
+	{
+		if (a->to != s)
+			fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
+					a->from->no, a->to->no, s->no);
+	}
+}
+
+/*
+ * dumparcs - dump out-arcs in human-readable form
+ */
+static void
+dumparcs(struct state *s,
+		 FILE *f)
+{
+	int			pos;
+	struct arc *a;
+
+	/* printing oldest arcs first is usually clearer */
+	a = s->outs;
+	assert(a != NULL);
+	while (a->outchain != NULL)
+		a = a->outchain;
+	pos = 1;
+	do
+	{
+		dumparc(a, s, f);
+		if (pos == 5)
+		{
+			fprintf(f, "\n");
+			pos = 1;
+		}
+		else
+			pos++;
+		a = a->outchainRev;
+	} while (a != NULL);
+	if (pos != 1)
+		fprintf(f, "\n");
+}
+
+/*
+ * dumparc - dump one outarc in readable form, including prefixing tab
+ */
+static void
+dumparc(struct arc *a,
+		struct state *s,
+		FILE *f)
+{
+	struct arc *aa;
+	struct arcbatch *ab;
+
+	fprintf(f, "\t");
+	switch (a->type)
+	{
+		case PLAIN:
+			fprintf(f, "[%ld]", (long) a->co);
+			break;
+		case AHEAD:
+			fprintf(f, ">%ld>", (long) a->co);
+			break;
+		case BEHIND:
+			fprintf(f, "<%ld<", (long) a->co);
+			break;
+		case LACON:
+			fprintf(f, ":%ld:", (long) a->co);
+			break;
+		case '^':
+		case '$':
+			fprintf(f, "%c%d", a->type, (int) a->co);
+			break;
+		case EMPTY:
+			break;
+		default:
+			fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
+			break;
+	}
+	if (a->from != s)
+		fprintf(f, "?%d?", a->from->no);
+	for (ab = &a->from->oas; ab != NULL; ab = ab->next)
+	{
+		for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
+			if (aa == a)
+				break;			/* NOTE BREAK OUT */
+		if (aa < &ab->a[ABSIZE])	/* propagate break */
+			break;				/* NOTE BREAK OUT */
+	}
+	if (ab == NULL)
+		fprintf(f, "?!?");		/* not in allocated space */
+	fprintf(f, "->");
+	if (a->to == NULL)
+	{
+		fprintf(f, "NULL");
+		return;
+	}
+	fprintf(f, "%d", a->to->no);
+	for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
+		if (aa == a)
+			break;				/* NOTE BREAK OUT */
+	if (aa == NULL)
+		fprintf(f, "?!?");		/* missing from in-chain */
+}
+#endif							/* REG_DEBUG */
+
+/*
+ * dumpcnfa - dump a compacted NFA in human-readable form
+ */
+#ifdef REG_DEBUG
+static void
+dumpcnfa(struct cnfa *cnfa,
+		 FILE *f)
+{
+	int			st;
+
+	fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
+	if (cnfa->bos[0] != COLORLESS)
+		fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
+	if (cnfa->bos[1] != COLORLESS)
+		fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
+	if (cnfa->eos[0] != COLORLESS)
+		fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
+	if (cnfa->eos[1] != COLORLESS)
+		fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
+	if (cnfa->flags & HASLACONS)
+		fprintf(f, ", haslacons");
+	fprintf(f, "\n");
+	for (st = 0; st < cnfa->nstates; st++)
+		dumpcstate(st, cnfa, f);
+	fflush(f);
+}
+#endif
+
+#ifdef REG_DEBUG				/* subordinates of dumpcnfa */
+
+/*
+ * dumpcstate - dump a compacted-NFA state in human-readable form
+ */
+static void
+dumpcstate(int st,
+		   struct cnfa *cnfa,
+		   FILE *f)
+{
+	struct carc *ca;
+	int			pos;
+
+	fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
+	pos = 1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co < cnfa->ncolors)
+			fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
+		else
+			fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
+		if (pos == 5)
+		{
+			fprintf(f, "\n");
+			pos = 1;
+		}
+		else
+			pos++;
+	}
+	if (ca == cnfa->states[st] || pos != 1)
+		fprintf(f, "\n");
+	fflush(f);
+}
+
+#endif							/* REG_DEBUG */
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
new file mode 100644
index 0000000..3cc2d4d
--- /dev/null
+++ b/src/backend/regex/regc_pg_locale.c
@@ -0,0 +1,935 @@
+/*-------------------------------------------------------------------------
+ *
+ * regc_pg_locale.c
+ *	  ctype functions adapted to work on pg_wchar (a/k/a chr),
+ *	  and functions to cache the results of wholesale ctype probing.
+ *
+ * This file is #included by regcomp.c; it's not meant to compile standalone.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regc_pg_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "catalog/pg_collation.h"
+#include "utils/pg_locale.h"
+
+/*
+ * To provide as much functionality as possible on a variety of platforms,
+ * without going so far as to implement everything from scratch, we use
+ * several implementation strategies depending on the situation:
+ *
+ * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
+ * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
+ * collations don't give a fig about multibyte characters.
+ *
+ * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
+ *
+ * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * This assumes that every platform uses Unicode codepoints directly
+ * as the wchar_t representation of Unicode.  On some platforms
+ * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
+ *
+ * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * values up to 255, and punt for values above that.  This is 100% correct
+ * only in single-byte encodings such as LATINn.  However, non-Unicode
+ * multibyte encodings are mostly Far Eastern character sets for which the
+ * properties being tested here aren't very relevant for higher code values
+ * anyway.  The difficulty with using the <wctype.h> functions with
+ * non-Unicode multibyte encodings is that we can have no certainty that
+ * the platform's wchar_t representation matches what we do in pg_wchar
+ * conversions.
+ *
+ * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
+ * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
+ * functions, under exactly the same cases as #2.
+ *
+ * There is one notable difference between cases 2 and 3: in the "default"
+ * collation we force ASCII letters to follow ASCII upcase/downcase rules,
+ * while in a non-default collation we just let the library functions do what
+ * they will.  The case where this matters is treatment of I/i in Turkish,
+ * and the behavior is meant to match the upper()/lower() SQL functions.
+ *
+ * We store the active collation setting in static variables.  In principle
+ * it could be passed down to here via the regex library's "struct vars" data
+ * structure; but that would require somewhat invasive changes in the regex
+ * library, and right now there's no real benefit to be gained from that.
+ *
+ * NB: the coding here assumes pg_wchar is an unsigned type.
+ */
+
+typedef enum
+{
+	PG_REGEX_LOCALE_C,			/* C locale (encoding independent) */
+	PG_REGEX_LOCALE_WIDE,		/* Use <wctype.h> functions */
+	PG_REGEX_LOCALE_1BYTE,		/* Use <ctype.h> functions */
+	PG_REGEX_LOCALE_WIDE_L,		/* Use locale_t <wctype.h> functions */
+	PG_REGEX_LOCALE_1BYTE_L,	/* Use locale_t <ctype.h> functions */
+	PG_REGEX_LOCALE_ICU			/* Use ICU uchar.h functions */
+} PG_Locale_Strategy;
+
+static PG_Locale_Strategy pg_regex_strategy;
+static pg_locale_t pg_regex_locale;
+static Oid	pg_regex_collation;
+
+/*
+ * Hard-wired character properties for C locale
+ */
+#define PG_ISDIGIT	0x01
+#define PG_ISALPHA	0x02
+#define PG_ISALNUM	(PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER	0x04
+#define PG_ISLOWER	0x08
+#define PG_ISGRAPH	0x10
+#define PG_ISPRINT	0x20
+#define PG_ISPUNCT	0x40
+#define PG_ISSPACE	0x80
+
+static const unsigned char pg_char_properties[128] = {
+	 /* NUL */ 0,
+	 /* ^A */ 0,
+	 /* ^B */ 0,
+	 /* ^C */ 0,
+	 /* ^D */ 0,
+	 /* ^E */ 0,
+	 /* ^F */ 0,
+	 /* ^G */ 0,
+	 /* ^H */ 0,
+	 /* ^I */ PG_ISSPACE,
+	 /* ^J */ PG_ISSPACE,
+	 /* ^K */ PG_ISSPACE,
+	 /* ^L */ PG_ISSPACE,
+	 /* ^M */ PG_ISSPACE,
+	 /* ^N */ 0,
+	 /* ^O */ 0,
+	 /* ^P */ 0,
+	 /* ^Q */ 0,
+	 /* ^R */ 0,
+	 /* ^S */ 0,
+	 /* ^T */ 0,
+	 /* ^U */ 0,
+	 /* ^V */ 0,
+	 /* ^W */ 0,
+	 /* ^X */ 0,
+	 /* ^Y */ 0,
+	 /* ^Z */ 0,
+	 /* ^[ */ 0,
+	 /* ^\ */ 0,
+	 /* ^] */ 0,
+	 /* ^^ */ 0,
+	 /* ^_ */ 0,
+	 /* */ PG_ISPRINT | PG_ISSPACE,
+	 /* !  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* "  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* #  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* $  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* %  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* &  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* '  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* (  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* )  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* *  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* +  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ,  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* -  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* .  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* /  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* 0  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 1  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 2  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 3  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 4  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 5  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 6  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 7  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 8  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* 9  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	 /* :  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ;  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* <  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* =  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* >  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ?  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* @  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* A  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* B  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* C  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* D  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* E  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* F  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* G  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* H  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* I  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* J  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* K  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* L  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* M  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* N  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* O  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* P  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* Q  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* R  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* S  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* T  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* U  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* V  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* W  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* X  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* Y  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* Z  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	 /* [  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* \  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ]  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ^  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* _  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* `  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* a  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* b  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* c  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* d  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* e  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* f  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* g  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* h  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* i  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* j  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* k  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* l  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* m  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* n  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* o  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* p  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* q  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* r  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* s  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* t  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* u  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* v  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* w  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* x  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* y  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* z  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	 /* {  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* |  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* }  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* ~  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	 /* DEL */ 0
+};
+
+
+/*
+ * pg_set_regex_collation: set collation for these functions to obey
+ *
+ * This is called when beginning compilation or execution of a regexp.
+ * Since there's no need for reentrancy of regexp operations, it's okay
+ * to store the results in static variables.
+ */
+void
+pg_set_regex_collation(Oid collation)
+{
+	if (lc_ctype_is_c(collation))
+	{
+		/* C/POSIX collations use this path regardless of database encoding */
+		pg_regex_strategy = PG_REGEX_LOCALE_C;
+		pg_regex_locale = 0;
+		pg_regex_collation = C_COLLATION_OID;
+	}
+	else
+	{
+		if (collation == DEFAULT_COLLATION_OID)
+			pg_regex_locale = 0;
+		else if (OidIsValid(collation))
+		{
+			/*
+			 * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
+			 * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not
+			 * have to be considered below.
+			 */
+			pg_regex_locale = pg_newlocale_from_collation(collation);
+		}
+		else
+		{
+			/*
+			 * This typically means that the parser could not resolve a
+			 * conflict of implicit collations, so report it that way.
+			 */
+			ereport(ERROR,
+					(errcode(ERRCODE_INDETERMINATE_COLLATION),
+					 errmsg("could not determine which collation to use for regular expression"),
+					 errhint("Use the COLLATE clause to set the collation explicitly.")));
+		}
+
+		if (pg_regex_locale && !pg_regex_locale->deterministic)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("nondeterministic collations are not supported for regular expressions")));
+
+#ifdef USE_ICU
+		if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
+			pg_regex_strategy = PG_REGEX_LOCALE_ICU;
+		else
+#endif
+		if (GetDatabaseEncoding() == PG_UTF8)
+		{
+			if (pg_regex_locale)
+				pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
+			else
+				pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
+		}
+		else
+		{
+			if (pg_regex_locale)
+				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
+			else
+				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
+		}
+
+		pg_regex_collation = collation;
+	}
+}
+
+static int
+pg_wc_isdigit(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISDIGIT));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswdigit((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isdigit((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isdigit(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalpha(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISALPHA));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalpha((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalpha((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isalpha(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isalnum(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISALNUM));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalnum((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalnum((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isalnum(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isupper(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISUPPER));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswupper((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isupper((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isupper(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_islower(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISLOWER));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswlower((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					islower((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					islower_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_islower(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isgraph(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISGRAPH));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswgraph((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isgraph((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isgraph(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isprint(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISPRINT));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswprint((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isprint((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isprint(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_ispunct(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISPUNCT));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswpunct((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					ispunct((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_ispunct(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static int
+pg_wc_isspace(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISSPACE));
+		case PG_REGEX_LOCALE_WIDE:
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswspace((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isspace((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_isspace(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_toupper(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towupper((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return toupper((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
+#endif
+			return c;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_toupper(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+static pg_wchar
+pg_wc_tolower(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towlower((wint_t) c);
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return tolower((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE_L:
+#ifdef HAVE_LOCALE_T
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
+#endif
+			return c;
+		case PG_REGEX_LOCALE_ICU:
+#ifdef USE_ICU
+			return u_tolower(c);
+#endif
+			break;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+
+
+/*
+ * These functions cache the results of probing libc's ctype behavior for
+ * all character codes of interest in a given encoding/collation.  The
+ * result is provided as a "struct cvec", but notice that the representation
+ * is a touch different from a cvec created by regc_cvec.c: we allocate the
+ * chrs[] and ranges[] arrays separately from the struct so that we can
+ * realloc them larger at need.  This is okay since the cvecs made here
+ * should never be freed by freecvec().
+ *
+ * We use malloc not palloc since we mustn't lose control on out-of-memory;
+ * the main regex code expects us to return a failure indication instead.
+ */
+
+typedef int (*pg_wc_probefunc) (pg_wchar c);
+
+typedef struct pg_ctype_cache
+{
+	pg_wc_probefunc probefunc;	/* pg_wc_isalpha or a sibling */
+	Oid			collation;		/* collation this entry is for */
+	struct cvec cv;				/* cache entry contents */
+	struct pg_ctype_cache *next;	/* chain link */
+} pg_ctype_cache;
+
+static pg_ctype_cache *pg_ctype_cache_list = NULL;
+
+/*
+ * Add a chr or range to pcc->cv; return false if run out of memory
+ */
+static bool
+store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
+{
+	chr		   *newchrs;
+
+	if (nchrs > 1)
+	{
+		if (pcc->cv.nranges >= pcc->cv.rangespace)
+		{
+			pcc->cv.rangespace *= 2;
+			newchrs = (chr *) realloc(pcc->cv.ranges,
+									  pcc->cv.rangespace * sizeof(chr) * 2);
+			if (newchrs == NULL)
+				return false;
+			pcc->cv.ranges = newchrs;
+		}
+		pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
+		pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
+		pcc->cv.nranges++;
+	}
+	else
+	{
+		assert(nchrs == 1);
+		if (pcc->cv.nchrs >= pcc->cv.chrspace)
+		{
+			pcc->cv.chrspace *= 2;
+			newchrs = (chr *) realloc(pcc->cv.chrs,
+									  pcc->cv.chrspace * sizeof(chr));
+			if (newchrs == NULL)
+				return false;
+			pcc->cv.chrs = newchrs;
+		}
+		pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
+	}
+	return true;
+}
+
+/*
+ * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
+ * chrs satisfying the probe function.  The active collation is the one
+ * previously set by pg_set_regex_collation.  Return NULL if out of memory.
+ *
+ * Note that the result must not be freed or modified by caller.
+ */
+static struct cvec *
+pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
+{
+	pg_ctype_cache *pcc;
+	pg_wchar	max_chr;
+	pg_wchar	cur_chr;
+	int			nmatches;
+	chr		   *newchrs;
+
+	/*
+	 * Do we already have the answer cached?
+	 */
+	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
+	{
+		if (pcc->probefunc == probefunc &&
+			pcc->collation == pg_regex_collation)
+			return &pcc->cv;
+	}
+
+	/*
+	 * Nope, so initialize some workspace ...
+	 */
+	pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
+	if (pcc == NULL)
+		return NULL;
+	pcc->probefunc = probefunc;
+	pcc->collation = pg_regex_collation;
+	pcc->cv.nchrs = 0;
+	pcc->cv.chrspace = 128;
+	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
+	pcc->cv.nranges = 0;
+	pcc->cv.rangespace = 64;
+	pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
+	if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
+		goto out_of_memory;
+	pcc->cv.cclasscode = cclasscode;
+
+	/*
+	 * Decide how many character codes we ought to look through.  In general
+	 * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
+	 * runtime using the "high colormap" mechanism.  However, in C locale
+	 * there's no need to go further than 127, and if we only have a 1-byte
+	 * <ctype.h> API there's no need to go further than that can handle.
+	 *
+	 * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
+	 * output cvec as not having any locale-dependent behavior, since there
+	 * will be no need to do any run-time locale checks.  (The #if's here
+	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
+	 * useful to allow it to be small for testing purposes.)
+	 */
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+#if MAX_SIMPLE_CHR >= 127
+			max_chr = (pg_wchar) 127;
+			pcc->cv.cclasscode = -1;
+#else
+			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
+			break;
+		case PG_REGEX_LOCALE_WIDE:
+		case PG_REGEX_LOCALE_WIDE_L:
+			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+			break;
+		case PG_REGEX_LOCALE_1BYTE:
+		case PG_REGEX_LOCALE_1BYTE_L:
+#if MAX_SIMPLE_CHR >= UCHAR_MAX
+			max_chr = (pg_wchar) UCHAR_MAX;
+			pcc->cv.cclasscode = -1;
+#else
+			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
+			break;
+		case PG_REGEX_LOCALE_ICU:
+			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+			break;
+		default:
+			max_chr = 0;		/* can't get here, but keep compiler quiet */
+			break;
+	}
+
+	/*
+	 * And scan 'em ...
+	 */
+	nmatches = 0;				/* number of consecutive matches */
+
+	for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
+	{
+		if ((*probefunc) (cur_chr))
+			nmatches++;
+		else if (nmatches > 0)
+		{
+			if (!store_match(pcc, cur_chr - nmatches, nmatches))
+				goto out_of_memory;
+			nmatches = 0;
+		}
+	}
+
+	if (nmatches > 0)
+		if (!store_match(pcc, cur_chr - nmatches, nmatches))
+			goto out_of_memory;
+
+	/*
+	 * We might have allocated more memory than needed, if so free it
+	 */
+	if (pcc->cv.nchrs == 0)
+	{
+		free(pcc->cv.chrs);
+		pcc->cv.chrs = NULL;
+		pcc->cv.chrspace = 0;
+	}
+	else if (pcc->cv.nchrs < pcc->cv.chrspace)
+	{
+		newchrs = (chr *) realloc(pcc->cv.chrs,
+								  pcc->cv.nchrs * sizeof(chr));
+		if (newchrs == NULL)
+			goto out_of_memory;
+		pcc->cv.chrs = newchrs;
+		pcc->cv.chrspace = pcc->cv.nchrs;
+	}
+	if (pcc->cv.nranges == 0)
+	{
+		free(pcc->cv.ranges);
+		pcc->cv.ranges = NULL;
+		pcc->cv.rangespace = 0;
+	}
+	else if (pcc->cv.nranges < pcc->cv.rangespace)
+	{
+		newchrs = (chr *) realloc(pcc->cv.ranges,
+								  pcc->cv.nranges * sizeof(chr) * 2);
+		if (newchrs == NULL)
+			goto out_of_memory;
+		pcc->cv.ranges = newchrs;
+		pcc->cv.rangespace = pcc->cv.nranges;
+	}
+
+	/*
+	 * Success, link it into cache chain
+	 */
+	pcc->next = pg_ctype_cache_list;
+	pg_ctype_cache_list = pcc;
+
+	return &pcc->cv;
+
+	/*
+	 * Failure, clean up
+	 */
+out_of_memory:
+	if (pcc->cv.chrs)
+		free(pcc->cv.chrs);
+	if (pcc->cv.ranges)
+		free(pcc->cv.ranges);
+	free(pcc);
+
+	return NULL;
+}
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
new file mode 100644
index 0000000..91078dc
--- /dev/null
+++ b/src/backend/regex/regcomp.c
@@ -0,0 +1,2194 @@
+/*
+ * re_*comp and friends - compile REs
+ * This file #includes several others (see the bottom).
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regcomp.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+/*
+ * forward declarations, up here so forward datatypes etc. are defined early
+ */
+/* === regcomp.c === */
+static void moresubs(struct vars *, int);
+static int	freev(struct vars *, int);
+static void makesearch(struct vars *, struct nfa *);
+static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
+static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
+static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
+static void nonword(struct vars *, int, struct state *, struct state *);
+static void word(struct vars *, int, struct state *, struct state *);
+static int	scannum(struct vars *);
+static void repeat(struct vars *, struct state *, struct state *, int, int);
+static void bracket(struct vars *, struct state *, struct state *);
+static void cbracket(struct vars *, struct state *, struct state *);
+static void brackpart(struct vars *, struct state *, struct state *);
+static const chr *scanplain(struct vars *);
+static void onechr(struct vars *, chr, struct state *, struct state *);
+static void wordchrs(struct vars *);
+static void processlacon(struct vars *, struct state *, struct state *, int,
+						 struct state *, struct state *);
+static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
+static void freesubre(struct vars *, struct subre *);
+static void freesrnode(struct vars *, struct subre *);
+static void optst(struct vars *, struct subre *);
+static int	numst(struct subre *, int);
+static void markst(struct subre *);
+static void cleanst(struct vars *);
+static long nfatree(struct vars *, struct subre *, FILE *);
+static long nfanode(struct vars *, struct subre *, int, FILE *);
+static int	newlacon(struct vars *, struct state *, struct state *, int);
+static void freelacons(struct subre *, int);
+static void rfree(regex_t *);
+static int	rcancelrequested(void);
+static int	rstacktoodeep(void);
+
+#ifdef REG_DEBUG
+static void dump(regex_t *, FILE *);
+static void dumpst(struct subre *, FILE *, int);
+static void stdump(struct subre *, FILE *, int);
+static const char *stid(struct subre *, char *, size_t);
+#endif
+/* === regc_lex.c === */
+static void lexstart(struct vars *);
+static void prefixes(struct vars *);
+static void lexnest(struct vars *, const chr *, const chr *);
+static void lexword(struct vars *);
+static int	next(struct vars *);
+static int	lexescape(struct vars *);
+static chr	lexdigits(struct vars *, int, int, int);
+static int	brenext(struct vars *, chr);
+static void skip(struct vars *);
+static chr	newline(void);
+static chr	chrnamed(struct vars *, const chr *, const chr *, chr);
+
+/* === regc_color.c === */
+static void initcm(struct vars *, struct colormap *);
+static void freecm(struct colormap *);
+static color maxcolor(struct colormap *);
+static color newcolor(struct colormap *);
+static void freecolor(struct colormap *, color);
+static color pseudocolor(struct colormap *);
+static color subcolor(struct colormap *, chr);
+static color subcolorhi(struct colormap *, color *);
+static color newsub(struct colormap *, color);
+static int	newhicolorrow(struct colormap *, int);
+static void newhicolorcols(struct colormap *);
+static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *);
+static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *);
+static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *);
+static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *);
+static void okcolors(struct nfa *, struct colormap *);
+static void colorchain(struct colormap *, struct arc *);
+static void uncolorchain(struct colormap *, struct arc *);
+static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *);
+static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
+
+#ifdef REG_DEBUG
+static void dumpcolors(struct colormap *, FILE *);
+static void dumpchr(chr, FILE *);
+#endif
+/* === regc_nfa.c === */
+static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
+static void freenfa(struct nfa *);
+static struct state *newstate(struct nfa *);
+static struct state *newfstate(struct nfa *, int flag);
+static void dropstate(struct nfa *, struct state *);
+static void freestate(struct nfa *, struct state *);
+static void destroystate(struct nfa *, struct state *);
+static void newarc(struct nfa *, int, color, struct state *, struct state *);
+static void createarc(struct nfa *, int, color, struct state *, struct state *);
+static struct arc *allocarc(struct nfa *, struct state *);
+static void freearc(struct nfa *, struct arc *);
+static void changearctarget(struct arc *, struct state *);
+static int	hasnonemptyout(struct state *);
+static struct arc *findarc(struct state *, int, color);
+static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
+static void sortins(struct nfa *, struct state *);
+static int	sortins_cmp(const void *, const void *);
+static void sortouts(struct nfa *, struct state *);
+static int	sortouts_cmp(const void *, const void *);
+static void moveins(struct nfa *, struct state *, struct state *);
+static void copyins(struct nfa *, struct state *, struct state *);
+static void mergeins(struct nfa *, struct state *, struct arc **, int);
+static void moveouts(struct nfa *, struct state *, struct state *);
+static void copyouts(struct nfa *, struct state *, struct state *);
+static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
+static void delsub(struct nfa *, struct state *, struct state *);
+static void deltraverse(struct nfa *, struct state *, struct state *);
+static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
+static void duptraverse(struct nfa *, struct state *, struct state *);
+static void cleartraverse(struct nfa *, struct state *);
+static struct state *single_color_transition(struct state *, struct state *);
+static void specialcolors(struct nfa *);
+static long optimize(struct nfa *, FILE *);
+static void pullback(struct nfa *, FILE *);
+static int	pull(struct nfa *, struct arc *, struct state **);
+static void pushfwd(struct nfa *, FILE *);
+static int	push(struct nfa *, struct arc *, struct state **);
+
+#define INCOMPATIBLE	1		/* destroys arc */
+#define SATISFIED	2			/* constraint satisfied */
+#define COMPATIBLE	3			/* compatible but not satisfied yet */
+static int	combine(struct arc *, struct arc *);
+static void fixempties(struct nfa *, FILE *);
+static struct state *emptyreachable(struct nfa *, struct state *,
+									struct state *, struct arc **);
+static int	isconstraintarc(struct arc *);
+static int	hasconstraintout(struct state *);
+static void fixconstraintloops(struct nfa *, FILE *);
+static int	findconstraintloop(struct nfa *, struct state *);
+static void breakconstraintloop(struct nfa *, struct state *);
+static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
+								 struct state *, struct arc *,
+								 char *, char *, int);
+static void cleanup(struct nfa *);
+static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
+static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
+static long analyze(struct nfa *);
+static void compact(struct nfa *, struct cnfa *);
+static void carcsort(struct carc *, size_t);
+static int	carc_cmp(const void *, const void *);
+static void freecnfa(struct cnfa *);
+static void dumpnfa(struct nfa *, FILE *);
+
+#ifdef REG_DEBUG
+static void dumpstate(struct state *, FILE *);
+static void dumparcs(struct state *, FILE *);
+static void dumparc(struct arc *, struct state *, FILE *);
+static void dumpcnfa(struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
+#endif
+/* === regc_cvec.c === */
+static struct cvec *newcvec(int, int);
+static struct cvec *clearcvec(struct cvec *);
+static void addchr(struct cvec *, chr);
+static void addrange(struct cvec *, chr, chr);
+static struct cvec *getcvec(struct vars *, int, int);
+static void freecvec(struct cvec *);
+
+/* === regc_pg_locale.c === */
+static int	pg_wc_isdigit(pg_wchar c);
+static int	pg_wc_isalpha(pg_wchar c);
+static int	pg_wc_isalnum(pg_wchar c);
+static int	pg_wc_isupper(pg_wchar c);
+static int	pg_wc_islower(pg_wchar c);
+static int	pg_wc_isgraph(pg_wchar c);
+static int	pg_wc_isprint(pg_wchar c);
+static int	pg_wc_ispunct(pg_wchar c);
+static int	pg_wc_isspace(pg_wchar c);
+static pg_wchar pg_wc_toupper(pg_wchar c);
+static pg_wchar pg_wc_tolower(pg_wchar c);
+
+/* === regc_locale.c === */
+static chr	element(struct vars *, const chr *, const chr *);
+static struct cvec *range(struct vars *, chr, chr, int);
+static int	before(chr, chr);
+static struct cvec *eclass(struct vars *, chr, int);
+static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
+static int	cclass_column_index(struct colormap *, chr);
+static struct cvec *allcases(struct vars *, chr);
+static int	cmp(const chr *, const chr *, size_t);
+static int	casecmp(const chr *, const chr *, size_t);
+
+
+/* internal variables, bundled for easy passing around */
+struct vars
+{
+	regex_t    *re;
+	const chr  *now;			/* scan pointer into string */
+	const chr  *stop;			/* end of string */
+	const chr  *savenow;		/* saved now and stop for "subroutine call" */
+	const chr  *savestop;
+	int			err;			/* error code (0 if none) */
+	int			cflags;			/* copy of compile flags */
+	int			lasttype;		/* type of previous token */
+	int			nexttype;		/* type of next token */
+	chr			nextvalue;		/* value (if any) of next token */
+	int			lexcon;			/* lexical context type (see lex.c) */
+	int			nsubexp;		/* subexpression count */
+	struct subre **subs;		/* subRE pointer vector */
+	size_t		nsubs;			/* length of vector */
+	struct subre *sub10[10];	/* initial vector, enough for most */
+	struct nfa *nfa;			/* the NFA */
+	struct colormap *cm;		/* character color map */
+	color		nlcolor;		/* color of newline */
+	struct state *wordchrs;		/* state in nfa holding word-char outarcs */
+	struct subre *tree;			/* subexpression tree */
+	struct subre *treechain;	/* all tree nodes allocated */
+	struct subre *treefree;		/* any free tree nodes */
+	int			ntree;			/* number of tree nodes, plus one */
+	struct cvec *cv;			/* interface cvec */
+	struct cvec *cv2;			/* utility cvec */
+	struct subre *lacons;		/* lookaround-constraint vector */
+	int			nlacons;		/* size of lacons[]; note that only slots
+								 * numbered 1 .. nlacons-1 are used */
+	size_t		spaceused;		/* approx. space used for compilation */
+};
+
+/* parsing macros; most know that `v' is the struct vars pointer */
+#define NEXT()	(next(v))		/* advance by one token */
+#define SEE(t)	(v->nexttype == (t))	/* is next token this? */
+#define EAT(t)	(SEE(t) && next(v)) /* if next is this, swallow it */
+#define VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
+#define ISERR() VISERR(v)
+#define VERR(vv,e)	((vv)->nexttype = EOS, \
+					 (vv)->err = ((vv)->err ? (vv)->err : (e)))
+#define ERR(e)	VERR(v, e)		/* record an error */
+#define NOERR() {if (ISERR()) return;}	/* if error seen, return */
+#define NOERRN()	{if (ISERR()) return NULL;} /* NOERR with retval */
+#define NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
+#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
+#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
+#define EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
+
+/* token type codes, some also used as NFA arc types */
+#define EMPTY	'n'				/* no token present */
+#define EOS 'e'					/* end of string */
+#define PLAIN	'p'				/* ordinary character */
+#define DIGIT	'd'				/* digit (in bound) */
+#define BACKREF 'b'				/* back reference */
+#define COLLEL	'I'				/* start of [. */
+#define ECLASS	'E'				/* start of [= */
+#define CCLASS	'C'				/* start of [: */
+#define END 'X'					/* end of [. [= [: */
+#define RANGE	'R'				/* - within [] which might be range delim. */
+#define LACON	'L'				/* lookaround constraint subRE */
+#define AHEAD	'a'				/* color-lookahead arc */
+#define BEHIND	'r'				/* color-lookbehind arc */
+#define WBDRY	'w'				/* word boundary constraint */
+#define NWBDRY	'W'				/* non-word-boundary constraint */
+#define SBEGIN	'A'				/* beginning of string (even if not BOL) */
+#define SEND	'Z'				/* end of string (even if not EOL) */
+
+/* is an arc colored, and hence on a color chain? */
+#define COLORED(a) \
+	((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
+
+
+/* static function list */
+static const struct fns functions = {
+	rfree,						/* regfree insides */
+	rcancelrequested,			/* check for cancel request */
+	rstacktoodeep				/* check for stack getting dangerously deep */
+};
+
+
+
+/*
+ * pg_regcomp - compile regular expression
+ *
+ * Note: on failure, no resources remain allocated, so pg_regfree()
+ * need not be applied to re.
+ */
+int
+pg_regcomp(regex_t *re,
+		   const chr *string,
+		   size_t len,
+		   int flags,
+		   Oid collation)
+{
+	struct vars var;
+	struct vars *v = &var;
+	struct guts *g;
+	int			i;
+	size_t		j;
+
+#ifdef REG_DEBUG
+	FILE	   *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
+#else
+	FILE	   *debug = (FILE *) NULL;
+#endif
+
+#define  CNOERR()	 { if (ISERR()) return freev(v, v->err); }
+
+	/* sanity checks */
+
+	if (re == NULL || string == NULL)
+		return REG_INVARG;
+	if ((flags & REG_QUOTE) &&
+		(flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
+		return REG_INVARG;
+	if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
+		return REG_INVARG;
+
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(collation);
+
+	/* initial setup (after which freev() is callable) */
+	v->re = re;
+	v->now = string;
+	v->stop = v->now + len;
+	v->savenow = v->savestop = NULL;
+	v->err = 0;
+	v->cflags = flags;
+	v->nsubexp = 0;
+	v->subs = v->sub10;
+	v->nsubs = 10;
+	for (j = 0; j < v->nsubs; j++)
+		v->subs[j] = NULL;
+	v->nfa = NULL;
+	v->cm = NULL;
+	v->nlcolor = COLORLESS;
+	v->wordchrs = NULL;
+	v->tree = NULL;
+	v->treechain = NULL;
+	v->treefree = NULL;
+	v->cv = NULL;
+	v->cv2 = NULL;
+	v->lacons = NULL;
+	v->nlacons = 0;
+	v->spaceused = 0;
+	re->re_magic = REMAGIC;
+	re->re_info = 0;			/* bits get set during parse */
+	re->re_csize = sizeof(chr);
+	re->re_collation = collation;
+	re->re_guts = NULL;
+	re->re_fns = VS(&functions);
+
+	/* more complex setup, malloced things */
+	re->re_guts = VS(MALLOC(sizeof(struct guts)));
+	if (re->re_guts == NULL)
+		return freev(v, REG_ESPACE);
+	g = (struct guts *) re->re_guts;
+	g->tree = NULL;
+	initcm(v, &g->cmap);
+	v->cm = &g->cmap;
+	g->lacons = NULL;
+	g->nlacons = 0;
+	ZAPCNFA(g->search);
+	v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
+	CNOERR();
+	/* set up a reasonably-sized transient cvec for getcvec usage */
+	v->cv = newcvec(100, 20);
+	if (v->cv == NULL)
+		return freev(v, REG_ESPACE);
+
+	/* parsing */
+	lexstart(v);				/* also handles prefixes */
+	if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
+	{
+		/* assign newline a unique color */
+		v->nlcolor = subcolor(v->cm, newline());
+		okcolors(v->nfa, v->cm);
+	}
+	CNOERR();
+	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
+	assert(SEE(EOS));			/* even if error; ISERR() => SEE(EOS) */
+	CNOERR();
+	assert(v->tree != NULL);
+
+	/* finish setup of nfa and its subre tree */
+	specialcolors(v->nfa);
+	CNOERR();
+#ifdef REG_DEBUG
+	if (debug != NULL)
+	{
+		fprintf(debug, "\n\n\n========= RAW ==========\n");
+		dumpnfa(v->nfa, debug);
+		dumpst(v->tree, debug, 1);
+	}
+#endif
+	optst(v, v->tree);
+	v->ntree = numst(v->tree, 1);
+	markst(v->tree);
+	cleanst(v);
+#ifdef REG_DEBUG
+	if (debug != NULL)
+	{
+		fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
+		dumpst(v->tree, debug, 1);
+	}
+#endif
+
+	/* build compacted NFAs for tree and lacons */
+	re->re_info |= nfatree(v, v->tree, debug);
+	CNOERR();
+	assert(v->nlacons == 0 || v->lacons != NULL);
+	for (i = 1; i < v->nlacons; i++)
+	{
+		struct subre *lasub = &v->lacons[i];
+
+#ifdef REG_DEBUG
+		if (debug != NULL)
+			fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
+#endif
+
+		/* Prepend .* to pattern if it's a lookbehind LACON */
+		nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
+	}
+	CNOERR();
+	if (v->tree->flags & SHORTER)
+		NOTE(REG_USHORTEST);
+
+	/* build compacted NFAs for tree, lacons, fast search */
+#ifdef REG_DEBUG
+	if (debug != NULL)
+		fprintf(debug, "\n\n\n========= SEARCH ==========\n");
+#endif
+	/* can sacrifice main NFA now, so use it as work area */
+	(DISCARD) optimize(v->nfa, debug);
+	CNOERR();
+	makesearch(v, v->nfa);
+	CNOERR();
+	compact(v->nfa, &g->search);
+	CNOERR();
+
+	/* looks okay, package it up */
+	re->re_nsub = v->nsubexp;
+	v->re = NULL;				/* freev no longer frees re */
+	g->magic = GUTSMAGIC;
+	g->cflags = v->cflags;
+	g->info = re->re_info;
+	g->nsub = re->re_nsub;
+	g->tree = v->tree;
+	v->tree = NULL;
+	g->ntree = v->ntree;
+	g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
+	g->lacons = v->lacons;
+	v->lacons = NULL;
+	g->nlacons = v->nlacons;
+
+#ifdef REG_DEBUG
+	if (flags & REG_DUMP)
+		dump(re, stdout);
+#endif
+
+	assert(v->err == 0);
+	return freev(v, 0);
+}
+
+/*
+ * moresubs - enlarge subRE vector
+ */
+static void
+moresubs(struct vars *v,
+		 int wanted)			/* want enough room for this one */
+{
+	struct subre **p;
+	size_t		n;
+
+	assert(wanted > 0 && (size_t) wanted >= v->nsubs);
+	n = (size_t) wanted * 3 / 2 + 1;
+
+	if (v->subs == v->sub10)
+	{
+		p = (struct subre **) MALLOC(n * sizeof(struct subre *));
+		if (p != NULL)
+			memcpy(VS(p), VS(v->subs),
+				   v->nsubs * sizeof(struct subre *));
+	}
+	else
+		p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
+	if (p == NULL)
+	{
+		ERR(REG_ESPACE);
+		return;
+	}
+	v->subs = p;
+	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
+		*p = NULL;
+	assert(v->nsubs == n);
+	assert((size_t) wanted < v->nsubs);
+}
+
+/*
+ * freev - free vars struct's substructures where necessary
+ *
+ * Optionally does error-number setting, and always returns error code
+ * (if any), to make error-handling code terser.
+ */
+static int
+freev(struct vars *v,
+	  int err)
+{
+	if (v->re != NULL)
+		rfree(v->re);
+	if (v->subs != v->sub10)
+		FREE(v->subs);
+	if (v->nfa != NULL)
+		freenfa(v->nfa);
+	if (v->tree != NULL)
+		freesubre(v, v->tree);
+	if (v->treechain != NULL)
+		cleanst(v);
+	if (v->cv != NULL)
+		freecvec(v->cv);
+	if (v->cv2 != NULL)
+		freecvec(v->cv2);
+	if (v->lacons != NULL)
+		freelacons(v->lacons, v->nlacons);
+	ERR(err);					/* nop if err==0 */
+
+	return v->err;
+}
+
+/*
+ * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
+ * NFA must have been optimize()d already.
+ */
+static void
+makesearch(struct vars *v,
+		   struct nfa *nfa)
+{
+	struct arc *a;
+	struct arc *b;
+	struct state *pre = nfa->pre;
+	struct state *s;
+	struct state *s2;
+	struct state *slist;
+
+	/* no loops are needed if it's anchored */
+	for (a = pre->outs; a != NULL; a = a->outchain)
+	{
+		assert(a->type == PLAIN);
+		if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
+			break;
+	}
+	if (a != NULL)
+	{
+		/* add implicit .* in front */
+		rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
+
+		/* and ^* and \A* too -- not always necessary, but harmless */
+		newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
+		newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
+	}
+
+	/*
+	 * Now here's the subtle part.  Because many REs have no lookback
+	 * constraints, often knowing when you were in the pre state tells you
+	 * little; it's the next state(s) that are informative.  But some of them
+	 * may have other inarcs, i.e. it may be possible to make actual progress
+	 * and then return to one of them.  We must de-optimize such cases,
+	 * splitting each such state into progress and no-progress states.
+	 */
+
+	/* first, make a list of the states reachable from pre and elsewhere */
+	slist = NULL;
+	for (a = pre->outs; a != NULL; a = a->outchain)
+	{
+		s = a->to;
+		for (b = s->ins; b != NULL; b = b->inchain)
+		{
+			if (b->from != pre)
+				break;
+		}
+
+		/*
+		 * We want to mark states as being in the list already by having non
+		 * NULL tmp fields, but we can't just store the old slist value in tmp
+		 * because that doesn't work for the first such state.  Instead, the
+		 * first list entry gets its own address in tmp.
+		 */
+		if (b != NULL && s->tmp == NULL)
+		{
+			s->tmp = (slist != NULL) ? slist : s;
+			slist = s;
+		}
+	}
+
+	/* do the splits */
+	for (s = slist; s != NULL; s = s2)
+	{
+		s2 = newstate(nfa);
+		NOERR();
+		copyouts(nfa, s, s2);
+		NOERR();
+		for (a = s->ins; a != NULL; a = b)
+		{
+			b = a->inchain;
+			if (a->from != pre)
+			{
+				cparc(nfa, a, a->from, s2);
+				freearc(nfa, a);
+			}
+		}
+		s2 = (s->tmp != s) ? s->tmp : NULL;
+		s->tmp = NULL;			/* clean up while we're at it */
+	}
+}
+
+/*
+ * parse - parse an RE
+ *
+ * This is actually just the top level, which parses a bunch of branches
+ * tied together with '|'.  They appear in the tree as the left children
+ * of a chain of '|' subres.
+ */
+static struct subre *
+parse(struct vars *v,
+	  int stopper,				/* EOS or ')' */
+	  int type,					/* LACON (lookaround subRE) or PLAIN */
+	  struct state *init,		/* initial state */
+	  struct state *final)		/* final state */
+{
+	struct state *left;			/* scaffolding for branch */
+	struct state *right;
+	struct subre *branches;		/* top level */
+	struct subre *branch;		/* current branch */
+	struct subre *t;			/* temporary */
+	int			firstbranch;	/* is this the first branch? */
+
+	assert(stopper == ')' || stopper == EOS);
+
+	branches = subre(v, '|', LONGER, init, final);
+	NOERRN();
+	branch = branches;
+	firstbranch = 1;
+	do
+	{							/* a branch */
+		if (!firstbranch)
+		{
+			/* need a place to hang it */
+			branch->right = subre(v, '|', LONGER, init, final);
+			NOERRN();
+			branch = branch->right;
+		}
+		firstbranch = 0;
+		left = newstate(v->nfa);
+		right = newstate(v->nfa);
+		NOERRN();
+		EMPTYARC(init, left);
+		EMPTYARC(right, final);
+		NOERRN();
+		branch->left = parsebranch(v, stopper, type, left, right, 0);
+		NOERRN();
+		branch->flags |= UP(branch->flags | branch->left->flags);
+		if ((branch->flags & ~branches->flags) != 0)	/* new flags */
+			for (t = branches; t != branch; t = t->right)
+				t->flags |= branch->flags;
+	} while (EAT('|'));
+	assert(SEE(stopper) || SEE(EOS));
+
+	if (!SEE(stopper))
+	{
+		assert(stopper == ')' && SEE(EOS));
+		ERR(REG_EPAREN);
+	}
+
+	/* optimize out simple cases */
+	if (branch == branches)
+	{							/* only one branch */
+		assert(branch->right == NULL);
+		t = branch->left;
+		branch->left = NULL;
+		freesubre(v, branches);
+		branches = t;
+	}
+	else if (!MESSY(branches->flags))
+	{							/* no interesting innards */
+		freesubre(v, branches->left);
+		branches->left = NULL;
+		freesubre(v, branches->right);
+		branches->right = NULL;
+		branches->op = '=';
+	}
+
+	return branches;
+}
+
+/*
+ * parsebranch - parse one branch of an RE
+ *
+ * This mostly manages concatenation, working closely with parseqatom().
+ * Concatenated things are bundled up as much as possible, with separate
+ * ',' nodes introduced only when necessary due to substructure.
+ */
+static struct subre *
+parsebranch(struct vars *v,
+			int stopper,		/* EOS or ')' */
+			int type,			/* LACON (lookaround subRE) or PLAIN */
+			struct state *left, /* leftmost state */
+			struct state *right,	/* rightmost state */
+			int partial)		/* is this only part of a branch? */
+{
+	struct state *lp;			/* left end of current construct */
+	int			seencontent;	/* is there anything in this branch yet? */
+	struct subre *t;
+
+	lp = left;
+	seencontent = 0;
+	t = subre(v, '=', 0, left, right);	/* op '=' is tentative */
+	NOERRN();
+	while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
+	{
+		if (seencontent)
+		{						/* implicit concat operator */
+			lp = newstate(v->nfa);
+			NOERRN();
+			moveins(v->nfa, right, lp);
+		}
+		seencontent = 1;
+
+		/* NB, recursion in parseqatom() may swallow rest of branch */
+		parseqatom(v, stopper, type, lp, right, t);
+		NOERRN();
+	}
+
+	if (!seencontent)
+	{							/* empty branch */
+		if (!partial)
+			NOTE(REG_UUNSPEC);
+		assert(lp == left);
+		EMPTYARC(left, right);
+	}
+
+	return t;
+}
+
+/*
+ * parseqatom - parse one quantified atom or constraint of an RE
+ *
+ * The bookkeeping near the end cooperates very closely with parsebranch();
+ * in particular, it contains a recursion that can involve parsing the rest
+ * of the branch, making this function's name somewhat inaccurate.
+ */
+static void
+parseqatom(struct vars *v,
+		   int stopper,			/* EOS or ')' */
+		   int type,			/* LACON (lookaround subRE) or PLAIN */
+		   struct state *lp,	/* left state to hang it on */
+		   struct state *rp,	/* right state to hang it on */
+		   struct subre *top)	/* subtree top */
+{
+	struct state *s;			/* temporaries for new states */
+	struct state *s2;
+
+#define  ARCV(t, val)	 newarc(v->nfa, t, val, lp, rp)
+	int			m,
+				n;
+	struct subre *atom;			/* atom's subtree */
+	struct subre *t;
+	int			cap;			/* capturing parens? */
+	int			latype;			/* lookaround constraint type */
+	int			subno;			/* capturing-parens or backref number */
+	int			atomtype;
+	int			qprefer;		/* quantifier short/long preference */
+	int			f;
+	struct subre **atomp;		/* where the pointer to atom is */
+
+	/* initial bookkeeping */
+	atom = NULL;
+	assert(lp->nouts == 0);		/* must string new code */
+	assert(rp->nins == 0);		/* between lp and rp */
+	subno = 0;					/* just to shut lint up */
+
+	/* an atom or constraint... */
+	atomtype = v->nexttype;
+	switch (atomtype)
+	{
+			/* first, constraints, which end by returning */
+		case '^':
+			ARCV('^', 1);
+			if (v->cflags & REG_NLANCH)
+				ARCV(BEHIND, v->nlcolor);
+			NEXT();
+			return;
+			break;
+		case '$':
+			ARCV('$', 1);
+			if (v->cflags & REG_NLANCH)
+				ARCV(AHEAD, v->nlcolor);
+			NEXT();
+			return;
+			break;
+		case SBEGIN:
+			ARCV('^', 1);		/* BOL */
+			ARCV('^', 0);		/* or BOS */
+			NEXT();
+			return;
+			break;
+		case SEND:
+			ARCV('$', 1);		/* EOL */
+			ARCV('$', 0);		/* or EOS */
+			NEXT();
+			return;
+			break;
+		case '<':
+			wordchrs(v);		/* does NEXT() */
+			s = newstate(v->nfa);
+			NOERR();
+			nonword(v, BEHIND, lp, s);
+			word(v, AHEAD, s, rp);
+			return;
+			break;
+		case '>':
+			wordchrs(v);		/* does NEXT() */
+			s = newstate(v->nfa);
+			NOERR();
+			word(v, BEHIND, lp, s);
+			nonword(v, AHEAD, s, rp);
+			return;
+			break;
+		case WBDRY:
+			wordchrs(v);		/* does NEXT() */
+			s = newstate(v->nfa);
+			NOERR();
+			nonword(v, BEHIND, lp, s);
+			word(v, AHEAD, s, rp);
+			s = newstate(v->nfa);
+			NOERR();
+			word(v, BEHIND, lp, s);
+			nonword(v, AHEAD, s, rp);
+			return;
+			break;
+		case NWBDRY:
+			wordchrs(v);		/* does NEXT() */
+			s = newstate(v->nfa);
+			NOERR();
+			word(v, BEHIND, lp, s);
+			word(v, AHEAD, s, rp);
+			s = newstate(v->nfa);
+			NOERR();
+			nonword(v, BEHIND, lp, s);
+			nonword(v, AHEAD, s, rp);
+			return;
+			break;
+		case LACON:				/* lookaround constraint */
+			latype = v->nextvalue;
+			NEXT();
+			s = newstate(v->nfa);
+			s2 = newstate(v->nfa);
+			NOERR();
+			t = parse(v, ')', LACON, s, s2);
+			freesubre(v, t);	/* internal structure irrelevant */
+			NOERR();
+			assert(SEE(')'));
+			NEXT();
+			processlacon(v, s, s2, latype, lp, rp);
+			return;
+			break;
+			/* then errors, to get them out of the way */
+		case '*':
+		case '+':
+		case '?':
+		case '{':
+			ERR(REG_BADRPT);
+			return;
+			break;
+		default:
+			ERR(REG_ASSERT);
+			return;
+			break;
+			/* then plain characters, and minor variants on that theme */
+		case ')':				/* unbalanced paren */
+			if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
+			{
+				ERR(REG_EPAREN);
+				return;
+			}
+			/* legal in EREs due to specification botch */
+			NOTE(REG_UPBOTCH);
+			/* fall through into case PLAIN */
+			/* FALLTHROUGH */
+		case PLAIN:
+			onechr(v, v->nextvalue, lp, rp);
+			okcolors(v->nfa, v->cm);
+			NOERR();
+			NEXT();
+			break;
+		case '[':
+			if (v->nextvalue == 1)
+				bracket(v, lp, rp);
+			else
+				cbracket(v, lp, rp);
+			assert(SEE(']') || ISERR());
+			NEXT();
+			break;
+		case '.':
+			rainbow(v->nfa, v->cm, PLAIN,
+					(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
+					lp, rp);
+			NEXT();
+			break;
+			/* and finally the ugly stuff */
+		case '(':				/* value flags as capturing or non */
+			cap = (type == LACON) ? 0 : v->nextvalue;
+			if (cap)
+			{
+				v->nsubexp++;
+				subno = v->nsubexp;
+				if ((size_t) subno >= v->nsubs)
+					moresubs(v, subno);
+				assert((size_t) subno < v->nsubs);
+			}
+			else
+				atomtype = PLAIN;	/* something that's not '(' */
+			NEXT();
+			/* need new endpoints because tree will contain pointers */
+			s = newstate(v->nfa);
+			s2 = newstate(v->nfa);
+			NOERR();
+			EMPTYARC(lp, s);
+			EMPTYARC(s2, rp);
+			NOERR();
+			atom = parse(v, ')', type, s, s2);
+			assert(SEE(')') || ISERR());
+			NEXT();
+			NOERR();
+			if (cap)
+			{
+				v->subs[subno] = atom;
+				t = subre(v, '(', atom->flags | CAP, lp, rp);
+				NOERR();
+				t->subno = subno;
+				t->left = atom;
+				atom = t;
+			}
+			/* postpone everything else pending possible {0} */
+			break;
+		case BACKREF:			/* the Feature From The Black Lagoon */
+			INSIST(type != LACON, REG_ESUBREG);
+			INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
+			INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+			NOERR();
+			assert(v->nextvalue > 0);
+			atom = subre(v, 'b', BACKR, lp, rp);
+			NOERR();
+			subno = v->nextvalue;
+			atom->subno = subno;
+			EMPTYARC(lp, rp);	/* temporarily, so there's something */
+			NEXT();
+			break;
+	}
+
+	/* ...and an atom may be followed by a quantifier */
+	switch (v->nexttype)
+	{
+		case '*':
+			m = 0;
+			n = DUPINF;
+			qprefer = (v->nextvalue) ? LONGER : SHORTER;
+			NEXT();
+			break;
+		case '+':
+			m = 1;
+			n = DUPINF;
+			qprefer = (v->nextvalue) ? LONGER : SHORTER;
+			NEXT();
+			break;
+		case '?':
+			m = 0;
+			n = 1;
+			qprefer = (v->nextvalue) ? LONGER : SHORTER;
+			NEXT();
+			break;
+		case '{':
+			NEXT();
+			m = scannum(v);
+			if (EAT(','))
+			{
+				if (SEE(DIGIT))
+					n = scannum(v);
+				else
+					n = DUPINF;
+				if (m > n)
+				{
+					ERR(REG_BADBR);
+					return;
+				}
+				/* {m,n} exercises preference, even if it's {m,m} */
+				qprefer = (v->nextvalue) ? LONGER : SHORTER;
+			}
+			else
+			{
+				n = m;
+				/* {m} passes operand's preference through */
+				qprefer = 0;
+			}
+			if (!SEE('}'))
+			{					/* catches errors too */
+				ERR(REG_BADBR);
+				return;
+			}
+			NEXT();
+			break;
+		default:				/* no quantifier */
+			m = n = 1;
+			qprefer = 0;
+			break;
+	}
+
+	/* annoying special case:  {0} or {0,0} cancels everything */
+	if (m == 0 && n == 0)
+	{
+		if (atom != NULL)
+			freesubre(v, atom);
+		if (atomtype == '(')
+			v->subs[subno] = NULL;
+		delsub(v->nfa, lp, rp);
+		EMPTYARC(lp, rp);
+		return;
+	}
+
+	/* if not a messy case, avoid hard part */
+	assert(!MESSY(top->flags));
+	f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
+	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
+	{
+		if (!(m == 1 && n == 1))
+			repeat(v, lp, rp, m, n);
+		if (atom != NULL)
+			freesubre(v, atom);
+		top->flags = f;
+		return;
+	}
+
+	/*
+	 * hard part:  something messy
+	 *
+	 * That is, capturing parens, back reference, short/long clash, or an atom
+	 * with substructure containing one of those.
+	 */
+
+	/* now we'll need a subre for the contents even if they're boring */
+	if (atom == NULL)
+	{
+		atom = subre(v, '=', 0, lp, rp);
+		NOERR();
+	}
+
+	/*----------
+	 * Prepare a general-purpose state skeleton.
+	 *
+	 * In the no-backrefs case, we want this:
+	 *
+	 * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+	 *
+	 * where prefix is some repetitions of atom.  In the general case we need
+	 *
+	 * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+	 *
+	 * where the iterator wraps around [begin] ---atom---> [end]
+	 *
+	 * We make the s state here for both cases; s2 is made below if needed
+	 *----------
+	 */
+	s = newstate(v->nfa);		/* first, new endpoints for the atom */
+	s2 = newstate(v->nfa);
+	NOERR();
+	moveouts(v->nfa, lp, s);
+	moveins(v->nfa, rp, s2);
+	NOERR();
+	atom->begin = s;
+	atom->end = s2;
+	s = newstate(v->nfa);		/* set up starting state */
+	NOERR();
+	EMPTYARC(lp, s);
+	NOERR();
+
+	/* break remaining subRE into x{...} and what follows */
+	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
+	NOERR();
+	t->left = atom;
+	atomp = &t->left;
+
+	/* here we should recurse... but we must postpone that to the end */
+
+	/* split top into prefix and remaining */
+	assert(top->op == '=' && top->left == NULL && top->right == NULL);
+	top->left = subre(v, '=', top->flags, top->begin, lp);
+	NOERR();
+	top->op = '.';
+	top->right = t;
+
+	/* if it's a backref, now is the time to replicate the subNFA */
+	if (atomtype == BACKREF)
+	{
+		assert(atom->begin->nouts == 1);	/* just the EMPTY */
+		delsub(v->nfa, atom->begin, atom->end);
+		assert(v->subs[subno] != NULL);
+
+		/*
+		 * And here's why the recursion got postponed: it must wait until the
+		 * skeleton is filled in, because it may hit a backref that wants to
+		 * copy the filled-in skeleton.
+		 */
+		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+			   atom->begin, atom->end);
+		NOERR();
+	}
+
+	/*
+	 * It's quantifier time.  If the atom is just a backref, we'll let it deal
+	 * with quantifiers internally.
+	 */
+	if (atomtype == BACKREF)
+	{
+		/* special case:  backrefs have internal quantifiers */
+		EMPTYARC(s, atom->begin);	/* empty prefix */
+		/* just stuff everything into atom */
+		repeat(v, atom->begin, atom->end, m, n);
+		atom->min = (short) m;
+		atom->max = (short) n;
+		atom->flags |= COMBINE(qprefer, atom->flags);
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
+	}
+	else if (m == 1 && n == 1 &&
+			 (qprefer == 0 ||
+			  (atom->flags & (LONGER | SHORTER | MIXED)) == 0 ||
+			  qprefer == (atom->flags & (LONGER | SHORTER | MIXED))))
+	{
+		/* no/vacuous quantifier:  done */
+		EMPTYARC(s, atom->begin);	/* empty prefix */
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
+	}
+	else if (m > 0 && !(atom->flags & BACKR))
+	{
+		/*
+		 * If there's no backrefs involved, we can turn x{m,n} into
+		 * x{m-1,n-1}x, with capturing parens in only the second x.  This is
+		 * valid because we only care about capturing matches from the final
+		 * iteration of the quantifier.  It's a win because we can implement
+		 * the backref-free left side as a plain DFA node, since we don't
+		 * really care where its submatches are.
+		 */
+		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
+		assert(m >= 1 && m != DUPINF && n >= 1);
+		repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '.', f, s, atom->end); /* prefix and atom */
+		NOERR();
+		t->left = subre(v, '=', PREF(f), s, atom->begin);
+		NOERR();
+		t->right = atom;
+		*atomp = t;
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
+	}
+	else
+	{
+		/* general case: need an iteration node */
+		s2 = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, atom->end, s2);
+		NOERR();
+		dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+		repeat(v, s, s2, m, n);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '*', f, s, s2);
+		NOERR();
+		t->min = (short) m;
+		t->max = (short) n;
+		t->left = atom;
+		*atomp = t;
+		/* rest of branch is to be strung from iteration's end state */
+	}
+
+	/* and finally, look after that postponed recursion */
+	t = top->right;
+	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
+		t->right = parsebranch(v, stopper, type, s2, rp, 1);
+	else
+	{
+		EMPTYARC(s2, rp);
+		t->right = subre(v, '=', 0, s2, rp);
+	}
+	NOERR();
+	assert(SEE('|') || SEE(stopper) || SEE(EOS));
+	t->flags |= COMBINE(t->flags, t->right->flags);
+	top->flags |= COMBINE(top->flags, t->flags);
+}
+
+/*
+ * nonword - generate arcs for non-word-character ahead or behind
+ */
+static void
+nonword(struct vars *v,
+		int dir,				/* AHEAD or BEHIND */
+		struct state *lp,
+		struct state *rp)
+{
+	int			anchor = (dir == AHEAD) ? '$' : '^';
+
+	assert(dir == AHEAD || dir == BEHIND);
+	newarc(v->nfa, anchor, 1, lp, rp);
+	newarc(v->nfa, anchor, 0, lp, rp);
+	colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
+	/* (no need for special attention to \n) */
+}
+
+/*
+ * word - generate arcs for word character ahead or behind
+ */
+static void
+word(struct vars *v,
+	 int dir,					/* AHEAD or BEHIND */
+	 struct state *lp,
+	 struct state *rp)
+{
+	assert(dir == AHEAD || dir == BEHIND);
+	cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
+	/* (no need for special attention to \n) */
+}
+
+/*
+ * scannum - scan a number
+ */
+static int						/* value, <= DUPMAX */
+scannum(struct vars *v)
+{
+	int			n = 0;
+
+	while (SEE(DIGIT) && n < DUPMAX)
+	{
+		n = n * 10 + v->nextvalue;
+		NEXT();
+	}
+	if (SEE(DIGIT) || n > DUPMAX)
+	{
+		ERR(REG_BADBR);
+		return 0;
+	}
+	return n;
+}
+
+/*
+ * repeat - replicate subNFA for quantifiers
+ *
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
+ * The duplication sequences used here are chosen carefully so that any
+ * pointers starting out pointing into the subexpression end up pointing into
+ * the last occurrence.  (Note that it may not be strung between the same
+ * left and right end states, however!)  This used to be important for the
+ * subRE tree, although the important bits are now handled by the in-line
+ * code in parse(), and when this is called, it doesn't matter any more.
+ */
+static void
+repeat(struct vars *v,
+	   struct state *lp,
+	   struct state *rp,
+	   int m,
+	   int n)
+{
+#define  SOME	 2
+#define  INF	 3
+#define  PAIR(x, y)  ((x)*4 + (y))
+#define  REDUCE(x)	 ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
+	const int	rm = REDUCE(m);
+	const int	rn = REDUCE(n);
+	struct state *s;
+	struct state *s2;
+
+	switch (PAIR(rm, rn))
+	{
+		case PAIR(0, 0):		/* empty string */
+			delsub(v->nfa, lp, rp);
+			EMPTYARC(lp, rp);
+			break;
+		case PAIR(0, 1):		/* do as x| */
+			EMPTYARC(lp, rp);
+			break;
+		case PAIR(0, SOME):		/* do as x{1,n}| */
+			repeat(v, lp, rp, 1, n);
+			NOERR();
+			EMPTYARC(lp, rp);
+			break;
+		case PAIR(0, INF):		/* loop x around */
+			s = newstate(v->nfa);
+			NOERR();
+			moveouts(v->nfa, lp, s);
+			moveins(v->nfa, rp, s);
+			EMPTYARC(lp, s);
+			EMPTYARC(s, rp);
+			break;
+		case PAIR(1, 1):		/* no action required */
+			break;
+		case PAIR(1, SOME):		/* do as x{0,n-1}x = (x{1,n-1}|)x */
+			s = newstate(v->nfa);
+			NOERR();
+			moveouts(v->nfa, lp, s);
+			dupnfa(v->nfa, s, rp, lp, s);
+			NOERR();
+			repeat(v, lp, s, 1, n - 1);
+			NOERR();
+			EMPTYARC(lp, s);
+			break;
+		case PAIR(1, INF):		/* add loopback arc */
+			s = newstate(v->nfa);
+			s2 = newstate(v->nfa);
+			NOERR();
+			moveouts(v->nfa, lp, s);
+			moveins(v->nfa, rp, s2);
+			EMPTYARC(lp, s);
+			EMPTYARC(s2, rp);
+			EMPTYARC(s2, s);
+			break;
+		case PAIR(SOME, SOME):	/* do as x{m-1,n-1}x */
+			s = newstate(v->nfa);
+			NOERR();
+			moveouts(v->nfa, lp, s);
+			dupnfa(v->nfa, s, rp, lp, s);
+			NOERR();
+			repeat(v, lp, s, m - 1, n - 1);
+			break;
+		case PAIR(SOME, INF):	/* do as x{m-1,}x */
+			s = newstate(v->nfa);
+			NOERR();
+			moveouts(v->nfa, lp, s);
+			dupnfa(v->nfa, s, rp, lp, s);
+			NOERR();
+			repeat(v, lp, s, m - 1, n);
+			break;
+		default:
+			ERR(REG_ASSERT);
+			break;
+	}
+}
+
+/*
+ * bracket - handle non-complemented bracket expression
+ * Also called from cbracket for complemented bracket expressions.
+ */
+static void
+bracket(struct vars *v,
+		struct state *lp,
+		struct state *rp)
+{
+	assert(SEE('['));
+	NEXT();
+	while (!SEE(']') && !SEE(EOS))
+		brackpart(v, lp, rp);
+	assert(SEE(']') || ISERR());
+	okcolors(v->nfa, v->cm);
+}
+
+/*
+ * cbracket - handle complemented bracket expression
+ * We do it by calling bracket() with dummy endpoints, and then complementing
+ * the result.  The alternative would be to invoke rainbow(), and then delete
+ * arcs as the b.e. is seen... but that gets messy.
+ */
+static void
+cbracket(struct vars *v,
+		 struct state *lp,
+		 struct state *rp)
+{
+	struct state *left = newstate(v->nfa);
+	struct state *right = newstate(v->nfa);
+
+	NOERR();
+	bracket(v, left, right);
+	if (v->cflags & REG_NLSTOP)
+		newarc(v->nfa, PLAIN, v->nlcolor, left, right);
+	NOERR();
+
+	assert(lp->nouts == 0);		/* all outarcs will be ours */
+
+	/*
+	 * Easy part of complementing, and all there is to do since the MCCE code
+	 * was removed.
+	 */
+	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
+	NOERR();
+	dropstate(v->nfa, left);
+	assert(right->nins == 0);
+	freestate(v->nfa, right);
+}
+
+/*
+ * brackpart - handle one item (or range) within a bracket expression
+ */
+static void
+brackpart(struct vars *v,
+		  struct state *lp,
+		  struct state *rp)
+{
+	chr			startc;
+	chr			endc;
+	struct cvec *cv;
+	const chr  *startp;
+	const chr  *endp;
+	chr			c[1];
+
+	/* parse something, get rid of special cases, take shortcuts */
+	switch (v->nexttype)
+	{
+		case RANGE:				/* a-b-c or other botch */
+			ERR(REG_ERANGE);
+			return;
+			break;
+		case PLAIN:
+			c[0] = v->nextvalue;
+			NEXT();
+			/* shortcut for ordinary chr (not range) */
+			if (!SEE(RANGE))
+			{
+				onechr(v, c[0], lp, rp);
+				return;
+			}
+			startc = element(v, c, c + 1);
+			NOERR();
+			break;
+		case COLLEL:
+			startp = v->now;
+			endp = scanplain(v);
+			INSIST(startp < endp, REG_ECOLLATE);
+			NOERR();
+			startc = element(v, startp, endp);
+			NOERR();
+			break;
+		case ECLASS:
+			startp = v->now;
+			endp = scanplain(v);
+			INSIST(startp < endp, REG_ECOLLATE);
+			NOERR();
+			startc = element(v, startp, endp);
+			NOERR();
+			cv = eclass(v, startc, (v->cflags & REG_ICASE));
+			NOERR();
+			subcolorcvec(v, cv, lp, rp);
+			return;
+			break;
+		case CCLASS:
+			startp = v->now;
+			endp = scanplain(v);
+			INSIST(startp < endp, REG_ECTYPE);
+			NOERR();
+			cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
+			NOERR();
+			subcolorcvec(v, cv, lp, rp);
+			return;
+			break;
+		default:
+			ERR(REG_ASSERT);
+			return;
+			break;
+	}
+
+	if (SEE(RANGE))
+	{
+		NEXT();
+		switch (v->nexttype)
+		{
+			case PLAIN:
+			case RANGE:
+				c[0] = v->nextvalue;
+				NEXT();
+				endc = element(v, c, c + 1);
+				NOERR();
+				break;
+			case COLLEL:
+				startp = v->now;
+				endp = scanplain(v);
+				INSIST(startp < endp, REG_ECOLLATE);
+				NOERR();
+				endc = element(v, startp, endp);
+				NOERR();
+				break;
+			default:
+				ERR(REG_ERANGE);
+				return;
+				break;
+		}
+	}
+	else
+		endc = startc;
+
+	/*
+	 * Ranges are unportable.  Actually, standard C does guarantee that digits
+	 * are contiguous, but making that an exception is just too complicated.
+	 */
+	if (startc != endc)
+		NOTE(REG_UUNPORT);
+	cv = range(v, startc, endc, (v->cflags & REG_ICASE));
+	NOERR();
+	subcolorcvec(v, cv, lp, rp);
+}
+
+/*
+ * scanplain - scan PLAIN contents of [. etc.
+ *
+ * Certain bits of trickery in lex.c know that this code does not try
+ * to look past the final bracket of the [. etc.
+ */
+static const chr *				/* just after end of sequence */
+scanplain(struct vars *v)
+{
+	const chr  *endp;
+
+	assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
+	NEXT();
+
+	endp = v->now;
+	while (SEE(PLAIN))
+	{
+		endp = v->now;
+		NEXT();
+	}
+
+	assert(SEE(END) || ISERR());
+	NEXT();
+
+	return endp;
+}
+
+/*
+ * onechr - fill in arcs for a plain character, and possible case complements
+ * This is mostly a shortcut for efficient handling of the common case.
+ */
+static void
+onechr(struct vars *v,
+	   chr c,
+	   struct state *lp,
+	   struct state *rp)
+{
+	if (!(v->cflags & REG_ICASE))
+	{
+		color		lastsubcolor = COLORLESS;
+
+		subcoloronechr(v, c, lp, rp, &lastsubcolor);
+		return;
+	}
+
+	/* rats, need general case anyway... */
+	subcolorcvec(v, allcases(v, c), lp, rp);
+}
+
+/*
+ * wordchrs - set up word-chr list for word-boundary stuff, if needed
+ *
+ * The list is kept as a bunch of arcs between two dummy states; it's
+ * disposed of by the unreachable-states sweep in NFA optimization.
+ * Does NEXT().  Must not be called from any unusual lexical context.
+ * This should be reconciled with the \w etc. handling in lex.c, and
+ * should be cleaned up to reduce dependencies on input scanning.
+ */
+static void
+wordchrs(struct vars *v)
+{
+	struct state *left;
+	struct state *right;
+
+	if (v->wordchrs != NULL)
+	{
+		NEXT();					/* for consistency */
+		return;
+	}
+
+	left = newstate(v->nfa);
+	right = newstate(v->nfa);
+	NOERR();
+	/* fine point:	implemented with [::], and lexer will set REG_ULOCALE */
+	lexword(v);
+	NEXT();
+	assert(v->savenow != NULL && SEE('['));
+	bracket(v, left, right);
+	assert((v->savenow != NULL && SEE(']')) || ISERR());
+	NEXT();
+	NOERR();
+	v->wordchrs = left;
+}
+
+/*
+ * processlacon - generate the NFA representation of a LACON
+ *
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+static void
+processlacon(struct vars *v,
+			 struct state *begin,	/* start of parsed LACON sub-re */
+			 struct state *end, /* end of parsed LACON sub-re */
+			 int latype,
+			 struct state *lp,	/* left state to hang it on */
+			 struct state *rp)	/* right state to hang it on */
+{
+	struct state *s1;
+	int			n;
+
+	/*
+	 * Check for lookaround RE consisting of a single plain color arc (or set
+	 * of arcs); this would typically be a simple chr or a bracket expression.
+	 */
+	s1 = single_color_transition(begin, end);
+	switch (latype)
+	{
+		case LATYPE_AHEAD_POS:
+			/* If lookahead RE is just colorset C, convert to AHEAD(C) */
+			if (s1 != NULL)
+			{
+				cloneouts(v->nfa, s1, lp, rp, AHEAD);
+				return;
+			}
+			break;
+		case LATYPE_AHEAD_NEG:
+			/* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+			if (s1 != NULL)
+			{
+				colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+				newarc(v->nfa, '$', 1, lp, rp);
+				newarc(v->nfa, '$', 0, lp, rp);
+				return;
+			}
+			break;
+		case LATYPE_BEHIND_POS:
+			/* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+			if (s1 != NULL)
+			{
+				cloneouts(v->nfa, s1, lp, rp, BEHIND);
+				return;
+			}
+			break;
+		case LATYPE_BEHIND_NEG:
+			/* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+			if (s1 != NULL)
+			{
+				colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+				newarc(v->nfa, '^', 1, lp, rp);
+				newarc(v->nfa, '^', 0, lp, rp);
+				return;
+			}
+			break;
+		default:
+			assert(NOTREACHED);
+	}
+
+	/* General case: we need a LACON subre and arc */
+	n = newlacon(v, begin, end, latype);
+	newarc(v->nfa, LACON, n, lp, rp);
+}
+
+/*
+ * subre - allocate a subre
+ */
+static struct subre *
+subre(struct vars *v,
+	  int op,
+	  int flags,
+	  struct state *begin,
+	  struct state *end)
+{
+	struct subre *ret = v->treefree;
+
+	/*
+	 * Checking for stack overflow here is sufficient to protect parse() and
+	 * its recursive subroutines.
+	 */
+	if (STACK_TOO_DEEP(v->re))
+	{
+		ERR(REG_ETOOBIG);
+		return NULL;
+	}
+
+	if (ret != NULL)
+		v->treefree = ret->left;
+	else
+	{
+		ret = (struct subre *) MALLOC(sizeof(struct subre));
+		if (ret == NULL)
+		{
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+		ret->chain = v->treechain;
+		v->treechain = ret;
+	}
+
+	assert(strchr("=b|.*(", op) != NULL);
+
+	ret->op = op;
+	ret->flags = flags;
+	ret->id = 0;				/* will be assigned later */
+	ret->subno = 0;
+	ret->min = ret->max = 1;
+	ret->left = NULL;
+	ret->right = NULL;
+	ret->begin = begin;
+	ret->end = end;
+	ZAPCNFA(ret->cnfa);
+
+	return ret;
+}
+
+/*
+ * freesubre - free a subRE subtree
+ */
+static void
+freesubre(struct vars *v,		/* might be NULL */
+		  struct subre *sr)
+{
+	if (sr == NULL)
+		return;
+
+	if (sr->left != NULL)
+		freesubre(v, sr->left);
+	if (sr->right != NULL)
+		freesubre(v, sr->right);
+
+	freesrnode(v, sr);
+}
+
+/*
+ * freesrnode - free one node in a subRE subtree
+ */
+static void
+freesrnode(struct vars *v,		/* might be NULL */
+		   struct subre *sr)
+{
+	if (sr == NULL)
+		return;
+
+	if (!NULLCNFA(sr->cnfa))
+		freecnfa(&sr->cnfa);
+	sr->flags = 0;
+
+	if (v != NULL && v->treechain != NULL)
+	{
+		/* we're still parsing, maybe we can reuse the subre */
+		sr->left = v->treefree;
+		v->treefree = sr;
+	}
+	else
+		FREE(sr);
+}
+
+/*
+ * optst - optimize a subRE subtree
+ */
+static void
+optst(struct vars *v,
+	  struct subre *t)
+{
+	/*
+	 * DGP (2007-11-13): I assume it was the programmer's intent to eventually
+	 * come back and add code to optimize subRE trees, but the routine coded
+	 * just spends effort traversing the tree and doing nothing. We can do
+	 * nothing with less effort.
+	 */
+	return;
+}
+
+/*
+ * numst - number tree nodes (assigning "id" indexes)
+ */
+static int						/* next number */
+numst(struct subre *t,
+	  int start)				/* starting point for subtree numbers */
+{
+	int			i;
+
+	assert(t != NULL);
+
+	i = start;
+	t->id = (short) i++;
+	if (t->left != NULL)
+		i = numst(t->left, i);
+	if (t->right != NULL)
+		i = numst(t->right, i);
+	return i;
+}
+
+/*
+ * markst - mark tree nodes as INUSE
+ *
+ * Note: this is a great deal more subtle than it looks.  During initial
+ * parsing of a regex, all subres are linked into the treechain list;
+ * discarded ones are also linked into the treefree list for possible reuse.
+ * After we are done creating all subres required for a regex, we run markst()
+ * then cleanst(), which results in discarding all subres not reachable from
+ * v->tree.  We then clear v->treechain, indicating that subres must be found
+ * by descending from v->tree.  This changes the behavior of freesubre(): it
+ * will henceforth FREE() unwanted subres rather than sticking them into the
+ * treefree list.  (Doing that any earlier would result in dangling links in
+ * the treechain list.)  This all means that freev() will clean up correctly
+ * if invoked before or after markst()+cleanst(); but it would not work if
+ * called partway through this state conversion, so we mustn't error out
+ * in or between these two functions.
+ */
+static void
+markst(struct subre *t)
+{
+	assert(t != NULL);
+
+	t->flags |= INUSE;
+	if (t->left != NULL)
+		markst(t->left);
+	if (t->right != NULL)
+		markst(t->right);
+}
+
+/*
+ * cleanst - free any tree nodes not marked INUSE
+ */
+static void
+cleanst(struct vars *v)
+{
+	struct subre *t;
+	struct subre *next;
+
+	for (t = v->treechain; t != NULL; t = next)
+	{
+		next = t->chain;
+		if (!(t->flags & INUSE))
+			FREE(t);
+	}
+	v->treechain = NULL;
+	v->treefree = NULL;			/* just on general principles */
+}
+
+/*
+ * nfatree - turn a subRE subtree into a tree of compacted NFAs
+ */
+static long						/* optimize results from top node */
+nfatree(struct vars *v,
+		struct subre *t,
+		FILE *f)				/* for debug output */
+{
+	assert(t != NULL && t->begin != NULL);
+
+	if (t->left != NULL)
+		(DISCARD) nfatree(v, t->left, f);
+	if (t->right != NULL)
+		(DISCARD) nfatree(v, t->right, f);
+
+	return nfanode(v, t, 0, f);
+}
+
+/*
+ * nfanode - do one NFA for nfatree or lacons
+ *
+ * If converttosearch is true, apply makesearch() to the NFA.
+ */
+static long						/* optimize results */
+nfanode(struct vars *v,
+		struct subre *t,
+		int converttosearch,
+		FILE *f)				/* for debug output */
+{
+	struct nfa *nfa;
+	long		ret = 0;
+
+	assert(t->begin != NULL);
+
+#ifdef REG_DEBUG
+	if (f != NULL)
+	{
+		char		idbuf[50];
+
+		fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
+				stid(t, idbuf, sizeof(idbuf)));
+	}
+#endif
+	nfa = newnfa(v, v->cm, v->nfa);
+	NOERRZ();
+	dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
+	if (!ISERR())
+		specialcolors(nfa);
+	if (!ISERR())
+		ret = optimize(nfa, f);
+	if (converttosearch && !ISERR())
+		makesearch(v, nfa);
+	if (!ISERR())
+		compact(nfa, &t->cnfa);
+
+	freenfa(nfa);
+	return ret;
+}
+
+/*
+ * newlacon - allocate a lookaround-constraint subRE
+ */
+static int						/* lacon number */
+newlacon(struct vars *v,
+		 struct state *begin,
+		 struct state *end,
+		 int latype)
+{
+	int			n;
+	struct subre *newlacons;
+	struct subre *sub;
+
+	if (v->nlacons == 0)
+	{
+		n = 1;					/* skip 0th */
+		newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
+	}
+	else
+	{
+		n = v->nlacons;
+		newlacons = (struct subre *) REALLOC(v->lacons,
+											 (n + 1) * sizeof(struct subre));
+	}
+	if (newlacons == NULL)
+	{
+		ERR(REG_ESPACE);
+		return 0;
+	}
+	v->lacons = newlacons;
+	v->nlacons = n + 1;
+	sub = &v->lacons[n];
+	sub->begin = begin;
+	sub->end = end;
+	sub->subno = latype;
+	ZAPCNFA(sub->cnfa);
+	return n;
+}
+
+/*
+ * freelacons - free lookaround-constraint subRE vector
+ */
+static void
+freelacons(struct subre *subs,
+		   int n)
+{
+	struct subre *sub;
+	int			i;
+
+	assert(n > 0);
+	for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)	/* no 0th */
+		if (!NULLCNFA(sub->cnfa))
+			freecnfa(&sub->cnfa);
+	FREE(subs);
+}
+
+/*
+ * rfree - free a whole RE (insides of regfree)
+ */
+static void
+rfree(regex_t *re)
+{
+	struct guts *g;
+
+	if (re == NULL || re->re_magic != REMAGIC)
+		return;
+
+	re->re_magic = 0;			/* invalidate RE */
+	g = (struct guts *) re->re_guts;
+	re->re_guts = NULL;
+	re->re_fns = NULL;
+	if (g != NULL)
+	{
+		g->magic = 0;
+		freecm(&g->cmap);
+		if (g->tree != NULL)
+			freesubre((struct vars *) NULL, g->tree);
+		if (g->lacons != NULL)
+			freelacons(g->lacons, g->nlacons);
+		if (!NULLCNFA(g->search))
+			freecnfa(&g->search);
+		FREE(g);
+	}
+}
+
+/*
+ * rcancelrequested - check for external request to cancel regex operation
+ *
+ * Return nonzero to fail the operation with error code REG_CANCEL,
+ * zero to keep going
+ *
+ * The current implementation is Postgres-specific.  If we ever get around
+ * to splitting the regex code out as a standalone library, there will need
+ * to be some API to let applications define a callback function for this.
+ */
+static int
+rcancelrequested(void)
+{
+	return InterruptPending && (QueryCancelPending || ProcDiePending);
+}
+
+/*
+ * rstacktoodeep - check for stack getting dangerously deep
+ *
+ * Return nonzero to fail the operation with error code REG_ETOOBIG,
+ * zero to keep going
+ *
+ * The current implementation is Postgres-specific.  If we ever get around
+ * to splitting the regex code out as a standalone library, there will need
+ * to be some API to let applications define a callback function for this.
+ */
+static int
+rstacktoodeep(void)
+{
+	return stack_is_too_deep();
+}
+
+#ifdef REG_DEBUG
+
+/*
+ * dump - dump an RE in human-readable form
+ */
+static void
+dump(regex_t *re,
+	 FILE *f)
+{
+	struct guts *g;
+	int			i;
+
+	if (re->re_magic != REMAGIC)
+		fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
+				REMAGIC);
+	if (re->re_guts == NULL)
+	{
+		fprintf(f, "NULL guts!!!\n");
+		return;
+	}
+	g = (struct guts *) re->re_guts;
+	if (g->magic != GUTSMAGIC)
+		fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
+				GUTSMAGIC);
+
+	fprintf(f, "\n\n\n========= DUMP ==========\n");
+	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
+			(int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
+
+	dumpcolors(&g->cmap, f);
+	if (!NULLCNFA(g->search))
+	{
+		fprintf(f, "\nsearch:\n");
+		dumpcnfa(&g->search, f);
+	}
+	for (i = 1; i < g->nlacons; i++)
+	{
+		struct subre *lasub = &g->lacons[i];
+		const char *latype;
+
+		switch (lasub->subno)
+		{
+			case LATYPE_AHEAD_POS:
+				latype = "positive lookahead";
+				break;
+			case LATYPE_AHEAD_NEG:
+				latype = "negative lookahead";
+				break;
+			case LATYPE_BEHIND_POS:
+				latype = "positive lookbehind";
+				break;
+			case LATYPE_BEHIND_NEG:
+				latype = "negative lookbehind";
+				break;
+			default:
+				latype = "???";
+				break;
+		}
+		fprintf(f, "\nla%d (%s):\n", i, latype);
+		dumpcnfa(&lasub->cnfa, f);
+	}
+	fprintf(f, "\n");
+	dumpst(g->tree, f, 0);
+}
+
+/*
+ * dumpst - dump a subRE tree
+ */
+static void
+dumpst(struct subre *t,
+	   FILE *f,
+	   int nfapresent)			/* is the original NFA still around? */
+{
+	if (t == NULL)
+		fprintf(f, "null tree\n");
+	else
+		stdump(t, f, nfapresent);
+	fflush(f);
+}
+
+/*
+ * stdump - recursive guts of dumpst
+ */
+static void
+stdump(struct subre *t,
+	   FILE *f,
+	   int nfapresent)			/* is the original NFA still around? */
+{
+	char		idbuf[50];
+
+	fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
+	if (t->flags & LONGER)
+		fprintf(f, " longest");
+	if (t->flags & SHORTER)
+		fprintf(f, " shortest");
+	if (t->flags & MIXED)
+		fprintf(f, " hasmixed");
+	if (t->flags & CAP)
+		fprintf(f, " hascapture");
+	if (t->flags & BACKR)
+		fprintf(f, " hasbackref");
+	if (!(t->flags & INUSE))
+		fprintf(f, " UNUSED");
+	if (t->subno != 0)
+		fprintf(f, " (#%d)", t->subno);
+	if (t->min != 1 || t->max != 1)
+	{
+		fprintf(f, " {%d,", t->min);
+		if (t->max != DUPINF)
+			fprintf(f, "%d", t->max);
+		fprintf(f, "}");
+	}
+	if (nfapresent)
+		fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
+	if (t->left != NULL)
+		fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
+	if (t->right != NULL)
+		fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
+	if (!NULLCNFA(t->cnfa))
+	{
+		fprintf(f, "\n");
+		dumpcnfa(&t->cnfa, f);
+	}
+	fprintf(f, "\n");
+	if (t->left != NULL)
+		stdump(t->left, f, nfapresent);
+	if (t->right != NULL)
+		stdump(t->right, f, nfapresent);
+}
+
+/*
+ * stid - identify a subtree node for dumping
+ */
+static const char *				/* points to buf or constant string */
+stid(struct subre *t,
+	 char *buf,
+	 size_t bufsize)
+{
+	/* big enough for hex int or decimal t->id? */
+	if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
+		return "unable";
+	if (t->id != 0)
+		sprintf(buf, "%d", t->id);
+	else
+		sprintf(buf, "%p", t);
+	return buf;
+}
+#endif							/* REG_DEBUG */
+
+
+#include "regc_lex.c"
+#include "regc_color.c"
+#include "regc_nfa.c"
+#include "regc_cvec.c"
+#include "regc_pg_locale.c"
+#include "regc_locale.c"
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
new file mode 100644
index 0000000..5695e15
--- /dev/null
+++ b/src/backend/regex/rege_dfa.c
@@ -0,0 +1,929 @@
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/rege_dfa.c
+ *
+ */
+
+/*
+ * longest - longest-preferred matching engine
+ *
+ * On success, returns match endpoint address.  Returns NULL on no match.
+ * Internal errors also return NULL, with v->err set.
+ */
+static chr *
+longest(struct vars *v,
+		struct dfa *d,
+		chr *start,				/* where the match should start */
+		chr *stop,				/* match must end at or before here */
+		int *hitstopp)			/* record whether hit v->stop, if non-NULL */
+{
+	chr		   *cp;
+	chr		   *realstop = (stop == v->stop) ? stop : stop + 1;
+	color		co;
+	struct sset *css;
+	struct sset *ss;
+	chr		   *post;
+	int			i;
+	struct colormap *cm = d->cm;
+
+	/* prevent "uninitialized variable" warnings */
+	if (hitstopp != NULL)
+		*hitstopp = 0;
+
+	/* initialize */
+	css = initialize(v, d, start);
+	if (css == NULL)
+		return NULL;
+	cp = start;
+
+	/* startup */
+	FDEBUG(("+++ startup +++\n"));
+	if (cp == v->start)
+	{
+		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+	}
+	else
+	{
+		co = GETCOLOR(cm, *(cp - 1));
+		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+	}
+	css = miss(v, d, css, co, cp, start);
+	if (css == NULL)
+		return NULL;
+	css->lastseen = cp;
+
+	/*
+	 * This is the main text-scanning loop.  It seems worth having two copies
+	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+	 * builds, when you're not actively tracing.
+	 */
+#ifdef REG_DEBUG
+	if (v->eflags & REG_FTRACE)
+	{
+		while (cp < realstop)
+		{
+			FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+	else
+#endif
+	{
+		while (cp < realstop)
+		{
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+
+	if (ISERR())
+		return NULL;
+
+	/* shutdown */
+	FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
+	if (cp == v->stop && stop == v->stop)
+	{
+		if (hitstopp != NULL)
+			*hitstopp = 1;
+		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+		ss = miss(v, d, css, co, cp, start);
+		if (ISERR())
+			return NULL;
+		/* special case:  match ended at eol? */
+		if (ss != NULL && (ss->flags & POSTSTATE))
+			return cp;
+		else if (ss != NULL)
+			ss->lastseen = cp;	/* to be tidy */
+	}
+
+	/* find last match, if any */
+	post = d->lastpost;
+	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+		if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
+			(post == NULL || post < ss->lastseen))
+			post = ss->lastseen;
+	if (post != NULL)			/* found one */
+		return post - 1;
+
+	return NULL;
+}
+
+/*
+ * shortest - shortest-preferred matching engine
+ *
+ * On success, returns match endpoint address.  Returns NULL on no match.
+ * Internal errors also return NULL, with v->err set.
+ */
+static chr *
+shortest(struct vars *v,
+		 struct dfa *d,
+		 chr *start,			/* where the match should start */
+		 chr *min,				/* match must end at or after here */
+		 chr *max,				/* match must end at or before here */
+		 chr **coldp,			/* store coldstart pointer here, if non-NULL */
+		 int *hitstopp)			/* record whether hit v->stop, if non-NULL */
+{
+	chr		   *cp;
+	chr		   *realmin = (min == v->stop) ? min : min + 1;
+	chr		   *realmax = (max == v->stop) ? max : max + 1;
+	color		co;
+	struct sset *css;
+	struct sset *ss;
+	struct colormap *cm = d->cm;
+
+	/* prevent "uninitialized variable" warnings */
+	if (coldp != NULL)
+		*coldp = NULL;
+	if (hitstopp != NULL)
+		*hitstopp = 0;
+
+	/* initialize */
+	css = initialize(v, d, start);
+	if (css == NULL)
+		return NULL;
+	cp = start;
+
+	/* startup */
+	FDEBUG(("--- startup ---\n"));
+	if (cp == v->start)
+	{
+		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+	}
+	else
+	{
+		co = GETCOLOR(cm, *(cp - 1));
+		FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
+	}
+	css = miss(v, d, css, co, cp, start);
+	if (css == NULL)
+		return NULL;
+	css->lastseen = cp;
+	ss = css;
+
+	/*
+	 * This is the main text-scanning loop.  It seems worth having two copies
+	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+	 * builds, when you're not actively tracing.
+	 */
+#ifdef REG_DEBUG
+	if (v->eflags & REG_FTRACE)
+	{
+		while (cp < realmax)
+		{
+			FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+			if ((ss->flags & POSTSTATE) && cp >= realmin)
+				break;			/* NOTE BREAK OUT */
+		}
+	}
+	else
+#endif
+	{
+		while (cp < realmax)
+		{
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+			if ((ss->flags & POSTSTATE) && cp >= realmin)
+				break;			/* NOTE BREAK OUT */
+		}
+	}
+
+	if (ss == NULL)
+		return NULL;
+
+	if (coldp != NULL)			/* report last no-progress state set, if any */
+		*coldp = lastcold(v, d);
+
+	if ((ss->flags & POSTSTATE) && cp > min)
+	{
+		assert(cp >= realmin);
+		cp--;
+	}
+	else if (cp == v->stop && max == v->stop)
+	{
+		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+		ss = miss(v, d, css, co, cp, start);
+		/* match might have ended at eol */
+		if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
+			*hitstopp = 1;
+	}
+
+	if (ss == NULL || !(ss->flags & POSTSTATE))
+		return NULL;
+
+	return cp;
+}
+
+/*
+ * matchuntil - incremental matching engine
+ *
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*).  We determine whether a
+ * match exists starting at v->start and ending at probe.  Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ *
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+static int
+matchuntil(struct vars *v,
+		   struct dfa *d,
+		   chr *probe,			/* we want to know if a match ends here */
+		   struct sset **lastcss,	/* state storage across calls */
+		   chr **lastcp)		/* state storage across calls */
+{
+	chr		   *cp = *lastcp;
+	color		co;
+	struct sset *css = *lastcss;
+	struct sset *ss;
+	struct colormap *cm = d->cm;
+
+	/* initialize and startup, or restart, if necessary */
+	if (cp == NULL || cp > probe)
+	{
+		cp = v->start;
+		css = initialize(v, d, cp);
+		if (css == NULL)
+			return 0;
+
+		FDEBUG((">>> startup >>>\n"));
+		co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+
+		css = miss(v, d, css, co, cp, v->start);
+		if (css == NULL)
+			return 0;
+		css->lastseen = cp;
+	}
+	else if (css == NULL)
+	{
+		/* we previously found that no match is possible beyond *lastcp */
+		return 0;
+	}
+	ss = css;
+
+	/*
+	 * This is the main text-scanning loop.  It seems worth having two copies
+	 * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+	 * builds, when you're not actively tracing.
+	 */
+#ifdef REG_DEBUG
+	if (v->eflags & REG_FTRACE)
+	{
+		while (cp < probe)
+		{
+			FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, v->start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+	else
+#endif
+	{
+		while (cp < probe)
+		{
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL)
+			{
+				ss = miss(v, d, css, co, cp + 1, v->start);
+				if (ss == NULL)
+					break;		/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	}
+
+	*lastcss = ss;
+	*lastcp = cp;
+
+	if (ss == NULL)
+		return 0;				/* impossible match, or internal error */
+
+	/* We need to process one more chr, or the EOS symbol, to check match */
+	if (cp < v->stop)
+	{
+		FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+		co = GETCOLOR(cm, *cp);
+		FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+		ss = css->outs[co];
+		if (ss == NULL)
+			ss = miss(v, d, css, co, cp + 1, v->start);
+	}
+	else
+	{
+		assert(cp == v->stop);
+		co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long) co));
+		ss = miss(v, d, css, co, cp, v->start);
+	}
+
+	if (ss == NULL || !(ss->flags & POSTSTATE))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * lastcold - determine last point at which no progress had been made
+ */
+static chr *					/* endpoint, or NULL */
+lastcold(struct vars *v,
+		 struct dfa *d)
+{
+	struct sset *ss;
+	chr		   *nopr;
+	int			i;
+
+	nopr = d->lastnopr;
+	if (nopr == NULL)
+		nopr = v->start;
+	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+		if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
+			nopr = ss->lastseen;
+	return nopr;
+}
+
+/*
+ * newdfa - set up a fresh DFA
+ */
+static struct dfa *
+newdfa(struct vars *v,
+	   struct cnfa *cnfa,
+	   struct colormap *cm,
+	   struct smalldfa *sml)	/* preallocated space, may be NULL */
+{
+	struct dfa *d;
+	size_t		nss = cnfa->nstates * 2;
+	int			wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+	struct smalldfa *smallwas = sml;
+
+	assert(cnfa != NULL && cnfa->nstates != 0);
+
+	if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
+	{
+		assert(wordsper == 1);
+		if (sml == NULL)
+		{
+			sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
+			if (sml == NULL)
+			{
+				ERR(REG_ESPACE);
+				return NULL;
+			}
+		}
+		d = &sml->dfa;
+		d->ssets = sml->ssets;
+		d->statesarea = sml->statesarea;
+		d->work = &d->statesarea[nss];
+		d->outsarea = sml->outsarea;
+		d->incarea = sml->incarea;
+		d->cptsmalloced = 0;
+		d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL;
+	}
+	else
+	{
+		d = (struct dfa *) MALLOC(sizeof(struct dfa));
+		if (d == NULL)
+		{
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+		d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
+		d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
+											sizeof(unsigned));
+		d->work = &d->statesarea[nss * wordsper];
+		d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
+											  sizeof(struct sset *));
+		d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
+											sizeof(struct arcp));
+		d->cptsmalloced = 1;
+		d->mallocarea = (char *) d;
+		if (d->ssets == NULL || d->statesarea == NULL ||
+			d->outsarea == NULL || d->incarea == NULL)
+		{
+			freedfa(d);
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+	}
+
+	d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
+	d->nssused = 0;
+	d->nstates = cnfa->nstates;
+	d->ncolors = cnfa->ncolors;
+	d->wordsper = wordsper;
+	d->cnfa = cnfa;
+	d->cm = cm;
+	d->lastpost = NULL;
+	d->lastnopr = NULL;
+	d->search = d->ssets;
+
+	/* initialization of sset fields is done as needed */
+
+	return d;
+}
+
+/*
+ * freedfa - free a DFA
+ */
+static void
+freedfa(struct dfa *d)
+{
+	if (d->cptsmalloced)
+	{
+		if (d->ssets != NULL)
+			FREE(d->ssets);
+		if (d->statesarea != NULL)
+			FREE(d->statesarea);
+		if (d->outsarea != NULL)
+			FREE(d->outsarea);
+		if (d->incarea != NULL)
+			FREE(d->incarea);
+	}
+
+	if (d->mallocarea != NULL)
+		FREE(d->mallocarea);
+}
+
+/*
+ * hash - construct a hash code for a bitvector
+ *
+ * There are probably better ways, but they're more expensive.
+ */
+static unsigned
+hash(unsigned *uv,
+	 int n)
+{
+	int			i;
+	unsigned	h;
+
+	h = 0;
+	for (i = 0; i < n; i++)
+		h ^= uv[i];
+	return h;
+}
+
+/*
+ * initialize - hand-craft a cache entry for startup, otherwise get ready
+ */
+static struct sset *
+initialize(struct vars *v,
+		   struct dfa *d,
+		   chr *start)
+{
+	struct sset *ss;
+	int			i;
+
+	/* is previous one still there? */
+	if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
+		ss = &d->ssets[0];
+	else
+	{							/* no, must (re)build it */
+		ss = getvacant(v, d, start, start);
+		if (ss == NULL)
+			return NULL;
+		for (i = 0; i < d->wordsper; i++)
+			ss->states[i] = 0;
+		BSET(ss->states, d->cnfa->pre);
+		ss->hash = HASH(ss->states, d->wordsper);
+		assert(d->cnfa->pre != d->cnfa->post);
+		ss->flags = STARTER | LOCKED | NOPROGRESS;
+		/* lastseen dealt with below */
+	}
+
+	for (i = 0; i < d->nssused; i++)
+		d->ssets[i].lastseen = NULL;
+	ss->lastseen = start;		/* maybe untrue, but harmless */
+	d->lastpost = NULL;
+	d->lastnopr = NULL;
+	return ss;
+}
+
+/*
+ * miss - handle a stateset cache miss
+ *
+ * css is the current stateset, co is the color of the current input character,
+ * cp points to the character after that (which is where we may need to test
+ * LACONs).  start does not affect matching behavior but is needed for pickss'
+ * heuristics about which stateset cache entry to replace.
+ *
+ * Ordinarily, returns the address of the next stateset (the one that is
+ * valid after consuming the input character).  Returns NULL if no valid
+ * NFA states remain, ie we have a certain match failure.
+ * Internal errors also return NULL, with v->err set.
+ */
+static struct sset *
+miss(struct vars *v,
+	 struct dfa *d,
+	 struct sset *css,
+	 color co,
+	 chr *cp,					/* next chr */
+	 chr *start)				/* where the attempt got started */
+{
+	struct cnfa *cnfa = d->cnfa;
+	int			i;
+	unsigned	h;
+	struct carc *ca;
+	struct sset *p;
+	int			ispost;
+	int			noprogress;
+	int			gotstate;
+	int			dolacons;
+	int			sawlacons;
+
+	/* for convenience, we can be called even if it might not be a miss */
+	if (css->outs[co] != NULL)
+	{
+		FDEBUG(("hit\n"));
+		return css->outs[co];
+	}
+	FDEBUG(("miss\n"));
+
+	/*
+	 * Checking for operation cancel in the inner text search loop seems
+	 * unduly expensive.  As a compromise, check during cache misses.
+	 */
+	if (CANCEL_REQUESTED(v->re))
+	{
+		ERR(REG_CANCEL);
+		return NULL;
+	}
+
+	/*
+	 * What set of states would we end up in after consuming the co character?
+	 * We first consider PLAIN arcs that consume the character, and then look
+	 * to see what LACON arcs could be traversed after consuming it.
+	 */
+	for (i = 0; i < d->wordsper; i++)
+		d->work[i] = 0;			/* build new stateset bitmap in d->work */
+	ispost = 0;
+	noprogress = 1;
+	gotstate = 0;
+	for (i = 0; i < d->nstates; i++)
+		if (ISBSET(css->states, i))
+			for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
+				if (ca->co == co)
+				{
+					BSET(d->work, ca->to);
+					gotstate = 1;
+					if (ca->to == cnfa->post)
+						ispost = 1;
+					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
+						noprogress = 0;
+					FDEBUG(("%d -> %d\n", i, ca->to));
+				}
+	if (!gotstate)
+		return NULL;			/* character cannot reach any new state */
+	dolacons = (cnfa->flags & HASLACONS);
+	sawlacons = 0;
+	/* outer loop handles transitive closure of reachable-by-LACON states */
+	while (dolacons)
+	{
+		dolacons = 0;
+		for (i = 0; i < d->nstates; i++)
+			if (ISBSET(d->work, i))
+				for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
+				{
+					if (ca->co < cnfa->ncolors)
+						continue;	/* not a LACON arc */
+					if (ISBSET(d->work, ca->to))
+						continue;	/* arc would be a no-op anyway */
+					sawlacons = 1;	/* this LACON affects our result */
+					if (!lacon(v, cnfa, cp, ca->co))
+					{
+						if (ISERR())
+							return NULL;
+						continue;	/* LACON arc cannot be traversed */
+					}
+					if (ISERR())
+						return NULL;
+					BSET(d->work, ca->to);
+					dolacons = 1;
+					if (ca->to == cnfa->post)
+						ispost = 1;
+					if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
+						noprogress = 0;
+					FDEBUG(("%d :> %d\n", i, ca->to));
+				}
+	}
+	h = HASH(d->work, d->wordsper);
+
+	/* Is this stateset already in the cache? */
+	for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+		if (HIT(h, d->work, p, d->wordsper))
+		{
+			FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
+			break;				/* NOTE BREAK OUT */
+		}
+	if (i == 0)
+	{							/* nope, need a new cache entry */
+		p = getvacant(v, d, cp, start);
+		if (p == NULL)
+			return NULL;
+		assert(p != css);
+		for (i = 0; i < d->wordsper; i++)
+			p->states[i] = d->work[i];
+		p->hash = h;
+		p->flags = (ispost) ? POSTSTATE : 0;
+		if (noprogress)
+			p->flags |= NOPROGRESS;
+		/* lastseen to be dealt with by caller */
+	}
+
+	/*
+	 * Link new stateset to old, unless a LACON affected the result, in which
+	 * case we don't create the link.  That forces future transitions across
+	 * this same arc (same prior stateset and character color) to come through
+	 * miss() again, so that we can recheck the LACON(s), which might or might
+	 * not pass since context will be different.
+	 */
+	if (!sawlacons)
+	{
+		FDEBUG(("c%d[%d]->c%d\n",
+				(int) (css - d->ssets), co, (int) (p - d->ssets)));
+		css->outs[co] = p;
+		css->inchain[co] = p->ins;
+		p->ins.ss = css;
+		p->ins.co = co;
+	}
+	return p;
+}
+
+/*
+ * lacon - lookaround-constraint checker for miss()
+ */
+static int						/* predicate:  constraint satisfied? */
+lacon(struct vars *v,
+	  struct cnfa *pcnfa,		/* parent cnfa */
+	  chr *cp,
+	  color co)					/* "color" of the lookaround constraint */
+{
+	int			n;
+	struct subre *sub;
+	struct dfa *d;
+	chr		   *end;
+	int			satisfied;
+
+	/* Since this is recursive, it could be driven to stack overflow */
+	if (STACK_TOO_DEEP(v->re))
+	{
+		ERR(REG_ETOOBIG);
+		return 0;
+	}
+
+	n = co - pcnfa->ncolors;
+	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+	FDEBUG(("=== testing lacon %d\n", n));
+	sub = &v->g->lacons[n];
+	d = getladfa(v, n);
+	if (d == NULL)
+		return 0;
+	if (LATYPE_IS_AHEAD(sub->subno))
+	{
+		/* used to use longest() here, but shortest() could be much cheaper */
+		end = shortest(v, d, cp, cp, v->stop,
+					   (chr **) NULL, (int *) NULL);
+		satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+	}
+	else
+	{
+		/*
+		 * To avoid doing O(N^2) work when repeatedly testing a lookbehind
+		 * constraint in an N-character string, we use matchuntil() which can
+		 * cache the DFA state across calls.  We only need to restart if the
+		 * probe point decreases, which is not common.  The NFA we're using is
+		 * a search NFA, so it doesn't mind scanning over stuff before the
+		 * nominal match.
+		 */
+		satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
+		if (!LATYPE_IS_POS(sub->subno))
+			satisfied = !satisfied;
+	}
+	FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
+	return satisfied;
+}
+
+/*
+ * getvacant - get a vacant state set
+ *
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ */
+static struct sset *
+getvacant(struct vars *v,
+		  struct dfa *d,
+		  chr *cp,
+		  chr *start)
+{
+	int			i;
+	struct sset *ss;
+	struct sset *p;
+	struct arcp ap;
+	color		co;
+
+	ss = pickss(v, d, cp, start);
+	if (ss == NULL)
+		return NULL;
+	assert(!(ss->flags & LOCKED));
+
+	/* clear out its inarcs, including self-referential ones */
+	ap = ss->ins;
+	while ((p = ap.ss) != NULL)
+	{
+		co = ap.co;
+		FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
+		p->outs[co] = NULL;
+		ap = p->inchain[co];
+		p->inchain[co].ss = NULL;	/* paranoia */
+	}
+	ss->ins.ss = NULL;
+
+	/* take it off the inarc chains of the ssets reached by its outarcs */
+	for (i = 0; i < d->ncolors; i++)
+	{
+		p = ss->outs[i];
+		assert(p != ss);		/* not self-referential */
+		if (p == NULL)
+			continue;			/* NOTE CONTINUE */
+		FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
+		if (p->ins.ss == ss && p->ins.co == i)
+			p->ins = ss->inchain[i];
+		else
+		{
+			struct arcp lastap = {NULL, 0};
+
+			assert(p->ins.ss != NULL);
+			for (ap = p->ins; ap.ss != NULL &&
+				 !(ap.ss == ss && ap.co == i);
+				 ap = ap.ss->inchain[ap.co])
+				lastap = ap;
+			assert(ap.ss != NULL);
+			lastap.ss->inchain[lastap.co] = ss->inchain[i];
+		}
+		ss->outs[i] = NULL;
+		ss->inchain[i].ss = NULL;
+	}
+
+	/* if ss was a success state, may need to remember location */
+	if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
+		(d->lastpost == NULL || d->lastpost < ss->lastseen))
+		d->lastpost = ss->lastseen;
+
+	/* likewise for a no-progress state */
+	if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
+		(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
+		d->lastnopr = ss->lastseen;
+
+	return ss;
+}
+
+/*
+ * pickss - pick the next stateset to be used
+ */
+static struct sset *
+pickss(struct vars *v,
+	   struct dfa *d,
+	   chr *cp,
+	   chr *start)
+{
+	int			i;
+	struct sset *ss;
+	struct sset *end;
+	chr		   *ancient;
+
+	/* shortcut for cases where cache isn't full */
+	if (d->nssused < d->nssets)
+	{
+		i = d->nssused;
+		d->nssused++;
+		ss = &d->ssets[i];
+		FDEBUG(("new c%d\n", i));
+		/* set up innards */
+		ss->states = &d->statesarea[i * d->wordsper];
+		ss->flags = 0;
+		ss->ins.ss = NULL;
+		ss->ins.co = WHITE;		/* give it some value */
+		ss->outs = &d->outsarea[i * d->ncolors];
+		ss->inchain = &d->incarea[i * d->ncolors];
+		for (i = 0; i < d->ncolors; i++)
+		{
+			ss->outs[i] = NULL;
+			ss->inchain[i].ss = NULL;
+		}
+		return ss;
+	}
+
+	/* look for oldest, or old enough anyway */
+	if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
+		ancient = cp - d->nssets * 2 / 3;
+	else
+		ancient = start;
+	for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+			!(ss->flags & LOCKED))
+		{
+			d->search = ss + 1;
+			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
+			return ss;
+		}
+	for (ss = d->ssets, end = d->search; ss < end; ss++)
+		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+			!(ss->flags & LOCKED))
+		{
+			d->search = ss + 1;
+			FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
+			return ss;
+		}
+
+	/* nobody's old enough?!? -- something's really wrong */
+	FDEBUG(("cannot find victim to replace!\n"));
+	ERR(REG_ASSERT);
+	return NULL;
+}
diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c
new file mode 100644
index 0000000..4a27c25
--- /dev/null
+++ b/src/backend/regex/regerror.c
@@ -0,0 +1,120 @@
+/*
+ * regerror - error-code expansion
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regerror.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+/* unknown-error explanation */
+static const char unk[] = "*** unknown regex error code 0x%x ***";
+
+/* struct to map among codes, code names, and explanations */
+static const struct rerr
+{
+	int			code;
+	const char *name;
+	const char *explain;
+}			rerrs[] =
+
+{
+	/* the actual table is built from regex.h */
+#include "regex/regerrs.h"		/* pgrminclude ignore */
+	{
+		-1, "", "oops"
+	},							/* explanation special-cased in code */
+};
+
+/*
+ * pg_regerror - the interface to error numbers
+ */
+/* ARGSUSED */
+size_t							/* actual space needed (including NUL) */
+pg_regerror(int errcode,		/* error code, or REG_ATOI or REG_ITOA */
+			const regex_t *preg,	/* associated regex_t (unused at present) */
+			char *errbuf,		/* result buffer (unless errbuf_size==0) */
+			size_t errbuf_size) /* available space in errbuf, can be 0 */
+{
+	const struct rerr *r;
+	const char *msg;
+	char		convbuf[sizeof(unk) + 50];	/* 50 = plenty for int */
+	size_t		len;
+	int			icode;
+
+	switch (errcode)
+	{
+		case REG_ATOI:			/* convert name to number */
+			for (r = rerrs; r->code >= 0; r++)
+				if (strcmp(r->name, errbuf) == 0)
+					break;
+			sprintf(convbuf, "%d", r->code);	/* -1 for unknown */
+			msg = convbuf;
+			break;
+		case REG_ITOA:			/* convert number to name */
+			icode = atoi(errbuf);	/* not our problem if this fails */
+			for (r = rerrs; r->code >= 0; r++)
+				if (r->code == icode)
+					break;
+			if (r->code >= 0)
+				msg = r->name;
+			else
+			{					/* unknown; tell him the number */
+				sprintf(convbuf, "REG_%u", (unsigned) icode);
+				msg = convbuf;
+			}
+			break;
+		default:				/* a real, normal error code */
+			for (r = rerrs; r->code >= 0; r++)
+				if (r->code == errcode)
+					break;
+			if (r->code >= 0)
+				msg = r->explain;
+			else
+			{					/* unknown; say so */
+				sprintf(convbuf, unk, errcode);
+				msg = convbuf;
+			}
+			break;
+	}
+
+	len = strlen(msg) + 1;		/* space needed, including NUL */
+	if (errbuf_size > 0)
+	{
+		if (errbuf_size > len)
+			strcpy(errbuf, msg);
+		else
+		{						/* truncate to fit */
+			memcpy(errbuf, msg, errbuf_size - 1);
+			errbuf[errbuf_size - 1] = '\0';
+		}
+	}
+
+	return len;
+}
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
new file mode 100644
index 0000000..f7eaa76
--- /dev/null
+++ b/src/backend/regex/regexec.c
@@ -0,0 +1,1425 @@
+/*
+ * re_*exec and friends - match REs
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regexec.c
+ *
+ */
+
+#include "regex/regguts.h"
+
+
+
+/* lazy-DFA representation */
+struct arcp
+{								/* "pointer" to an outarc */
+	struct sset *ss;
+	color		co;
+};
+
+struct sset
+{								/* state set */
+	unsigned   *states;			/* pointer to bitvector */
+	unsigned	hash;			/* hash of bitvector */
+#define  HASH(bv, nw)	 (((nw) == 1) ? *(bv) : hash(bv, nw))
+#define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
+		memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
+	int			flags;
+#define  STARTER	 01			/* the initial state set */
+#define  POSTSTATE	 02			/* includes the goal state */
+#define  LOCKED		 04			/* locked in cache */
+#define  NOPROGRESS  010		/* zero-progress state set */
+	struct arcp ins;			/* chain of inarcs pointing here */
+	chr		   *lastseen;		/* last entered on arrival here */
+	struct sset **outs;			/* outarc vector indexed by color */
+	struct arcp *inchain;		/* chain-pointer vector for outarcs */
+};
+
+struct dfa
+{
+	int			nssets;			/* size of cache */
+	int			nssused;		/* how many entries occupied yet */
+	int			nstates;		/* number of states */
+	int			ncolors;		/* length of outarc and inchain vectors */
+	int			wordsper;		/* length of state-set bitvectors */
+	struct sset *ssets;			/* state-set cache */
+	unsigned   *statesarea;		/* bitvector storage */
+	unsigned   *work;			/* pointer to work area within statesarea */
+	struct sset **outsarea;		/* outarc-vector storage */
+	struct arcp *incarea;		/* inchain storage */
+	struct cnfa *cnfa;
+	struct colormap *cm;
+	chr		   *lastpost;		/* location of last cache-flushed success */
+	chr		   *lastnopr;		/* location of last cache-flushed NOPROGRESS */
+	struct sset *search;		/* replacement-search-pointer memory */
+	int			cptsmalloced;	/* were the areas individually malloced? */
+	char	   *mallocarea;		/* self, or master malloced area, or NULL */
+};
+
+#define WORK	1				/* number of work bitvectors needed */
+
+/* setup for non-malloc allocation for small cases */
+#define FEWSTATES	20			/* must be less than UBITS */
+#define FEWCOLORS	15
+struct smalldfa
+{
+	struct dfa	dfa;
+	struct sset ssets[FEWSTATES * 2];
+	unsigned	statesarea[FEWSTATES * 2 + WORK];
+	struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
+	struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
+};
+
+#define DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */
+
+
+
+/* internal variables, bundled for easy passing around */
+struct vars
+{
+	regex_t    *re;
+	struct guts *g;
+	int			eflags;			/* copies of arguments */
+	size_t		nmatch;
+	regmatch_t *pmatch;
+	rm_detail_t *details;
+	chr		   *start;			/* start of string */
+	chr		   *search_start;	/* search start of string */
+	chr		   *stop;			/* just past end of string */
+	int			err;			/* error code if any (0 none) */
+	struct dfa **subdfas;		/* per-tree-subre DFAs */
+	struct dfa **ladfas;		/* per-lacon-subre DFAs */
+	struct sset **lblastcss;	/* per-lacon-subre lookbehind restart data */
+	chr		  **lblastcp;		/* per-lacon-subre lookbehind restart data */
+	struct smalldfa dfa1;
+	struct smalldfa dfa2;
+};
+
+#define VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
+#define ISERR() VISERR(v)
+#define VERR(vv,e)	((vv)->err = ((vv)->err ? (vv)->err : (e)))
+#define ERR(e)	VERR(v, e)		/* record an error */
+#define NOERR() {if (ISERR()) return v->err;}	/* if error seen, return it */
+#define OFF(p)	((p) - v->start)
+#define LOFF(p) ((long)OFF(p))
+
+
+
+/*
+ * forward declarations
+ */
+/* === regexec.c === */
+static struct dfa *getsubdfa(struct vars *, struct subre *);
+static struct dfa *getladfa(struct vars *, int);
+static int	find(struct vars *, struct cnfa *, struct colormap *);
+static int	cfind(struct vars *, struct cnfa *, struct colormap *);
+static int	cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
+static void zapallsubs(regmatch_t *, size_t);
+static void zaptreesubs(struct vars *, struct subre *);
+static void subset(struct vars *, struct subre *, chr *, chr *);
+static int	cdissect(struct vars *, struct subre *, chr *, chr *);
+static int	ccondissect(struct vars *, struct subre *, chr *, chr *);
+static int	crevcondissect(struct vars *, struct subre *, chr *, chr *);
+static int	cbrdissect(struct vars *, struct subre *, chr *, chr *);
+static int	caltdissect(struct vars *, struct subre *, chr *, chr *);
+static int	citerdissect(struct vars *, struct subre *, chr *, chr *);
+static int	creviterdissect(struct vars *, struct subre *, chr *, chr *);
+
+/* === rege_dfa.c === */
+static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
+static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static int	matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
+static chr *lastcold(struct vars *, struct dfa *);
+static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
+static void freedfa(struct dfa *);
+static unsigned hash(unsigned *, int);
+static struct sset *initialize(struct vars *, struct dfa *, chr *);
+static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *);
+static int	lacon(struct vars *, struct cnfa *, chr *, color);
+static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
+static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
+
+
+/*
+ * pg_regexec - match regular expression
+ */
+int
+pg_regexec(regex_t *re,
+		   const chr *string,
+		   size_t len,
+		   size_t search_start,
+		   rm_detail_t *details,
+		   size_t nmatch,
+		   regmatch_t pmatch[],
+		   int flags)
+{
+	struct vars var;
+	register struct vars *v = &var;
+	int			st;
+	size_t		n;
+	size_t		i;
+	int			backref;
+
+#define  LOCALMAT	 20
+	regmatch_t	mat[LOCALMAT];
+
+#define  LOCALDFAS	 40
+	struct dfa *subdfas[LOCALDFAS];
+
+	/* sanity checks */
+	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
+		return REG_INVARG;
+	if (re->re_csize != sizeof(chr))
+		return REG_MIXED;
+
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
+
+	/* setup */
+	v->re = re;
+	v->g = (struct guts *) re->re_guts;
+	if ((v->g->cflags & REG_EXPECT) && details == NULL)
+		return REG_INVARG;
+	if (v->g->info & REG_UIMPOSSIBLE)
+		return REG_NOMATCH;
+	backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
+	v->eflags = flags;
+	if (v->g->cflags & REG_NOSUB)
+		nmatch = 0;				/* override client */
+	v->nmatch = nmatch;
+	if (backref)
+	{
+		/* need work area */
+		if (v->g->nsub + 1 <= LOCALMAT)
+			v->pmatch = mat;
+		else
+			v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
+											  sizeof(regmatch_t));
+		if (v->pmatch == NULL)
+			return REG_ESPACE;
+		v->nmatch = v->g->nsub + 1;
+	}
+	else
+		v->pmatch = pmatch;
+	v->details = details;
+	v->start = (chr *) string;
+	v->search_start = (chr *) string + search_start;
+	v->stop = (chr *) string + len;
+	v->err = 0;
+	v->subdfas = NULL;
+	v->ladfas = NULL;
+	v->lblastcss = NULL;
+	v->lblastcp = NULL;
+	/* below this point, "goto cleanup" will behave sanely */
+
+	assert(v->g->ntree >= 0);
+	n = (size_t) v->g->ntree;
+	if (n <= LOCALDFAS)
+		v->subdfas = subdfas;
+	else
+	{
+		v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+		if (v->subdfas == NULL)
+		{
+			st = REG_ESPACE;
+			goto cleanup;
+		}
+	}
+	for (i = 0; i < n; i++)
+		v->subdfas[i] = NULL;
+
+	assert(v->g->nlacons >= 0);
+	n = (size_t) v->g->nlacons;
+	if (n > 0)
+	{
+		v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+		if (v->ladfas == NULL)
+		{
+			st = REG_ESPACE;
+			goto cleanup;
+		}
+		for (i = 0; i < n; i++)
+			v->ladfas[i] = NULL;
+		v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+		v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+		if (v->lblastcss == NULL || v->lblastcp == NULL)
+		{
+			st = REG_ESPACE;
+			goto cleanup;
+		}
+		for (i = 0; i < n; i++)
+		{
+			v->lblastcss[i] = NULL;
+			v->lblastcp[i] = NULL;
+		}
+	}
+
+	/* do it */
+	assert(v->g->tree != NULL);
+	if (backref)
+		st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
+	else
+		st = find(v, &v->g->tree->cnfa, &v->g->cmap);
+
+	/* copy (portion of) match vector over if necessary */
+	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+	{
+		zapallsubs(pmatch, nmatch);
+		n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
+		memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+	}
+
+	/* clean up */
+cleanup:
+	if (v->pmatch != pmatch && v->pmatch != mat)
+		FREE(v->pmatch);
+	if (v->subdfas != NULL)
+	{
+		n = (size_t) v->g->ntree;
+		for (i = 0; i < n; i++)
+		{
+			if (v->subdfas[i] != NULL)
+				freedfa(v->subdfas[i]);
+		}
+		if (v->subdfas != subdfas)
+			FREE(v->subdfas);
+	}
+	if (v->ladfas != NULL)
+	{
+		n = (size_t) v->g->nlacons;
+		for (i = 0; i < n; i++)
+		{
+			if (v->ladfas[i] != NULL)
+				freedfa(v->ladfas[i]);
+		}
+		FREE(v->ladfas);
+	}
+	if (v->lblastcss != NULL)
+		FREE(v->lblastcss);
+	if (v->lblastcp != NULL)
+		FREE(v->lblastcp);
+
+	return st;
+}
+
+/*
+ * getsubdfa - create or re-fetch the DFA for a tree subre node
+ *
+ * We only need to create the DFA once per overall regex execution.
+ * The DFA will be freed by the cleanup step in pg_regexec().
+ */
+static struct dfa *
+getsubdfa(struct vars *v,
+		  struct subre *t)
+{
+	if (v->subdfas[t->id] == NULL)
+	{
+		v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+		if (ISERR())
+			return NULL;
+	}
+	return v->subdfas[t->id];
+}
+
+/*
+ * getladfa - create or re-fetch the DFA for a LACON subre node
+ *
+ * Same as above, but for LACONs.
+ */
+static struct dfa *
+getladfa(struct vars *v,
+		 int n)
+{
+	assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+
+	if (v->ladfas[n] == NULL)
+	{
+		struct subre *sub = &v->g->lacons[n];
+
+		v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+		if (ISERR())
+			return NULL;
+	}
+	return v->ladfas[n];
+}
+
+/*
+ * find - find a match for the main NFA (no-complications case)
+ */
+static int
+find(struct vars *v,
+	 struct cnfa *cnfa,
+	 struct colormap *cm)
+{
+	struct dfa *s;
+	struct dfa *d;
+	chr		   *begin;
+	chr		   *end = NULL;
+	chr		   *cold;
+	chr		   *open;			/* open and close of range of possible starts */
+	chr		   *close;
+	int			hitend;
+	int			shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
+
+	/* first, a shot with the search RE */
+	s = newdfa(v, &v->g->search, cm, &v->dfa1);
+	assert(!(ISERR() && s != NULL));
+	NOERR();
+	MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
+	cold = NULL;
+	close = shortest(v, s, v->search_start, v->search_start, v->stop,
+					 &cold, (int *) NULL);
+	freedfa(s);
+	NOERR();
+	if (v->g->cflags & REG_EXPECT)
+	{
+		assert(v->details != NULL);
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+	}
+	if (close == NULL)			/* not found */
+		return REG_NOMATCH;
+	if (v->nmatch == 0)			/* found, don't need exact location */
+		return REG_OKAY;
+
+	/* find starting point and match */
+	assert(cold != NULL);
+	open = cold;
+	cold = NULL;
+	MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
+	d = newdfa(v, cnfa, cm, &v->dfa1);
+	assert(!(ISERR() && d != NULL));
+	NOERR();
+	for (begin = open; begin <= close; begin++)
+	{
+		MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
+		if (shorter)
+			end = shortest(v, d, begin, begin, v->stop,
+						   (chr **) NULL, &hitend);
+		else
+			end = longest(v, d, begin, v->stop, &hitend);
+		if (ISERR())
+		{
+			freedfa(d);
+			return v->err;
+		}
+		if (hitend && cold == NULL)
+			cold = begin;
+		if (end != NULL)
+			break;				/* NOTE BREAK OUT */
+	}
+	assert(end != NULL);		/* search RE succeeded so loop should */
+	freedfa(d);
+
+	/* and pin down details */
+	assert(v->nmatch > 0);
+	v->pmatch[0].rm_so = OFF(begin);
+	v->pmatch[0].rm_eo = OFF(end);
+	if (v->g->cflags & REG_EXPECT)
+	{
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+	}
+	if (v->nmatch == 1)			/* no need for submatches */
+		return REG_OKAY;
+
+	/* find submatches */
+	zapallsubs(v->pmatch, v->nmatch);
+	return cdissect(v, v->g->tree, begin, end);
+}
+
+/*
+ * cfind - find a match for the main NFA (with complications)
+ */
+static int
+cfind(struct vars *v,
+	  struct cnfa *cnfa,
+	  struct colormap *cm)
+{
+	struct dfa *s;
+	struct dfa *d;
+	chr		   *cold;
+	int			ret;
+
+	s = newdfa(v, &v->g->search, cm, &v->dfa1);
+	NOERR();
+	d = newdfa(v, cnfa, cm, &v->dfa2);
+	if (ISERR())
+	{
+		assert(d == NULL);
+		freedfa(s);
+		return v->err;
+	}
+
+	ret = cfindloop(v, cnfa, cm, d, s, &cold);
+
+	freedfa(d);
+	freedfa(s);
+	NOERR();
+	if (v->g->cflags & REG_EXPECT)
+	{
+		assert(v->details != NULL);
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
+	}
+	return ret;
+}
+
+/*
+ * cfindloop - the heart of cfind
+ */
+static int
+cfindloop(struct vars *v,
+		  struct cnfa *cnfa,
+		  struct colormap *cm,
+		  struct dfa *d,
+		  struct dfa *s,
+		  chr **coldp)			/* where to put coldstart pointer */
+{
+	chr		   *begin;
+	chr		   *end;
+	chr		   *cold;
+	chr		   *open;			/* open and close of range of possible starts */
+	chr		   *close;
+	chr		   *estart;
+	chr		   *estop;
+	int			er;
+	int			shorter = v->g->tree->flags & SHORTER;
+	int			hitend;
+
+	assert(d != NULL && s != NULL);
+	cold = NULL;
+	close = v->search_start;
+	do
+	{
+		/* Search with the search RE for match range at/beyond "close" */
+		MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
+		close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
+		if (ISERR())
+		{
+			*coldp = cold;
+			return v->err;
+		}
+		if (close == NULL)
+			break;				/* no more possible match anywhere */
+		assert(cold != NULL);
+		open = cold;
+		cold = NULL;
+		/* Search for matches starting between "open" and "close" inclusive */
+		MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
+		for (begin = open; begin <= close; begin++)
+		{
+			MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
+			estart = begin;
+			estop = v->stop;
+			for (;;)
+			{
+				/* Here we use the top node's detailed RE */
+				if (shorter)
+					end = shortest(v, d, begin, estart,
+								   estop, (chr **) NULL, &hitend);
+				else
+					end = longest(v, d, begin, estop,
+								  &hitend);
+				if (ISERR())
+				{
+					*coldp = cold;
+					return v->err;
+				}
+				if (hitend && cold == NULL)
+					cold = begin;
+				if (end == NULL)
+					break;		/* no match with this begin point, try next */
+				MDEBUG(("tentative end %ld\n", LOFF(end)));
+				/* Dissect the potential match to see if it really matches */
+				zapallsubs(v->pmatch, v->nmatch);
+				er = cdissect(v, v->g->tree, begin, end);
+				if (er == REG_OKAY)
+				{
+					if (v->nmatch > 0)
+					{
+						v->pmatch[0].rm_so = OFF(begin);
+						v->pmatch[0].rm_eo = OFF(end);
+					}
+					*coldp = cold;
+					return REG_OKAY;
+				}
+				if (er != REG_NOMATCH)
+				{
+					ERR(er);
+					*coldp = cold;
+					return er;
+				}
+				/* Try next longer/shorter match with same begin point */
+				if (shorter)
+				{
+					if (end == estop)
+						break;	/* no more, so try next begin point */
+					estart = end + 1;
+				}
+				else
+				{
+					if (end == begin)
+						break;	/* no more, so try next begin point */
+					estop = end - 1;
+				}
+			}					/* end loop over endpoint positions */
+		}						/* end loop over beginning positions */
+
+		/*
+		 * If we get here, there is no possible match starting at or before
+		 * "close", so consider matches beyond that.  We'll do a fresh search
+		 * with the search RE to find a new promising match range.
+		 */
+		close++;
+	} while (close < v->stop);
+
+	*coldp = cold;
+	return REG_NOMATCH;
+}
+
+/*
+ * zapallsubs - initialize all subexpression matches to "no match"
+ */
+static void
+zapallsubs(regmatch_t *p,
+		   size_t n)
+{
+	size_t		i;
+
+	for (i = n - 1; i > 0; i--)
+	{
+		p[i].rm_so = -1;
+		p[i].rm_eo = -1;
+	}
+}
+
+/*
+ * zaptreesubs - initialize subexpressions within subtree to "no match"
+ */
+static void
+zaptreesubs(struct vars *v,
+			struct subre *t)
+{
+	if (t->op == '(')
+	{
+		int			n = t->subno;
+
+		assert(n > 0);
+		if ((size_t) n < v->nmatch)
+		{
+			v->pmatch[n].rm_so = -1;
+			v->pmatch[n].rm_eo = -1;
+		}
+	}
+
+	if (t->left != NULL)
+		zaptreesubs(v, t->left);
+	if (t->right != NULL)
+		zaptreesubs(v, t->right);
+}
+
+/*
+ * subset - set subexpression match data for a successful subre
+ */
+static void
+subset(struct vars *v,
+	   struct subre *sub,
+	   chr *begin,
+	   chr *end)
+{
+	int			n = sub->subno;
+
+	assert(n > 0);
+	if ((size_t) n >= v->nmatch)
+		return;
+
+	MDEBUG(("setting %d\n", n));
+	v->pmatch[n].rm_so = OFF(begin);
+	v->pmatch[n].rm_eo = OFF(end);
+}
+
+/*
+ * cdissect - check backrefs and determine subexpression matches
+ *
+ * cdissect recursively processes a subre tree to check matching of backrefs
+ * and/or identify submatch boundaries for capture nodes.  The proposed match
+ * runs from "begin" to "end" (not including "end"), and we are basically
+ * "dissecting" it to see where the submatches are.
+ *
+ * Before calling any level of cdissect, the caller must have run the node's
+ * DFA and found that the proposed substring satisfies the DFA.  (We make
+ * the caller do that because in concatenation and iteration nodes, it's
+ * much faster to check all the substrings against the child DFAs before we
+ * recurse.)  Also, caller must have cleared subexpression match data via
+ * zaptreesubs (or zapallsubs at the top level).
+ */
+static int						/* regexec return code */
+cdissect(struct vars *v,
+		 struct subre *t,
+		 chr *begin,			/* beginning of relevant substring */
+		 chr *end)				/* end of same */
+{
+	int			er;
+
+	assert(t != NULL);
+	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
+
+	/* handy place to check for operation cancel */
+	if (CANCEL_REQUESTED(v->re))
+		return REG_CANCEL;
+	/* ... and stack overrun */
+	if (STACK_TOO_DEEP(v->re))
+		return REG_ETOOBIG;
+
+	switch (t->op)
+	{
+		case '=':				/* terminal node */
+			assert(t->left == NULL && t->right == NULL);
+			er = REG_OKAY;		/* no action, parent did the work */
+			break;
+		case 'b':				/* back reference */
+			assert(t->left == NULL && t->right == NULL);
+			er = cbrdissect(v, t, begin, end);
+			break;
+		case '.':				/* concatenation */
+			assert(t->left != NULL && t->right != NULL);
+			if (t->left->flags & SHORTER)	/* reverse scan */
+				er = crevcondissect(v, t, begin, end);
+			else
+				er = ccondissect(v, t, begin, end);
+			break;
+		case '|':				/* alternation */
+			assert(t->left != NULL);
+			er = caltdissect(v, t, begin, end);
+			break;
+		case '*':				/* iteration */
+			assert(t->left != NULL);
+			if (t->left->flags & SHORTER)	/* reverse scan */
+				er = creviterdissect(v, t, begin, end);
+			else
+				er = citerdissect(v, t, begin, end);
+			break;
+		case '(':				/* capturing */
+			assert(t->left != NULL && t->right == NULL);
+			assert(t->subno > 0);
+			er = cdissect(v, t->left, begin, end);
+			if (er == REG_OKAY)
+				subset(v, t, begin, end);
+			break;
+		default:
+			er = REG_ASSERT;
+			break;
+	}
+
+	/*
+	 * We should never have a match failure unless backrefs lurk below;
+	 * otherwise, either caller failed to check the DFA, or there's some
+	 * inconsistency between the DFA and the node's innards.
+	 */
+	assert(er != REG_NOMATCH || (t->flags & BACKR));
+
+	return er;
+}
+
+/*
+ * ccondissect - dissect match for concatenation node
+ */
+static int						/* regexec return code */
+ccondissect(struct vars *v,
+			struct subre *t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	struct dfa *d2;
+	chr		   *mid;
+	int			er;
+
+	assert(t->op == '.');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->right != NULL && t->right->cnfa.nstates > 0);
+	assert(!(t->left->flags & SHORTER));
+
+	d = getsubdfa(v, t->left);
+	NOERR();
+	d2 = getsubdfa(v, t->right);
+	NOERR();
+	MDEBUG(("cconcat %d\n", t->id));
+
+	/* pick a tentative midpoint */
+	mid = longest(v, d, begin, end, (int *) NULL);
+	NOERR();
+	if (mid == NULL)
+		return REG_NOMATCH;
+	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
+
+	/* iterate until satisfaction or failure */
+	for (;;)
+	{
+		/* try this midpoint on for size */
+		if (longest(v, d2, mid, end, (int *) NULL) == end)
+		{
+			er = cdissect(v, t->left, begin, mid);
+			if (er == REG_OKAY)
+			{
+				er = cdissect(v, t->right, mid, end);
+				if (er == REG_OKAY)
+				{
+					/* satisfaction */
+					MDEBUG(("successful\n"));
+					return REG_OKAY;
+				}
+			}
+			if (er != REG_NOMATCH)
+				return er;
+		}
+		NOERR();
+
+		/* that midpoint didn't work, find a new one */
+		if (mid == begin)
+		{
+			/* all possibilities exhausted */
+			MDEBUG(("%d no midpoint\n", t->id));
+			return REG_NOMATCH;
+		}
+		mid = longest(v, d, begin, mid - 1, (int *) NULL);
+		NOERR();
+		if (mid == NULL)
+		{
+			/* failed to find a new one */
+			MDEBUG(("%d failed midpoint\n", t->id));
+			return REG_NOMATCH;
+		}
+		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+		zaptreesubs(v, t->left);
+		zaptreesubs(v, t->right);
+	}
+
+	/* can't get here */
+	return REG_ASSERT;
+}
+
+/*
+ * crevcondissect - dissect match for concatenation node, shortest-first
+ */
+static int						/* regexec return code */
+crevcondissect(struct vars *v,
+			   struct subre *t,
+			   chr *begin,		/* beginning of relevant substring */
+			   chr *end)		/* end of same */
+{
+	struct dfa *d;
+	struct dfa *d2;
+	chr		   *mid;
+	int			er;
+
+	assert(t->op == '.');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->right != NULL && t->right->cnfa.nstates > 0);
+	assert(t->left->flags & SHORTER);
+
+	d = getsubdfa(v, t->left);
+	NOERR();
+	d2 = getsubdfa(v, t->right);
+	NOERR();
+	MDEBUG(("crevcon %d\n", t->id));
+
+	/* pick a tentative midpoint */
+	mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
+	NOERR();
+	if (mid == NULL)
+		return REG_NOMATCH;
+	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
+
+	/* iterate until satisfaction or failure */
+	for (;;)
+	{
+		/* try this midpoint on for size */
+		if (longest(v, d2, mid, end, (int *) NULL) == end)
+		{
+			er = cdissect(v, t->left, begin, mid);
+			if (er == REG_OKAY)
+			{
+				er = cdissect(v, t->right, mid, end);
+				if (er == REG_OKAY)
+				{
+					/* satisfaction */
+					MDEBUG(("successful\n"));
+					return REG_OKAY;
+				}
+			}
+			if (er != REG_NOMATCH)
+				return er;
+		}
+		NOERR();
+
+		/* that midpoint didn't work, find a new one */
+		if (mid == end)
+		{
+			/* all possibilities exhausted */
+			MDEBUG(("%d no midpoint\n", t->id));
+			return REG_NOMATCH;
+		}
+		mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
+		NOERR();
+		if (mid == NULL)
+		{
+			/* failed to find a new one */
+			MDEBUG(("%d failed midpoint\n", t->id));
+			return REG_NOMATCH;
+		}
+		MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
+		zaptreesubs(v, t->left);
+		zaptreesubs(v, t->right);
+	}
+
+	/* can't get here */
+	return REG_ASSERT;
+}
+
+/*
+ * cbrdissect - dissect match for backref node
+ */
+static int						/* regexec return code */
+cbrdissect(struct vars *v,
+		   struct subre *t,
+		   chr *begin,			/* beginning of relevant substring */
+		   chr *end)			/* end of same */
+{
+	int			n = t->subno;
+	size_t		numreps;
+	size_t		tlen;
+	size_t		brlen;
+	chr		   *brstring;
+	chr		   *p;
+	int			min = t->min;
+	int			max = t->max;
+
+	assert(t != NULL);
+	assert(t->op == 'b');
+	assert(n >= 0);
+	assert((size_t) n < v->nmatch);
+
+	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
+
+	/* get the backreferenced string */
+	if (v->pmatch[n].rm_so == -1)
+		return REG_NOMATCH;
+	brstring = v->start + v->pmatch[n].rm_so;
+	brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+
+	/* special cases for zero-length strings */
+	if (brlen == 0)
+	{
+		/*
+		 * matches only if target is zero length, but any number of
+		 * repetitions can be considered to be present
+		 */
+		if (begin == end && min <= max)
+		{
+			MDEBUG(("cbackref matched trivially\n"));
+			return REG_OKAY;
+		}
+		return REG_NOMATCH;
+	}
+	if (begin == end)
+	{
+		/* matches only if zero repetitions are okay */
+		if (min == 0)
+		{
+			MDEBUG(("cbackref matched trivially\n"));
+			return REG_OKAY;
+		}
+		return REG_NOMATCH;
+	}
+
+	/*
+	 * check target length to see if it could possibly be an allowed number of
+	 * repetitions of brstring
+	 */
+	assert(end > begin);
+	tlen = end - begin;
+	if (tlen % brlen != 0)
+		return REG_NOMATCH;
+	numreps = tlen / brlen;
+	if (numreps < min || (numreps > max && max != DUPINF))
+		return REG_NOMATCH;
+
+	/* okay, compare the actual string contents */
+	p = begin;
+	while (numreps-- > 0)
+	{
+		if ((*v->g->compare) (brstring, p, brlen) != 0)
+			return REG_NOMATCH;
+		p += brlen;
+	}
+
+	MDEBUG(("cbackref matched\n"));
+	return REG_OKAY;
+}
+
+/*
+ * caltdissect - dissect match for alternation node
+ */
+static int						/* regexec return code */
+caltdissect(struct vars *v,
+			struct subre *t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	int			er;
+
+	/* We loop, rather than tail-recurse, to handle a chain of alternatives */
+	while (t != NULL)
+	{
+		assert(t->op == '|');
+		assert(t->left != NULL && t->left->cnfa.nstates > 0);
+
+		MDEBUG(("calt n%d\n", t->id));
+
+		d = getsubdfa(v, t->left);
+		NOERR();
+		if (longest(v, d, begin, end, (int *) NULL) == end)
+		{
+			MDEBUG(("calt matched\n"));
+			er = cdissect(v, t->left, begin, end);
+			if (er != REG_NOMATCH)
+				return er;
+		}
+		NOERR();
+
+		t = t->right;
+	}
+
+	return REG_NOMATCH;
+}
+
+/*
+ * citerdissect - dissect match for iteration node
+ */
+static int						/* regexec return code */
+citerdissect(struct vars *v,
+			 struct subre *t,
+			 chr *begin,		/* beginning of relevant substring */
+			 chr *end)			/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(!(t->left->flags & SHORTER));
+	assert(begin <= end);
+
+	/*
+	 * For the moment, assume the minimum number of matches is 1.  If zero
+	 * matches are allowed, and the target string is empty, we are allowed to
+	 * match regardless of the contents of the iter node --- but we would
+	 * prefer to match once, so that capturing parens get set.  (An example of
+	 * the concern here is a pattern like "()*\1", which historically this
+	 * code has allowed to succeed.)  Therefore, we deal with the zero-matches
+	 * case at the bottom, after failing to find any other way to match.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+		min_matches = 1;
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != DUPINF)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = getsubdfa(v, t->left);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("citer %d\n", t->id));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = end;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
+		if (ISERR())
+		{
+			FREE(endpts);
+			return v->err;
+		}
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can shorten previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->id, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to shorten some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			/* reject zero-length match unless necessary to achieve min */
+			if (endpts[k] == endpts[k - 1] &&
+				(k >= min_matches || min_matches - k < end - endpts[k]))
+				goto backtrack;
+
+			k++;
+			limit = end;
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches that
+		 * works so far as the child DFA can tell.  If k is an allowed number
+		 * of matches, start the slow part: recurse to verify each sub-match.
+		 * We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			zaptreesubs(v, t->left);
+			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->id));
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+
+		/*
+		 * Must consider shorter versions of the current sub-match.  However,
+		 * we'll only ask for a zero-length match if necessary.
+		 */
+		while (k > 0)
+		{
+			chr		   *prev_end = endpts[k - 1];
+
+			if (endpts[k] > prev_end)
+			{
+				limit = endpts[k] - 1;
+				if (limit > prev_end ||
+					(k < min_matches && min_matches - k >= end - prev_end))
+				{
+					/* break out of backtrack loop, continue the outer one */
+					break;
+				}
+			}
+			/* can't shorten k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted */
+	FREE(endpts);
+
+	/*
+	 * Now consider the possibility that we can match to a zero-length string
+	 * by using zero repetitions.
+	 */
+	if (t->min == 0 && begin == end)
+	{
+		MDEBUG(("%d allowing zero matches\n", t->id));
+		return REG_OKAY;
+	}
+
+	MDEBUG(("%d failed\n", t->id));
+	return REG_NOMATCH;
+}
+
+/*
+ * creviterdissect - dissect match for iteration node, shortest-first
+ */
+static int						/* regexec return code */
+creviterdissect(struct vars *v,
+				struct subre *t,
+				chr *begin,		/* beginning of relevant substring */
+				chr *end)		/* end of same */
+{
+	struct dfa *d;
+	chr		  **endpts;
+	chr		   *limit;
+	int			min_matches;
+	size_t		max_matches;
+	int			nverified;
+	int			k;
+	int			i;
+	int			er;
+
+	assert(t->op == '*');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->left->flags & SHORTER);
+	assert(begin <= end);
+
+	/*
+	 * If zero matches are allowed, and target string is empty, just declare
+	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * so we pretend the min is 1.
+	 */
+	min_matches = t->min;
+	if (min_matches <= 0)
+	{
+		if (begin == end)
+			return REG_OKAY;
+		min_matches = 1;
+	}
+
+	/*
+	 * We need workspace to track the endpoints of each sub-match.  Normally
+	 * we consider only nonzero-length sub-matches, so there can be at most
+	 * end-begin of them.  However, if min is larger than that, we will also
+	 * consider zero-length sub-matches in order to find enough matches.
+	 *
+	 * For convenience, endpts[0] contains the "begin" pointer and we store
+	 * sub-match endpoints in endpts[1..max_matches].
+	 */
+	max_matches = end - begin;
+	if (max_matches > t->max && t->max != DUPINF)
+		max_matches = t->max;
+	if (max_matches < min_matches)
+		max_matches = min_matches;
+	endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
+	if (endpts == NULL)
+		return REG_ESPACE;
+	endpts[0] = begin;
+
+	d = getsubdfa(v, t->left);
+	if (ISERR())
+	{
+		FREE(endpts);
+		return v->err;
+	}
+	MDEBUG(("creviter %d\n", t->id));
+
+	/*
+	 * Our strategy is to first find a set of sub-match endpoints that are
+	 * valid according to the child node's DFA, and then recursively dissect
+	 * each sub-match to confirm validity.  If any validity check fails,
+	 * backtrack the last sub-match and try again.  And, when we next try for
+	 * a validity check, we need not recheck any successfully verified
+	 * sub-matches that we didn't move the endpoints of.  nverified remembers
+	 * how many sub-matches are currently known okay.
+	 */
+
+	/* initialize to consider first sub-match */
+	nverified = 0;
+	k = 1;
+	limit = begin;
+
+	/* iterate until satisfaction or failure */
+	while (k > 0)
+	{
+		/* disallow zero-length match unless necessary to achieve min */
+		if (limit == endpts[k - 1] &&
+			limit != end &&
+			(k >= min_matches || min_matches - k < end - limit))
+			limit++;
+
+		/* if this is the last allowed sub-match, it must reach to the end */
+		if (k >= max_matches)
+			limit = end;
+
+		/* try to find an endpoint for the k'th sub-match */
+		endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
+							 (chr **) NULL, (int *) NULL);
+		if (ISERR())
+		{
+			FREE(endpts);
+			return v->err;
+		}
+		if (endpts[k] == NULL)
+		{
+			/* no match possible, so see if we can lengthen previous one */
+			k--;
+			goto backtrack;
+		}
+		MDEBUG(("%d: working endpoint %d: %ld\n",
+				t->id, k, LOFF(endpts[k])));
+
+		/* k'th sub-match can no longer be considered verified */
+		if (nverified >= k)
+			nverified = k - 1;
+
+		if (endpts[k] != end)
+		{
+			/* haven't reached end yet, try another iteration if allowed */
+			if (k >= max_matches)
+			{
+				/* must try to lengthen some previous match */
+				k--;
+				goto backtrack;
+			}
+
+			k++;
+			limit = endpts[k - 1];
+			continue;
+		}
+
+		/*
+		 * We've identified a way to divide the string into k sub-matches that
+		 * works so far as the child DFA can tell.  If k is an allowed number
+		 * of matches, start the slow part: recurse to verify each sub-match.
+		 * We always have k <= max_matches, needn't check that.
+		 */
+		if (k < min_matches)
+			goto backtrack;
+
+		MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
+
+		for (i = nverified + 1; i <= k; i++)
+		{
+			zaptreesubs(v, t->left);
+			er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+			if (er == REG_OKAY)
+			{
+				nverified = i;
+				continue;
+			}
+			if (er == REG_NOMATCH)
+				break;
+			/* oops, something failed */
+			FREE(endpts);
+			return er;
+		}
+
+		if (i > k)
+		{
+			/* satisfaction */
+			MDEBUG(("%d successful\n", t->id));
+			FREE(endpts);
+			return REG_OKAY;
+		}
+
+		/* match failed to verify, so backtrack */
+
+backtrack:
+
+		/*
+		 * Must consider longer versions of the current sub-match.
+		 */
+		while (k > 0)
+		{
+			if (endpts[k] < end)
+			{
+				limit = endpts[k] + 1;
+				/* break out of backtrack loop, continue the outer one */
+				break;
+			}
+			/* can't lengthen k'th sub-match any more, consider previous one */
+			k--;
+		}
+	}
+
+	/* all possibilities exhausted */
+	MDEBUG(("%d failed\n", t->id));
+	FREE(endpts);
+	return REG_NOMATCH;
+}
+
+
+
+#include "rege_dfa.c"
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c
new file mode 100644
index 0000000..a925a9f
--- /dev/null
+++ b/src/backend/regex/regexport.c
@@ -0,0 +1,292 @@
+/*-------------------------------------------------------------------------
+ *
+ * regexport.c
+ *	  Functions for exporting info about a regex's NFA
+ *
+ * In this implementation, the NFA defines a necessary but not sufficient
+ * condition for a string to match the regex: that is, there can be strings
+ * that match the NFA but don't match the full regex, but not vice versa.
+ * Thus, for example, it is okay for the functions below to treat lookaround
+ * constraints as no-ops, since they merely constrain the string some more.
+ *
+ * Notice that these functions return info into caller-provided arrays
+ * rather than doing their own malloc's.  This simplifies the APIs by
+ * eliminating a class of error conditions, and in the case of colors
+ * allows the caller to decide how big is too big to bother with.
+ *
+ *
+ * Portions Copyright (c) 2013-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regexport.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+#include "regex/regexport.h"
+
+
+/*
+ * Get total number of NFA states.
+ */
+int
+pg_reg_getnumstates(const regex_t *regex)
+{
+	struct cnfa *cnfa;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	return cnfa->nstates;
+}
+
+/*
+ * Get initial state of NFA.
+ */
+int
+pg_reg_getinitialstate(const regex_t *regex)
+{
+	struct cnfa *cnfa;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	return cnfa->pre;
+}
+
+/*
+ * Get final state of NFA.
+ */
+int
+pg_reg_getfinalstate(const regex_t *regex)
+{
+	struct cnfa *cnfa;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	return cnfa->post;
+}
+
+/*
+ * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
+ * arcs from the caller, treating any LACON as being automatically satisfied.
+ * Since the output representation does not support arcs that consume no
+ * character when traversed, we have to recursively traverse LACON arcs here,
+ * and report whatever normal arcs are reachable by traversing LACON arcs.
+ * Note that this wouldn't work if it were possible to reach the final state
+ * via LACON traversal, but the regex library never builds NFAs that have
+ * LACON arcs leading directly to the final state.  (This is because the
+ * regex executor is designed to consume one character beyond the nominal
+ * match end --- possibly an EOS indicator --- so there is always a set of
+ * ordinary arcs leading to the final state.)
+ *
+ * traverse_lacons is a recursive subroutine used by both exported functions
+ * to count and then emit the reachable regular arcs.  *arcs_count is
+ * incremented by the number of reachable arcs, and as many as will fit in
+ * arcs_len (possibly 0) are emitted into arcs[].
+ */
+static void
+traverse_lacons(struct cnfa *cnfa, int st,
+				int *arcs_count,
+				regex_arc_t *arcs, int arcs_len)
+{
+	struct carc *ca;
+
+	/*
+	 * Since this function recurses, it could theoretically be driven to stack
+	 * overflow.  In practice, this is mostly useful to backstop against a
+	 * failure of the regex compiler to remove a loop of LACON arcs.
+	 */
+	check_stack_depth();
+
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co < cnfa->ncolors)
+		{
+			/* Ordinary arc, so count and possibly emit it */
+			int			ndx = (*arcs_count)++;
+
+			if (ndx < arcs_len)
+			{
+				arcs[ndx].co = ca->co;
+				arcs[ndx].to = ca->to;
+			}
+		}
+		else
+		{
+			/* LACON arc --- assume it's satisfied and recurse... */
+			/* ... but first, assert it doesn't lead directly to post state */
+			Assert(ca->to != cnfa->post);
+
+			traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
+		}
+	}
+}
+
+/*
+ * Get number of outgoing NFA arcs of state number "st".
+ */
+int
+pg_reg_getnumoutarcs(const regex_t *regex, int st)
+{
+	struct cnfa *cnfa;
+	int			arcs_count;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	if (st < 0 || st >= cnfa->nstates)
+		return 0;
+	arcs_count = 0;
+	traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
+	return arcs_count;
+}
+
+/*
+ * Write array of outgoing NFA arcs of state number "st" into arcs[],
+ * whose length arcs_len must be at least as long as indicated by
+ * pg_reg_getnumoutarcs(), else not all arcs will be returned.
+ */
+void
+pg_reg_getoutarcs(const regex_t *regex, int st,
+				  regex_arc_t *arcs, int arcs_len)
+{
+	struct cnfa *cnfa;
+	int			arcs_count;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
+		return;
+	arcs_count = 0;
+	traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
+}
+
+/*
+ * Get total number of colors.
+ */
+int
+pg_reg_getnumcolors(const regex_t *regex)
+{
+	struct colormap *cm;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cm = &((struct guts *) regex->re_guts)->cmap;
+
+	return cm->max + 1;
+}
+
+/*
+ * Check if color is beginning of line/string.
+ *
+ * (We might at some point need to offer more refined handling of pseudocolors,
+ * but this will do for now.)
+ */
+int
+pg_reg_colorisbegin(const regex_t *regex, int co)
+{
+	struct cnfa *cnfa;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	if (co == cnfa->bos[0] || co == cnfa->bos[1])
+		return true;
+	else
+		return false;
+}
+
+/*
+ * Check if color is end of line/string.
+ */
+int
+pg_reg_colorisend(const regex_t *regex, int co)
+{
+	struct cnfa *cnfa;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cnfa = &((struct guts *) regex->re_guts)->search;
+
+	if (co == cnfa->eos[0] || co == cnfa->eos[1])
+		return true;
+	else
+		return false;
+}
+
+/*
+ * Get number of member chrs of color number "co".
+ *
+ * Note: we return -1 if the color number is invalid, or if it is a special
+ * color (WHITE or a pseudocolor), or if the number of members is uncertain.
+ * Callers should not try to extract the members if -1 is returned.
+ */
+int
+pg_reg_getnumcharacters(const regex_t *regex, int co)
+{
+	struct colormap *cm;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cm = &((struct guts *) regex->re_guts)->cmap;
+
+	if (co <= 0 || co > cm->max)	/* we reject 0 which is WHITE */
+		return -1;
+	if (cm->cd[co].flags & PSEUDO)	/* also pseudocolors (BOS etc) */
+		return -1;
+
+	/*
+	 * If the color appears anywhere in the high colormap, treat its number of
+	 * members as uncertain.  In principle we could determine all the specific
+	 * chrs corresponding to each such entry, but it would be expensive
+	 * (particularly if character class tests are required) and it doesn't
+	 * seem worth it.
+	 */
+	if (cm->cd[co].nuchrs != 0)
+		return -1;
+
+	/* OK, return the known number of member chrs */
+	return cm->cd[co].nschrs;
+}
+
+/*
+ * Write array of member chrs of color number "co" into chars[],
+ * whose length chars_len must be at least as long as indicated by
+ * pg_reg_getnumcharacters(), else not all chars will be returned.
+ *
+ * Fetching the members of WHITE or a pseudocolor is not supported.
+ *
+ * Caution: this is a relatively expensive operation.
+ */
+void
+pg_reg_getcharacters(const regex_t *regex, int co,
+					 pg_wchar *chars, int chars_len)
+{
+	struct colormap *cm;
+	chr			c;
+
+	assert(regex != NULL && regex->re_magic == REMAGIC);
+	cm = &((struct guts *) regex->re_guts)->cmap;
+
+	if (co <= 0 || co > cm->max || chars_len <= 0)
+		return;
+	if (cm->cd[co].flags & PSEUDO)
+		return;
+
+	/*
+	 * We need only examine the low character map; there should not be any
+	 * matching entries in the high map.
+	 */
+	for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
+	{
+		if (cm->locolormap[c - CHR_MIN] == co)
+		{
+			*chars++ = c;
+			if (--chars_len == 0)
+				break;
+		}
+	}
+}
diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c
new file mode 100644
index 0000000..ae17ae7
--- /dev/null
+++ b/src/backend/regex/regfree.c
@@ -0,0 +1,54 @@
+/*
+ * regfree - free an RE
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ *
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them.
+ *
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ *
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * src/backend/regex/regfree.c
+ *
+ *
+ * You might think that this could be incorporated into regcomp.c, and
+ * that would be a reasonable idea... except that this is a generic
+ * function (with a generic name), applicable to all compiled REs
+ * regardless of the size of their characters, whereas the stuff in
+ * regcomp.c gets compiled once per character size.
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * pg_regfree - free an RE (generic function, punts to RE-specific function)
+ *
+ * Ignoring invocation with NULL is a convenience.
+ */
+void
+pg_regfree(regex_t *re)
+{
+	if (re == NULL)
+		return;
+	(*((struct fns *) re->re_fns)->free) (re);
+}
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644
index 0000000..991b868
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,260 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ *	  Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int	findprefix(struct cnfa *cnfa, struct colormap *cm,
+					   chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ *	REG_NOMATCH: there is no common prefix of strings matching the regex
+ *	REG_PREFIX: there is a common prefix of strings matching the regex
+ *	REG_EXACT: all strings satisfying the regex must match the same string
+ *	or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookaround
+ * constraints) exactly.  Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex.  But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+			 chr **string,
+			 size_t *slength)
+{
+	struct guts *g;
+	struct cnfa *cnfa;
+	int			st;
+
+	/* sanity checks */
+	if (string == NULL || slength == NULL)
+		return REG_INVARG;
+	*string = NULL;				/* initialize for failure cases */
+	*slength = 0;
+	if (re == NULL || re->re_magic != REMAGIC)
+		return REG_INVARG;
+	if (re->re_csize != sizeof(chr))
+		return REG_MIXED;
+
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
+
+	/* setup */
+	g = (struct guts *) re->re_guts;
+	if (g->info & REG_UIMPOSSIBLE)
+		return REG_NOMATCH;
+
+	/*
+	 * This implementation considers only the search NFA for the topmost regex
+	 * tree node.  Therefore, constraints such as backrefs are not fully
+	 * applied, which is allowed per the function's API spec.
+	 */
+	assert(g->tree != NULL);
+	cnfa = &g->tree->cnfa;
+
+	/*
+	 * Since a correct NFA should never contain any exit-free loops, it should
+	 * not be possible for our traversal to return to a previously visited NFA
+	 * state.  Hence we need at most nstates chrs in the output string.
+	 */
+	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+	if (*string == NULL)
+		return REG_ESPACE;
+
+	/* do it */
+	st = findprefix(cnfa, &g->cmap, *string, slength);
+
+	assert(*slength <= cnfa->nstates);
+
+	/* clean up */
+	if (st != REG_PREFIX && st != REG_EXACT)
+	{
+		FREE(*string);
+		*string = NULL;
+		*slength = 0;
+	}
+
+	return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int						/* regprefix return code */
+findprefix(struct cnfa *cnfa,
+		   struct colormap *cm,
+		   chr *string,
+		   size_t *slength)
+{
+	int			st;
+	int			nextst;
+	color		thiscolor;
+	chr			c;
+	struct carc *ca;
+
+	/*
+	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+	 * anchored left.  If we have both BOS and BOL, they must go to the same
+	 * next state.
+	 */
+	st = cnfa->pre;
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+				return REG_NOMATCH;
+		}
+		else
+			return REG_NOMATCH;
+	}
+	if (nextst == -1)
+		return REG_NOMATCH;
+
+	/*
+	 * Scan through successive states, stopping as soon as we find one with
+	 * more than one acceptable transition character (either multiple colors
+	 * on out-arcs, or a color with more than one member chr).
+	 *
+	 * We could find a state with multiple out-arcs that are all labeled with
+	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+	 * In that case we add the chr "c" to the output string but then exit the
+	 * loop with nextst == -1.  This leaves a little bit on the table: if the
+	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
+	 * worth the trouble.
+	 */
+	do
+	{
+		st = nextst;
+		nextst = -1;
+		thiscolor = COLORLESS;
+		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+		{
+			/* We can ignore BOS/BOL arcs */
+			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+				continue;
+			/* ... but EOS/EOL arcs terminate the search, as do LACONs */
+			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
+				ca->co >= cnfa->ncolors)
+			{
+				thiscolor = COLORLESS;
+				break;
+			}
+			if (thiscolor == COLORLESS)
+			{
+				/* First plain outarc */
+				thiscolor = ca->co;
+				nextst = ca->to;
+			}
+			else if (thiscolor == ca->co)
+			{
+				/* Another plain outarc for same color */
+				nextst = -1;
+			}
+			else
+			{
+				/* More than one plain outarc color terminates the search */
+				thiscolor = COLORLESS;
+				break;
+			}
+		}
+		/* Done if we didn't find exactly one color on plain outarcs */
+		if (thiscolor == COLORLESS)
+			break;
+		/* The color must be a singleton */
+		if (cm->cd[thiscolor].nschrs != 1)
+			break;
+		/* Must not have any high-color-map entries */
+		if (cm->cd[thiscolor].nuchrs != 0)
+			break;
+
+		/*
+		 * Identify the color's sole member chr and add it to the prefix
+		 * string.  In general the colormap data structure doesn't provide a
+		 * way to find color member chrs, except by trying GETCOLOR() on each
+		 * possible chr value, which won't do at all.  However, for the cases
+		 * we care about it should be sufficient to test the "firstchr" value,
+		 * that is the first chr ever added to the color.  There are cases
+		 * where this might no longer be a member of the color (so we do need
+		 * to test), but none of them are likely to arise for a character that
+		 * is a member of a common prefix.  If we do hit such a corner case,
+		 * we just fall out without adding anything to the prefix string.
+		 */
+		c = cm->cd[thiscolor].firstchr;
+		if (GETCOLOR(cm, c) != thiscolor)
+			break;
+
+		string[(*slength)++] = c;
+
+		/* Advance to next state, but only if we have a unique next state */
+	} while (nextst != -1);
+
+	/*
+	 * If we ended at a state that only has EOS/EOL outarcs leading to the
+	 * "post" state, then we have an exact-match string.  Note this is true
+	 * even if the string is of zero length.
+	 */
+	nextst = -1;
+	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+	{
+		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+		{
+			if (nextst == -1)
+				nextst = ca->to;
+			else if (nextst != ca->to)
+			{
+				nextst = -1;
+				break;
+			}
+		}
+		else
+		{
+			nextst = -1;
+			break;
+		}
+	}
+	if (nextst == cnfa->post)
+		return REG_EXACT;
+
+	/*
+	 * Otherwise, if we were unable to identify any prefix characters, say
+	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
+	 * particular first character.
+	 */
+	if (*slength > 0)
+		return REG_PREFIX;
+
+	return REG_NOMATCH;
+}