natefw

natefw Mercurial Source Tree


Root/src/Pattern.cpp

/**
  From the author (Jeff Stuart)
  "
  Let me start by saying this file is pretty big. If you feel up to it, you can
  try making changes yourself, but you would be better off to just email me at
  stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you
  would like added. This project is very "near and dear" to me, so I am fairly
  quick to make bug fixes. The header files for Pattern and Matcher are fairly
  well documented and the function names are pretty self-explanatory, but if you
  are having any trouble, feel free to email me at stuart@cs.ucdavis.edu.
 
  If you email me, make sure you put something like C++RE in the subject because
  I tend to delete email if I don't recognize the name and the subject is
  something like "I Need Your Help" or "Got A Second" or "I Found It".
  "
 */
 
/*
  Detailed documentation is provided in this class' header file
 
  @author   Jeffery Stuart
  @since    November 24th, 2007
  @version  1.09.00
*/
 
#ifdef _WIN32
  #pragma warning(push)
  #pragma warning(disable:4996)
  #define str_icmp stricmp
#else
  #define str_icmp strcasecmp
#endif
 
#include "regexp/Pattern.h"
#include "regexp/Matcher.h"
#include <cstdio>
#include <cstring>
#include <typeinfo>
#include <algorithm>
 
std::map<std::string, Pattern *> Pattern::compiledPatterns;
std::map<std::string, std::pair<std::string, unsigned long> > Pattern::registeredPatterns;
 
const int Pattern::MIN_QMATCH = 0x00000000;
const int Pattern::MAX_QMATCH = 0x7FFFFFFF;
 
const unsigned long Pattern::CASE_INSENSITIVE       = 0x01;
const unsigned long Pattern::LITERAL                = 0x02;
const unsigned long Pattern::DOT_MATCHES_ALL        = 0x04;
const unsigned long Pattern::MULTILINE_MATCHING     = 0x08;
const unsigned long Pattern::UNIX_LINE_MODE         = 0x10;
 
Pattern::Pattern(const std::string & rhs)
{
  matcher = NULL;
  pattern = rhs;
  curInd = 0;
  groupCount = 0;
  nonCapGroupCount = 0;
  error = 0;
  head = NULL;
}
// convenience function in case we want to add any extra debugging output
void Pattern::raiseError()
{
  switch (pattern[curInd - 1])
  {
  case '*':
  case ')':
  case '+':
  case '?':
  case ']':
  case '}':
    fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
    fprintf(stderr, "Syntax Error near here. Possible unescaped meta character.\n");
    break;
  default:
    fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
    fprintf(stderr, "Syntax Error near here. \n");
    break;
  }
  error = 1;
}
NFANode * Pattern::registerNode(NFANode * node)
{
  nodes[node] = 1;
  return node;
}
 
std::string Pattern::classUnion      (std::string s1, std::string s2)  const
{
  char out[300];
  std::sort(s1.begin(), s1.end());
  std::sort(s2.begin(), s2.end());
  *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
  return out;
}
std::string Pattern::classIntersect  (std::string s1, std::string s2)  const
{
  char out[300];
  std::sort(s1.begin(), s1.end());
  std::sort(s2.begin(), s2.end());
  *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
  return out;
}
std::string Pattern::classNegate     (std::string s1)                  const
{
  char out[300];
  int i, ind = 0;
  std::map<char, bool> m;
 
  for (i = 0; i < (int)s1.size(); ++i)
  {
    m[s1[i]] = 1;
    if (flags & CASE_INSENSITIVE)
    {
      if      (s1[i] >= 'a' && s1[i] <= 'z') m[s1[i] - 'a' + 'A'] = 1;
      else if (s1[i] >= 'A' && s1[i] <= 'Z') m[s1[i] - 'A' + 'a'] = 1;
    }
  }
  for (i = 0xFF; i >= 0; --i) if (m.find((char)i) == m.end()) out[ind++] = (char)i;
  out[ind] = 0;
  return std::string(out, ind);
}
std::string Pattern::classCreateRange(char low, char hi)    const
{
  char out[300];
  int ind = 0;
  while (low != hi) out[ind++] = low++;
  out[ind++] = low;
  return std::string(out, ind);
}
 
int Pattern::getInt(int start, int end)
{
  int ret = 0;
  for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - '0');
  return ret;
}
bool Pattern::quantifyCurly(int & sNum, int & eNum)
{
  bool good = 1;
  int i, ci = curInd + 1;
  int commaInd = ci, endInd = ci, len = pattern.size();
  sNum = eNum = 0;
 
  while (endInd   < len     && pattern[endInd  ] != '}') ++endInd;
  while (commaInd < endInd  && pattern[commaInd] != ',') ++commaInd;
  if (endInd >= len) { raiseError(); return 0; }
  for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0;
  if (!good && commaInd < endInd) { raiseError(); return 0; }
  if (!good) return 0;
  /* so now everything in here is either a comma (and there is at most one comma) or a digit */
  if (commaInd == ci) // {,*}
  {
    if (endInd == commaInd + 1)    { sNum = MIN_QMATCH;               eNum = MAX_QMATCH;                        } // {,} = *
    else                           { sNum = MIN_QMATCH;               eNum = getInt(commaInd + 1, endInd - 1);  } // {,+}
  }
  else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH;                        } // {+,}
  else if (commaInd == endInd)     { sNum = getInt(ci, endInd - 1);   eNum = sNum;                              } // {+}
  else                             { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1);  } // {+,+}
  curInd = endInd + 1;
  return 1;
}
NFANode * Pattern::quantifyGroup(NFANode * start, NFANode * stop, const int gn)
{
  NFANode * newNode = NULL;
  int type = 0;
 
  if (curInd < (int)pattern.size())
  {
    char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
    switch (pattern[curInd])
    {
    case '*':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '?':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, 1, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '+':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, 1, MAX_QMATCH, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '{':
      {
        int s, e;
        if (quantifyCurly(s, e))
        {
          ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
          switch (ch)
          {
          case '?': ++curInd; type = 1; break;
          case '+': ++curInd; type = 2; break;
          }
          newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
          newNode->next = registerNode(new NFAGroupLoopNode(start, s, e, gn, type));
          stop->next = newNode->next;
          return newNode;
        }
      }
    default:
      break;
    }
  }
  return NULL;
}
 
NFANode * Pattern::quantify(NFANode * newNode)
{
  if (curInd < (int)pattern.size())
  {
    char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
    switch (pattern[curInd])
    {
    case '*':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      }
      break;
    case '?':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, MIN_QMATCH, 1)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, 1)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, MIN_QMATCH, 1)); break;
      }
      break;
    case '+':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, 1, MAX_QMATCH)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, 1, MAX_QMATCH)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, 1, MAX_QMATCH)); break;
      }
      break;
    case '{':
      {
        int s, e;
        if (quantifyCurly(s, e))
        {
          ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
          switch (ch)
          {
          case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, s, e)); break;
          case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, s, e)); break;
          default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, s, e)); break;
          }
        }
      }
      break;
    default:
      break;
    }
  }
  return newNode;
}
std::string Pattern::parseClass()
{
  std::string t, ret = "";
  char ch, c1, c2;
  bool inv = 0, neg = 0, quo = 0;
 
  if (curInd < (int)pattern.size() && pattern[curInd] == '^')
  {
    ++curInd;
    neg = 1;
  }
  while (curInd < (int)pattern.size() && pattern[curInd] != ']')
  {
    ch = pattern[curInd++];
    if (ch == '[')
    {
      t = parseClass();
      ret = classUnion(ret, t);
    }
    /*else if (ch == '-')
    {
      raiseError();
      curInd = pattern.size();
    }*/
    else if (ch == '&' && curInd < (int)pattern.size() && pattern[curInd] == '&')
    {
      if (pattern[++curInd] != '[')
      {
        raiseError();
        curInd = pattern.size();
      }
      else
      {
        ++curInd;
        t = parseClass();
        ret = classIntersect(ret, t);
      }
    }
    else if (ch == '\\')
    {
      t = parseEscape(inv, quo);
      if (quo)
      {
        raiseError();
        curInd = pattern.size();
      }
      else if (inv || t.size() > 1) // cant be part of a range (a-z)
      {
        if (inv) t = classNegate(t);
        ret = classUnion(ret, t);
      }
      else if (curInd < (int)pattern.size() && pattern[curInd] == '-') // part of a range (a-z)
      {
        c1 = t[0];
        ++curInd;
        if (curInd >= (int)pattern.size()) raiseError();
        else
        {
          c2 = pattern[curInd++];
          if (c2 == '\\')
          {
            t = parseEscape(inv, quo);
            if (quo)
            {
              raiseError();
              curInd = pattern.size();
            }
            else if (inv || t.size() > 1) raiseError();
            else ret = classUnion(ret, classCreateRange(c1, c2));
          }
          else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
          {
            raiseError();
            curInd = pattern.size();
          }
          else ret = classUnion(ret, classCreateRange(c1, c2));
        }
      }
      else
      {
        ret = classUnion(ret, t);
      }
    }
    else if (curInd < (int)pattern.size() && pattern[curInd] == '-')
    {
      c1 = ch;
      ++curInd;
      if (curInd >= (int)pattern.size()) raiseError();
      else
      {
        c2 = pattern[curInd++];
        if (c2 == '\\')
        {
          t = parseEscape(inv, quo);
          if (quo)
          {
            raiseError();
            curInd = pattern.size();
          }
          else if (inv || t.size() > 1) raiseError();
          else ret = classUnion(ret, classCreateRange(c1, c2));
        }
        else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
        {
          raiseError();
          curInd = pattern.size();
        }
        else
        {
          ret = classUnion(ret, classCreateRange(c1, c2));
        }
      }
    }
    else
    {
      ret += " ";
      ret[ret.size() - 1] = ch;
    }
  }
  if (curInd >= (int)pattern.size() || pattern[curInd] != ']')
  {
    raiseError();
    ret = "";
  }
  else
  {
    ++curInd;
    if (neg) ret = classNegate(ret);
  }
  return ret;
}
std::string Pattern::parsePosix()
{
  std::string s7 = pattern.substr(curInd, 7);
  if (s7 == "{Lower}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyz";                                                                       }
  if (s7 == "{Upper}")  { curInd += 7; return "ABCDEFGHIJKLMNOPQRSTUVWXYZ";                                                                       }
  if (s7 == "{Alpha}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";                                             }
  if (s7 == "{Digit}")  { curInd += 7; return "0123456789";                                                                                       }
  if (s7 == "{Alnum}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";                                   }
  if (s7 == "{Punct}")  { curInd += 7; return "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";                                                               }
  if (s7 == "{Graph}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == "{Print}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == "{Blank}")  { curInd += 7; return " \t";                                                                                              }
  if (s7 == "{Space}")  { curInd += 7; return " \t\n\x0B\f\r";                                                                                    }
  if (s7 == "{Cntrl}")
  {
    int i;
    std::string s = " ";
 
    for (i = 0; i < 5; ++i) s += s;
    s += " ";
    for (i = 0; i <= 0x1F; ++i) s[i] = i;
    s[0x20] = 0x7F;
    curInd += 7;
    return s;
  }
  if (s7 == "{ASCII}")
  {
    std::string s(0x80, ' ');
    for (int i = 0; i < 0x80; ++i) s[i] = i;
    curInd += 7;
    return s;
  }
  if (pattern.substr(curInd, 8) == "{XDigit}") { curInd += 8; return "abcdefABCDEF0123456789"; }
  raiseError();
  return "";
}
NFANode * Pattern::parseBackref()
{
  #define is_dig(x) ((x) >= '0' && (x) <= '9')
  #define to_int(x) ((x) - '0')
  int ci = curInd;
  int oldRef = 0, ref = 0;
 
  while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount))
  {
    oldRef = ref;
    ref = ref * 10 + to_int(pattern[ci++]);
  }
  if (ci == (int)pattern.size())
  {
    oldRef = ref;
    ++ci;
  }
  if (oldRef < 0 || ci <= curInd)
  {
    raiseError();
    return registerNode(new NFAReferenceNode(-1));
  }
  curInd = ci;
  return registerNode(new NFAReferenceNode(ref));
 
  #undef is_dig
  #undef to_int
}
std::string Pattern::parseOctal()
{
  #define islowoc(x)  ((x) >= '0' && (x) <= '3')
  #define isoc(x)     ((x) >= '0' && (x) <= '7')
  #define fromoc(x)   ((x) - '0')
  int ci = curInd;
  char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
  char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
  char ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1;
  std::string s = " ";
 
  if (islowoc(ch1) && isoc(ch2))
  {
    curInd += 2;
    s[0] = fromoc(ch1) * 8 + fromoc(ch2);
    if (isoc(ch3))
    {
      ++curInd;
      s[0] = s[0] * 8 + fromoc(ch3);
    }
  }
  else if (isoc(ch1) && isoc(ch2))
  {
    curInd += 2;
    s[0] = fromoc(ch1) * 8 + fromoc(ch2);
  }
  else raiseError();
 
  return s;
  #undef islowoc
  #undef isoc
  #undef fromoc
}
std::string Pattern::parseHex()
{
  #define to_low(x)   (((x) >= 'A' && (x) <= 'Z') ? ((x) - 'A' + 'a') : (x))
  #define is_dig(x)   ((x) >= '0' && (x) <= '9')
  #define is_hex(x)   (is_dig(x) || (to_low(x) >= 'a' && to_low(x) <= 'f'))
  #define to_int(x)   ((is_dig(x)) ? ((x) - '0') : (to_low(x) - 'a' + 10))
 
  int ci = curInd;
  char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
  char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
  std::string s = " ";
 
  if (is_hex(ch1) && is_hex(ch2))
  {
    curInd += 2;
    s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F);
  }
 
  return s;
  #undef to_low
  #undef is_dig
  #undef is_hex
  #undef to_int
}
std::string Pattern::parseEscape(bool & inv, bool & quo)
{
  char ch = pattern[curInd++];
  std::string classes = "";
 
  if (curInd > (int)pattern.size())
  {
    raiseError();
    return NULL;
  }
 
  quo = 0;
  inv = 0;
  switch (ch)
  {
  case 'p': classes = parsePosix();                                                         break;
  case 'P': classes = "!!"; classes += parsePosix();                                        break;
  case 'd': classes = "0123456789";                                                         break;
  case 'D': classes = "!!0123456789";                                                       break;
  case 's': classes = " \t\r\n\f";                                                          break;
  case 'S': classes = "!! \t\r\n\f";                                                        break;
  case 'w': classes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";    break;
  case 'W': classes = "!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"break;
  case '0': classes = parseOctal(); break;
  case 'x': classes = parseHex();   break;
 
  case 'Q': quo = 1;        break;
  case 't': classes = "\t"; break;
  case 'r': classes = "\r"; break;
  case 'n': classes = "\n"; break;
  case 'f': classes = "\f"; break;
  case 'a': classes = "\a"; break;
  case 'e': classes = "\r"; break;
  default:  classes = " "; classes[0] = ch; break;
  }
  if (classes.substr(0, 2) == "!!")
  {
    classes = classes.substr(2);
    inv = 1;
  }
  return classes;
}
NFANode * Pattern::parseRegisteredPattern(NFANode ** end)
{
  int i, j;
  std::string s;
  NFANode * ret = NULL;
  for (i = curInd; i < (int)pattern.size() && pattern[i] != '}'; ++i) { }
  if (pattern[i] != '}') { raiseError(); return NULL; }
  if (i == curInd + 1)   { raiseError(); return NULL; } // {}
  if (
      !(
        (pattern[curInd] >= 'a' && pattern[curInd] <= 'z') ||
        (pattern[curInd] >= 'A' && pattern[curInd] <= 'Z') ||
        (pattern[curInd] == '_')
       )
     )
  {
    raiseError();
    return NULL;
  }
  for (j = curInd; !error && j < i; ++j)
  {
    if (
        !(
          (pattern[j] >= 'a' && pattern[j] <= 'z') ||
          (pattern[j] >= 'A' && pattern[j] <= 'Z') ||
          (pattern[j] >= '0' && pattern[j] <= '9') ||
          (pattern[j] == '_')
         )
        )
    {
      raiseError();
      return NULL;
    }
  }
  s = pattern.substr(curInd, i - curInd);
  if (registeredPatterns.find(s) == registeredPatterns.end()) raiseError();
  else
  {
    unsigned long oflags = flags;
    std::string op = pattern;
    int ci = i + 1;
 
    pattern = registeredPatterns[s].first;
    curInd = 0;
    flags = registeredPatterns[s].second;
 
    --groupCount;
    ret = parse(0, 0, end);
 
    pattern = op;
    curInd = ci;
    flags = oflags;
  }
  if (error) { *end = ret = NULL; }
  return ret;
}
 
// look behind should interpret everything as a literal (except \\) since the
// pattern must have a concrete length
NFANode * Pattern::parseBehind(const bool pos, NFANode ** end)
{
  std::string t = "";
  while (curInd < (int)pattern.size() && pattern[curInd] != ')')
  {
    char ch = pattern[curInd++];
    t += " ";
    if (ch == '\\')
    {
      if (curInd + 1 >= (int)pattern.size())
      {
        raiseError();
        return *end = registerNode(new NFACharNode(' '));
      }
      ch = pattern[curInd++];
    }
    t[t.size() - 1] = ch;
  }
  if (curInd >= (int)pattern.size() || pattern[curInd] != ')') raiseError();
  else ++curInd;
  return *end = registerNode(new NFALookBehindNode(t, pos));
}
NFANode * Pattern::parseQuote()
{
  bool done = 0;
  std::string s = "";
 
  while (!done)
  {
    if      (curInd >= (int)pattern.size())
    {
      raiseError();
      done = 1;
    }
    else if (pattern.substr(curInd, 2) == "\\E")
    {
      curInd += 2;
      done = 1;
    }
    else if (pattern[curInd] == '\\')
    {
      s += " ";
      s[s.size() - 1] = pattern[++curInd];
      ++curInd;
    }
    else
    {
      s += " ";
      s[s.size() - 1] = pattern[curInd++];
    }
  }
  if ((flags & Pattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteNode(s));
  return registerNode(new NFAQuoteNode(s));
}
NFANode * Pattern::parse(const bool inParen, const bool inOr, NFANode ** end, const int orGroup, const bool inOrCap)
{
  NFANode * start, * cur, * next = NULL;
  std::string t;
  int grc = groupCount++;
  bool inv, quo;
  bool ahead = false, pos = false, noncap = false, indep = false;
  unsigned long oldFlags = flags;
 
  if (inParen)
  {
    if (inOr)
    {
      --groupCount;
      noncap = !inOrCap;
      grc = orGroup;
      if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc));
      else        cur = start = registerNode(new NFASubStartNode);
    }
    else if (pattern[curInd] == '?')
    {
      ++curInd;
      --groupCount;
      if      (pattern[curInd]           == ':')  { noncap = 1; ++curInd;     grc = --nonCapGroupCount; }
      else if (pattern[curInd]           == '=')  { ++curInd;     ahead = 1;  pos = 1;                  }
      else if (pattern[curInd]           == '!')  { ++curInd;     ahead = 1;  pos = 0;                  }
      else if (pattern.substr(curInd, 2) == "<=") { curInd += 2;  return parseBehind(1, end);           }
      else if (pattern.substr(curInd, 2) == "<!") { curInd += 2;  return parseBehind(0, end);           }
      else if (pattern[curInd]           == '>')  { ++curInd;     indep = 1;                            }
      else
      {
        bool negate = false, done = false;
        while (!done)
        {
          if (curInd >= (int)pattern.size())
          {
            raiseError();
            return NULL;
          }
          else if (negate)
          {
            switch (pattern[curInd])
            {
            case 'i': flags &= ~Pattern::CASE_INSENSITIVE;    break;
            case 'd': flags &= ~Pattern::UNIX_LINE_MODE;      break;
            case 'm': flags &= ~Pattern::MULTILINE_MATCHING;  break;
            case 's': flags &= ~Pattern::DOT_MATCHES_ALL;     break;
            case ':': done = true;                            break;
            case ')':
              ++curInd;
              *end = registerNode(new NFALookBehindNode("", true));
              return *end;
            case '-':
            default: raiseError(); return NULL;
            }
          }
          else
          {
            switch (pattern[curInd])
            {
            case 'i': flags |= Pattern::CASE_INSENSITIVE;     break;
            case 'd': flags |= Pattern::UNIX_LINE_MODE;       break;
            case 'm': flags |= Pattern::MULTILINE_MATCHING;   break;
            case 's': flags |= Pattern::DOT_MATCHES_ALL;      break;
            case ':': done = true;                            break;
            case '-': negate = true;                          break;
            case ')':
              ++curInd;
              *end = registerNode(new NFALookBehindNode("", true));
              return *end;
            default:  raiseError(); return NULL;
            }
          }
          ++curInd;
        }
        noncap = 1;
        grc = --nonCapGroupCount;
      }
      if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc));
      else        cur = start = registerNode(new NFASubStartNode);
    }
    else cur = start = registerNode(new NFAGroupHeadNode(grc));
  }
  else cur = start = registerNode(new NFASubStartNode);
  while (curInd < (int)pattern.size())
  {
    char ch = pattern[curInd++];
 
    next = NULL;
    if (error) return NULL;
    switch (ch)
    {
    case '^':
      if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineNode);
      else                                            next = registerNode(new NFAStartOfInputNode);
      break;
    case '$':
      if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineNode);
      else                                            next = registerNode(new NFAEndOfInputNode(0));
      break;
    case '|':
      cur->next = registerNode(new NFAAcceptNode);
      cur = start = registerNode(new NFAOrNode(start, parse(inParen, true, NULL, grc, !noncap)));
      break;
    case '\\':
      if      (curInd < (int)pattern.size())
      {
        bool eoi = 0;
        switch (pattern[curInd])
        {
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9': next = parseBackref(); break;
        case 'A': ++curInd; next = registerNode(new NFAStartOfInputNode);     break;
        case 'B': ++curInd; next = registerNode(new NFAWordBoundaryNode(0));  break;
        case 'b': ++curInd; next = registerNode(new NFAWordBoundaryNode(1));  break;
        case 'G': ++curInd; next = registerNode(new NFAEndOfMatchNode);       break;
        case 'Z': eoi = 1;
        case 'z': ++curInd; next = registerNode(new NFAEndOfInputNode(eoi));  break;
        default:
          t = parseEscape(inv, quo);
          if (!quo)
          {
            if (t.size() > 1 || inv)
            {
              if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassNode(t, inv));
              else                                          next = registerNode(new NFAClassNode(t, inv));
            }
            else
            {
              next = registerNode(new NFACharNode(t[0]));
            }
          }
          else
          {
            next = parseQuote();
          }
        }
      }
      else raiseError();
      break;
    case '[':
      if ((flags & Pattern::CASE_INSENSITIVE) == 0)
      {
        NFAClassNode * clazz = new NFAClassNode();
        std::string s = parseClass();
        for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1;
        next = registerNode(clazz);
      }
      else
      {
        NFACIClassNode * clazz = new NFACIClassNode();
        std::string s = parseClass();
        for (int i = 0; i < (int)s.size(); ++i) clazz->vals[tolower(s[i])] = 1;
        next = registerNode(clazz);
      }
      break;
    case '.':
      {
        bool useN = 1, useR = 1;
        NFAClassNode * clazz = new NFAClassNode(1);
        if ((flags & Pattern::UNIX_LINE_MODE)  != 0) useR = 0;
        if ((flags & Pattern::DOT_MATCHES_ALL) != 0) useN = useR = 0;
        if (useN) clazz->vals['\n'] = 1;
        if (useR) clazz->vals['\r'] = 1;
        next = registerNode(clazz);
      }
      break;
    case '(':
      {
        NFANode * end, * t1, * t2;
        t1 = parse(true, false, &end);
        if (!t1) raiseError();
        else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL)
        {
          cur->next = t2;
          cur = t2->next;
        }
        else
        {
          cur->next = t1;
          cur = end;
        }
      }
      break;
    case ')':
      if (!inParen) raiseError();
      else if (inOr)
      {
        --curInd;
        cur = cur->next = registerNode(new NFAAcceptNode);
        flags = oldFlags;
        return start;
      }
      else
      {
        if (ahead)
        {
          cur = cur->next = registerNode(new NFAAcceptNode);
          flags = oldFlags;
          return *end = registerNode(new NFALookAheadNode(start, pos));
        }
        else if (indep)
        {
          cur = cur->next = registerNode(new NFAAcceptNode);
          flags = oldFlags;
          return *end = registerNode(new NFAPossessiveQuantifierNode(this, start, 1, 1));
        }
        else // capping or noncapping, it doesnt matter
        {
          *end = cur = cur->next = registerNode(new NFAGroupTailNode(grc));
          next = quantifyGroup(start, *end, grc);
          if (next)
          {
            start = next;
            *end = next->next;
          }
          flags = oldFlags;
          return start;
        }
      }
      break;
    case '{': // registered pattern
      cur->next = parseRegisteredPattern(&next);
      if (cur->next) cur = next;
      break;
    case '*':
    case '+':
    case '?':
    case '}':
    case ']':
      raiseError();
      break;
    default:
      if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharNode(ch));
      else                                          next = registerNode(new NFACharNode(ch));
      break;
    }
    NFAOrNode * orNode = dynamic_cast<NFAOrNode *>(cur);
    if (orNode) // an orNode will never have a "next" set. thus we don't have to worry about problems with
    {           // quantifying an or node.
      NFANode * one = orNode->one, * two = orNode->two;
      while (one->next) one = one->next;
      while (two->next) two = two->next;
      one->next = two->next = cur = registerNode(new NFAAcceptNode);
    }
    if (next)
    {
      cur = cur->next = quantify(next);
    }
  }
  if (inParen) raiseError();
  else
  {
    if (inOr) cur = cur->next = registerNode(new NFAAcceptNode);
    if (end) *end = cur;
  }
 
  flags = oldFlags;
  if (error) return NULL;
 
  return start;
}
 
Pattern * Pattern::compile(const std::string & pattern, const unsigned long mode)
{
  Pattern * p = new Pattern(pattern);
  NFANode * end;
 
  p->flags = mode;
  if ((mode & Pattern::LITERAL) != 0)
  {
    p->head = p->registerNode(new NFAStartNode);
    if ((mode & Pattern::CASE_INSENSITIVE) != 0)  p->head->next = p->registerNode(new NFACIQuoteNode(pattern));
    else                                          p->head->next = p->registerNode(new NFAQuoteNode(pattern));
    p->head->next->next = p->registerNode(new NFAEndNode);
  }
  else
  {
    p->head = p->parse(0, 0, &end);
    if (!p->head)
    {
      delete p;
      p = NULL;
    }
    else
    {
      if (!(p->head && p->head->isStartOfInputNode()))
      {
        NFANode * n = p->registerNode(new NFAStartNode);
        n->next = p->head;
        p->head = n;
      }
      end->next = p->registerNode(new NFAEndNode);
    }
  }
  if (p != NULL)
  {
    p->matcher = new Matcher(p, "");
  }
 
  return p;
}
 
Pattern * Pattern::compileAndKeep(const std::string & pattern, const unsigned long mode)
{
  Pattern * ret = NULL;
  std::map<std::string, Pattern*>::iterator it = compiledPatterns.find(pattern);
 
  if (it != compiledPatterns.end())
  {
    ret = it->second;
  }
  else
  {
    ret = compile(pattern, mode);
    compiledPatterns[pattern] = ret;
  }
 
  return ret;
}
std::string Pattern::replace(const std::string & pattern, const std::string & str,
                             const std::string & replacementText, const unsigned long mode)
{
  std::string ret;
  Pattern * p = Pattern::compile(pattern, mode);
  if (p)
  {
    ret = p->replace(str, replacementText);
    delete p;
  }
  return ret;
}
 
std::vector<std::string> Pattern::split(const std::string & pattern, const std::string & str, const bool keepEmptys,
                              const unsigned long limit, const unsigned long mode)
{
  std::vector<std::string> ret;
  Pattern * p = Pattern::compile(pattern, mode);
  if (p)
  {
    ret = p->split(str, keepEmptys, limit);
    delete p;
  }
  return ret;
}
 
std::vector<std::string> Pattern::findAll(const std::string & pattern, const std::string & str, const unsigned long mode)
{
  std::vector<std::string> ret;
  Pattern * p = Pattern::compile(pattern, mode);
  if (p)
  {
    ret = p->findAll(str);
    delete p;
  }
  return ret;
}
 
bool Pattern::matches(const std::string & pattern, const std::string & str, const unsigned long mode)
{
  bool ret = 0;
  Pattern * p = compile(pattern, mode);
 
  if (p)
  {
    ret = p->matches(str);
    delete p;
  }
 
  return ret;
}
 
bool Pattern::registerPattern(const std::string & name, const std::string & pattern, const unsigned long mode)
{
  Pattern * p = Pattern::compile(pattern, mode);
  if (!p) return 0;
  Pattern::registeredPatterns[name] = std::make_pair(pattern, mode);
  delete p;
  return 1;
}
 
void Pattern::unregisterPatterns()
{
  registeredPatterns.clear();
}
void Pattern::clearPatternCache()
{
  std::map<std::string, Pattern*>::iterator it;
  for (it = compiledPatterns.begin(); it != compiledPatterns.end(); ++it)
  {
    delete it->second;
  }
  compiledPatterns.clear();
}
 
std::pair<std::string, int>  Pattern::findNthMatch(const std::string & pattern, const std::string & str,
                                         const int matchNum, const unsigned long mode)
{
  std::pair<std::string, int> ret;
  Pattern * p = Pattern::compile(pattern, mode);
 
  ret.second = -1;
  if (p)
  {
    int i = -1;
    p->matcher->setString(str);
    while (i < matchNum && p->matcher->findNextMatch()) { ++i; }
    if (i == matchNum && p->matcher->getStartingIndex() >= 0)
    {
      ret.first = p->matcher->getGroup(0);
      ret.second = p->matcher->getStartingIndex();
    }
    delete p;
  }
 
  return ret;
}
 
Pattern::~Pattern()
{
  if (matcher) delete matcher;
  for (std::map<NFANode*, bool>::iterator it = nodes.begin(); it != nodes.end(); ++it)
  {
    delete it->first;
  }
}
std::string Pattern::replace(const std::string & str, const std::string & replacementText)
{
  int li = 0;
  std::string ret = "";
 
  matcher->setString(str);
  while (matcher->findNextMatch())
  {
    ret += str.substr(li, matcher->getStartingIndex() - li);
    ret += matcher->replaceWithGroups(replacementText);
    li = matcher->getEndingIndex();
  }
  ret += str.substr(li);
 
  return ret;
}
std::vector<std::string> Pattern::split(const std::string & str, const bool keepEmptys, const unsigned long limit)
{
  unsigned long lim = (limit == 0 ? MAX_QMATCH : limit);
  int li = 0;
  std::vector<std::string> ret;
 
  matcher->setString(str);
 
  while (matcher->findNextMatch() && ret.size() < lim)
  {
    if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back("");
    if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys)
    {
      if (li != matcher->getStartingIndex() || keepEmptys)
      {
        ret.push_back(str.substr(li, matcher->getStartingIndex() - li));
      }
      li = matcher->getEndingIndex();
    }
  }
  if (li < (int)str.size()) ret.push_back(str.substr(li));
 
  return ret;
}
std::vector<std::string> Pattern::findAll(const std::string & str)
{
  matcher->setString(str);
  return matcher->findAll();
}
bool Pattern::matches(const std::string & str)
{
  matcher->setString(str);
  return matcher->matches();
}
unsigned long Pattern::getFlags() const
{
  return flags;
}
std::string Pattern::getPattern() const
{
  return pattern;
}
Matcher * Pattern::createMatcher(const std::string & str)
{
  return new Matcher(this, str);
}
void Pattern::print()
{
  printf("Pattern(%s):\n", pattern.c_str());
  if (head) head->print(1);
}
 
 
 
 
 
 
 
static char * getPrintChar(const char ch)
{
  static char buf[1024];
  if      (ch == '0')   strcpy(buf, "\\0");
  else if (ch == '\r'strcpy(buf, "\\r");
  else if (ch == '\t'strcpy(buf, "\\t");
  else if (ch == '\n'strcpy(buf, "\\n");
  else if (ch == 0x0C)  strcpy(buf, "\\p");
  else if (ch < ' ' || ch >= 0x80) sprintf(buf, "0x%02X", ch);
  else                  sprintf(buf, "%c", ch);
  return buf;
}
static void appendRawToClass(const int ch, char * const buf, int & curInd)
{
  switch (ch)
  {
  case '(':
  case ')':
  case '\\':
  case '-':
  case '[':
  case ']':
    buf[curInd++] = '\\';
  default:;
    buf[curInd++] = (char)ch;
  }
}
static void appendToClass0(const int j, int & start, const bool raw, const bool allowRanges, char * const buf, int & curInd)
{
  if (start != -1)
  {
    if (start + 1 == j)
    {
      if (raw)
      {
        appendRawToClass(start, buf, curInd);
      }
      else
      {
        sprintf(buf + curInd, "\\x%02X", start);
        curInd += 4;
      }
    }
    else if (start + 2 == j)
    {
      if (raw)
      {
        appendRawToClass(start, buf, curInd);
        appendRawToClass(start + 1, buf, curInd);
      }
      else
      {
        sprintf(buf + curInd, "\\x%02X\\x%02X", start, start + 1);
        curInd += 8;
      }
    }
    else
    {
      if (allowRanges)
      {
        if (raw)
        {
          appendRawToClass(start, buf, curInd);
          buf[curInd++] = '-';
          appendRawToClass(j - 1, buf, curInd);
        }
        else
        {
          sprintf(buf + curInd, "\\x%02X-\\x%02X", start, j - 1);
          curInd += 9;
        }
      }
      else
      {
        for (int k = start; k < j; ++k)
        {
          appendRawToClass(k, buf, curInd);
        }
      }
    }
  }
  start = -1;
}
static void appendToClass(const std::map<char, bool> & chars, const int * const range, const bool raw, const bool allowRanges, char * const buf, int & curInd)
{
  int start = -1, j;
  for (j = range[0]; j < range[1]; ++j)
  {
    if (chars.find((char)j) != chars.end())
    {
      if (start == -1) start = j;
    }
    else
    {
      appendToClass0(j, start, raw, allowRanges, buf, curInd);
    }
  }
  appendToClass0(j, start, raw, allowRanges, buf, curInd);
}
static char * getClassDesc(const std::map<char, bool> & chars, const bool inv, const bool firstPass = true)
{
  static char buf[1024];
  int curInd = 0;
  bool ok[256] = { false };
 
  buf[curInd++] = '[';
  if (inv)
  {
    buf[curInd++] = '^';
  }
  // check for lower case letters
  if (chars.find(' ')  != chars.end()) { buf[curInd++] = ' '; }
  if (chars.find(0x00) != chars.end()) { strcpy(buf + curInd, "\\0"); curInd += 2; }
  if (chars.find('\t') != chars.end()) { strcpy(buf + curInd, "\\t"); curInd += 2; }
  if (chars.find('\r') != chars.end()) { strcpy(buf + curInd, "\\r"); curInd += 2; }
  if (chars.find('\n') != chars.end()) { strcpy(buf + curInd, "\\n"); curInd += 2; }
  if (chars.find(0x0c) != chars.end()) { strcpy(buf + curInd, "\\p"); curInd += 2; }
  const int HEX_RANGES[][2] =
  {
    { 0x01, 0x09 },
    { 0x0B, 0x0B },
    { 0x0E, 0x20 },
    { 0x80, 0x100 },
  };
  const int RAW_RANGES[][2] =
  {
    { 0x20,     '0'     },
    { '0',      '9' + 1 },
    { '9' + 1,  'A'     },
    { 'A',      'Z' + 1 },
    { 'Z' + 1,  'a'     },
    { 'a',      'z' + 1 },
    { 'z' + 1,  0x80    },
  };
  const bool ALLOW_RANGES[] =
  {
    false,
    true,
    false,
    true,
    false,
    true,
    false,
  };
  const int NUM_HEX = sizeof(HEX_RANGES) / sizeof(HEX_RANGES[0]);
  const int NUM_RAW = sizeof(RAW_RANGES) / sizeof(RAW_RANGES[0]);
 
  for (int i = 0; i < NUM_HEX; ++i)
  {
    appendToClass(chars, HEX_RANGES[i], false, true, buf, curInd);
  }
  for (int i = 0; i < NUM_RAW; ++i)
  {
    appendToClass(chars, RAW_RANGES[i], true, ALLOW_RANGES[i], buf, curInd);
  }
  buf[curInd++] = ']';
  buf[curInd++] = 0;
  if (firstPass)
  {
    char newBuf[1024];
    std::map<char, bool> invChars;
    for (int i = 0; i < 256; ++i)
    {
      if (chars.find((char)i) == chars.end()) invChars[(char)i] = true;
    }
    strcpy(newBuf, buf);
    getClassDesc(invChars, !inv, false);
    if (strlen(buf) > strlen(newBuf)) strcpy(buf, newBuf);
  }
  return buf;
}
 
// NFANode
 
NFANode::NFANode() { next = NULL; }
NFANode::~NFANode() { }
void NFANode::findAllNodes(std::map<NFANode*, bool> & soFar)
{
  if (soFar.find(this) == soFar.end()) return;
  soFar[this] = 1;
  if (next) next->findAllNodes(soFar);
}
void NFANode::print(const int indent)
{
  printf("%*c%s\n", indent * 2, ' ', typeid(*this).name());
  if (next) next->print(indent);
}
 
// NFACharNode
 
NFACharNode::NFACharNode(const char c) { ch = c; }
int NFACharNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1, depth + 1);
  return -1;
}
void NFACharNode::print(const int indent)
{
  printf("%*c%s(0x%02X)\n", indent * 2, ' ', typeid(*this).name(), ch);
  if (next) next->print(indent);
}
 
// NFACICharNode
 
NFACICharNode::NFACICharNode(const char c) { ch = tolower(c); }
int NFACICharNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd < (int)str.size() && tolower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1, depth + 1);
  return -1;
}
void NFACICharNode::print(const int indent)
{
  printf("%*c%s(0x%02X)\n", indent * 2, ' ', typeid(*this).name(), ch);
  if (next) next->print(indent);
}
 
// NFAStartNode
 
NFAStartNode::NFAStartNode() { }
int NFAStartNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ret = -1, ci = curInd;
 
  matcher->starts[0] = curInd;
  if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) == (unsigned int)Matcher::MATCH_ENTIRE_STRING)
  {
    if (curInd != 0)
    {
      matcher->starts[0] = -1;
      return -1;
    }
    return next->match(str, matcher, 0, depth + 1);
  }
  while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size())
  {
    matcher->clearGroups();
    matcher->starts[0] = ++ci;
  }
  if (ret < 0) matcher->starts[0] = -1;
  return ret;
}
 
// NFAEndNode
 
NFAEndNode::NFAEndNode() { }
int NFAEndNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  matcher->ends[0] = curInd;
  if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) != 0)
  {
    if (curInd == (int)str.size()) return curInd;
    matcher->ends[0] = -1;
    return -1;
  }
  return curInd;
}
 
// NFAQuantifierNode
 
void NFAQuantifierNode::findAllNodes(std::map<NFANode*, bool> & soFar)
{
  inner->findAllNodes(soFar);
  NFANode::findAllNodes(soFar);
}
NFAQuantifierNode::NFAQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
{
  inner = internal;
  inner->next = pat->registerNode(new NFAAcceptNode);
  min = (minMatch < Pattern::MIN_QMATCH) ? Pattern::MIN_QMATCH : minMatch;
  max = (maxMatch > Pattern::MAX_QMATCH) ? Pattern::MAX_QMATCH : maxMatch;
}
 
int NFAQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int i0, i1, i2 = 0;
 
  i0 = i1 = curInd;
  while (i2 < min)
  {
    ++i2;
    i1 = inner->match(str, matcher, i0, depth + 1);
    if (i1 <= i0) return i1; // i1 < i0 means i1 is -1
    i0 = i1;
  }
 
  return i1;
}
void NFAQuantifierNode::print(const int indent)
{
  printf("%*c%s(min=%d, max=%d)\n", indent * 2, ' ', typeid(*this).name(), min, max);
  printf("%*cinner\n", (indent + 1) * 2, ' ');
  if (inner) inner->print(indent + 2);
  if (next) next->print(indent);
}
 
// NFAGreedyQuantifierNode
NFAGreedyQuantifierNode::NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
                        : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
int NFAGreedyQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int t = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);
  if (t != -1) return matchInternal(str, matcher, t, min, depth + 1);
  return t;
}
int NFAGreedyQuantifierNode::matchInternal(const std::string & str, Matcher * matcher, const int curInd, const int soFar, const int depth) const
{
  if (soFar >= max) return next->match(str, matcher, curInd, depth + 1);
 
  int i, j;
 
  i = inner->match(str, matcher, curInd, depth + 1);
  if (i != -1)
  {
    j = matchInternal(str, matcher, i, soFar + 1, depth + 1);
    if (j != -1) return j;
  }
  return next->match(str, matcher, curInd, depth + 1);
}
 
// NFALazyQuantifierNode
 
NFALazyQuantifierNode::NFALazyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
                      : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
int NFALazyQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int i, j, m = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);
 
  if (m == -1) return -1;
 
  for (i = min; i < max; ++i)
  {
    j = next->match(str, matcher, m, depth + 1);
    if (j == -1)
    {
      j = inner->match(str, matcher, m, depth + 1);
      // if j < m, then j is -1, so we bail.
      // if j == m, then we would just go and call next->match on the same index,
      // but it already failed trying to match right there, so we know we can
      // just bail
      if (j <= m) return -1;
      m = j;
    }
    else return j;
  }
  return next->match(str, matcher, m, depth + 1);
}
 
// NFAPossessiveQuantifierNode
 
NFAPossessiveQuantifierNode::NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)
                            : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }
int NFAPossessiveQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int i, j, m = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);
 
  if (m == -1) return -1;
  for (i = min; i < max; ++i)
  {
    j = inner->match(str, matcher, m, depth + 1);
    if (j <= m) return next->match(str, matcher, m, depth + 1);
    m = j;
  }
  return next->match(str, matcher, m, depth + 1);
}
 
// NFAAcceptNode
 
NFAAcceptNode::NFAAcceptNode() { }
int NFAAcceptNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (!next) return curInd;
  else return next->match(str, matcher, curInd, depth + 1);
}
 
// NFAClassNode
 
NFAClassNode::NFAClassNode(const bool invert)
{
  inv = invert;
}
NFAClassNode::NFAClassNode(const std::string & clazz, const bool invert)
{
  inv = invert;
  for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1;
}
int NFAClassNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv))
  {
    return next->match(str, matcher, curInd + 1, depth + 1);
  }
  return -1;
}
void NFAClassNode::print(const int indent)
{
  printf("%*c%s(%d vals, inv=%s)\n", indent * 2, ' ', typeid(*this).name(), (int)vals.size(), inv ? "true" : "false");
  if (next) next->print(indent);
}
 
// NFACIClassNode
 
NFACIClassNode::NFACIClassNode(const bool invert)
{
  inv = invert;
}
NFACIClassNode::NFACIClassNode(const std::string & clazz, const bool invert)
{
  inv = invert;
  for (int i = 0; i < (int)clazz.size(); ++i) vals[tolower(clazz[i])] = 1;
}
int NFACIClassNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd < (int)str.size() && ((vals.find(tolower(str[curInd])) != vals.end()) ^ inv))
  {
    return next->match(str, matcher, curInd + 1, depth + 1);
  }
  return -1;
}
void NFACIClassNode::print(const int indent)
{
  if (next) next->print(indent);
}
 
#undef to_lower
 
// NFASubStartNode
 
NFASubStartNode::NFASubStartNode() { }
int NFASubStartNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  return next->match(str, matcher, curInd, depth + 1);
}
 
// NFAOrNode
 
NFAOrNode::NFAOrNode(NFANode * first, NFANode * second) : one(first), two(second) { }
void NFAOrNode::findAllNodes(std::map<NFANode*, bool> & soFar)
{
  if (one) one->findAllNodes(soFar);
  if (two) two->findAllNodes(soFar);
  NFANode::findAllNodes(soFar);
}
int NFAOrNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ci = one->match(str, matcher, curInd, depth + 1);
  if (ci != -1) return ci;
  return two->match(str, matcher, curInd, depth + 1);
/*
  int ci = one->match(str, matcher, curInd, depth + 1);
  if (ci != -1)
  {
    ci = next->match(str, matcher, ci, depth + 1);
    if (ci != -1) return ci;
  }
  ci = two->match(str, matcher, curInd, depth + 1);
  if (ci != -1) ci = next->match(str, matcher, ci, depth + 1);
  return -1;
*/
}
void NFAOrNode::print(const int indent)
{
  printf("%*c%s\n", indent * 2, ' ', typeid(*this).name());
  printf("%*cone\n", (indent + 1) * 2, ' ');
  one->print(indent + 2);
  printf("%*ctwo\n", (indent + 1) * 2, ' ');
  two->print(indent + 2);
}
 
// NFAQuoteNode
 
NFAQuoteNode::NFAQuoteNode(const std::string & quoted) : qStr(quoted) { }
int NFAQuoteNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd + qStr.size() > str.size())       return -1;
  if (str.substr(curInd, qStr.size()) != qStr) return -1;
  return next->match(str, matcher, curInd + qStr.size(), depth + 1);
}
void NFAQuoteNode::print(const int indent)
{
  printf("%*c%s(%s)\n", indent * 2, ' ', typeid(*this).name(), qStr.c_str());
  if (next) next->print(indent);
}
 
// NFACIQuoteNode
 
NFACIQuoteNode::NFACIQuoteNode(const std::string & quoted) : qStr(quoted) { }
int NFACIQuoteNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd + qStr.size() > str.size()) return -1;
  if (str_icmp(str.substr(curInd, qStr.size()).c_str(),  qStr.c_str())) return -1;
  return next->match(str, matcher, qStr.size(), depth + 1);
}
void NFACIQuoteNode::print(const int indent)
{
  printf("%*c%s(%s)\n", indent * 2, ' ', typeid(*this).name(), qStr.c_str());
  if (next) next->print(indent);
}
 
// NFALookAheadNode
 
NFALookAheadNode::NFALookAheadNode(NFANode * internal, const bool positive) : NFANode(), pos(positive), inner(internal) { }
void NFALookAheadNode::findAllNodes(std::map<NFANode*, bool> & soFar)
{
  if (inner) inner->findAllNodes(soFar);
  NFANode::findAllNodes(soFar);
}
int NFALookAheadNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  return ((inner->match(str, matcher, curInd, depth + 1) == -1) ^ pos) ? next->match(str, matcher, curInd, depth + 1) : -1;
}
void NFALookAheadNode::print(const int indent)
{
  printf("%*c%s(pos=%s)\n", indent * 2, ' ', typeid(*this).name(), pos ? "true" : "false");
  inner->print(indent + 1);
  if (next) next->print(indent);
}
 
// NFALookBehindNode
 
NFALookBehindNode::NFALookBehindNode(const std::string & str, const bool positive) : pos(positive), mStr(str) { }
int NFALookBehindNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (pos)
  {
    if (curInd < (int)mStr.size()) return -1;
    if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd, depth + 1);
  }
  else
  {
    if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd, depth + 1);
    if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1;
     return next->match(str, matcher, curInd, depth + 1);
  }
  return -1;
}
void NFALookBehindNode::print(const int indent)
{
  printf("%*c%s(str=%s, pos=%s)\n", indent * 2, ' ', typeid(*this).name(), mStr.c_str(), pos ? "true" : "false");
  if (next) next->print(indent);
}
 
// NFAStartOfLineNode
 
NFAStartOfLineNode::NFAStartOfLineNode() { }
int NFAStartOfLineNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd == 0 || str[curInd - 1] == '\n' || str[curInd - 1] == '\r')
  {
    return next->match(str, matcher, curInd, depth + 1);
  }
  return -1;
}
 
// NFAEndOfLineNode
 
NFAEndOfLineNode::NFAEndOfLineNode() { }
int NFAEndOfLineNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd >= (int)str.size() || str[curInd] == '\n' || str[curInd] == '\r')
  {
    return next->match(str, matcher, curInd, depth + 1);
  }
  return -1;
}
 
// NFAReferenceNode
 
NFAReferenceNode::NFAReferenceNode(const int groupIndex) : gi(groupIndex) { }
int NFAReferenceNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int len = matcher->ends[gi] - matcher->starts[gi];
  int ni = -1;
  if      (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0)   ni = curInd;
  else if (curInd + len > (int)str.size())                                  return -1;
  else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1;
  else                                                                      ni = curInd + len;
 
  return next->match(str, matcher, ni, depth + 1);
}
void NFAReferenceNode::print(const int indent)
{
  printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);
  if (next) next->print(indent);
}
 
// NFAStartOfInputNode
 
NFAStartOfInputNode::NFAStartOfInputNode() { }
int NFAStartOfInputNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd == 0) return next->match(str, matcher, curInd, depth + 1);
  return -1;
}
 
// NFAEndOfInputNode
 
NFAEndOfInputNode::NFAEndOfInputNode(const bool lookForTerm) : term(lookForTerm) { }
int NFAEndOfInputNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int len = (int)str.size();
  if      (curInd == len) return next->match(str, matcher, curInd, depth + 1);
  else if (term)
  {
    if      (curInd == len - 1 && (str[curInd] == '\r' || str[curInd] == '\n'))
    {
      return next->match(str, matcher, curInd, depth + 1);
    }
    else if (curInd == len - 2 && str.substr(curInd, 2) == "\r\n")
    {
      return next->match(str, matcher, curInd, depth + 1);
    }
  }
  return -1;
}
void NFAEndOfInputNode::print(const int indent)
{
  printf("%*c%s(term=%s)\n", indent * 2, ' ', typeid(*this).name(), term ? "true" : "false");
  if (next) next->print(indent);
}
 
// NFAWordBoundaryNode
 
NFAWordBoundaryNode::NFAWordBoundaryNode(const bool positive) : pos(positive) { }
int NFAWordBoundaryNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  #define is_alpha(x) (((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
 
  int len = (int)str.size();
  bool ok = 0;
  char c1 = (curInd - 1 < len) ? str[curInd - 1] : -1;
  char c2 = (curInd     < len) ? str[curInd    ] : -1;
 
  if      (curInd == len) return next->match(str, matcher, curInd, depth + 1);
  if      (is_alpha(c1) ^ is_alpha(c2)) ok = 1;
  if (ok && pos) return next->match(str, matcher, curInd, depth + 1);
  return -1;
 
  #undef is_alpha
}
void NFAWordBoundaryNode::print(const int indent)
{
  printf("%*c%s(pos=%s)\n", indent * 2, ' ', typeid(*this).name(), pos ? "true" : "false");
  if (next) next->print(indent);
}
 
// NFAEndOfMatchNode
 
NFAEndOfMatchNode::NFAEndOfMatchNode() { }
int NFAEndOfMatchNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  if (curInd == matcher->lm) return next->match(str, matcher, curInd, depth + 1);
  return -1;
}
 
// NFAGroupHeadNode
 
NFAGroupHeadNode::NFAGroupHeadNode(const int groupIndex) : gi(groupIndex) { }
int NFAGroupHeadNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ret, o = matcher->starts[gi];
 
  matcher->starts[gi] = curInd;
  ret = next->match(str, matcher, curInd, depth + 1);
  if (ret < 0) matcher->starts[gi] = o;
 
  return ret;
}
void NFAGroupHeadNode::print(const int indent)
{
  printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);
  if (next) next->print(indent);
}
 
// NFAGroupTailNode
 
NFAGroupTailNode::NFAGroupTailNode(const int groupIndex) : gi(groupIndex) { }
int NFAGroupTailNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ret, o = matcher->ends[gi];
  matcher->ends[gi] = curInd;
  ret = next->match(str, matcher, curInd, depth + 1);
  if (ret < 0) matcher->ends[gi] = o;
 
  return ret;
}
void NFAGroupTailNode::print(const int indent)
{
  printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);
  if (next) next->print(indent);
}
 
// NFAGroupLoopPrologueNode
 
NFAGroupLoopPrologueNode::NFAGroupLoopPrologueNode(const int groupIndex) : gi(groupIndex) { }
int NFAGroupLoopPrologueNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi];
 
  matcher->groups[gi] = 0;
  matcher->groupPos[gi] = 0;
  matcher->groupIndeces[gi] = -1;
  ret = next->match(str, matcher, curInd, depth + 1);
  if (ret < 0)
  {
    matcher->groups[gi] = o1;
    matcher->groupPos[gi] = o2;
    matcher->groupIndeces[gi] = o3;
  }
 
  return ret;
}
void NFAGroupLoopPrologueNode::print(const int indent)
{
  printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);
  if (next) next->print(indent);
}
 
// NFAGroupLoopNode
 
NFAGroupLoopNode::NFAGroupLoopNode(NFANode * internal, const int minMatch, const int maxMatch,
                                   const int groupIndex, const int matchType)
{
  inner = internal;
  min = minMatch;
  max = maxMatch;
  gi = groupIndex;
  type = matchType;
}
void NFAGroupLoopNode::findAllNodes(std::map<NFANode*, bool> & soFar)
{
  if (inner) inner->findAllNodes(soFar);
  NFANode::findAllNodes(soFar);
}
int NFAGroupLoopNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  bool b = (curInd > matcher->groupIndeces[gi]);
 
  if (b && matcher->groups[gi] < min)
  {
    ++matcher->groups[gi];
    int o = matcher->groupIndeces[gi];
    matcher->groupIndeces[gi] = curInd;
    int ret = inner->match(str, matcher, curInd, depth + 1);
    if (ret < 0)
    {
      matcher->groupIndeces[gi] = o;
      --matcher->groups[gi];
    }
    return ret;
  }
  else if (!b || matcher->groups[gi] >= max)
  {
    return next->match(str, matcher, curInd, depth + 1);
  }
  else
  {
    switch (type)
    {
    case 0: return matchGreedy(str, matcher, curInd, depth + 1);
    case 1: return matchLazy(str, matcher, curInd, depth + 1);
    case 2: return matchPossessive(str, matcher, curInd, depth + 1);
    }
  }
  return -1;
}
int NFAGroupLoopNode::matchGreedy(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int o = matcher->groupIndeces[gi];            // save our info for backtracking
  matcher->groupIndeces[gi] = curInd;           // move along
  ++matcher->groups[gi];
  int ret = inner->match(str, matcher, curInd, depth + 1); // match internally
  if (ret < 0)
  {                                             // if we failed, then restore info and match next
    --matcher->groups[gi];
    matcher->groupIndeces[gi] = o;
    ret = next->match(str, matcher, curInd, depth + 1);
  }
  return ret;
}
int NFAGroupLoopNode::matchLazy(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int ret = next->match(str, matcher, curInd, depth + 1);  // be lazy, just go on
  if (ret < 0)
  {
    int o = matcher->groupIndeces[gi];          // save info for backtracking
    matcher->groupIndeces[gi] = curInd;         // advance our position
    ++matcher->groups[gi];
    ret = inner->match(str, matcher, curInd, depth + 1);   // match our internal stuff
    if (ret < 0)                                // if we failed, then restore the info
    {
      --matcher->groups[gi];
      matcher->groupIndeces[gi] = o;
    }
  }
  return ret;
}
int NFAGroupLoopNode::matchPossessive(const std::string & str, Matcher * matcher, const int curInd, const int depth) const
{
  int o = matcher->groupIndeces[gi];            // save info for backtracking
  matcher->groupPos[gi] = matcher->groups[gi];  // set a flag stating we have matcher at least this much
  matcher->groupIndeces[gi] = curInd;           // move along
  ++matcher->groups[gi];
  int ret = inner->match(str, matcher, curInd, depth + 1); // try and match again
  if (ret < 0)
  {                                             // if we fail, back off, but to an extent
    --matcher->groups[gi];
    matcher->groupIndeces[gi] = o;
    if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd, depth + 1);
  }
  return ret;
}
void NFAGroupLoopNode::print(const int indent)
{
  printf("%*c%s(gi=%d, min=%d, max=%d, type=%s)\n", indent * 2, ' ', typeid(*this).name(), gi, min, max, type == 0 ? "greedy" : type == 1 ? "reluctant" : "possessive");
  if (next) next->print(indent);
}
 
#ifdef _WIN32
  #pragma warning(pop)
#endif

Archive Download this file

Branches

Tags

Page rendered in 1.17176s using 11 queries.