| /**␍␊ |
| From the author (Jeff Stuart)␍␊ |
| "␍␊ |
| Let me start by saying this file is pretty big. If you feel up to it, you can␍␊ |
| try making changes yourself, but you would be better off to just email me at␍␊ |
| stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you␍␊ |
| would like added. This project is very "near and dear" to me, so I am fairly␍␊ |
| quick to make bug fixes. The header files for Pattern and Matcher are fairly␍␊ |
| well documented and the function names are pretty self-explanatory, but if you␍␊ |
| are having any trouble, feel free to email me at stuart@cs.ucdavis.edu.␍␊ |
| ␍␊ |
| If you email me, make sure you put something like C++RE in the subject because␍␊ |
| I tend to delete email if I don't recognize the name and the subject is␍␊ |
| something like "I Need Your Help" or "Got A Second" or "I Found It".␍␊ |
| "␍␊ |
| */␍␊ |
| ␍␊ |
| /*␍␊ |
| Detailed documentation is provided in this class' header file␍␊ |
| ␍␊ |
| @author Jeffery Stuart␍␊ |
| @since November 24th, 2007␍␊ |
| @version 1.09.00␍␊ |
| */␍␊ |
| ␍␊ |
| #ifdef _WIN32␍␊ |
| #pragma warning(push)␍␊ |
| #pragma warning(disable:4996)␍␊ |
| #define str_icmp stricmp␍␊ |
| #else␍␊ |
| #define str_icmp strcasecmp␍␊ |
| #endif␍␊ |
| ␍␊ |
| #include "regexp/Pattern.h"␍␊ |
| #include "regexp/Matcher.h"␍␊ |
| #include <cstdio>␍␊ |
| #include <cstring>␍␊ |
| #include <typeinfo>␍␊ |
| #include <algorithm>␍␊ |
| ␍␊ |
| std::map<std::string, Pattern *> Pattern::compiledPatterns;␍␊ |
| std::map<std::string, std::pair<std::string, unsigned long> > Pattern::registeredPatterns;␍␊ |
| ␍␊ |
| const int Pattern::MIN_QMATCH = 0x00000000;␍␊ |
| const int Pattern::MAX_QMATCH = 0x7FFFFFFF;␍␊ |
| ␍␊ |
| const unsigned long Pattern::CASE_INSENSITIVE = 0x01;␍␊ |
| const unsigned long Pattern::LITERAL = 0x02;␍␊ |
| const unsigned long Pattern::DOT_MATCHES_ALL = 0x04;␍␊ |
| const unsigned long Pattern::MULTILINE_MATCHING = 0x08;␍␊ |
| const unsigned long Pattern::UNIX_LINE_MODE = 0x10;␍␊ |
| ␍␊ |
| Pattern::Pattern(const std::string & rhs)␍␊ |
| {␍␊ |
| matcher = NULL;␍␊ |
| pattern = rhs;␍␊ |
| curInd = 0;␍␊ |
| groupCount = 0;␍␊ |
| nonCapGroupCount = 0;␍␊ |
| error = 0;␍␊ |
| head = NULL;␍␊ |
| }␍␊ |
| // convenience function in case we want to add any extra debugging output␍␊ |
| void Pattern::raiseError()␍␊ |
| {␍␊ |
| switch (pattern[curInd - 1])␍␊ |
| {␍␊ |
| case '*':␍␊ |
| case ')':␍␊ |
| case '+':␍␊ |
| case '?':␍␊ |
| case ']':␍␊ |
| case '}':␍␊ |
| fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');␍␊ |
| fprintf(stderr, "Syntax Error near here. Possible unescaped meta character.\n");␍␊ |
| break;␍␊ |
| default:␍␊ |
| fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');␍␊ |
| fprintf(stderr, "Syntax Error near here. \n");␍␊ |
| break;␍␊ |
| }␍␊ |
| error = 1;␍␊ |
| }␍␊ |
| NFANode * Pattern::registerNode(NFANode * node)␍␊ |
| {␍␊ |
| nodes[node] = 1;␍␊ |
| return node;␍␊ |
| }␍␊ |
| ␍␊ |
| std::string Pattern::classUnion (std::string s1, std::string s2) const␍␊ |
| {␍␊ |
| char out[300];␍␊ |
| std::sort(s1.begin(), s1.end());␍␊ |
| std::sort(s2.begin(), s2.end());␍␊ |
| *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;␍␊ |
| return out;␍␊ |
| }␍␊ |
| std::string Pattern::classIntersect (std::string s1, std::string s2) const␍␊ |
| {␍␊ |
| char out[300];␍␊ |
| std::sort(s1.begin(), s1.end());␍␊ |
| std::sort(s2.begin(), s2.end());␍␊ |
| *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;␍␊ |
| return out;␍␊ |
| }␍␊ |
| std::string Pattern::classNegate (std::string s1) const␍␊ |
| {␍␊ |
| char out[300];␍␊ |
| int i, ind = 0;␍␊ |
| std::map<char, bool> m;␍␊ |
| ␍␊ |
| for (i = 0; i < (int)s1.size(); ++i)␍␊ |
| {␍␊ |
| m[s1[i]] = 1;␍␊ |
| if (flags & CASE_INSENSITIVE)␍␊ |
| {␍␊ |
| if (s1[i] >= 'a' && s1[i] <= 'z') m[s1[i] - 'a' + 'A'] = 1;␍␊ |
| else if (s1[i] >= 'A' && s1[i] <= 'Z') m[s1[i] - 'A' + 'a'] = 1;␍␊ |
| }␍␊ |
| }␍␊ |
| for (i = 0xFF; i >= 0; --i) if (m.find((char)i) == m.end()) out[ind++] = (char)i;␍␊ |
| out[ind] = 0;␍␊ |
| return std::string(out, ind);␍␊ |
| }␍␊ |
| std::string Pattern::classCreateRange(char low, char hi) const␍␊ |
| {␍␊ |
| char out[300];␍␊ |
| int ind = 0;␍␊ |
| while (low != hi) out[ind++] = low++;␍␊ |
| out[ind++] = low;␍␊ |
| return std::string(out, ind);␍␊ |
| }␍␊ |
| ␍␊ |
| int Pattern::getInt(int start, int end)␍␊ |
| {␍␊ |
| int ret = 0;␍␊ |
| for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - '0');␍␊ |
| return ret;␍␊ |
| }␍␊ |
| bool Pattern::quantifyCurly(int & sNum, int & eNum)␍␊ |
| {␍␊ |
| bool good = 1;␍␊ |
| int i, ci = curInd + 1;␍␊ |
| int commaInd = ci, endInd = ci, len = pattern.size();␍␊ |
| sNum = eNum = 0;␍␊ |
| ␍␊ |
| while (endInd < len && pattern[endInd ] != '}') ++endInd;␍␊ |
| while (commaInd < endInd && pattern[commaInd] != ',') ++commaInd;␍␊ |
| if (endInd >= len) { raiseError(); return 0; }␍␊ |
| for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0;␍␊ |
| if (!good && commaInd < endInd) { raiseError(); return 0; }␍␊ |
| if (!good) return 0;␍␊ |
| /* so now everything in here is either a comma (and there is at most one comma) or a digit */␍␊ |
| if (commaInd == ci) // {,*}␍␊ |
| {␍␊ |
| if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = *␍␊ |
| else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+}␍␊ |
| }␍␊ |
| else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,}␍␊ |
| else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+}␍␊ |
| else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+}␍␊ |
| curInd = endInd + 1;␍␊ |
| return 1;␍␊ |
| }␍␊ |
| NFANode * Pattern::quantifyGroup(NFANode * start, NFANode * stop, const int gn)␍␊ |
| {␍␊ |
| NFANode * newNode = NULL;␍␊ |
| int type = 0;␍␊ |
| ␍␊ |
| if (curInd < (int)pattern.size())␍␊ |
| {␍␊ |
| char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];␍␊ |
| switch (pattern[curInd])␍␊ |
| {␍␊ |
| case '*':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; type = 1; break;␍␊ |
| case '+': ++curInd; type = 2; break;␍␊ |
| }␍␊ |
| newNode = registerNode(new NFAGroupLoopPrologueNode(gn));␍␊ |
| newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));␍␊ |
| stop->next = newNode->next;␍␊ |
| return newNode;␍␊ |
| case '?':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; type = 1; break;␍␊ |
| case '+': ++curInd; type = 2; break;␍␊ |
| }␍␊ |
| newNode = registerNode(new NFAGroupLoopPrologueNode(gn));␍␊ |
| newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, 1, gn, type));␍␊ |
| stop->next = newNode->next;␍␊ |
| return newNode;␍␊ |
| case '+':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; type = 1; break;␍␊ |
| case '+': ++curInd; type = 2; break;␍␊ |
| }␍␊ |
| newNode = registerNode(new NFAGroupLoopPrologueNode(gn));␍␊ |
| newNode->next = registerNode(new NFAGroupLoopNode(start, 1, MAX_QMATCH, gn, type));␍␊ |
| stop->next = newNode->next;␍␊ |
| return newNode;␍␊ |
| case '{':␍␊ |
| {␍␊ |
| int s, e;␍␊ |
| if (quantifyCurly(s, e))␍␊ |
| {␍␊ |
| ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; type = 1; break;␍␊ |
| case '+': ++curInd; type = 2; break;␍␊ |
| }␍␊ |
| newNode = registerNode(new NFAGroupLoopPrologueNode(gn));␍␊ |
| newNode->next = registerNode(new NFAGroupLoopNode(start, s, e, gn, type));␍␊ |
| stop->next = newNode->next;␍␊ |
| return newNode;␍␊ |
| }␍␊ |
| }␍␊ |
| default:␍␊ |
| break;␍␊ |
| }␍␊ |
| }␍␊ |
| return NULL;␍␊ |
| }␍␊ |
| ␍␊ |
| NFANode * Pattern::quantify(NFANode * newNode)␍␊ |
| {␍␊ |
| if (curInd < (int)pattern.size())␍␊ |
| {␍␊ |
| char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];␍␊ |
| switch (pattern[curInd])␍␊ |
| {␍␊ |
| case '*':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;␍␊ |
| case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;␍␊ |
| default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;␍␊ |
| }␍␊ |
| break;␍␊ |
| case '?':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break;␍␊ |
| case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, 1)); break;␍␊ |
| default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, MIN_QMATCH, 1)); break;␍␊ |
| }␍␊ |
| break;␍␊ |
| case '+':␍␊ |
| ++curInd;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break;␍␊ |
| case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, 1, MAX_QMATCH)); break;␍␊ |
| default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, 1, MAX_QMATCH)); break;␍␊ |
| }␍␊ |
| break;␍␊ |
| case '{':␍␊ |
| {␍␊ |
| int s, e;␍␊ |
| if (quantifyCurly(s, e))␍␊ |
| {␍␊ |
| ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode (this, newNode, s, e)); break;␍␊ |
| case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, s, e)); break;␍␊ |
| default: newNode = registerNode(new NFAGreedyQuantifierNode (this, newNode, s, e)); break;␍␊ |
| }␍␊ |
| }␍␊ |
| }␍␊ |
| break;␍␊ |
| default:␍␊ |
| break;␍␊ |
| }␍␊ |
| }␍␊ |
| return newNode;␍␊ |
| }␍␊ |
| std::string Pattern::parseClass()␍␊ |
| {␍␊ |
| std::string t, ret = "";␍␊ |
| char ch, c1, c2;␍␊ |
| bool inv = 0, neg = 0, quo = 0;␍␊ |
| ␍␊ |
| if (curInd < (int)pattern.size() && pattern[curInd] == '^')␍␊ |
| {␍␊ |
| ++curInd;␍␊ |
| neg = 1;␍␊ |
| }␍␊ |
| while (curInd < (int)pattern.size() && pattern[curInd] != ']')␍␊ |
| {␍␊ |
| ch = pattern[curInd++];␍␊ |
| if (ch == '[')␍␊ |
| {␍␊ |
| t = parseClass();␍␊ |
| ret = classUnion(ret, t);␍␊ |
| }␍␊ |
| /*else if (ch == '-')␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }*/␍␊ |
| else if (ch == '&' && curInd < (int)pattern.size() && pattern[curInd] == '&')␍␊ |
| {␍␊ |
| if (pattern[++curInd] != '[')␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ++curInd;␍␊ |
| t = parseClass();␍␊ |
| ret = classIntersect(ret, t);␍␊ |
| }␍␊ |
| }␍␊ |
| else if (ch == '\\')␍␊ |
| {␍␊ |
| t = parseEscape(inv, quo);␍␊ |
| if (quo)␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else if (inv || t.size() > 1) // cant be part of a range (a-z)␍␊ |
| {␍␊ |
| if (inv) t = classNegate(t);␍␊ |
| ret = classUnion(ret, t);␍␊ |
| }␍␊ |
| else if (curInd < (int)pattern.size() && pattern[curInd] == '-') // part of a range (a-z)␍␊ |
| {␍␊ |
| c1 = t[0];␍␊ |
| ++curInd;␍␊ |
| if (curInd >= (int)pattern.size()) raiseError();␍␊ |
| else␍␊ |
| {␍␊ |
| c2 = pattern[curInd++];␍␊ |
| if (c2 == '\\')␍␊ |
| {␍␊ |
| t = parseEscape(inv, quo);␍␊ |
| if (quo)␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else if (inv || t.size() > 1) raiseError();␍␊ |
| else ret = classUnion(ret, classCreateRange(c1, c2));␍␊ |
| }␍␊ |
| else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else ret = classUnion(ret, classCreateRange(c1, c2));␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ret = classUnion(ret, t);␍␊ |
| }␍␊ |
| }␍␊ |
| else if (curInd < (int)pattern.size() && pattern[curInd] == '-')␍␊ |
| {␍␊ |
| c1 = ch;␍␊ |
| ++curInd;␍␊ |
| if (curInd >= (int)pattern.size()) raiseError();␍␊ |
| else␍␊ |
| {␍␊ |
| c2 = pattern[curInd++];␍␊ |
| if (c2 == '\\')␍␊ |
| {␍␊ |
| t = parseEscape(inv, quo);␍␊ |
| if (quo)␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else if (inv || t.size() > 1) raiseError();␍␊ |
| else ret = classUnion(ret, classCreateRange(c1, c2));␍␊ |
| }␍␊ |
| else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| curInd = pattern.size();␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ret = classUnion(ret, classCreateRange(c1, c2));␍␊ |
| }␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ret += " ";␍␊ |
| ret[ret.size() - 1] = ch;␍␊ |
| }␍␊ |
| }␍␊ |
| if (curInd >= (int)pattern.size() || pattern[curInd] != ']')␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| ret = "";␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ++curInd;␍␊ |
| if (neg) ret = classNegate(ret);␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| std::string Pattern::parsePosix()␍␊ |
| {␍␊ |
| std::string s7 = pattern.substr(curInd, 7);␍␊ |
| if (s7 == "{Lower}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyz"; }␍␊ |
| if (s7 == "{Upper}") { curInd += 7; return "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; }␍␊ |
| if (s7 == "{Alpha}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; }␍␊ |
| if (s7 == "{Digit}") { curInd += 7; return "0123456789"; }␍␊ |
| if (s7 == "{Alnum}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; }␍␊ |
| if (s7 == "{Punct}") { curInd += 7; return "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }␍␊ |
| if (s7 == "{Graph}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }␍␊ |
| if (s7 == "{Print}") { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }␍␊ |
| if (s7 == "{Blank}") { curInd += 7; return " \t"; }␍␊ |
| if (s7 == "{Space}") { curInd += 7; return " \t\n\x0B\f\r"; }␍␊ |
| if (s7 == "{Cntrl}")␍␊ |
| {␍␊ |
| int i;␍␊ |
| std::string s = " ";␍␊ |
| ␍␊ |
| for (i = 0; i < 5; ++i) s += s;␍␊ |
| s += " ";␍␊ |
| for (i = 0; i <= 0x1F; ++i) s[i] = i;␍␊ |
| s[0x20] = 0x7F;␍␊ |
| curInd += 7;␍␊ |
| return s;␍␊ |
| }␍␊ |
| if (s7 == "{ASCII}")␍␊ |
| {␍␊ |
| std::string s(0x80, ' ');␍␊ |
| for (int i = 0; i < 0x80; ++i) s[i] = i;␍␊ |
| curInd += 7;␍␊ |
| return s;␍␊ |
| }␍␊ |
| if (pattern.substr(curInd, 8) == "{XDigit}") { curInd += 8; return "abcdefABCDEF0123456789"; }␍␊ |
| raiseError();␍␊ |
| return "";␍␊ |
| }␍␊ |
| NFANode * Pattern::parseBackref()␍␊ |
| {␍␊ |
| #define is_dig(x) ((x) >= '0' && (x) <= '9')␍␊ |
| #define to_int(x) ((x) - '0')␍␊ |
| int ci = curInd;␍␊ |
| int oldRef = 0, ref = 0;␍␊ |
| ␍␊ |
| while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount))␍␊ |
| {␍␊ |
| oldRef = ref;␍␊ |
| ref = ref * 10 + to_int(pattern[ci++]);␍␊ |
| }␍␊ |
| if (ci == (int)pattern.size())␍␊ |
| {␍␊ |
| oldRef = ref;␍␊ |
| ++ci;␍␊ |
| }␍␊ |
| if (oldRef < 0 || ci <= curInd)␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return registerNode(new NFAReferenceNode(-1));␍␊ |
| }␍␊ |
| curInd = ci;␍␊ |
| return registerNode(new NFAReferenceNode(ref));␍␊ |
| ␍␊ |
| #undef is_dig␍␊ |
| #undef to_int␍␊ |
| }␍␊ |
| std::string Pattern::parseOctal()␍␊ |
| {␍␊ |
| #define islowoc(x) ((x) >= '0' && (x) <= '3')␍␊ |
| #define isoc(x) ((x) >= '0' && (x) <= '7')␍␊ |
| #define fromoc(x) ((x) - '0')␍␊ |
| int ci = curInd;␍␊ |
| char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;␍␊ |
| char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;␍␊ |
| char ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1;␍␊ |
| std::string s = " ";␍␊ |
| ␍␊ |
| if (islowoc(ch1) && isoc(ch2))␍␊ |
| {␍␊ |
| curInd += 2;␍␊ |
| s[0] = fromoc(ch1) * 8 + fromoc(ch2);␍␊ |
| if (isoc(ch3))␍␊ |
| {␍␊ |
| ++curInd;␍␊ |
| s[0] = s[0] * 8 + fromoc(ch3);␍␊ |
| }␍␊ |
| }␍␊ |
| else if (isoc(ch1) && isoc(ch2))␍␊ |
| {␍␊ |
| curInd += 2;␍␊ |
| s[0] = fromoc(ch1) * 8 + fromoc(ch2);␍␊ |
| }␍␊ |
| else raiseError();␍␊ |
| ␍␊ |
| return s;␍␊ |
| #undef islowoc␍␊ |
| #undef isoc␍␊ |
| #undef fromoc␍␊ |
| }␍␊ |
| std::string Pattern::parseHex()␍␊ |
| {␍␊ |
| #define to_low(x) (((x) >= 'A' && (x) <= 'Z') ? ((x) - 'A' + 'a') : (x))␍␊ |
| #define is_dig(x) ((x) >= '0' && (x) <= '9')␍␊ |
| #define is_hex(x) (is_dig(x) || (to_low(x) >= 'a' && to_low(x) <= 'f'))␍␊ |
| #define to_int(x) ((is_dig(x)) ? ((x) - '0') : (to_low(x) - 'a' + 10))␍␊ |
| ␍␊ |
| int ci = curInd;␍␊ |
| char ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;␍␊ |
| char ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;␍␊ |
| std::string s = " ";␍␊ |
| ␍␊ |
| if (is_hex(ch1) && is_hex(ch2))␍␊ |
| {␍␊ |
| curInd += 2;␍␊ |
| s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F);␍␊ |
| }␍␊ |
| ␍␊ |
| return s;␍␊ |
| #undef to_low␍␊ |
| #undef is_dig␍␊ |
| #undef is_hex␍␊ |
| #undef to_int␍␊ |
| }␍␊ |
| std::string Pattern::parseEscape(bool & inv, bool & quo)␍␊ |
| {␍␊ |
| char ch = pattern[curInd++];␍␊ |
| std::string classes = "";␍␊ |
| ␍␊ |
| if (curInd > (int)pattern.size())␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return NULL;␍␊ |
| }␍␊ |
| ␍␊ |
| quo = 0;␍␊ |
| inv = 0;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case 'p': classes = parsePosix(); break;␍␊ |
| case 'P': classes = "!!"; classes += parsePosix(); break;␍␊ |
| case 'd': classes = "0123456789"; break;␍␊ |
| case 'D': classes = "!!0123456789"; break;␍␊ |
| case 's': classes = " \t\r\n\f"; break;␍␊ |
| case 'S': classes = "!! \t\r\n\f"; break;␍␊ |
| case 'w': classes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;␍␊ |
| case 'W': classes = "!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; break;␍␊ |
| case '0': classes = parseOctal(); break;␍␊ |
| case 'x': classes = parseHex(); break;␍␊ |
| ␍␊ |
| case 'Q': quo = 1; break;␍␊ |
| case 't': classes = "\t"; break;␍␊ |
| case 'r': classes = "\r"; break;␍␊ |
| case 'n': classes = "\n"; break;␍␊ |
| case 'f': classes = "\f"; break;␍␊ |
| case 'a': classes = "\a"; break;␍␊ |
| case 'e': classes = "\r"; break;␍␊ |
| default: classes = " "; classes[0] = ch; break;␍␊ |
| }␍␊ |
| if (classes.substr(0, 2) == "!!")␍␊ |
| {␍␊ |
| classes = classes.substr(2);␍␊ |
| inv = 1;␍␊ |
| }␍␊ |
| return classes;␍␊ |
| }␍␊ |
| NFANode * Pattern::parseRegisteredPattern(NFANode ** end)␍␊ |
| {␍␊ |
| int i, j;␍␊ |
| std::string s;␍␊ |
| NFANode * ret = NULL;␍␊ |
| for (i = curInd; i < (int)pattern.size() && pattern[i] != '}'; ++i) { }␍␊ |
| if (pattern[i] != '}') { raiseError(); return NULL; }␍␊ |
| if (i == curInd + 1) { raiseError(); return NULL; } // {}␍␊ |
| if (␍␊ |
| !(␍␊ |
| (pattern[curInd] >= 'a' && pattern[curInd] <= 'z') ||␍␊ |
| (pattern[curInd] >= 'A' && pattern[curInd] <= 'Z') ||␍␊ |
| (pattern[curInd] == '_')␍␊ |
| )␍␊ |
| )␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return NULL;␍␊ |
| }␍␊ |
| for (j = curInd; !error && j < i; ++j)␍␊ |
| {␍␊ |
| if (␍␊ |
| !(␍␊ |
| (pattern[j] >= 'a' && pattern[j] <= 'z') ||␍␊ |
| (pattern[j] >= 'A' && pattern[j] <= 'Z') ||␍␊ |
| (pattern[j] >= '0' && pattern[j] <= '9') ||␍␊ |
| (pattern[j] == '_')␍␊ |
| )␍␊ |
| )␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return NULL;␍␊ |
| }␍␊ |
| }␍␊ |
| s = pattern.substr(curInd, i - curInd);␍␊ |
| if (registeredPatterns.find(s) == registeredPatterns.end()) raiseError();␍␊ |
| else␍␊ |
| {␍␊ |
| unsigned long oflags = flags;␍␊ |
| std::string op = pattern;␍␊ |
| int ci = i + 1;␍␊ |
| ␍␊ |
| pattern = registeredPatterns[s].first;␍␊ |
| curInd = 0;␍␊ |
| flags = registeredPatterns[s].second;␍␊ |
| ␍␊ |
| --groupCount;␍␊ |
| ret = parse(0, 0, end);␍␊ |
| ␍␊ |
| pattern = op;␍␊ |
| curInd = ci;␍␊ |
| flags = oflags;␍␊ |
| }␍␊ |
| if (error) { *end = ret = NULL; }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| // look behind should interpret everything as a literal (except \\) since the␍␊ |
| // pattern must have a concrete length␍␊ |
| NFANode * Pattern::parseBehind(const bool pos, NFANode ** end)␍␊ |
| {␍␊ |
| std::string t = "";␍␊ |
| while (curInd < (int)pattern.size() && pattern[curInd] != ')')␍␊ |
| {␍␊ |
| char ch = pattern[curInd++];␍␊ |
| t += " ";␍␊ |
| if (ch == '\\')␍␊ |
| {␍␊ |
| if (curInd + 1 >= (int)pattern.size())␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return *end = registerNode(new NFACharNode(' '));␍␊ |
| }␍␊ |
| ch = pattern[curInd++];␍␊ |
| }␍␊ |
| t[t.size() - 1] = ch;␍␊ |
| }␍␊ |
| if (curInd >= (int)pattern.size() || pattern[curInd] != ')') raiseError();␍␊ |
| else ++curInd;␍␊ |
| return *end = registerNode(new NFALookBehindNode(t, pos));␍␊ |
| }␍␊ |
| NFANode * Pattern::parseQuote()␍␊ |
| {␍␊ |
| bool done = 0;␍␊ |
| std::string s = "";␍␊ |
| ␍␊ |
| while (!done)␍␊ |
| {␍␊ |
| if (curInd >= (int)pattern.size())␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| done = 1;␍␊ |
| }␍␊ |
| else if (pattern.substr(curInd, 2) == "\\E")␍␊ |
| {␍␊ |
| curInd += 2;␍␊ |
| done = 1;␍␊ |
| }␍␊ |
| else if (pattern[curInd] == '\\')␍␊ |
| {␍␊ |
| s += " ";␍␊ |
| s[s.size() - 1] = pattern[++curInd];␍␊ |
| ++curInd;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| s += " ";␍␊ |
| s[s.size() - 1] = pattern[curInd++];␍␊ |
| }␍␊ |
| }␍␊ |
| if ((flags & Pattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteNode(s));␍␊ |
| return registerNode(new NFAQuoteNode(s));␍␊ |
| }␍␊ |
| NFANode * Pattern::parse(const bool inParen, const bool inOr, NFANode ** end, const int orGroup, const bool inOrCap)␍␊ |
| {␍␊ |
| NFANode * start, * cur, * next = NULL;␍␊ |
| std::string t;␍␊ |
| int grc = groupCount++;␍␊ |
| bool inv, quo;␍␊ |
| bool ahead = false, pos = false, noncap = false, indep = false;␍␊ |
| unsigned long oldFlags = flags;␍␊ |
| ␍␊ |
| if (inParen)␍␊ |
| {␍␊ |
| if (inOr)␍␊ |
| {␍␊ |
| --groupCount;␍␊ |
| noncap = !inOrCap;␍␊ |
| grc = orGroup;␍␊ |
| if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc));␍␊ |
| else cur = start = registerNode(new NFASubStartNode);␍␊ |
| }␍␊ |
| else if (pattern[curInd] == '?')␍␊ |
| {␍␊ |
| ++curInd;␍␊ |
| --groupCount;␍␊ |
| if (pattern[curInd] == ':') { noncap = 1; ++curInd; grc = --nonCapGroupCount; }␍␊ |
| else if (pattern[curInd] == '=') { ++curInd; ahead = 1; pos = 1; }␍␊ |
| else if (pattern[curInd] == '!') { ++curInd; ahead = 1; pos = 0; }␍␊ |
| else if (pattern.substr(curInd, 2) == "<=") { curInd += 2; return parseBehind(1, end); }␍␊ |
| else if (pattern.substr(curInd, 2) == "<!") { curInd += 2; return parseBehind(0, end); }␍␊ |
| else if (pattern[curInd] == '>') { ++curInd; indep = 1; }␍␊ |
| else␍␊ |
| {␍␊ |
| bool negate = false, done = false;␍␊ |
| while (!done)␍␊ |
| {␍␊ |
| if (curInd >= (int)pattern.size())␍␊ |
| {␍␊ |
| raiseError();␍␊ |
| return NULL;␍␊ |
| }␍␊ |
| else if (negate)␍␊ |
| {␍␊ |
| switch (pattern[curInd])␍␊ |
| {␍␊ |
| case 'i': flags &= ~Pattern::CASE_INSENSITIVE; break;␍␊ |
| case 'd': flags &= ~Pattern::UNIX_LINE_MODE; break;␍␊ |
| case 'm': flags &= ~Pattern::MULTILINE_MATCHING; break;␍␊ |
| case 's': flags &= ~Pattern::DOT_MATCHES_ALL; break;␍␊ |
| case ':': done = true; break;␍␊ |
| case ')':␍␊ |
| ++curInd;␍␊ |
| *end = registerNode(new NFALookBehindNode("", true));␍␊ |
| return *end;␍␊ |
| case '-':␍␊ |
| default: raiseError(); return NULL;␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| switch (pattern[curInd])␍␊ |
| {␍␊ |
| case 'i': flags |= Pattern::CASE_INSENSITIVE; break;␍␊ |
| case 'd': flags |= Pattern::UNIX_LINE_MODE; break;␍␊ |
| case 'm': flags |= Pattern::MULTILINE_MATCHING; break;␍␊ |
| case 's': flags |= Pattern::DOT_MATCHES_ALL; break;␍␊ |
| case ':': done = true; break;␍␊ |
| case '-': negate = true; break;␍␊ |
| case ')':␍␊ |
| ++curInd;␍␊ |
| *end = registerNode(new NFALookBehindNode("", true));␍␊ |
| return *end;␍␊ |
| default: raiseError(); return NULL;␍␊ |
| }␍␊ |
| }␍␊ |
| ++curInd;␍␊ |
| }␍␊ |
| noncap = 1;␍␊ |
| grc = --nonCapGroupCount;␍␊ |
| }␍␊ |
| if (noncap) cur = start = registerNode(new NFAGroupHeadNode(grc));␍␊ |
| else cur = start = registerNode(new NFASubStartNode);␍␊ |
| }␍␊ |
| else cur = start = registerNode(new NFAGroupHeadNode(grc));␍␊ |
| }␍␊ |
| else cur = start = registerNode(new NFASubStartNode);␍␊ |
| while (curInd < (int)pattern.size())␍␊ |
| {␍␊ |
| char ch = pattern[curInd++];␍␊ |
| ␍␊ |
| next = NULL;␍␊ |
| if (error) return NULL;␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '^':␍␊ |
| if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineNode);␍␊ |
| else next = registerNode(new NFAStartOfInputNode);␍␊ |
| break;␍␊ |
| case '$':␍␊ |
| if ((flags & Pattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineNode);␍␊ |
| else next = registerNode(new NFAEndOfInputNode(0));␍␊ |
| break;␍␊ |
| case '|':␍␊ |
| cur->next = registerNode(new NFAAcceptNode);␍␊ |
| cur = start = registerNode(new NFAOrNode(start, parse(inParen, true, NULL, grc, !noncap)));␍␊ |
| break;␍␊ |
| case '\\':␍␊ |
| if (curInd < (int)pattern.size())␍␊ |
| {␍␊ |
| bool eoi = 0;␍␊ |
| switch (pattern[curInd])␍␊ |
| {␍␊ |
| case '1':␍␊ |
| case '2':␍␊ |
| case '3':␍␊ |
| case '4':␍␊ |
| case '5':␍␊ |
| case '6':␍␊ |
| case '7':␍␊ |
| case '8':␍␊ |
| case '9': next = parseBackref(); break;␍␊ |
| case 'A': ++curInd; next = registerNode(new NFAStartOfInputNode); break;␍␊ |
| case 'B': ++curInd; next = registerNode(new NFAWordBoundaryNode(0)); break;␍␊ |
| case 'b': ++curInd; next = registerNode(new NFAWordBoundaryNode(1)); break;␍␊ |
| case 'G': ++curInd; next = registerNode(new NFAEndOfMatchNode); break;␍␊ |
| case 'Z': eoi = 1;␍␊ |
| case 'z': ++curInd; next = registerNode(new NFAEndOfInputNode(eoi)); break;␍␊ |
| default:␍␊ |
| t = parseEscape(inv, quo);␍␊ |
| if (!quo)␍␊ |
| {␍␊ |
| if (t.size() > 1 || inv)␍␊ |
| {␍␊ |
| if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassNode(t, inv));␍␊ |
| else next = registerNode(new NFAClassNode(t, inv));␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| next = registerNode(new NFACharNode(t[0]));␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| next = parseQuote();␍␊ |
| }␍␊ |
| }␍␊ |
| }␍␊ |
| else raiseError();␍␊ |
| break;␍␊ |
| case '[':␍␊ |
| if ((flags & Pattern::CASE_INSENSITIVE) == 0)␍␊ |
| {␍␊ |
| NFAClassNode * clazz = new NFAClassNode();␍␊ |
| std::string s = parseClass();␍␊ |
| for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1;␍␊ |
| next = registerNode(clazz);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| NFACIClassNode * clazz = new NFACIClassNode();␍␊ |
| std::string s = parseClass();␍␊ |
| for (int i = 0; i < (int)s.size(); ++i) clazz->vals[tolower(s[i])] = 1;␍␊ |
| next = registerNode(clazz);␍␊ |
| }␍␊ |
| break;␍␊ |
| case '.':␍␊ |
| {␍␊ |
| bool useN = 1, useR = 1;␍␊ |
| NFAClassNode * clazz = new NFAClassNode(1);␍␊ |
| if ((flags & Pattern::UNIX_LINE_MODE) != 0) useR = 0;␍␊ |
| if ((flags & Pattern::DOT_MATCHES_ALL) != 0) useN = useR = 0;␍␊ |
| if (useN) clazz->vals['\n'] = 1;␍␊ |
| if (useR) clazz->vals['\r'] = 1;␍␊ |
| next = registerNode(clazz);␍␊ |
| }␍␊ |
| break;␍␊ |
| case '(':␍␊ |
| {␍␊ |
| NFANode * end, * t1, * t2;␍␊ |
| t1 = parse(true, false, &end);␍␊ |
| if (!t1) raiseError();␍␊ |
| else if (t1->isGroupHeadNode() && (t2 = quantifyGroup(t1, end, grc)) != NULL)␍␊ |
| {␍␊ |
| cur->next = t2;␍␊ |
| cur = t2->next;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| cur->next = t1;␍␊ |
| cur = end;␍␊ |
| }␍␊ |
| }␍␊ |
| break;␍␊ |
| case ')':␍␊ |
| if (!inParen) raiseError();␍␊ |
| else if (inOr)␍␊ |
| {␍␊ |
| --curInd;␍␊ |
| cur = cur->next = registerNode(new NFAAcceptNode);␍␊ |
| flags = oldFlags;␍␊ |
| return start;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| if (ahead)␍␊ |
| {␍␊ |
| cur = cur->next = registerNode(new NFAAcceptNode);␍␊ |
| flags = oldFlags;␍␊ |
| return *end = registerNode(new NFALookAheadNode(start, pos));␍␊ |
| }␍␊ |
| else if (indep)␍␊ |
| {␍␊ |
| cur = cur->next = registerNode(new NFAAcceptNode);␍␊ |
| flags = oldFlags;␍␊ |
| return *end = registerNode(new NFAPossessiveQuantifierNode(this, start, 1, 1));␍␊ |
| }␍␊ |
| else // capping or noncapping, it doesnt matter␍␊ |
| {␍␊ |
| *end = cur = cur->next = registerNode(new NFAGroupTailNode(grc));␍␊ |
| next = quantifyGroup(start, *end, grc);␍␊ |
| if (next)␍␊ |
| {␍␊ |
| start = next;␍␊ |
| *end = next->next;␍␊ |
| }␍␊ |
| flags = oldFlags;␍␊ |
| return start;␍␊ |
| }␍␊ |
| }␍␊ |
| break;␍␊ |
| case '{': // registered pattern␍␊ |
| cur->next = parseRegisteredPattern(&next);␍␊ |
| if (cur->next) cur = next;␍␊ |
| break;␍␊ |
| case '*':␍␊ |
| case '+':␍␊ |
| case '?':␍␊ |
| case '}':␍␊ |
| case ']':␍␊ |
| raiseError();␍␊ |
| break;␍␊ |
| default:␍␊ |
| if ((flags & Pattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACICharNode(ch));␍␊ |
| else next = registerNode(new NFACharNode(ch));␍␊ |
| break;␍␊ |
| }␍␊ |
| NFAOrNode * orNode = dynamic_cast<NFAOrNode *>(cur);␍␊ |
| if (orNode) // an orNode will never have a "next" set. thus we don't have to worry about problems with␍␊ |
| { // quantifying an or node.␍␊ |
| NFANode * one = orNode->one, * two = orNode->two;␍␊ |
| while (one->next) one = one->next;␍␊ |
| while (two->next) two = two->next;␍␊ |
| one->next = two->next = cur = registerNode(new NFAAcceptNode);␍␊ |
| }␍␊ |
| if (next)␍␊ |
| {␍␊ |
| cur = cur->next = quantify(next);␍␊ |
| }␍␊ |
| }␍␊ |
| if (inParen) raiseError();␍␊ |
| else␍␊ |
| {␍␊ |
| if (inOr) cur = cur->next = registerNode(new NFAAcceptNode);␍␊ |
| if (end) *end = cur;␍␊ |
| }␍␊ |
| ␍␊ |
| flags = oldFlags;␍␊ |
| if (error) return NULL;␍␊ |
| ␍␊ |
| return start;␍␊ |
| }␍␊ |
| ␍␊ |
| Pattern * Pattern::compile(const std::string & pattern, const unsigned long mode)␍␊ |
| {␍␊ |
| Pattern * p = new Pattern(pattern);␍␊ |
| NFANode * end;␍␊ |
| ␍␊ |
| p->flags = mode;␍␊ |
| if ((mode & Pattern::LITERAL) != 0)␍␊ |
| {␍␊ |
| p->head = p->registerNode(new NFAStartNode);␍␊ |
| if ((mode & Pattern::CASE_INSENSITIVE) != 0) p->head->next = p->registerNode(new NFACIQuoteNode(pattern));␍␊ |
| else p->head->next = p->registerNode(new NFAQuoteNode(pattern));␍␊ |
| p->head->next->next = p->registerNode(new NFAEndNode);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| p->head = p->parse(0, 0, &end);␍␊ |
| if (!p->head)␍␊ |
| {␍␊ |
| delete p;␍␊ |
| p = NULL;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| if (!(p->head && p->head->isStartOfInputNode()))␍␊ |
| {␍␊ |
| NFANode * n = p->registerNode(new NFAStartNode);␍␊ |
| n->next = p->head;␍␊ |
| p->head = n;␍␊ |
| }␍␊ |
| end->next = p->registerNode(new NFAEndNode);␍␊ |
| }␍␊ |
| }␍␊ |
| if (p != NULL)␍␊ |
| {␍␊ |
| p->matcher = new Matcher(p, "");␍␊ |
| }␍␊ |
| ␍␊ |
| return p;␍␊ |
| }␍␊ |
| ␍␊ |
| Pattern * Pattern::compileAndKeep(const std::string & pattern, const unsigned long mode)␍␊ |
| {␍␊ |
| Pattern * ret = NULL;␍␊ |
| std::map<std::string, Pattern*>::iterator it = compiledPatterns.find(pattern);␍␊ |
| ␍␊ |
| if (it != compiledPatterns.end())␍␊ |
| {␍␊ |
| ret = it->second;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| ret = compile(pattern, mode);␍␊ |
| compiledPatterns[pattern] = ret;␍␊ |
| }␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| std::string Pattern::replace(const std::string & pattern, const std::string & str,␍␊ |
| const std::string & replacementText, const unsigned long mode)␍␊ |
| {␍␊ |
| std::string ret;␍␊ |
| Pattern * p = Pattern::compile(pattern, mode);␍␊ |
| if (p)␍␊ |
| {␍␊ |
| ret = p->replace(str, replacementText);␍␊ |
| delete p;␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| std::vector<std::string> Pattern::split(const std::string & pattern, const std::string & str, const bool keepEmptys,␍␊ |
| const unsigned long limit, const unsigned long mode)␍␊ |
| {␍␊ |
| std::vector<std::string> ret;␍␊ |
| Pattern * p = Pattern::compile(pattern, mode);␍␊ |
| if (p)␍␊ |
| {␍␊ |
| ret = p->split(str, keepEmptys, limit);␍␊ |
| delete p;␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| std::vector<std::string> Pattern::findAll(const std::string & pattern, const std::string & str, const unsigned long mode)␍␊ |
| {␍␊ |
| std::vector<std::string> ret;␍␊ |
| Pattern * p = Pattern::compile(pattern, mode);␍␊ |
| if (p)␍␊ |
| {␍␊ |
| ret = p->findAll(str);␍␊ |
| delete p;␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| bool Pattern::matches(const std::string & pattern, const std::string & str, const unsigned long mode)␍␊ |
| {␍␊ |
| bool ret = 0;␍␊ |
| Pattern * p = compile(pattern, mode);␍␊ |
| ␍␊ |
| if (p)␍␊ |
| {␍␊ |
| ret = p->matches(str);␍␊ |
| delete p;␍␊ |
| }␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| bool Pattern::registerPattern(const std::string & name, const std::string & pattern, const unsigned long mode)␍␊ |
| {␍␊ |
| Pattern * p = Pattern::compile(pattern, mode);␍␊ |
| if (!p) return 0;␍␊ |
| Pattern::registeredPatterns[name] = std::make_pair(pattern, mode);␍␊ |
| delete p;␍␊ |
| return 1;␍␊ |
| }␍␊ |
| ␍␊ |
| void Pattern::unregisterPatterns()␍␊ |
| {␍␊ |
| registeredPatterns.clear();␍␊ |
| }␍␊ |
| void Pattern::clearPatternCache()␍␊ |
| {␍␊ |
| std::map<std::string, Pattern*>::iterator it;␍␊ |
| for (it = compiledPatterns.begin(); it != compiledPatterns.end(); ++it)␍␊ |
| {␍␊ |
| delete it->second;␍␊ |
| }␍␊ |
| compiledPatterns.clear();␍␊ |
| }␍␊ |
| ␍␊ |
| std::pair<std::string, int> Pattern::findNthMatch(const std::string & pattern, const std::string & str,␍␊ |
| const int matchNum, const unsigned long mode)␍␊ |
| {␍␊ |
| std::pair<std::string, int> ret;␍␊ |
| Pattern * p = Pattern::compile(pattern, mode);␍␊ |
| ␍␊ |
| ret.second = -1;␍␊ |
| if (p)␍␊ |
| {␍␊ |
| int i = -1;␍␊ |
| p->matcher->setString(str);␍␊ |
| while (i < matchNum && p->matcher->findNextMatch()) { ++i; }␍␊ |
| if (i == matchNum && p->matcher->getStartingIndex() >= 0)␍␊ |
| {␍␊ |
| ret.first = p->matcher->getGroup(0);␍␊ |
| ret.second = p->matcher->getStartingIndex();␍␊ |
| }␍␊ |
| delete p;␍␊ |
| }␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| Pattern::~Pattern()␍␊ |
| {␍␊ |
| if (matcher) delete matcher;␍␊ |
| for (std::map<NFANode*, bool>::iterator it = nodes.begin(); it != nodes.end(); ++it)␍␊ |
| {␍␊ |
| delete it->first;␍␊ |
| }␍␊ |
| }␍␊ |
| std::string Pattern::replace(const std::string & str, const std::string & replacementText)␍␊ |
| {␍␊ |
| int li = 0;␍␊ |
| std::string ret = "";␍␊ |
| ␍␊ |
| matcher->setString(str);␍␊ |
| while (matcher->findNextMatch())␍␊ |
| {␍␊ |
| ret += str.substr(li, matcher->getStartingIndex() - li);␍␊ |
| ret += matcher->replaceWithGroups(replacementText);␍␊ |
| li = matcher->getEndingIndex();␍␊ |
| }␍␊ |
| ret += str.substr(li);␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| std::vector<std::string> Pattern::split(const std::string & str, const bool keepEmptys, const unsigned long limit)␍␊ |
| {␍␊ |
| unsigned long lim = (limit == 0 ? MAX_QMATCH : limit);␍␊ |
| int li = 0;␍␊ |
| std::vector<std::string> ret;␍␊ |
| ␍␊ |
| matcher->setString(str);␍␊ |
| ␍␊ |
| while (matcher->findNextMatch() && ret.size() < lim)␍␊ |
| {␍␊ |
| if (matcher->getStartingIndex() == 0 && keepEmptys) ret.push_back("");␍␊ |
| if ((matcher->getStartingIndex() != matcher->getEndingIndex()) || keepEmptys)␍␊ |
| {␍␊ |
| if (li != matcher->getStartingIndex() || keepEmptys)␍␊ |
| {␍␊ |
| ret.push_back(str.substr(li, matcher->getStartingIndex() - li));␍␊ |
| }␍␊ |
| li = matcher->getEndingIndex();␍␊ |
| }␍␊ |
| }␍␊ |
| if (li < (int)str.size()) ret.push_back(str.substr(li));␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| std::vector<std::string> Pattern::findAll(const std::string & str)␍␊ |
| {␍␊ |
| matcher->setString(str);␍␊ |
| return matcher->findAll();␍␊ |
| }␍␊ |
| bool Pattern::matches(const std::string & str)␍␊ |
| {␍␊ |
| matcher->setString(str);␍␊ |
| return matcher->matches();␍␊ |
| }␍␊ |
| unsigned long Pattern::getFlags() const␍␊ |
| {␍␊ |
| return flags;␍␊ |
| }␍␊ |
| std::string Pattern::getPattern() const␍␊ |
| {␍␊ |
| return pattern;␍␊ |
| }␍␊ |
| Matcher * Pattern::createMatcher(const std::string & str)␍␊ |
| {␍␊ |
| return new Matcher(this, str);␍␊ |
| }␍␊ |
| void Pattern::print()␍␊ |
| {␍␊ |
| printf("Pattern(%s):\n", pattern.c_str());␍␊ |
| if (head) head->print(1);␍␊ |
| }␍␊ |
| ␍␊ |
| ␍␊ |
| ␍␊ |
| ␍␊ |
| ␍␊ |
| ␍␊ |
| ␍␊ |
| static char * getPrintChar(const char ch)␍␊ |
| {␍␊ |
| static char buf[1024];␍␊ |
| if (ch == '0') strcpy(buf, "\\0");␍␊ |
| else if (ch == '\r') strcpy(buf, "\\r");␍␊ |
| else if (ch == '\t') strcpy(buf, "\\t");␍␊ |
| else if (ch == '\n') strcpy(buf, "\\n");␍␊ |
| else if (ch == 0x0C) strcpy(buf, "\\p");␍␊ |
| else if (ch < ' ' || ch >= 0x80) sprintf(buf, "0x%02X", ch);␍␊ |
| else sprintf(buf, "%c", ch);␍␊ |
| return buf;␍␊ |
| }␍␊ |
| static void appendRawToClass(const int ch, char * const buf, int & curInd)␍␊ |
| {␍␊ |
| switch (ch)␍␊ |
| {␍␊ |
| case '(':␍␊ |
| case ')':␍␊ |
| case '\\':␍␊ |
| case '-':␍␊ |
| case '[':␍␊ |
| case ']':␍␊ |
| buf[curInd++] = '\\';␍␊ |
| default:;␍␊ |
| buf[curInd++] = (char)ch;␍␊ |
| }␍␊ |
| }␍␊ |
| static void appendToClass0(const int j, int & start, const bool raw, const bool allowRanges, char * const buf, int & curInd)␍␊ |
| {␍␊ |
| if (start != -1)␍␊ |
| {␍␊ |
| if (start + 1 == j)␍␊ |
| {␍␊ |
| if (raw)␍␊ |
| {␍␊ |
| appendRawToClass(start, buf, curInd);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| sprintf(buf + curInd, "\\x%02X", start);␍␊ |
| curInd += 4;␍␊ |
| }␍␊ |
| }␍␊ |
| else if (start + 2 == j)␍␊ |
| {␍␊ |
| if (raw)␍␊ |
| {␍␊ |
| appendRawToClass(start, buf, curInd);␍␊ |
| appendRawToClass(start + 1, buf, curInd);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| sprintf(buf + curInd, "\\x%02X\\x%02X", start, start + 1);␍␊ |
| curInd += 8;␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| if (allowRanges)␍␊ |
| {␍␊ |
| if (raw)␍␊ |
| {␍␊ |
| appendRawToClass(start, buf, curInd);␍␊ |
| buf[curInd++] = '-';␍␊ |
| appendRawToClass(j - 1, buf, curInd);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| sprintf(buf + curInd, "\\x%02X-\\x%02X", start, j - 1);␍␊ |
| curInd += 9;␍␊ |
| }␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| for (int k = start; k < j; ++k)␍␊ |
| {␍␊ |
| appendRawToClass(k, buf, curInd);␍␊ |
| }␍␊ |
| }␍␊ |
| }␍␊ |
| }␍␊ |
| start = -1;␍␊ |
| }␍␊ |
| static void appendToClass(const std::map<char, bool> & chars, const int * const range, const bool raw, const bool allowRanges, char * const buf, int & curInd)␍␊ |
| {␍␊ |
| int start = -1, j;␍␊ |
| for (j = range[0]; j < range[1]; ++j)␍␊ |
| {␍␊ |
| if (chars.find((char)j) != chars.end())␍␊ |
| {␍␊ |
| if (start == -1) start = j;␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| appendToClass0(j, start, raw, allowRanges, buf, curInd);␍␊ |
| }␍␊ |
| }␍␊ |
| appendToClass0(j, start, raw, allowRanges, buf, curInd);␍␊ |
| }␍␊ |
| static char * getClassDesc(const std::map<char, bool> & chars, const bool inv, const bool firstPass = true)␍␊ |
| {␍␊ |
| static char buf[1024];␍␊ |
| int curInd = 0;␍␊ |
| bool ok[256] = { false };␍␊ |
| ␍␊ |
| buf[curInd++] = '[';␍␊ |
| if (inv)␍␊ |
| {␍␊ |
| buf[curInd++] = '^';␍␊ |
| }␍␊ |
| // check for lower case letters␍␊ |
| if (chars.find(' ') != chars.end()) { buf[curInd++] = ' '; }␍␊ |
| if (chars.find(0x00) != chars.end()) { strcpy(buf + curInd, "\\0"); curInd += 2; }␍␊ |
| if (chars.find('\t') != chars.end()) { strcpy(buf + curInd, "\\t"); curInd += 2; }␍␊ |
| if (chars.find('\r') != chars.end()) { strcpy(buf + curInd, "\\r"); curInd += 2; }␍␊ |
| if (chars.find('\n') != chars.end()) { strcpy(buf + curInd, "\\n"); curInd += 2; }␍␊ |
| if (chars.find(0x0c) != chars.end()) { strcpy(buf + curInd, "\\p"); curInd += 2; }␍␊ |
| const int HEX_RANGES[][2] =␍␊ |
| {␍␊ |
| { 0x01, 0x09 },␍␊ |
| { 0x0B, 0x0B },␍␊ |
| { 0x0E, 0x20 },␍␊ |
| { 0x80, 0x100 },␍␊ |
| };␍␊ |
| const int RAW_RANGES[][2] =␍␊ |
| {␍␊ |
| { 0x20, '0' },␍␊ |
| { '0', '9' + 1 },␍␊ |
| { '9' + 1, 'A' },␍␊ |
| { 'A', 'Z' + 1 },␍␊ |
| { 'Z' + 1, 'a' },␍␊ |
| { 'a', 'z' + 1 },␍␊ |
| { 'z' + 1, 0x80 },␍␊ |
| };␍␊ |
| const bool ALLOW_RANGES[] =␍␊ |
| {␍␊ |
| false,␍␊ |
| true,␍␊ |
| false,␍␊ |
| true,␍␊ |
| false,␍␊ |
| true,␍␊ |
| false,␍␊ |
| };␍␊ |
| const int NUM_HEX = sizeof(HEX_RANGES) / sizeof(HEX_RANGES[0]);␍␊ |
| const int NUM_RAW = sizeof(RAW_RANGES) / sizeof(RAW_RANGES[0]);␍␊ |
| ␍␊ |
| for (int i = 0; i < NUM_HEX; ++i)␍␊ |
| {␍␊ |
| appendToClass(chars, HEX_RANGES[i], false, true, buf, curInd);␍␊ |
| }␍␊ |
| for (int i = 0; i < NUM_RAW; ++i)␍␊ |
| {␍␊ |
| appendToClass(chars, RAW_RANGES[i], true, ALLOW_RANGES[i], buf, curInd);␍␊ |
| }␍␊ |
| buf[curInd++] = ']';␍␊ |
| buf[curInd++] = 0;␍␊ |
| if (firstPass)␍␊ |
| {␍␊ |
| char newBuf[1024];␍␊ |
| std::map<char, bool> invChars;␍␊ |
| for (int i = 0; i < 256; ++i)␍␊ |
| {␍␊ |
| if (chars.find((char)i) == chars.end()) invChars[(char)i] = true;␍␊ |
| }␍␊ |
| strcpy(newBuf, buf);␍␊ |
| getClassDesc(invChars, !inv, false);␍␊ |
| if (strlen(buf) > strlen(newBuf)) strcpy(buf, newBuf);␍␊ |
| }␍␊ |
| return buf;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFANode␍␊ |
| ␍␊ |
| NFANode::NFANode() { next = NULL; }␍␊ |
| NFANode::~NFANode() { }␍␊ |
| void NFANode::findAllNodes(std::map<NFANode*, bool> & soFar)␍␊ |
| {␍␊ |
| if (soFar.find(this) == soFar.end()) return;␍␊ |
| soFar[this] = 1;␍␊ |
| if (next) next->findAllNodes(soFar);␍␊ |
| }␍␊ |
| void NFANode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s\n", indent * 2, ' ', typeid(*this).name());␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFACharNode␍␊ |
| ␍␊ |
| NFACharNode::NFACharNode(const char c) { ch = c; }␍␊ |
| int NFACharNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd < (int)str.size() && str[curInd] == ch) return next->match(str, matcher, curInd + 1, depth + 1);␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFACharNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(0x%02X)\n", indent * 2, ' ', typeid(*this).name(), ch);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFACICharNode␍␊ |
| ␍␊ |
| NFACICharNode::NFACICharNode(const char c) { ch = tolower(c); }␍␊ |
| int NFACICharNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd < (int)str.size() && tolower(str[curInd]) == ch) return next->match(str, matcher, curInd + 1, depth + 1);␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFACICharNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(0x%02X)\n", indent * 2, ' ', typeid(*this).name(), ch);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAStartNode␍␊ |
| ␍␊ |
| NFAStartNode::NFAStartNode() { }␍␊ |
| int NFAStartNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ret = -1, ci = curInd;␍␊ |
| ␍␊ |
| matcher->starts[0] = curInd;␍␊ |
| if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) == (unsigned int)Matcher::MATCH_ENTIRE_STRING)␍␊ |
| {␍␊ |
| if (curInd != 0)␍␊ |
| {␍␊ |
| matcher->starts[0] = -1;␍␊ |
| return -1;␍␊ |
| }␍␊ |
| return next->match(str, matcher, 0, depth + 1);␍␊ |
| }␍␊ |
| while ((ret = next->match(str, matcher, ci)) == -1 && ci < (int)str.size())␍␊ |
| {␍␊ |
| matcher->clearGroups();␍␊ |
| matcher->starts[0] = ++ci;␍␊ |
| }␍␊ |
| if (ret < 0) matcher->starts[0] = -1;␍␊ |
| return ret;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAEndNode␍␊ |
| ␍␊ |
| NFAEndNode::NFAEndNode() { }␍␊ |
| int NFAEndNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| matcher->ends[0] = curInd;␍␊ |
| if ((matcher->getFlags() & Matcher::MATCH_ENTIRE_STRING) != 0)␍␊ |
| {␍␊ |
| if (curInd == (int)str.size()) return curInd;␍␊ |
| matcher->ends[0] = -1;␍␊ |
| return -1;␍␊ |
| }␍␊ |
| return curInd;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAQuantifierNode␍␊ |
| ␍␊ |
| void NFAQuantifierNode::findAllNodes(std::map<NFANode*, bool> & soFar)␍␊ |
| {␍␊ |
| inner->findAllNodes(soFar);␍␊ |
| NFANode::findAllNodes(soFar);␍␊ |
| }␍␊ |
| NFAQuantifierNode::NFAQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)␍␊ |
| {␍␊ |
| inner = internal;␍␊ |
| inner->next = pat->registerNode(new NFAAcceptNode);␍␊ |
| min = (minMatch < Pattern::MIN_QMATCH) ? Pattern::MIN_QMATCH : minMatch;␍␊ |
| max = (maxMatch > Pattern::MAX_QMATCH) ? Pattern::MAX_QMATCH : maxMatch;␍␊ |
| }␍␊ |
| ␍␊ |
| int NFAQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int i0, i1, i2 = 0;␍␊ |
| ␍␊ |
| i0 = i1 = curInd;␍␊ |
| while (i2 < min)␍␊ |
| {␍␊ |
| ++i2;␍␊ |
| i1 = inner->match(str, matcher, i0, depth + 1);␍␊ |
| if (i1 <= i0) return i1; // i1 < i0 means i1 is -1␍␊ |
| i0 = i1;␍␊ |
| }␍␊ |
| ␍␊ |
| return i1;␍␊ |
| }␍␊ |
| void NFAQuantifierNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(min=%d, max=%d)\n", indent * 2, ' ', typeid(*this).name(), min, max);␍␊ |
| printf("%*cinner\n", (indent + 1) * 2, ' ');␍␊ |
| if (inner) inner->print(indent + 2);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAGreedyQuantifierNode␍␊ |
| NFAGreedyQuantifierNode::NFAGreedyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)␍␊ |
| : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }␍␊ |
| int NFAGreedyQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int t = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);␍␊ |
| if (t != -1) return matchInternal(str, matcher, t, min, depth + 1);␍␊ |
| return t;␍␊ |
| }␍␊ |
| int NFAGreedyQuantifierNode::matchInternal(const std::string & str, Matcher * matcher, const int curInd, const int soFar, const int depth) const␍␊ |
| {␍␊ |
| if (soFar >= max) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| ␍␊ |
| int i, j;␍␊ |
| ␍␊ |
| i = inner->match(str, matcher, curInd, depth + 1);␍␊ |
| if (i != -1)␍␊ |
| {␍␊ |
| j = matchInternal(str, matcher, i, soFar + 1, depth + 1);␍␊ |
| if (j != -1) return j;␍␊ |
| }␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFALazyQuantifierNode␍␊ |
| ␍␊ |
| NFALazyQuantifierNode::NFALazyQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)␍␊ |
| : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }␍␊ |
| int NFALazyQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int i, j, m = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);␍␊ |
| ␍␊ |
| if (m == -1) return -1;␍␊ |
| ␍␊ |
| for (i = min; i < max; ++i)␍␊ |
| {␍␊ |
| j = next->match(str, matcher, m, depth + 1);␍␊ |
| if (j == -1)␍␊ |
| {␍␊ |
| j = inner->match(str, matcher, m, depth + 1);␍␊ |
| // if j < m, then j is -1, so we bail.␍␊ |
| // if j == m, then we would just go and call next->match on the same index,␍␊ |
| // but it already failed trying to match right there, so we know we can␍␊ |
| // just bail␍␊ |
| if (j <= m) return -1;␍␊ |
| m = j;␍␊ |
| }␍␊ |
| else return j;␍␊ |
| }␍␊ |
| return next->match(str, matcher, m, depth + 1);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAPossessiveQuantifierNode␍␊ |
| ␍␊ |
| NFAPossessiveQuantifierNode::NFAPossessiveQuantifierNode(Pattern * pat, NFANode * internal, const int minMatch, const int maxMatch)␍␊ |
| : NFAQuantifierNode(pat, internal, minMatch, maxMatch) { }␍␊ |
| int NFAPossessiveQuantifierNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int i, j, m = NFAQuantifierNode::match(str, matcher, curInd, depth + 1);␍␊ |
| ␍␊ |
| if (m == -1) return -1;␍␊ |
| for (i = min; i < max; ++i)␍␊ |
| {␍␊ |
| j = inner->match(str, matcher, m, depth + 1);␍␊ |
| if (j <= m) return next->match(str, matcher, m, depth + 1);␍␊ |
| m = j;␍␊ |
| }␍␊ |
| return next->match(str, matcher, m, depth + 1);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAAcceptNode␍␊ |
| ␍␊ |
| NFAAcceptNode::NFAAcceptNode() { }␍␊ |
| int NFAAcceptNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (!next) return curInd;␍␊ |
| else return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAClassNode␍␊ |
| ␍␊ |
| NFAClassNode::NFAClassNode(const bool invert)␍␊ |
| {␍␊ |
| inv = invert;␍␊ |
| }␍␊ |
| NFAClassNode::NFAClassNode(const std::string & clazz, const bool invert)␍␊ |
| {␍␊ |
| inv = invert;␍␊ |
| for (int i = 0; i < (int)clazz.size(); ++i) vals[clazz[i]] = 1;␍␊ |
| }␍␊ |
| int NFAClassNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd < (int)str.size() && ((vals.find(str[curInd]) != vals.end()) ^ inv))␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd + 1, depth + 1);␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFAClassNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(%d vals, inv=%s)\n", indent * 2, ' ', typeid(*this).name(), (int)vals.size(), inv ? "true" : "false");␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFACIClassNode␍␊ |
| ␍␊ |
| NFACIClassNode::NFACIClassNode(const bool invert)␍␊ |
| {␍␊ |
| inv = invert;␍␊ |
| }␍␊ |
| NFACIClassNode::NFACIClassNode(const std::string & clazz, const bool invert)␍␊ |
| {␍␊ |
| inv = invert;␍␊ |
| for (int i = 0; i < (int)clazz.size(); ++i) vals[tolower(clazz[i])] = 1;␍␊ |
| }␍␊ |
| int NFACIClassNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd < (int)str.size() && ((vals.find(tolower(str[curInd])) != vals.end()) ^ inv))␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd + 1, depth + 1);␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFACIClassNode::print(const int indent)␍␊ |
| {␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| #undef to_lower␍␊ |
| ␍␊ |
| // NFASubStartNode␍␊ |
| ␍␊ |
| NFASubStartNode::NFASubStartNode() { }␍␊ |
| int NFASubStartNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAOrNode␍␊ |
| ␍␊ |
| NFAOrNode::NFAOrNode(NFANode * first, NFANode * second) : one(first), two(second) { }␍␊ |
| void NFAOrNode::findAllNodes(std::map<NFANode*, bool> & soFar)␍␊ |
| {␍␊ |
| if (one) one->findAllNodes(soFar);␍␊ |
| if (two) two->findAllNodes(soFar);␍␊ |
| NFANode::findAllNodes(soFar);␍␊ |
| }␍␊ |
| int NFAOrNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ci = one->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ci != -1) return ci;␍␊ |
| return two->match(str, matcher, curInd, depth + 1);␍␊ |
| /*␍␊ |
| int ci = one->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ci != -1)␍␊ |
| {␍␊ |
| ci = next->match(str, matcher, ci, depth + 1);␍␊ |
| if (ci != -1) return ci;␍␊ |
| }␍␊ |
| ci = two->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ci != -1) ci = next->match(str, matcher, ci, depth + 1);␍␊ |
| return -1;␍␊ |
| */␍␊ |
| }␍␊ |
| void NFAOrNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s\n", indent * 2, ' ', typeid(*this).name());␍␊ |
| printf("%*cone\n", (indent + 1) * 2, ' ');␍␊ |
| one->print(indent + 2);␍␊ |
| printf("%*ctwo\n", (indent + 1) * 2, ' ');␍␊ |
| two->print(indent + 2);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAQuoteNode␍␊ |
| ␍␊ |
| NFAQuoteNode::NFAQuoteNode(const std::string & quoted) : qStr(quoted) { }␍␊ |
| int NFAQuoteNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd + qStr.size() > str.size()) return -1;␍␊ |
| if (str.substr(curInd, qStr.size()) != qStr) return -1;␍␊ |
| return next->match(str, matcher, curInd + qStr.size(), depth + 1);␍␊ |
| }␍␊ |
| void NFAQuoteNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(%s)\n", indent * 2, ' ', typeid(*this).name(), qStr.c_str());␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFACIQuoteNode␍␊ |
| ␍␊ |
| NFACIQuoteNode::NFACIQuoteNode(const std::string & quoted) : qStr(quoted) { }␍␊ |
| int NFACIQuoteNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd + qStr.size() > str.size()) return -1;␍␊ |
| if (str_icmp(str.substr(curInd, qStr.size()).c_str(), qStr.c_str())) return -1;␍␊ |
| return next->match(str, matcher, qStr.size(), depth + 1);␍␊ |
| }␍␊ |
| void NFACIQuoteNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(%s)\n", indent * 2, ' ', typeid(*this).name(), qStr.c_str());␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFALookAheadNode␍␊ |
| ␍␊ |
| NFALookAheadNode::NFALookAheadNode(NFANode * internal, const bool positive) : NFANode(), pos(positive), inner(internal) { }␍␊ |
| void NFALookAheadNode::findAllNodes(std::map<NFANode*, bool> & soFar)␍␊ |
| {␍␊ |
| if (inner) inner->findAllNodes(soFar);␍␊ |
| NFANode::findAllNodes(soFar);␍␊ |
| }␍␊ |
| int NFALookAheadNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| return ((inner->match(str, matcher, curInd, depth + 1) == -1) ^ pos) ? next->match(str, matcher, curInd, depth + 1) : -1;␍␊ |
| }␍␊ |
| void NFALookAheadNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(pos=%s)\n", indent * 2, ' ', typeid(*this).name(), pos ? "true" : "false");␍␊ |
| inner->print(indent + 1);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFALookBehindNode␍␊ |
| ␍␊ |
| NFALookBehindNode::NFALookBehindNode(const std::string & str, const bool positive) : pos(positive), mStr(str) { }␍␊ |
| int NFALookBehindNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (pos)␍␊ |
| {␍␊ |
| if (curInd < (int)mStr.size()) return -1;␍␊ |
| if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| if (curInd < (int)mStr.size()) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| if (str.substr(curInd - mStr.size(), mStr.size()) == mStr) return -1;␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFALookBehindNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(str=%s, pos=%s)\n", indent * 2, ' ', typeid(*this).name(), mStr.c_str(), pos ? "true" : "false");␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAStartOfLineNode␍␊ |
| ␍␊ |
| NFAStartOfLineNode::NFAStartOfLineNode() { }␍␊ |
| int NFAStartOfLineNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd == 0 || str[curInd - 1] == '\n' || str[curInd - 1] == '\r')␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAEndOfLineNode␍␊ |
| ␍␊ |
| NFAEndOfLineNode::NFAEndOfLineNode() { }␍␊ |
| int NFAEndOfLineNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd >= (int)str.size() || str[curInd] == '\n' || str[curInd] == '\r')␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAReferenceNode␍␊ |
| ␍␊ |
| NFAReferenceNode::NFAReferenceNode(const int groupIndex) : gi(groupIndex) { }␍␊ |
| int NFAReferenceNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int len = matcher->ends[gi] - matcher->starts[gi];␍␊ |
| int ni = -1;␍␊ |
| if (gi < 1 || matcher->ends[gi] < matcher->starts[gi] || len == 0) ni = curInd;␍␊ |
| else if (curInd + len > (int)str.size()) return -1;␍␊ |
| else if (str.substr(curInd, len) != str.substr(matcher->starts[gi], len)) return -1;␍␊ |
| else ni = curInd + len;␍␊ |
| ␍␊ |
| return next->match(str, matcher, ni, depth + 1);␍␊ |
| }␍␊ |
| void NFAReferenceNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAStartOfInputNode␍␊ |
| ␍␊ |
| NFAStartOfInputNode::NFAStartOfInputNode() { }␍␊ |
| int NFAStartOfInputNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd == 0) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| return -1;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAEndOfInputNode␍␊ |
| ␍␊ |
| NFAEndOfInputNode::NFAEndOfInputNode(const bool lookForTerm) : term(lookForTerm) { }␍␊ |
| int NFAEndOfInputNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int len = (int)str.size();␍␊ |
| if (curInd == len) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| else if (term)␍␊ |
| {␍␊ |
| if (curInd == len - 1 && (str[curInd] == '\r' || str[curInd] == '\n'))␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| else if (curInd == len - 2 && str.substr(curInd, 2) == "\r\n")␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| void NFAEndOfInputNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(term=%s)\n", indent * 2, ' ', typeid(*this).name(), term ? "true" : "false");␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAWordBoundaryNode␍␊ |
| ␍␊ |
| NFAWordBoundaryNode::NFAWordBoundaryNode(const bool positive) : pos(positive) { }␍␊ |
| int NFAWordBoundaryNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| #define is_alpha(x) (((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))␍␊ |
| ␍␊ |
| int len = (int)str.size();␍␊ |
| bool ok = 0;␍␊ |
| char c1 = (curInd - 1 < len) ? str[curInd - 1] : -1;␍␊ |
| char c2 = (curInd < len) ? str[curInd ] : -1;␍␊ |
| ␍␊ |
| if (curInd == len) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| if (is_alpha(c1) ^ is_alpha(c2)) ok = 1;␍␊ |
| if (ok && pos) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| return -1;␍␊ |
| ␍␊ |
| #undef is_alpha␍␊ |
| }␍␊ |
| void NFAWordBoundaryNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(pos=%s)\n", indent * 2, ' ', typeid(*this).name(), pos ? "true" : "false");␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAEndOfMatchNode␍␊ |
| ␍␊ |
| NFAEndOfMatchNode::NFAEndOfMatchNode() { }␍␊ |
| int NFAEndOfMatchNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| if (curInd == matcher->lm) return next->match(str, matcher, curInd, depth + 1);␍␊ |
| return -1;␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAGroupHeadNode␍␊ |
| ␍␊ |
| NFAGroupHeadNode::NFAGroupHeadNode(const int groupIndex) : gi(groupIndex) { }␍␊ |
| int NFAGroupHeadNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ret, o = matcher->starts[gi];␍␊ |
| ␍␊ |
| matcher->starts[gi] = curInd;␍␊ |
| ret = next->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ret < 0) matcher->starts[gi] = o;␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| void NFAGroupHeadNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAGroupTailNode␍␊ |
| ␍␊ |
| NFAGroupTailNode::NFAGroupTailNode(const int groupIndex) : gi(groupIndex) { }␍␊ |
| int NFAGroupTailNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ret, o = matcher->ends[gi];␍␊ |
| matcher->ends[gi] = curInd;␍␊ |
| ret = next->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ret < 0) matcher->ends[gi] = o;␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| void NFAGroupTailNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAGroupLoopPrologueNode␍␊ |
| ␍␊ |
| NFAGroupLoopPrologueNode::NFAGroupLoopPrologueNode(const int groupIndex) : gi(groupIndex) { }␍␊ |
| int NFAGroupLoopPrologueNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ret, o1 = matcher->groups[gi], o2 = matcher->groupPos[gi], o3 = matcher->groupIndeces[gi];␍␊ |
| ␍␊ |
| matcher->groups[gi] = 0;␍␊ |
| matcher->groupPos[gi] = 0;␍␊ |
| matcher->groupIndeces[gi] = -1;␍␊ |
| ret = next->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ret < 0)␍␊ |
| {␍␊ |
| matcher->groups[gi] = o1;␍␊ |
| matcher->groupPos[gi] = o2;␍␊ |
| matcher->groupIndeces[gi] = o3;␍␊ |
| }␍␊ |
| ␍␊ |
| return ret;␍␊ |
| }␍␊ |
| void NFAGroupLoopPrologueNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(gi=%d)\n", indent * 2, ' ', typeid(*this).name(), gi);␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| // NFAGroupLoopNode␍␊ |
| ␍␊ |
| NFAGroupLoopNode::NFAGroupLoopNode(NFANode * internal, const int minMatch, const int maxMatch,␍␊ |
| const int groupIndex, const int matchType)␍␊ |
| {␍␊ |
| inner = internal;␍␊ |
| min = minMatch;␍␊ |
| max = maxMatch;␍␊ |
| gi = groupIndex;␍␊ |
| type = matchType;␍␊ |
| }␍␊ |
| void NFAGroupLoopNode::findAllNodes(std::map<NFANode*, bool> & soFar)␍␊ |
| {␍␊ |
| if (inner) inner->findAllNodes(soFar);␍␊ |
| NFANode::findAllNodes(soFar);␍␊ |
| }␍␊ |
| int NFAGroupLoopNode::match(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| bool b = (curInd > matcher->groupIndeces[gi]);␍␊ |
| ␍␊ |
| if (b && matcher->groups[gi] < min)␍␊ |
| {␍␊ |
| ++matcher->groups[gi];␍␊ |
| int o = matcher->groupIndeces[gi];␍␊ |
| matcher->groupIndeces[gi] = curInd;␍␊ |
| int ret = inner->match(str, matcher, curInd, depth + 1);␍␊ |
| if (ret < 0)␍␊ |
| {␍␊ |
| matcher->groupIndeces[gi] = o;␍␊ |
| --matcher->groups[gi];␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| else if (!b || matcher->groups[gi] >= max)␍␊ |
| {␍␊ |
| return next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| else␍␊ |
| {␍␊ |
| switch (type)␍␊ |
| {␍␊ |
| case 0: return matchGreedy(str, matcher, curInd, depth + 1);␍␊ |
| case 1: return matchLazy(str, matcher, curInd, depth + 1);␍␊ |
| case 2: return matchPossessive(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| }␍␊ |
| return -1;␍␊ |
| }␍␊ |
| int NFAGroupLoopNode::matchGreedy(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int o = matcher->groupIndeces[gi]; // save our info for backtracking␍␊ |
| matcher->groupIndeces[gi] = curInd; // move along␍␊ |
| ++matcher->groups[gi];␍␊ |
| int ret = inner->match(str, matcher, curInd, depth + 1); // match internally␍␊ |
| if (ret < 0)␍␊ |
| { // if we failed, then restore info and match next␍␊ |
| --matcher->groups[gi];␍␊ |
| matcher->groupIndeces[gi] = o;␍␊ |
| ret = next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| int NFAGroupLoopNode::matchLazy(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int ret = next->match(str, matcher, curInd, depth + 1); // be lazy, just go on␍␊ |
| if (ret < 0)␍␊ |
| {␍␊ |
| int o = matcher->groupIndeces[gi]; // save info for backtracking␍␊ |
| matcher->groupIndeces[gi] = curInd; // advance our position␍␊ |
| ++matcher->groups[gi];␍␊ |
| ret = inner->match(str, matcher, curInd, depth + 1); // match our internal stuff␍␊ |
| if (ret < 0) // if we failed, then restore the info␍␊ |
| {␍␊ |
| --matcher->groups[gi];␍␊ |
| matcher->groupIndeces[gi] = o;␍␊ |
| }␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| int NFAGroupLoopNode::matchPossessive(const std::string & str, Matcher * matcher, const int curInd, const int depth) const␍␊ |
| {␍␊ |
| int o = matcher->groupIndeces[gi]; // save info for backtracking␍␊ |
| matcher->groupPos[gi] = matcher->groups[gi]; // set a flag stating we have matcher at least this much␍␊ |
| matcher->groupIndeces[gi] = curInd; // move along␍␊ |
| ++matcher->groups[gi];␍␊ |
| int ret = inner->match(str, matcher, curInd, depth + 1); // try and match again␍␊ |
| if (ret < 0)␍␊ |
| { // if we fail, back off, but to an extent␍␊ |
| --matcher->groups[gi];␍␊ |
| matcher->groupIndeces[gi] = o;␍␊ |
| if (matcher->groups[gi] == matcher->groupPos[gi]) ret = next->match(str, matcher, curInd, depth + 1);␍␊ |
| }␍␊ |
| return ret;␍␊ |
| }␍␊ |
| void NFAGroupLoopNode::print(const int indent)␍␊ |
| {␍␊ |
| printf("%*c%s(gi=%d, min=%d, max=%d, type=%s)\n", indent * 2, ' ', typeid(*this).name(), gi, min, max, type == 0 ? "greedy" : type == 1 ? "reluctant" : "possessive");␍␊ |
| if (next) next->print(indent);␍␊ |
| }␍␊ |
| ␍␊ |
| #ifdef _WIN32␍␊ |
| #pragma warning(pop)␍␊ |
| #endif␍␊ |