-
Notifications
You must be signed in to change notification settings - Fork 0
/
regexEngine.js
179 lines (168 loc) · 8.29 KB
/
regexEngine.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
var regexEngine = function(regex, string) {
// Since JavaScript doesn't support pointer arithmetic, we mimic
// that behavior using counter variables which indicate which position
// in the array we want to be referencing at each step in the recursion
regexCounter = 0;
stringCounter = 0;
function match() {
// If regex starts with ^ then string must begin with match of remainder of expression
if (regex[0] === '^') {
// If regex matches starting with first character, this will return true
// Otherwise it will return false
return matchHere(regexCounter+1, stringCounter);
}
// Else, walk through string and check to see if string match beginning of expression at each point in string
// This is an example of backtracking. If the regex doesn't match at this point in the string, the next
// character in the string will be consumed and the regex will be checked again starting at that point.
for (var i = 0; i < string.length; i++) {
if (matchHere(regexCounter, stringCounter + i)) {
return true;
}
}
// If the regex can't be matched starting at any point in the string, return false
return false;
}
// Variable shadowing --- Again, replacement for pointer arithmetic
function matchHere(regexCounter, stringCounter) {
// If we reach the end of the regex pattern, we've found a match
if (regexCounter === regex.length) {
return true;
}
// Handle * case
if (regex[regexCounter + 1] === '*') {
return matchStar(regex[regexCounter], regexCounter + 2, stringCounter);
}
// Handle + case
if (regex[regexCounter + 1] === '+') {
return matchPlus(regex[regexCounter], regexCounter + 2, stringCounter);
}
// If the current regex char specfies the end of the string then return true or false depending on whether we've reached the end of the string
if (regex[regexCounter] === '$' && regexCounter === regex.length -1) {
return stringCounter === string.length;
}
// If we're not at the end of the string and the regular expression character is a wildcard or the same as the string character, continue recursing
if (stringCounter !== string.length && (regex[regexCounter] === '.' || regex[regexCounter] === string[stringCounter])) {
return matchHere(regexCounter + 1, stringCounter + 1);
}
// If all else fails, return false. No match.
return false;
};
// This is a lazy (non-greedy) implementation of matchStart. This means that matchStart will attempt to match the minimum number of characters
// possible, as opposed to greedily matching as many as possible. There is no difference in the final output between using greedy and non-greedy
// matching, but it will alter how fast the regex-engine runs on certain inputs based on the amount of backtracking that needs to occur.
function matchStar(starChar, regexCounter, stringCounter) {
// Lazy (non-greedy) implementation of star. Since the minimum number of characters a start can match is 0, can immediately check if regex
// matches from this point onward, return true since the * does not need to consume any characters.
if (matchHere(regexCounter, stringCounter)) {
return true;
};
// If consuming 0 characters fails, consume one character and check, then continue consuming a character and checking if the regex pattern
// matches until there are no more characters to consume.
stringCounter++;
while (stringCounter !== string.length && (string[stringCounter] === starChar || starChar === '.')) {
if (matchHere(regexCounter, stringCounter)) {
return true;
}
stringCounter++;
}
// If the regex pattern was never able to match return false
return false;
};
// This is a greedy implementation of plus. It will match / consume as many character as possible. If matching the regex pattern after it has coonsumed
// as many characters as possible fails, then it will relinquish a character and try again, repeating this process until only one character remains. If
// the regex still fails to match at that point, then it will return false.
function matchPlus(plusChar, regexCounter, stringCounter) {
// Count for keeping track of how many characters have been matched. Minimum is 1 for plus to be successful.
var numFound = 0;
// If the first character doesn't match (and we're not using a . to match all characters) then return false
if (string[stringCounter] !== plusChar && plusChar !== '.') {
return false;
}
// This is where the greediness happens. Keep incrementing stringCounter (equivalent to consuming a character) until the character no longer matches
while (string[stringCounter] === plusChar || (plusChar === '.' && stringCounter !== string.length)) {
stringCounter++;
numFound++;
}
// After greedily consuming all the characters, check if the regamining regex/string combination matches.
if (matchHere(regexCounter, stringCounter) && numFound >= 1) {
return true;
}
// If it doesn't match, then relinquish a character and try again. Continue doing so until a match is found, or all characters (except one) have
// been relinquished. If only one character remains and the pattern has not matched, then return false as plus requires at least one character match.
else {
stringCounter--;
numFound--;
while (string[stringCounter] === plusChar || (plusChar === '.' && stringCounter !== string.length && stringCounter !== 0)) {
if (matchHere(regexCounter, stringCounter) && numFound >= 1) {
return true;
}
stringCounter--;
numFound--;
}
return false;
}
};
// This function is for instruction only, it shows what a lazy (non-greedy) implementation of match plus would look like. It handles certain pathological
// regex patterns / string combinations (like regex: a+a+a+a+a+a+a+a+ and string: aaaaaaaaaaaaaaaaaaaaaab) much more efficiently
function matchPlusLazy(plusChar, regexCounter, stringCounter) {
var numFound = 0;
if (string[stringCounter] !== plusChar && plusChar !== '.') {
return false;
}
// This is lazy (non-greedy) implementation. It consumes on character only, then tries to match the remaining regex / string. If that fails, it then consumes
// another token. Rinse and repeat until it reaches the end of the string.
while (string[stringCounter] === plusChar || (plusChar === '.' && stringCounter !== string.length)) {
stringCounter++;
numFound++;
if (matchHere(regexCounter, stringCounter) && numFound >= 1) {
return true;
}
}
// Return false if it never matches
return false;
};
return match();
}
// Simple testing functionality
var tester = {
numAssertions: 0,
numSuccessful: 0,
runTest: function(regex,string, result) {
this.numAssertions++;
if (regexEngine(regex, string) === result) {
this.numSuccessful++;
}
},
finishTests: function() {
console.log('Tests complete: ' + this.numSuccessful + '/' + this.numAssertions + ' assertions successful');
}
};
tester.runTest('a','a', true);
tester.runTest('a', 'b', false);
tester.runTest('ab', 'ab', true);
tester.runTest('ab', 'abc', true);
tester.runTest('ab$', 'abc', false);
tester.runTest('^dab', 'dabb', true);
tester.runTest('a*b', 'b', true);
tester.runTest('a*b', 'ab', true);
tester.runTest('a*b', 'aaaaab', true);
tester.runTest('a*b', 'a', false);
tester.runTest('a+', 'a', true);
tester.runTest('a+', 'ab', true);
tester.runTest('a+', 'b', false);
tester.runTest('a+', '', false);
tester.runTest('.*ab', 'asdfadsab', true);
tester.runTest('.*ab', 'ab', true);
tester.runTest('.*ab', 'absdfadsfa', true);
tester.runTest('.+ab', 'asdfadsab', true);
tester.runTest('.+ab', 'ab', false);
tester.runTest('.+ab', 'absdfadsfa', false);
tester.runTest('.+ab', 'aab', true);
tester.runTest('d+a+bb', 'dabb', true);
tester.runTest('d+a+abb', 'dabb', false);
tester.runTest('d+a+a*bb', 'dabb', true);
tester.runTest('d+a+a*bb', 'daaaaaabb', true);
// Example of a pathological regex / string combination that causes a LOT of backtracking with the greedy plus character, potentially crashes browser in some cases.
// However if its run with the lazy version of matchPlus, it executes extremely quickly.
// tester.runTest('a+a+a+a+a+a+a+a+a+', 'aaaaaaaaaaaaaaaaaaaaaab', true);
tester.finishTests();