Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support atomic grouping and possessive quantifiers, and named group, and lookbehind #55

Open
wants to merge 6 commits into
base: legacy
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions dist/regulex.js

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@
display:block;
margin:0 auto;
}
#graphCt svg text {
white-space: pre;
}
footer div.normal {
text-align: center;
border-top:1px solid grey;
Expand Down
23 changes: 20 additions & 3 deletions src/RegExp.js
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ var NFABuilders=(function _() {
var groupStart=[newState()];
var ts=[{
from:from,to:groupStart,charset:false,
action:!node.nonCapture && function _groupStart(stack,c,i) {
action:!node.nonCapture && !node.atomicGroup && function _groupStart(stack,c,i) {
stack.unshift({type:GROUP_CAPTURE_START,num:node.num,index:i});
}
}];
Expand All @@ -223,7 +223,7 @@ var NFABuilders=(function _() {
var groupEnd=[newState()];
ts.push({
from:a.accepts,to:groupEnd,charset:false,
action:!node.nonCapture && function _groupEnd(stack,c,i) {
action:!node.nonCapture && !node.atomicGroup && function _groupEnd(stack,c,i) {
stack.unshift({type:GROUP_CAPTURE_END,num:node.num,index:i});
}
});
Expand Down Expand Up @@ -284,6 +284,12 @@ var NFABuilders=(function _() {
case AssertNegativeLookahead:
f=_negativeLookahead(node);
break;
case AssertLookbehind:
f=_lookbehind(node);
break;
case AssertNegativeLookbehind:
f=_negativeLookbehind(node);
break;
}
return _newAssert(node,from,f);

Expand All @@ -304,10 +310,21 @@ var NFABuilders=(function _() {
return ret.acceptable;
};
}
function _lookbehind(node) {
var m=NFA(tree2NFA(node.sub,['start']));
return function _Lookbehind(stack,c,i,state,s) {
var ret=m.input(s,i,null,stack);
return ret.acceptable;
};
}
function _negativeLookahead(node) {
var f=_lookahead(node);
return function _NLookahead() {return !f.apply(this,arguments)};
}
function _negativeLookbehind(node) {
var f=_lookbehind(node);
return function _NLookbehind() {return !f.apply(this,arguments)};
}

function _isBoundary(i,s) {return !!(_isWordChar(i-1,s) ^ _isWordChar(i,s))}
function _isWordChar(i,s) {return i!==-1 && i!==s.length && /\w/.test(s[i])}
Expand Down Expand Up @@ -359,7 +376,7 @@ function repeatNFA(node,from) {
});
}
var endState=[newState()];
if (repeat.nonGreedy) {
if (repeat.nonGreedy || repeat.possessive) {
trans.push({
from:accepts,to:endState,charset:false
});
Expand Down
146 changes: 120 additions & 26 deletions src/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ var Constants={
//Assertion Type Constants
AssertLookahead:"AssertLookahead",
AssertNegativeLookahead:"AssertNegativeLookahead",
AssertLookbehind:"AssertLookbehind",
AssertNegativeLookbehind:"AssertNegativeLookbehind",
AssertNonWordBoundary:"AssertNonWordBoundary",
AssertWordBoundary:"AssertWordBoundary",
AssertEnd:"AssertEnd",
Expand Down Expand Up @@ -350,8 +352,14 @@ function _checkRepeat(node) {
if (node.repeat) {
var astype = node.assertionType;
var msg = 'Nothing to repeat! Repeat after assertion doesn\'t make sense!';
if (astype === 'AssertLookahead' || astype === 'AssertNegativeLookahead' ) {
var assertifier = astype === 'AssertLookahead' ? '?=' : '?!';
var assertifiers = {
'AssertLookahead': '?=',
'AssertNegativeLookahead': '?!',
'AssertLookbehind': '?<=',
'AssertNegativeLookbehind': '?<!',
}
var assertifier = assertifiers[astype];
if (assertifier !== undefined) {
var pattern = '('+assertifier+'b)';
msg += '\n/a'+pattern+'+/、/a'+pattern+'{1,n}/ are the same as /a'+pattern+'/。\n' +
'/a'+pattern+'*/、/a'+pattern+'{0,n}/、/a'+pattern+'?/ are the same as /a/。';
Expand Down Expand Up @@ -492,7 +500,7 @@ var actions=(function _() {
function repeat01(stack,c,i,state,s) { _repeat(stack,0,1,i,s) } // e.g. /a?/
function repeat1(stack,c,i,state,s) { _repeat(stack,1,Infinity,i,s) } // e.g. /a+/
function _repeat(stack,min,max,charEndIndex,s) {
var last=stack[0],repeat={min:min,max:max,nonGreedy:false},
var last=stack[0],repeat={min:min,max:max,nonGreedy:false,possessive:false},
charIndex=charEndIndex-1;
if (last.chars && last.chars.length===1) charIndex=last.indices[0];
if (last.type===EXACT_NODE) { // exact node only repeat last char
Expand All @@ -509,6 +517,7 @@ var actions=(function _() {
_set(repeat,'beginIndex',charEndIndex-stack[0].indices[0]);
}
function repeatNonGreedy(stack) { stack[0].repeat.nonGreedy=true}
function repeatPossessive(stack) { stack[0].repeat.possessive=true}

function escapeStart(stack,c,i) {
stack.unshift({
Expand Down Expand Up @@ -566,14 +575,38 @@ var actions=(function _() {
group.num=undefined;
stack.groupCounter.i--;
}
function groupToAssertion(stack,c,i) { // Convert /(?!)/,/(?=)/ to AssertNode
function groupToAssertion(stack,c,i,state) { // Convert /(?!)/,/(?=)/ to AssertNode
var group=stack._parentGroup;
group.type=ASSERT_NODE;
group.assertionType= c=='=' ? AssertLookahead : AssertNegativeLookahead ;
if (state === 'groupNameStart') {
group.assertionType= c==='=' ? AssertLookbehind : AssertNegativeLookbehind ;
}
// Caveat!!! Assertion group no need to capture
group.num=undefined;
stack.groupCounter.i--;
}
function groupAtomicGroup(stack) { // /(?>)/
var group=stack._parentGroup
group.atomicGroup=true;
group.num=undefined;
stack.groupCounter.i--;
}
function groupNameP(stack) {
var group=stack._parentGroup
group.nameP=true;
}
function groupName(stack,c,i,state,s) {
var group=stack._parentGroup
if (state === 'groupNameStart') {
group.name = c;
} else if (state === 'groupNameStartApos') {
group.name = c;
group.aops = true;
} else {
group.name += c;
}
}
function groupEnd(stack,c,i,state,s) {
stack=endChoice(stack); // restore group's stack from choice
var group=stack._parentGroup;
Expand Down Expand Up @@ -747,11 +780,6 @@ var actions=(function _() {
type:'InvalidBackReference',lastIndex:i,astStack:stack,lastState:state,
message:'Back reference number('+n+') greater than current groups count('+cn+').'
});
} else if (_isRecursive(n,stack)) {
throw new RegexSyntaxError({
type:'InvalidBackReference',lastIndex:i,astStack:stack,lastState:state,
message:'Recursive back reference in group ('+n+') itself.'
});
}
last.num=n;

Expand All @@ -762,18 +790,45 @@ var actions=(function _() {
}
}

function nameBackref(stack,c,i,state) {
var last=stack[0];
if (state === 'nameBackrefStartP') {
var group=stack._parentGroup;
delete stack._parentGroup;
delete stack.groupCounter;
stack=group._parentStack;
delete group._parentStack;
last={type:BACKREF_NODE,indices:[i-4]};
stack.unshift(last);
last.name = '';
stack.groupCounter.i--;
} else if (state === 'nameBackrefStart' || state === 'nameBackrefStartAops' || state === 'nameBackrefStartCub') {
last={type:BACKREF_NODE,indices:[i-3]};
stack.unshift(last);
last.name = '';
}
last.name += c;
if (state === 'nameBackrefStartP') {
return stack;
}
}

//console.log(K.locals(_));

return {
escapeStart:escapeStart,
exact:exact,dot:dot,nullChar:nullChar,assertBegin:assertBegin,
assertEnd:assertEnd,assertWordBoundary:assertWordBoundary,
repeatnStart:repeatnStart,repeatnComma:repeatnComma,repeatNonGreedy:repeatNonGreedy,
repeatPossessive:repeatPossessive,
repeatnEnd:repeatnEnd,repeat1:repeat1,repeat01:repeat01,repeat0:repeat0,
charClassEscape:charClassEscape,normalEscape:normalEscape,
unicodeEscape:unicodeEscape,hexEscape:hexEscape,charClassEscape:charClassEscape,
groupStart:groupStart,groupNonCapture:groupNonCapture,backref:backref,
nameBackref:nameBackref,
groupAtomicGroup:groupAtomicGroup,
groupToAssertion:groupToAssertion,groupEnd:groupEnd,
groupName:groupName, groupNameP:groupNameP,
choice:choice,endChoice:endChoice,
charsetStart:charsetStart,charsetExclude:charsetExclude,
charsetContent:charsetContent,charsetNullChar:charsetNullChar,
Expand Down Expand Up @@ -806,7 +861,8 @@ var normalEscapeInCharsetEX='^'+charClassEscape+unicodeEscape+hexEscape+'0-9';

// 'rntvf\\' escape ,others return raw
// Also need exclude \b\B assertion and backref
var normalEscapeEX=normalEscapeInCharsetEX+'bB1-9';
// Also need exclude \k\g (named backref)
var normalEscapeEX=normalEscapeInCharsetEX+'bB1-9'+'kg';

//var controlEscape;//Never TODO.Same reason as OctEscape.

Expand All @@ -829,18 +885,18 @@ var charsetRangeEndIncompleteEscapeStates=charsetRangeEndIncompleteEscapeFirstSt

var config={
compact:true,
accepts:'start,begin,end,repeat0,repeat1,exact,repeatn,repeat01,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates),
accepts:'start,begin,end,repeat0,repeat1,exact,repeatn,repeat01,repeatNonGreedy,repeatPossessive,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates),
trans:[
['start,begin,end,exact,repeatNonGreedy,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,choice,repeatn>exact',exactEXCharset,actions.exact],
['start,begin,end,exact,repeatNonGreedy,repeatPossessive,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,groupNameEnd,choice,repeatn>exact',exactEXCharset,actions.exact],
// e.g. /\u54/ means /u54/
[allHexEscapeStates+'>exact',exactEXCharset+hexDigit,actions.exact],
// e.g. /\0abc/ is exact "\0abc",but /\012/ is an error
['nullChar>exact',exactEXCharset+digit,actions.exact],
//[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>exact',exactEXCharset+'']
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',start,begin,end,exact,repeatNonGreedy,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,choice,repeatn>exact','.',actions.dot],
['start,groupStart,groupQualifiedStart,end,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates+'>begin','^',actions.assertBegin],
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',exact>repeatnStart','{',actions.repeatnStart],
['start,begin,end,groupQualifiedStart,groupStart,repeat0,repeat1,repeatn,repeat01,repeatNonGreedy,choice>repeatnErrorStart','{',actions.exact],//No repeat,treat as exact char e.g. /{/,/^{/,/a|{/
[(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+',start,begin,end,exact,repeatNonGreedy,repeatPossessive,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,groupNameEnd,choice,repeatn>exact','.',actions.dot],
['start,groupStart,groupQualifiedStart,groupNameEnd,end,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,repeatPossessive,choice,'+repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates+'>begin','^',actions.assertBegin],
[(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+',exact>repeatnStart','{',actions.repeatnStart],
['start,begin,end,groupQualifiedStart,groupNameEnd,groupStart,repeat0,repeat1,repeatn,repeat01,repeatNonGreedy,repeatPossessive,choice>repeatnErrorStart','{',actions.exact],//No repeat,treat as exact char e.g. /{/,/^{/,/a|{/
['repeatnStart>repeatn_1',digit,actions.exact], // Now maybe /a{1/
['repeatn_1>repeatn_1',digit,actions.exact], // Could be /a{11/
['repeatn_1>repeatn_2',',',actions.repeatnComma], // Now maybe /a{1,/
Expand All @@ -865,15 +921,16 @@ var config={
// "/a{2,a/" and "/{3,a" are valid
['repeatn_2,repeatnError_2>exact',exactEXCharset+digit+'}',actions.exact],

['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat0','*',actions.repeat0],
['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat1','+',actions.repeat1],
['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat01','?',actions.repeat01],
['exact,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat0','*',actions.repeat0],
['exact,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat1','+',actions.repeat1],
['exact,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat01','?',actions.repeat01],
['choice>repeatErrorFinal','*+?'],
['repeat0,repeat1,repeat01,repeatn>repeatNonGreedy','?',actions.repeatNonGreedy],
['repeat0,repeat1,repeat01,repeatn>repeatErrorFinal','+*'],
['repeat0,repeat1,repeat01,repeatn>repeatPossessive','+',actions.repeatPossessive],
['repeat0,repeat1,repeat01,repeatn>repeatErrorFinal','*'],

// Escape
['start,begin,end,groupStart,groupQualifiedStart,exact,repeatNonGreedy,repeat0,repeat1,repeat01,repeatn,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>escape','\\',actions.escapeStart],
['start,begin,end,groupStart,groupQualifiedStart,groupNameEnd,exact,repeatNonGreedy,repeatPossessive,repeat0,repeat1,repeat01,repeatn,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>escape','\\',actions.escapeStart],
['escape>nullChar','0',actions.nullChar],
['nullChar>digitFollowNullError','0-9'], // "/\0123/" is invalid in standard
['escape>exact',normalEscapeEX,actions.normalEscape],
Expand All @@ -891,22 +948,59 @@ var config={
['escape>digitBackref','1-9',actions.backref],
['digitBackref>digitBackref',digit,actions.backref],
['digitBackref>exact',exactEXCharset+digit,actions.exact],
['escape>nameBackrefK','k'],
// k<name>
['nameBackrefK>nameBackrefStart','<'],
['nameBackrefStart>nameBackref','a-zA-Z_',actions.nameBackref],
['nameBackref>nameBackref','a-zA-Z_0-9',actions.nameBackref],
['nameBackref>nameBackrefEnd','>'],
// k'name'
['nameBackrefK>nameBackrefStartAops','\''],
['nameBackrefStartAops>nameBackrefAops','a-zA-Z_',actions.nameBackref],
['nameBackrefAops>nameBackrefAops','a-zA-Z_0-9',actions.nameBackref],
['nameBackrefAops>nameBackrefEnd','\''],
// k{name}, g{name}
['escape>nameBackrefG','g'],
['nameBackrefK,nameBackrefG>nameBackrefStartCub','{'],
['nameBackrefStartCub>nameBackrefCub','a-zA-Z_',actions.nameBackref],
['nameBackrefCub>nameBackrefCub','a-zA-Z_0-9',actions.nameBackref],
['nameBackrefCub>nameBackrefEnd','}'],
// python format: (?P=...)
['groupNameP>nameBackrefStartP','='],
['nameBackrefStartP>nameBackrefP','a-zA-Z_',actions.nameBackref],
['nameBackrefP>nameBackrefP','a-zA-Z_0-9',actions.nameBackref],
['nameBackrefP>nameBackrefEnd',')'],
['nameBackrefEnd>exact',exactEXCharset+digit,actions.exact],

// Group start
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,start,groupStart,groupQualifiedStart,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>groupStart','(',actions.groupStart],
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,repeatPossessive,start,groupStart,groupQualifiedStart,groupNameEnd,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>groupStart','(',actions.groupStart],
['groupStart>groupQualify','?'],
['groupQualify>groupQualifiedStart',':',actions.groupNonCapture],//group non-capturing
['groupQualify>groupQualifiedStart','=',actions.groupToAssertion],//group positive lookahead
['groupQualify>groupQualifiedStart','!',actions.groupToAssertion],//group negative lookahead
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',groupStart,groupQualifiedStart,begin,end,exact,repeat1,repeat0,repeat01,repeatn,repeatNonGreedy,choice>exact',')',actions.groupEnd],//group end
['groupNameStart>groupQualifiedStart','=',actions.groupToAssertion],//group positive lookbehind
['groupNameStart>groupQualifiedStart','!',actions.groupToAssertion],//group negative lookbehind
['groupQualify>groupQualifiedStart','>',actions.groupAtomicGroup],//group atomic-group
// python
['groupQualify>groupNameP','P',actions.groupNameP],
['groupQualify,groupNameP>groupNameStart','<'],
['groupNameStart>groupName','a-zA-Z_', actions.groupName],//group name
['groupName>groupName','a-zA-Z_0-9', actions.groupName],//group name
// single quote
['groupQualify>groupNameStartApos','\''],
['groupNameStartApos>groupNameApos','a-zA-Z_', actions.groupName],//group name
['groupNameApos>groupNameApos','a-zA-Z_0-9', actions.groupName],//group name
['groupNameApos>groupNameEnd','\''],
['groupName>groupNameEnd','>'],
[(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+',groupStart,groupQualifiedStart,groupNameEnd,begin,end,exact,repeat1,repeat0,repeat01,repeatn,repeatNonGreedy,repeatPossessive,choice>exact',')',actions.groupEnd],//group end

//choice
['start,begin,end,groupStart,groupQualifiedStart,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>choice','|', actions.choice],
['start,begin,end,groupStart,groupQualifiedStart,groupNameEnd,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,repeatPossessive,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>choice','|', actions.choice],

['start,groupStart,groupQualifiedStart,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>end','$',actions.assertEnd],
['start,groupStart,groupQualifiedStart,groupNameEnd,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,repeatPossessive,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>end','$',actions.assertEnd],

// Charset [HA-HO]
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,groupQualifiedStart,groupStart,start,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>charsetStart','[',actions.charsetStart],
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,repeatPossessive,groupQualifiedStart,groupNameEnd,groupStart,start,choice,'+(repeatnStates+',nullChar,digitBackref,nameBackrefEnd,'+unicodeEscapeStates+','+hexEscapeStates)+'>charsetStart','[',actions.charsetStart],
['charsetStart>charsetExclude','^',actions.charsetExclude],
['charsetStart>charsetContent','^\\]^',actions.charsetContent],
['charsetExclude>charsetContent','^\\]',actions.charsetContent], // "[^^]" is valid
Expand Down
Loading