"use strict";
/**
* Generic parser implementation
*
* @author
* Copyright (c) 2010, 2013 Thomas 'PointedEars' Lahn <js@PointedEars.de>
* @requires object.js
*/
if (typeof jsx == "undefined")
{
/**
* @namespace
*/
var jsx = {};
}
if (typeof jsx.string == "undefined")
{
/**
* @namespace
*/
jsx.string = {};
}
/**
* @namespace
*/
jsx.string.parser = (/** @constructor */ function () {
/**
* Returns a global regular expression.
*
* Returns a {@link RegExp} that has the same pattern as
* rx, but has the global
flag
* set and has other flags set according to a property source.
*
* @param {RegExp|String} rx
* @param {Object} propertySource
* @return {RegExp}
*/
function _globalize (rx, propertySource)
{
return new RegExp(
rx.source || rx,
"g"
+ (propertySource.ignoreCase ? 'i' : '')
+ (rx.multiline || propertySource.multiline ? 'm' : '')
);
}
/**
* A token as recognized by a {@link #Lexer}.
*
* @property {RegExp|String} pattern
* See the constructor's pattern argument
* @property type
* See the constructor's type argument
* @property {Array} match
* The text and substrings that the token matched last.
* @method parse({@link #Parser} parser)
* When this method is defined, a {@link #Parser} calls it instead of
* the parser's {@link #Parser.prototype.parseToken parseToken(Token)} method.
* A Token
may implement this method to parse the token using
* a specialized {@link #Lexer}. An implementation may
* {@link jsx.object#clone clone()} the parser to create a lexer
* that can find more or less tokens than the suggested one.
*/
var _Token =
/**
* @constructor
* @param {RegExp|String} pattern
* The pattern to match the token. Parenthesized subexpressions
* must be marked as non-capturing ((?:…)
) for
* the correct token to be returned by the lexer.
* @param type (optional)
* A value to set the object's type
property which can be used
* for recognizing the type of the matched token in the parser's
* {@link Parser.prototype#parseToken parseToken()} method. Using
* a constant value is recommended.
* @param {string} nextState (optional)
* The state that a lexer should enter after this token
* was read.
*/
function (pattern, type, nextState) {
if (pattern instanceof RegExp || pattern instanceof String)
{
this.pattern = pattern;
this.type = type;
}
else
{
/* Parameter object */
this.pattern = pattern.pattern;
this.type = pattern.type;
}
this.nextState = nextState;
};
/**
* A Lexer
(lexical scanner) converts a sequence
* of characters in an input string into a sequence of
* {@link #Token Token}s which can be used by a {@link #Parser}.
*
*
The list of patterns to match the tokens can also be built * with calling {@link Lexer.prototype#addToken addToken()}, * whereas the calls for the preferred matches must come first, * or with {@link Lexer.prototype#addTokens addTokens()}, whereas * the arguments for the preferred matches must come first.
* * ALexer
has a state that defines the
* Token
s that it recognizes only when it is in
* that state. When a Token
has been recognized,
* the Lexer
's state can be changed (see
* {@link #Token} for details). Initially, the state is
* {@link Lexer.STATE_START}.
*/
var _Lexer = jsx.object.extend(
/**
* @constructor
* @param {Array} tokens
*/
function (tokens) {
var _tokens;
/**
* true
if the tokens have been compiled into a
* single regular expression. Always false if the longest match
* should win.
*
* @private
* @memberOf __jsx.string.parser.Lexer
* @type Object{int: boolean}
* @see jsx.string.parser.Lexer.prototype.longestMatchWins
*/
var _compiled = {};
this.state = _Lexer.STATE_START;
/**
* @protected
* @memberOf jsx.string.parser.Lexer
* @param {int} index
* Index of the token to be compiled
* @return {RegExp}
*/
this._compile = function (index) {
if (typeof index != "number" || index % 1 !== 0)
{
return jsx.throwThis(jsx.InvalidArgumentError, "index must be an integer");
}
var expression = _tokens[index].pattern;
if (!_compiled[index])
{
expression = _tokens[index].pattern = _globalize(
_tokens[index].pattern,
this);
_compiled[index] = true;
}
return expression;
};
/**
* Appends a {@link #Token} to the list of tokens.
*
* @param {jsx.string.parser.Token} token
* @return {Lexer}
* This object
*/
this.addToken = function (token) {
if (!_tokens)
{
_tokens = [];
}
_tokens.push(token);
_compiled[-1] = false;
return this;
};
/**
* @return {Array}
*/
this.getTokens = function () {
return _tokens;
};
if (tokens)
{
this.addTokens(tokens);
}
},
{
/**
* @memberOf jsx.string.parser.Lexer
*/
STATE_START: "S"
}
).extend(null, {
/**
* true
if character case should not matter
*
* @memberOf jsx.string.parser.Lexer.prototype
*/
ignoreCase: false,
/**
* true
if newline should be included in `.' matches
*/
dotAll: false,
/**
* true
if the longest match, not the first one
* in the list of token expressions, should be used. This is
* important where there is ambiguity in token prefixes, but
* wasteful in runtime complexity where there is not, because
* the length of the matches for each token need to be compared
* against one another then. Use with caution.
*/
longestMatchWins: false,
/**
* Text position where to continue scanning
*/
lastIndex: 0,
/**
* Appends one or more {@link #Token Tokens} to the list of tokens.
*
* @param {TokenList|Array} tokens
* @return {Lexer}
* This object
*/
addTokens: function (tokens) {
for (var i = 0, len = tokens.length; i < len; ++i)
{
this.addToken(tokens[i]);
}
return this;
},
/**
* Returns the next token in an input string.
*
* @param sText
* @return {#Token}
* The next token in sText
*/
getNextToken: function (sText) {
if (arguments.length > 0 && typeof this.text == "undefined")
{
/**
* @type string
*/
this.text = String(sText);
}
sText = this.text;
if (!sText)
{
return jsx.throwThis(jsx.InvalidArgumentError, "Nothing to scan");
}
var tokens = this.getTokens();
var longest_match_wins = this.longestMatchWins;
var last_index = this.lastIndex;
var used_match = {
index: Infinity,
lastIndex: last_index,
length: 0
};
// debugger;
/*
* NOTE: Must always use loop in order to find first match
* regardless of pattern position
*/
for (var i = 0, len = tokens.length; i < len; ++i)
{
var rx = this._compile(i);
rx.lastIndex = last_index;
var matches = rx.exec(sText);
if (matches)
{
var index = matches.index;
var current_token = tokens[i];
if (longest_match_wins)
{
var match_length = matches[0].length;
var used_index = used_match.index;
if (index < used_index
|| (index === used_index && match_length > used_match.length))
{
used_match.token = current_token;
used_match.match = matches[0];
used_match.length = match_length;
used_match.index = index;
used_match.lastIndex = rx.lastIndex;
}
}
else if (index < used_match.index)
{
used_match.token = current_token;
used_match.match = match;
used_match.index = match.index;
used_match.lastIndex = rx.lastIndex;
}
}
}
if (used_match.match)
{
this.lastIndex = used_match.lastIndex;
var next_state = used_match.token.nextState;
if (typeof next_state != "undefined")
{
this.state = next_state;
}
used_match.token.match = used_match.match;
return used_match.token;
}
return null;
},
});
/**
* An Array-like object whose items are {@link #Token}s.
*
* @param {Array|jsx.string.parser.Token} list
* If an {@link Array}, its elements are used as list
* (type conversion). Otherwise all arguments are used.
* @throws jsx.InvalidArgumentError
* if an item is not a {@link #Token}
*/
var _TokenList = function (list) {
var items = jsx.object.isArray(list) ? list : arguments;
for (var i = 0, len = items.length; i < len; ++i)
{
var token = items[i];
if (!(token instanceof _Token))
{
return jsx.throwThis(jsx.InvalidArgumentError,
"jsx.string.parser.TokenList: saw " + token + ", expected jsx.string.parser.Token");
}
this[i] = token;
this.length = i + 1;
}
}.extend(Array);
return {
/**
* @memberOf jsx.string.parser
* @param {Object[TokenList]} data
*/
StateCollection: function (data) {
var keys = jsx.object.getKeys(data);
for (var i = 0, len = keys.length; i < len; ++i)
{
var state = keys[i];
if (typeof state != "function")
{
if (!(state instanceof _TokenList))
{
return jsx.throwThis(jsx.InvalidArgumentError,
"jsx.string.parser.StateCollection: saw " + state
+ ", expected jsx.string.parser.TokenList");
}
this[state] = data[state];
}
}
},
Token: _Token,
TokenList: _TokenList,
Lexer: _Lexer,
/**
* A Parser
handles {@link #Token Tokens} in an input string
* as provided by a {@link #Lexer}.
*
* @function
* @property {Lexer} _lexer
* The lexer used by this parser
*/
Parser: (
/**
* @constructor
* @param {Lexer} lexer
* The lexer to be used by this parser
*/
function (lexer) {
if (arguments.length > 0)
{
if (!(lexer instanceof _Lexer))
{
return jsx.throwThis(jsx.InvalidArgumentError,
"jsx.string.parser.Parser: saw " + lexer + ", expected jsx.string.parser.Lexer");
}
/**
* @memberOf jsx.string.parser.Parser
*/
this._lexer = lexer;
}
}
).extend(null, {
/**
* Parses an input string requesting the next token from a {@link #Lexer}.
*
* @memberOf jsx.string.parser.Parser.prototype
* @param {String} sText
*/
parse: function (sText) {
var token;
var lexer = this._lexer;
/* DEBUG */
console.log(lexer.getTokens());
while ((token = lexer.getNextToken(sText)))
{
var tokensLeft = this.parseToken(token);
/* Break if all tokens were consumed or contract was violated */
if (!tokensLeft)
{
break;
}
}
},
/**
* Parses an input token as provided by a {@link #Lexer}.
*
* @param {jsx.string.parser.Token} token
* @return A false-value if all tokens have been consumed
* or an unrecoverable syntax error has been detected.
*/
parseToken: function (token) {
var result = true;
if (typeof token.parse == "function")
{
result = token.parse(this);
}
return result;
}
})
};
}());