/**
* Generic parser implementation
*
* @author
* Copyright (c) 2010 Thomas 'PointedEars' Lahn <js@PointedEars.de>
*/
if (typeof jsx.string.parser == "undefined")
{
/**
* @namespace
*/
jsx.string.parser = {};
}
/**
* A token as recognized by a {@link #Lexer}.
*
* @param pattern : RegExp|String
* The pattern to match the token. Parenthesized subexpressions
* must be marked as non-capturing ((?:…)
) for
* the correct token to be returned by the lexer.
* @param type : optional any
* A value to set the object's type
property which can be used
* for recognizing the type of the matched token in the parser's
* {@link Parser.prototype#parseToken parseToken()} method. Using
* a constant value is recommended, the default is the constructor
* identifier or "Token"
if the identifier is unavailable.
* However, whenever a token requires further parsing, it is recommended
* to use an object which constructor calls {@link Token Token()}, and
* to compare against its constructor
property instead.
* @property pattern : RegExp|String
* See the constructor's pattern argument
* @property type
* See the constructor's type argument
* @property match : Array
* The text and substrings that the token matched. If the {@link #Lexer}'s
* {@link #Lexer.keepHistory keepHistory} property is false
,
* the text and substrings that the token matched last.
* @method parse(lexer : Lexer)
* When this method is defined, a {@link #Parser} calls it instead of
* the parser's {@link Parser.prototype#parseToken parseToken(Token)} method.
* A Token
may implement this method to parse the token using
* a specialized {@link #Lexer}. An implementation may
* {@link jsx.object#clone clone()} the referred lexer to create a lexer
* that can find more or less tokens than the suggested one.
*/
jsx.string.parser.Token = function (pattern, type) {
this.pattern = pattern;
this.type = type || jsx.object.getFunctionName(this.constructor) || "Token";
};
/**
* A Lexer
(lexical analyzer) converts a sequence
* of characters in an input string into a sequence of
* {@link #Token Tokens} which can be used by a {@link Parser}.
*
*
The list of patterns to match the tokens can be built with calling * {@link Lexer.prototype#addToken addToken()}, whereas the calls * for the preferred matches must come first, or with * {@link Lexer.prototype#addTokens addTokens()}, whereas the arguments * for the preferred matches must come first.
* * @param text : optional String * The text to be analyzed by this lexer; the default is the empty string. * @property text : string * The text to be analyzed * @property _tokens : Array[Token] * The list of {@link #Token Tokens} * @property keepHistory : boolean = false * Iftrue
, each match initializes and returns a new {@link #Token};
* use this if you need to retain a history of tokens. Otherwise the
* existing {@link #Token Tokens} are reused which requires less memory.
*/
jsx.string.parser.Lexer = function (text) {
/**
* @type String
*/
this.text = text ? String(text) : "";
this._tokens = [];
this.keepHistory = false;
};
jsx.string.parser.Lexer.extend(null, {
/**
* true
if the tokens have been compiled into a
* single regular expression. Always false if the longest match
* should win.
*
* @see _longestMatchWins
*/
_compiled: false,
/**
* true
if character case should not matter
*/
_ignoreCase: false,
/**
* true
if newline should be included in `.' matches
*/
_dotAll: false,
/**
* true
if the longest match, not the first one
* in the list of token expressions, should be used. This is
* important where there is ambiguity in token prefixes, but
* wasteful in runtime complexity where there is not, because
* the length of the matches for each token need to be compared
* against one another then. Use with caution.
*/
_longestMatchWins: false,
/**
* Text position where to continue scanning
*/
_offset: 0,
/**
* Appends a {@link #Token} to the list of tokens.
*
* @memberOf jsx.string.parser.Lexer#prototype
* @param token : RegExp|String|Token
* If a reference to a RegExp
or a String
,
* the value is used for the token pattern;Token
, the reference is used and
* tokenType is ignored.
*
* NOTE: Parenthesized subexpressions must be marked as
* non-capturing ((?:…)
) for the correct
* token to be returned by the lexer.
String
, passed for the the type name to {@link #Token Token()};Function
, the function is used as
* constructor to which token is passed.
* @return {Lexer}
* This object
*/
addToken: function (token, tokenType) {
if (!token)
{
jsx.throwThis("jsx.InvalidArgumentError",
["Invalid token", token + " : " + typeof token
+ (token ? "[" + token.constructor + "]" : ""),
"(token: RegExp|String)"]);
}
if (token.constructor == RegExp
|| typeof token == "string")
{
if (typeof tokenType == "function")
{
token = new tokenType(token);
}
else
{
token = new jsx.string.parser.Token(token, tokenType);
}
}
this._tokens.push(token);
this._compiled = false;
return this;
},
/**
* Appends one or more {@link #Token Tokens} to the list of tokens.
*
* @params : [RegExp|String|Token, optional String|Function]
* @return {Lexer}
* This object
*/
addTokens: function () {
for (var i = 0, len = arguments.length; i < len; ++i)
{
var arg = arguments[i];
this.addToken(arg[0], arg[1]);
}
return this;
},
_jsx_RegExp: jsx.object.getFeature(jsx, "regexp", "RegExp"),
_RegExp: jsx.object.getFeature(jsx, "regexp", "RegExp") || RegExp,
/**
* Compiles _expression
from token patterns
*
* @protected
*/
_compile: (function () {
var _jsx_RegExp;
var _RegExp;
return function () {
var pattern = this._tokens.map(function (e) {
return e.pattern.source ? e.pattern.source : e.pattern;
});
this._expression = new this._RegExp(
"(" + pattern.join(")|(") + ")",
"g" + (this._ignoreCase ? 'i' : '')
+ ((this._RegExp == this._jsx_RegExp && this._dotAll) ? 's' : ''));
this._compiled = true;
return this._expression;
};
}()),
/**
* Returns the next token in an input string.
*
* @return Token
* The next token in the text assigned with this lexer
*/
nextToken: function () {
var tokens = this._tokens;
var keepHistory = this.keepHistory;
var match;
if (this._longestMatchWins)
{
var token_matches = [];
var max_match_len = 0;
var max_index = -1;
for (var index = tokens.length; index--;)
{
var token = tokens[index];
/* match from where we left */
match = this._jsx_RegExp
? this._jsx_RegExp.exec(token, this._text.substring(this._offset))
: token.exec(this._text.substring(this._offset));
if (match)
{
var token_match = token_matches[index] = match[0];
var token_match_len = token_match.length;
if (token_match_len > max_match_len)
{
max_match_len = token_match_len;
max_index = index;
}
}
}
if (max_index > -1)
{
token = token_matches[max_index];
if (token.match && keepHistory)
{
token = new token.constructor(token.pattern, token.type);
}
/* correct for offsetted search */
match.index += this._offset;
match.input = this._text;
token.match = match;
}
}
else
{
var expression = this._expression;
if (!this._compiled)
{
expression = this._compile();
}
if (expression.source == "()")
{
jsx.throwThis(jsx.Error, "No tokens added");
return null;
}
match = this._jsx_RegExp
? this._jsx_RegExp.exec(expression, this.text)
: expression.exec(this.text);
if (match)
{
for (var i = 1, len = match.length; i < len; ++i)
{
if (match[i])
{
token = tokens[i - 1];
if (token.match && keepHistory)
{
token = new token.constructor(token.pattern, token.type);
}
token.match = match;
break;
}
}
}
}
if (match)
{
/* advance offset for next _longestMatchWins search */
this._offset += match[0].length;
}
return token;
},
getIgnoreCase: function () {
return this._ignoreCase;
},
setIgnoreCase: function (value) {
value = !!value;
if (value !== this._ignoreCase)
{
this._compiled = false;
}
this._ignoreCase = value;
},
getDotAll: function () {
return this._dotAll;
},
setDotAll: function (value) {
value = !!value;
if (value !== this._dotAll)
{
this._compiled = false;
}
this._dotAll = value;
},
getExpression: function () {
return this._expression;
},
getOffset: function () {
return this._offset;
},
getTokens: function () {
return this._tokens;
}
});
/**
* A Parser
handles {@link #Token Tokens} in an input string
* as provided by a {@link #Lexer}.
*
* @param lexer : Lexer
* The lexer to be used by this parser
* @property _lexer : Lexer
* The lexer used by this parser
*/
jsx.string.parser.Parser = function Parser (lexer) {
if (!jsx.object.isInstanceOf(lexer, jsx.string.parser.Lexer))
{
jsx.warn("jsx.string.parser.Parser: saw " + lexer + ", expected jsx.string.parser.Lexer");
}
this._lexer = lexer;
};
jsx.string.parser.Parser.extend(null, {
/**
* Request the next token from the {@link #Lexer}.
*
* @memberOf jsx.string.parser.Parser#prototype
* @return jsx.string.parser#Token
*/
nextToken: function () {
return this._lexer.nextToken();
},
/**
* Parses an input string requesting the next token from a {@link #Lexer}.
*/
parse: function () {
var
token,
oLexer = this._lexer;
while ((token = oLexer.nextToken()))
{
if (typeof token.parse == "function")
{
token.parse(oLexer);
}
else
{
this.parseToken(token);
}
}
},
/**
* Parses an input token as provided by a {@link #Lexer}.
*
* @param token : Token
*/
parseToken: function (token) {
/* stub */
}
});