Subversion Repositories JSX

Compare Revisions

Last modification

Ignore whitespace Rev 515 → Rev 516

/trunk/regexp.js
34,15 → 34,7
var jsx = {};
}
 
if (typeof jsx.regexp == "undefined")
{
/* (for JSDT only) */
jsx.regexp = {};
}
 
/**
* @type jsx.regexp
* @memberOf __jsx.regexp
* @namespace
*/
jsx.regexp = (/** @constructor */ function () {
49,10 → 41,9
var _jsx_object = jsx.object;
var _getClass = _jsx_object.getClass;
var _getDataObject = _jsx_object.getDataObject;
var _isString = _jsx_object.isString;
 
/**
* @type jsx.regexp.RegExp
* @memberOf __jsx.regexp.RegExp
* @namespace
* @property pattern : String
* The original pattern string, including pattern-match
92,792 → 83,803
* @function
*/
var _RegExp2 = jsx.object.extend(
(/** @constructor */function () {
var
_destructure = jsx.array.destructure,
_WideString = _jsx_object.getFeature(
jsx, "string", "unicode", "WideString"),
_fromCharCode = function (codePoint) {
if (codePoint > 0xFFFF)
{
return _WideString.fromCharCode(codePoint);
}
(
/** @constructor */
function jsx_regexp_RegExp () {
var
_destructure = jsx.array.destructure,
_WideString = _jsx_object.getFeature(
jsx, "string", "unicode", "WideString"),
_fromCharCode = function (codePoint) {
if (codePoint > 0xFFFF)
{
return _WideString.fromCharCode(codePoint);
}
 
return String.fromCharCode(codePoint);
},
return String.fromCharCode(codePoint);
},
 
propertyClasses,
ucdFields = ["codePoint",, "propertyClass"],
propertyClasses,
ucdFields = ["codePoint",, "propertyClass"],
 
_parseUCDText = function () {
(new jsx.net.http.Request(
_RegExp2.ucdTextPath, "GET", false,
function (xhr) {
var lines = xhr.responseText.split(/\r?\n|\r/).map(
function (e) {
var entry = _destructure(e.split(";"), ucdFields);
entry.codePoint = parseInt(entry.codePoint, 16);
return entry;
});
_parseUCDText = function () {
(new jsx.net.http.Request(
_RegExp2.ucdTextPath, "GET", false,
function (xhr) {
var lines = xhr.responseText.split(/\r?\n|\r/).map(
function (e) {
var entry = _destructure(e.split(";"), ucdFields);
entry.codePoint = parseInt(entry.codePoint, 16);
return entry;
});
 
lines.sort(function (a, b) {
if (a.propertyClass < b.propertyClass)
{
return -1;
}
lines.sort(function (a, b) {
if (a.propertyClass < b.propertyClass)
{
return -1;
}
 
if (a.propertyClass > b.propertyClass)
{
return 1;
}
if (a.propertyClass > b.propertyClass)
{
return 1;
}
 
if (a.codePoint < b.codePoint)
{
return -1;
}
if (a.codePoint < b.codePoint)
{
return -1;
}
 
if (a.codePoint > b.codePoint)
{
return 1;
}
if (a.codePoint > b.codePoint)
{
return 1;
}
 
return 0;
});
return 0;
});
 
propertyClasses = _RegExp2.propertyClasses = {};
propertyClasses = _RegExp2.propertyClasses = {};
 
for (var i = 0, len = lines.length; i < len; ++i)
{
var
line = lines[i],
propertyClass = line.propertyClass,
prevClass,
codePoint = line.codePoint,
prevCodePoint;
 
if (isNaN(codePoint) || (codePoint > 0xFFFF && !_WideString))
for (var i = 0, len = lines.length; i < len; ++i)
{
continue;
}
var
line = lines[i],
propertyClass = line.propertyClass,
prevClass,
codePoint = line.codePoint,
prevCodePoint;
 
if (propertyClass != prevClass)
{
if (codePoint != prevCodePoint + 1)
if (isNaN(codePoint) || (codePoint > 0xFFFF && !_WideString))
{
if (startRange)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
}
continue;
}
 
propertyClasses[propertyClass] =
_fromCharCode(codePoint);
 
var startRange = false;
}
else
{
if (codePoint != prevCodePoint + 1)
if (propertyClass != prevClass)
{
if (startRange)
if (codePoint != prevCodePoint + 1)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
 
startRange = false;
if (startRange)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
}
}
 
propertyClasses[propertyClass] +=
propertyClasses[propertyClass] =
_fromCharCode(codePoint);
 
var startRange = false;
}
else
{
startRange = true;
if (codePoint != prevCodePoint + 1)
{
if (startRange)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
 
startRange = false;
}
 
propertyClasses[propertyClass] +=
_fromCharCode(codePoint);
}
else
{
startRange = true;
}
}
 
prevClass = propertyClass,
prevCodePoint = codePoint;
}
 
prevClass = propertyClass,
prevCodePoint = codePoint;
if (startRange)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
}
}
)).send();
},
 
if (startRange)
{
propertyClasses[prevClass] +=
"-" + _fromCharCode(prevCodePoint);
}
/**
* @param {String} charClassContent
* @param {boolean} bUnicodeMode
* @return {string}
*/
_normalizeCharClass = function (charClassContent, bUnicodeMode) {
var negEscapes = [];
 
if (charClassContent == "")
{
return "[]";
}
)).send();
},
 
/**
* @param {String} charClassContent
* @param {boolean} bUnicodeMode
* @return {string}
*/
_normalizeCharClass = function (charClassContent, bUnicodeMode) {
var negEscapes = [];
if (charClassContent == "^")
{
return "[^]";
}
 
if (charClassContent == "")
{
return "[]";
}
var reduced = charClassContent.replace(
/\\((P)\{([^\}]+)\}|(W))/g,
function (m, p1, cP, charProperty, cW) {
var escapeChar = cP || cW;
if (escapeChar == "P" || bUnicodeMode)
{
negEscapes.push("\\" + escapeChar.toLowerCase()
+ (charProperty ? "{" + charProperty + "}" : ""));
return "";
}
 
if (charClassContent == "^")
{
return "[^]";
}
return m;
});
 
var reduced = charClassContent.replace(
/\\((P)\{([^\}]+)\}|(W))/g,
function (m, p1, cP, charProperty, cW) {
var escapeChar = cP || cW;
if (escapeChar == "P" || bUnicodeMode)
if (negEscapes.length > 0)
{
/* Do not let negated empty class from reduction match everything */
if (reduced == "^")
{
negEscapes.push("\\" + escapeChar.toLowerCase()
+ (charProperty ? "{" + charProperty + "}" : ""));
return "";
reduced = "";
}
 
return m;
});
if (reduced != "")
{
jsx.warn(
"jsx.regexp.RegExp: Combined negative escapes in character classes"
+ " require support for non-capturing parentheses");
}
 
if (negEscapes.length > 0)
{
/* Do not let negated empty class from reduction match everything */
if (reduced == "^")
{
reduced = "";
return (reduced ? "(?:[" + reduced + "]|" : "")
+ "[" + (charClassContent.charAt(0) == "^" ? "" : "^")
+ negEscapes.join("") + "]"
+ (reduced ? ")" : "");
}
 
if (reduced != "")
{
jsx.warn(
"jsx.regexp.RegExp: Combined negative escapes in character classes"
+ " require support for non-capturing parentheses");
}
return "[" + reduced + "]";
},
 
return (reduced ? "(?:[" + reduced + "]|" : "")
+ "[" + (charClassContent.charAt(0) == "^" ? "" : "^")
+ negEscapes.join("") + "]"
+ (reduced ? ")" : "");
}
sPropertyEscapes = "\\\\(p)\\{([^\\}]+)\\}",
rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi"),
sNonPropEscInRange = "([^\\]\\\\]|\\\\[^p])*",
sEscapes =
"\\[(\\^?(" + sNonPropEscInRange + "(" + sPropertyEscapes
+ ")+" + sNonPropEscInRange + ")+)\\]"
+ "|" + sPropertyEscapes + "",
rxEscapes = new RegExp(sEscapes, "gi"),
 
return "[" + reduced + "]";
},
fEscapeMapper = function (match, classRanges, p2, p3, p4, p5, p6, p7,
standalonePropSpec, standaloneClass) {
propertyClasses = _RegExp2.propertyClasses;
 
sPropertyEscapes = "\\\\(p)\\{([^\\}]+)\\}",
rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi"),
sNonPropEscInRange = "([^\\]\\\\]|\\\\[^p])*",
sEscapes =
"\\[(\\^?(" + sNonPropEscInRange + "(" + sPropertyEscapes
+ ")+" + sNonPropEscInRange + ")+)\\]"
+ "|" + sPropertyEscapes + "",
rxEscapes = new RegExp(sEscapes, "gi"),
 
fEscapeMapper = function (match, classRanges, p2, p3, p4, p5, p6, p7,
standalonePropSpec, standaloneClass) {
propertyClasses = _RegExp2.propertyClasses;
 
/* If the Unicode Character Database (UCD) is not statically loaded */
if (!propertyClasses)
{
/* load it dynamically, ignore exceptions */
var ucdScriptPath = _RegExp2.ucdScriptPath;
if (ucdScriptPath)
/* If the Unicode Character Database (UCD) is not statically loaded */
if (!propertyClasses)
{
jsx.tryThis(function () { jsx.importFrom(ucdScriptPath); });
/* load it dynamically, ignore exceptions */
var ucdScriptPath = _RegExp2.ucdScriptPath;
if (ucdScriptPath)
{
jsx.tryThis(function () { jsx.importFrom(ucdScriptPath); });
 
propertyClasses = _RegExp2.propertyClasses;
}
propertyClasses = _RegExp2.propertyClasses;
}
 
/* if this failed */
if (!propertyClasses)
{
if (!jsx.net || !jsx.net.http
|| typeof jsx.net.http.Request != "function")
/* if this failed */
if (!propertyClasses)
{
jsx.throwThis("jsx.regexp.UCDLoadError",
['"' + _RegExp2.ucdScriptPath + '" (jsx.regexp.RegExp.ucdScriptPath)',
"http.js"]);
if (!jsx.net || !jsx.net.http
|| typeof jsx.net.http.Request != "function")
{
jsx.throwThis("jsx.regexp.UCDLoadError",
['"' + _RegExp2.ucdScriptPath + '" (jsx.regexp.RegExp.ucdScriptPath)',
"http.js"]);
}
 
/* parse the text version of the UCD */
_parseUCDText();
}
 
/* parse the text version of the UCD */
_parseUCDText();
/*
* Define property classes required for Unicode mode
* if not already defined (not available from text version
* of UCD)
*/
_jsx_object.extend(propertyClasses, {
L: "\\p{Ll}\\p{Lm}\\p{Lo}\\p{Lt}\\p{Lu}",
M: "\\p{Mc}\\p{Me}\\p{Mn}",
N: "\\p{Nd}\\p{Nl}\\p{No}",
Digit: "\\p{Nd}",
Space: "\u0009\u000a\u000c\u000d\u0020\u0085\u00a0"
+ "\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005"
+ "\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f"
+ "\u205f\u3000",
Word: "\\p{L}\\p{M}\\p{N}\\p{Pc}"
});
}
 
/*
* Define property classes required for Unicode mode
* if not already defined (not available from text version
* of UCD)
var _rangesStack = [];
/**
* @return {string}
*/
_jsx_object.extend(propertyClasses, {
L: "\\p{Ll}\\p{Lm}\\p{Lo}\\p{Lt}\\p{Lu}",
M: "\\p{Mc}\\p{Me}\\p{Mn}",
N: "\\p{Nd}\\p{Nl}\\p{No}",
Digit: "\\p{Nd}",
Space: "\u0009\u000a\u000c\u000d\u0020\u0085\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000",
Word: "\\p{L}\\p{M}\\p{N}\\p{Pc}"
});
}
_rangesStack.toString = function () {
return this.join(" --> ");
};
 
var _rangesStack = [];
/**
* @return {string}
*/
_rangesStack.toString = function () {
return this.join(" --> ");
};
var _propertyClassReplacer = function (match, propertySpecifier, propertyClass) {
if (propertySpecifier === "P")
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
_rangesStack.pop()
+ " contains the negative property specifier \\P{" + propertyClass + "}");
return;
}
 
var _propertyClassReplacer = function (match, propertySpecifier, propertyClass) {
if (propertySpecifier === "P")
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
_rangesStack.pop()
+ " contains the negative property specifier \\P{" + propertyClass + "}");
return;
}
return _getRanges(propertyClass);
};
 
return _getRanges(propertyClass);
};
var _getRanges =
/**
* Retrieves class ranges by property class, and throws a specialized
* exception if this fails.
 
var _getRanges =
/**
* Retrieves class ranges by property class, and throws a specialized
* exception if this fails.
* @param {String} propertyClass
* @throws jsx.regexp#UndefinedPropertyClassError
*/
function (propertyClass) {
return jsx.tryThis(
function () {
if (_rangesStack.indexOf(propertyClass) > -1)
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
propertyClass + " is cyclically defined ("
+ _rangesStack + " --> " + propertyClass
+ ")");
return;
}
 
* @param {String} propertyClass
* @throws jsx.regexp#UndefinedPropertyClassError
*/
function (propertyClass) {
return jsx.tryThis(
function () {
if (_rangesStack.indexOf(propertyClass) > -1)
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
propertyClass + " is cyclically defined ("
+ _rangesStack + " --> " + propertyClass
+ ")");
return;
}
_rangesStack.push(propertyClass);
 
_rangesStack.push(propertyClass);
var escapedRange = _jsx_object.getProperty(propertyClasses, propertyClass);
 
var escapedRange = _jsx_object.getProperty(propertyClasses, propertyClass);
/*
* Resolve property class references in property class values,
* watch for cyclic structures.
*/
var rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi");
var unescapedRange = escapedRange.replace(rxPropertyEscapes, _propertyClassReplacer);
 
/*
* Resolve property class references in property class values,
* watch for cyclic structures.
*/
var rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi");
var unescapedRange = escapedRange.replace(rxPropertyEscapes, _propertyClassReplacer);
_rangesStack.pop();
 
_rangesStack.pop();
return unescapedRange;
},
function (e) {
if (e.name == "jsx.object.PropertyError")
{
jsx.throwThis("jsx.regexp.UndefinedPropertyClassError",
propertyClass + (_rangesStack.length > 1 ? " in " + _rangesStack : ""));
}
else
{
jsx.rethrowThis(e);
}
});
};
 
return unescapedRange;
},
function (e) {
if (e.name == "jsx.object.PropertyError")
{
jsx.throwThis("jsx.regexp.UndefinedPropertyClassError",
propertyClass + (_rangesStack.length > 1 ? " in " + _rangesStack : ""));
}
else
{
jsx.rethrowThis(e);
}
});
};
/* We can handle standalone class references … */
if (standaloneClass)
{
var result = _getRanges(standaloneClass);
result = "[" + (standalonePropSpec == "P" ? "^" : "") + result + "]";
}
else
{
/* … and class references in character classes */
result = _normalizeCharClass(classRanges);
 
/* We can handle standalone class references … */
if (standaloneClass)
{
var result = _getRanges(standaloneClass);
result = "[" + (standalonePropSpec == "P" ? "^" : "") + result + "]";
}
else
{
/* … and class references in character classes */
result = _normalizeCharClass(classRanges);
result = result.replace(
rxPropertyEscapes,
function (match, propertySpecifier, propertyClass) {
var ranges = _getRanges(propertyClass);
return ranges;
});
}
 
result = result.replace(
rxPropertyEscapes,
function (match, propertySpecifier, propertyClass) {
var ranges = _getRanges(propertyClass);
return ranges;
});
}
return result;
};
 
return result;
};
 
/**
* Creates and returns an extended {@link RegExp} object.
*
* This constructor accepts pattern and flags arguments where you
* can use some features of Perl and Perl-compatible regular
* expressions (PCRE); like {@link RegExp()}, it can also be called
* as a function to do the same. The {@link RegExp} instance it
* returns is augmented with properties to support those features
* when matching it against a string.
*
* The following additional features are currently supported:
* <ul>
* <li>Flags:
* <ul>
* <li><tt>s</tt> (PCRE_DOTALL) – the <tt>.</tt> metacharacter
* matches newline as well.</li>
* <li><tt>u</tt> (Unicode mode) – the meaning of
* character class escape sequences <tt>\b</tt>, <tt>\w</tt>,
* and <tt>\W</tt> is extended to include Unicode character
* properties.</li>
* <li><tt>x</tt> (PCRE_EXTENDED) – whitespace within
* the pattern is ignored, so that it is easier
* human-readable.</li>
* </ul><p>
* Flags except for Unicode mode can be set and unset for
* parts of the expression outside of character classes using
* the <tt>(?…)</tt> and <tt>(?-…)</tt> notations.
* </li>
* <li>Unicode property classes using e.g. the \p{…} notation</li>
* <li>Named capturing groups by passing strings with the
* <tt>(?P&lt;name>…)</tt> or <tt>(?P'name'…)</tt> notation,
* where the <tt>P</tt> is optional, respectively.</li>
* </ul><p>
* This is facilitated through the following steps:
* </p><ol>
* <li>The flags <code>x</code>, <code>s</code> and <code>u</code>
* in the optional <var>sFlags</var> argument set the initial
* state of the pattern-match modifiers; the extended
* {@link RegExp}'s <code>extended</code>, <code>dotAll</code>,
* and <code>unicodeMode</code> properties are set accordingly.
* These flags are removed from the <var>sFlags</var>
* argument subsequently, as it is reused to create the
* {@link RegExp} instance. [Conforming implementations of
* ECMA-262-5.1 MUST throw a <code>SyntaxError</code>
* exception on flags other than <code>g</code>, <code>i</code>,
* and <code>m</code> (section 15.10.4.1); Mozilla JavaScript
* may also support the <code>y</code> (sticky) flag,
* but nothing else.]</li>
* <li>The pattern is run through several passes, where in each
* one it is scanned from left to right using another
* {@link RegExp}:
* <ol style="margin-bottom: 1em; list-style-type: lower-roman">
* <li><p>Capturing groups and pattern-match modifiers in the
* pattern are matched and replaced.
* <p>Capturing groups are replaced with the opening
* parenthesis if they were assigned a name. The
* extended {@link RegExp}'s <code>groups</code>,
* <code>names</code>, and <code>_patternGroups</code>
* properties are set accordingly. They are used in an
* overwritten <code>exec()</code> method and when matching
* against a <code>jsx.regexp.String</code> using its
* <tt>match(…)</tt> method.</p>
* <p style="margin-bottom: 0">
* Pattern-match modifiers are set and unset as they
* are scanned. The corresponding substrings are
* removed from the pattern. If the group is otherwise
* empty, and therefore is not a group at all,
* the entire pseudo-group is removed.</p>
* <ol style="margin-top: 0; list-style-type: lower-latin">
* <li>With PCRE_EXTENDED set, single-line
* comments starting with <tt>#</tt> and unescaped
* whitespace are removed from the pattern. The backslash
* is removed from the pattern when in front of
* whitespace.</li>
* <li>With PCRE_DOTALL set, unescaped <tt>.</tt>
* (period) characters are replaced with the character class
* <tt>[\S\s]</tt> which matches all Unicode characters.</li>
* </ol>
* <p><em>NOTE: Unlike in Perl and PCRE, a pattern-match
* modifier affects all of the pattern that follows,
* even outside the group in which the modifier was
* set/unset. This will be fixed in a later version.</em>
* </p></li>
* <li>When in Unicode mode,
* <ol style="list-style-type: lower-latin">
* <li>in the second pass, character class escape sequences
* <tt>\w</tt> and <tt>\W</tt> are replaced with
* corresponding uses of <tt>\p{Word}</tt>.</li>
* <li>in the third pass, <tt>\b</tt> is replaced with
* corresponding uses of character classes and negative
* lookahead.
* </ol></li>
* <li style="margin-top: 1em">The <tt>\p{…}</tt> and <tt>\P{…}</tt>
* escape sequences are replaced with the corresponding
* character classes.</li>
* </ol></li>
* <li>The resulting expression and remaining flags are passed
* to the {@link RegExp} constructor.</li>
* <li>The created {@link RegExp} instance is augmented with
* properties and returned.</li>
* </ol><p>
* There are the following possibilities to make Unicode property
* classes known to this constructor:
* </p><ol>
* <li>Provide the Unicode Character Database, or parts thereof,
* as an Object;</li>
* <li>Provide the Unicode Character Database, or parts thereof,
* as a plain text resource that is accessed with
* XMLHttpRequest;</li>
* <li>Define property classes manually</li>
* </ol>
* <p>
* Variant #1 requires you to define a mapping object with
* the following namespace and structure:
* </p>
* <pre><code>
* jsx.regexp.RegExp.propertyClasses = {
* ...,
* Sc: "\u20AC...",
* ...
* };
* </code></pre>
* <p>
* The property name is the name of the Unicode property class
* (here: <tt>Sc</tt>). The property value (a string) defines
* which characters belong to that class. You may use "-"
* to specify character ranges, i.e., the range of characters
* including the characters having the boundaries as code point
* value, and all characters that have a code point value
* in-between. (For a literal "-", you may use "\\-".)
* An example file to mirror the Unicode 5.0 Character Database,
* UnicodeData.js, is distributed with this file. Include it
* <em>after</em> the file that declares the constructor (this
* file) to use it. If you do not include it, but use the
* <code>\p{...}</code> notation, an attempt will be made to load
* the file specified by the <code>ucdScriptPath</code> property
* (default: <code>"/scripts/UnicodeData.js"</code>) using
* synchronous XHR (see below).
* </p>
* <p>
* Variant #2 is going to support two different methods:
* Synchronous and asynchronous request-response handling.
* Synchronous request-response handling requests the (partial)
* Unicode Character Database from the resource specified by
* the <code>ucdTextPath</code> property (default:
* <code>"/scripts/UnicodeData.txt"</code>) and halts execution
* until a response has been received or the connection timed out.
* Asynchronous request-response handling allows script execution
* to continue while the request and response are in progress, but
* you need to provide a callback as third argument where actions
* related to the regular expression must be performed.
* Asynchronous handling is recommended for applications that need
* to be responsive to user input. <strong>Currently, only
* synchronous handling is implemented.</strong>
* </p>
* <p>
* Variant #3 can be combined with the other variants.
* The constructor has a definePropertyClasses() method which can
* be used to define and redefine property classes. This allows
* an extended RegExp object to support only a subset of Unicode
* property classes, and to support user-defined character
* property classes.
* </p>
*
* The returned {@link RegExp} has additional properties to
* accomodate syntax extensions in the pattern string:
*
* @param {String|RegExp} expression
* A regular expression pattern string that may use the features
* described above. If it is a {@link RegExp}, its
* <code>source</code> property is used and combined with
* <var>sFlags</var>. That is, <code>jsx.regexp.RegExp(/foo/, "i")</code>
* returns the same as <code>jsx.regexp.RegExp(/foo/i)</code>.
* @param {String} sFlags
* Optional string containing none, one or more of the standard
* {@link RegExp} modifiers and the flags described above.
* Unsupported flags are ignored, but passed on to {@link RegExp}.
* Note that modifiers in <var>expression</var> can temporarily
* unset and set the "s" and "x" flags. Following Perl, the "u"
* flag (Unicode mode) can only be enabled, but not disabled.
* @return {RegExp}
* A regular expression with the property class escape sequences
* expanded according to the specified data, with the specified
* flags set if they are natively supported.
*/
function jsx_regexp_RegExp (expression, sFlags)
{
if (expression && _getClass(expression) == "RegExp")
/**
* Creates and returns an extended {@link RegExp} object.
*
* This constructor accepts pattern and flags arguments where you
* can use some features of Perl and Perl-compatible regular
* expressions (PCRE); like {@link RegExp()}, it can also be called
* as a function to do the same. The {@link RegExp} instance it
* returns is augmented with properties to support those features
* when matching it against a string.
*
* The following additional features are currently supported:
* <ul>
* <li>Flags:
* <ul>
* <li><tt>s</tt> (PCRE_DOTALL) – the <tt>.</tt> metacharacter
* matches newline as well.</li>
* <li><tt>u</tt> (Unicode mode) – the meaning of
* character class escape sequences <tt>\b</tt>, <tt>\w</tt>,
* and <tt>\W</tt> is extended to include Unicode character
* properties.</li>
* <li><tt>x</tt> (PCRE_EXTENDED) – whitespace within
* the pattern is ignored, so that it is easier
* human-readable.</li>
* </ul><p>
* Flags except for Unicode mode can be set and unset for
* parts of the expression outside of character classes using
* the <tt>(?…)</tt> and <tt>(?-…)</tt> notations.
* </li>
* <li>Unicode property classes using e.g. the \p{…} notation</li>
* <li>Named capturing groups by passing strings with the
* <tt>(?P&lt;name>…)</tt> or <tt>(?P'name'…)</tt> notation,
* where the <tt>P</tt> is optional, respectively.</li>
* </ul><p>
* This is facilitated through the following steps:
* </p><ol>
* <li>The flags <code>x</code>, <code>s</code> and <code>u</code>
* in the optional <var>sFlags</var> argument set the initial
* state of the pattern-match modifiers; the extended
* {@link RegExp}'s <code>extended</code>, <code>dotAll</code>,
* and <code>unicodeMode</code> properties are set accordingly.
* These flags are removed from the <var>sFlags</var>
* argument subsequently, as it is reused to create the
* {@link RegExp} instance. [Conforming implementations of
* ECMA-262-5.1 MUST throw a <code>SyntaxError</code>
* exception on flags other than <code>g</code>, <code>i</code>,
* and <code>m</code> (section 15.10.4.1); Mozilla JavaScript
* may also support the <code>y</code> (sticky) flag,
* but nothing else.]</li>
* <li>The pattern is run through several passes, where in each
* one it is scanned from left to right using another
* {@link RegExp}:
* <ol style="margin-bottom: 1em; list-style-type: lower-roman">
* <li><p>Capturing groups and pattern-match modifiers in the
* pattern are matched and replaced.
* <p>Capturing groups are replaced with the opening
* parenthesis if they were assigned a name. The
* extended {@link RegExp}'s <code>groups</code>,
* <code>names</code>, and <code>_patternGroups</code>
* properties are set accordingly. They are used in an
* overwritten <code>exec()</code> method and when matching
* against a <code>jsx.regexp.String</code> using its
* <tt>match(…)</tt> method.</p>
* <p style="margin-bottom: 0">
* Pattern-match modifiers are set and unset as they
* are scanned. The corresponding substrings are
* removed from the pattern. If the group is otherwise
* empty, and therefore is not a group at all,
* the entire pseudo-group is removed.</p>
* <ol style="margin-top: 0; list-style-type: lower-latin">
* <li>With PCRE_EXTENDED set, single-line
* comments starting with <tt>#</tt> and unescaped
* whitespace are removed from the pattern. The backslash
* is removed from the pattern when in front of
* whitespace.</li>
* <li>With PCRE_DOTALL set, unescaped <tt>.</tt>
* (period) characters are replaced with the character class
* <tt>[\S\s]</tt> which matches all Unicode characters.</li>
* </ol>
* <p><em>NOTE: Unlike in Perl and PCRE, a pattern-match
* modifier affects all of the pattern that follows,
* even outside the group in which the modifier was
* set/unset. This will be fixed in a later version.</em>
* </p></li>
* <li>When in Unicode mode,
* <ol style="list-style-type: lower-latin">
* <li>in the second pass, character class escape sequences
* <tt>\w</tt> and <tt>\W</tt> are replaced with
* corresponding uses of <tt>\p{Word}</tt>.</li>
* <li>in the third pass, <tt>\b</tt> is replaced with
* corresponding uses of character classes and negative
* lookahead.
* </ol></li>
* <li style="margin-top: 1em">The <tt>\p{…}</tt> and <tt>\P{…}</tt>
* escape sequences are replaced with the corresponding
* character classes.</li>
* </ol></li>
* <li>The resulting expression and remaining flags are passed
* to the {@link RegExp} constructor.</li>
* <li>The created {@link RegExp} instance is augmented with
* properties and returned.</li>
* </ol><p>
* There are the following possibilities to make Unicode property
* classes known to this constructor:
* </p><ol>
* <li>Provide the Unicode Character Database, or parts thereof,
* as an Object;</li>
* <li>Provide the Unicode Character Database, or parts thereof,
* as a plain text resource that is accessed with
* XMLHttpRequest;</li>
* <li>Define property classes manually</li>
* </ol>
* <p>
* Variant #1 requires you to define a mapping object with
* the following namespace and structure:
* </p>
* <pre><code>
* jsx.regexp.RegExp.propertyClasses = {
* ...,
* Sc: "\u20AC...",
* ...
* };
* </code></pre>
* <p>
* The property name is the name of the Unicode property class
* (here: <tt>Sc</tt>). The property value (a string) defines
* which characters belong to that class. You may use "-"
* to specify character ranges, i.e., the range of characters
* including the characters having the boundaries as code point
* value, and all characters that have a code point value
* in-between. (For a literal "-", you may use "\\-".)
* An example file to mirror the Unicode 5.0 Character Database,
* UnicodeData.js, is distributed with this file. Include it
* <em>after</em> the file that declares the constructor (this
* file) to use it. If you do not include it, but use the
* <code>\p{...}</code> notation, an attempt will be made to load
* the file specified by the <code>ucdScriptPath</code> property
* (default: <code>"/scripts/UnicodeData.js"</code>) using
* synchronous XHR (see below).
* </p>
* <p>
* Variant #2 is going to support two different methods:
* Synchronous and asynchronous request-response handling.
* Synchronous request-response handling requests the (partial)
* Unicode Character Database from the resource specified by
* the <code>ucdTextPath</code> property (default:
* <code>"/scripts/UnicodeData.txt"</code>) and halts execution
* until a response has been received or the connection timed out.
* Asynchronous request-response handling allows script execution
* to continue while the request and response are in progress, but
* you need to provide a callback as third argument where actions
* related to the regular expression must be performed.
* Asynchronous handling is recommended for applications that need
* to be responsive to user input. <strong>Currently, only
* synchronous handling is implemented.</strong>
* </p>
* <p>
* Variant #3 can be combined with the other variants.
* The constructor has a definePropertyClasses() method which can
* be used to define and redefine property classes. This allows
* an extended RegExp object to support only a subset of Unicode
* property classes, and to support user-defined character
* property classes.
* </p>
*
* The returned {@link RegExp} has additional properties to
* accomodate syntax extensions in the pattern string:
*
* @param {String|RegExp} expression
* A regular expression pattern string that may use the features
* described above. If it is a {@link RegExp}, its
* <code>source</code> property is used and combined with
* <var>sFlags</var>. That is, <code>jsx.regexp.RegExp(/foo/, "i")</code>
* returns the same as <code>jsx.regexp.RegExp(/foo/i)</code>.
* @param {String} sFlags
* Optional string containing none, one or more of the standard
* {@link RegExp} modifiers and the flags described above.
* Unsupported flags are ignored, but passed on to {@link RegExp}.
* Note that modifiers in <var>expression</var> can temporarily
* unset and set the "s" and "x" flags. Following Perl, the "u"
* flag (Unicode mode) can only be enabled, but not disabled.
* @return {RegExp}
* A regular expression with the property class escape sequences
* expanded according to the specified data, with the specified
* flags set if they are natively supported.
*/
function jsx_regexp_RegExp (expression, sFlags)
{
expression = expression.source;
}
 
var t = typeof expression;
if (t != "string")
{
if (arguments.length < 1)
if (expression && _getClass(expression) == "RegExp")
{
expression = "";
expression = expression.source;
}
else
 
var t = typeof expression;
if (t != "string")
{
expression = String(expression);
if (arguments.length < 1)
{
expression = "";
}
else
{
expression = String(expression);
}
}
}
 
var pattern = expression;
var flags = sFlags || "";
var pattern = expression;
var flags = sFlags || "";
 
var extended = false;
var dotAll = false;
var unicodeMode = false;
var extended = false;
var dotAll = false;
var unicodeMode = false;
 
if (sFlags)
{
if (sFlags.indexOf("x") > -1)
if (sFlags)
{
var originalExtended = extended = true;
}
if (sFlags.indexOf("x") > -1)
{
var originalExtended = extended = true;
}
 
if (sFlags.indexOf("s") > -1)
{
var originalDotAll = dotAll = true;
}
if (sFlags.indexOf("s") > -1)
{
var originalDotAll = dotAll = true;
}
 
if (sFlags.indexOf("u") > -1)
{
unicodeMode = true;
if (sFlags.indexOf("u") > -1)
{
unicodeMode = true;
}
 
sFlags = sFlags.replace(/[xsu]/g, "");
}
 
sFlags = sFlags.replace(/[xsu]/g, "");
}
/* Support for capturing and special groups */
var groupCount = 0;
var groups = _getDataObject();
var names = _getDataObject();
var patternGroups = [expression];
 
/* Support for capturing and special groups */
var groupCount = 0;
var groups = _getDataObject();
var names = _getDataObject();
var patternGroups = [expression];
 
expression = expression.replace(
/(\\\()/.concat(
"|",
/(\((\?P?(([adlupimsx]+)?(-([imsx]+))?)(<([^>]+)>|'([^']+)'|([:!]))?(\))?)?)/g,
"|",
/(#.*(\r?\n|\r|$))|\\(\s)/,
"|",
/\[([^\\\]]|\\.)*\]|(\s+)|\\\.|(\.)/g
),
function (match, escapedLParen,
group, specialGroup, modifierGroup,
positiveModifiers, negativeModifiers_opt, negativeModifiers,
namedGroup, bracketedName, quotedName,
nonCapturingGroup, emptyGroup,
comment, newline,
escapedWS, charClassContent, whitespace,
plainDot,
index, all) {
if (group)
{
var capturingGroup = (!nonCapturingGroup && !(modifierGroup && emptyGroup));
if (capturingGroup)
expression = expression.replace(
/(\\\()/.concat(
"|",
/(\((\?P?(([adlupimsx]+)?(-([imsx]+))?)(<([^>]+)>|'([^']+)'|([:!]))?(\))?)?)/g,
"|",
/(#.*(\r?\n|\r|$))|\\(\s)/,
"|",
/\[([^\\\]]|\\.)*\]|(\s+)|\\\.|(\.)/g
),
function (match, escapedLParen,
group, specialGroup, modifierGroup,
positiveModifiers, negativeModifiers_opt, negativeModifiers,
namedGroup, bracketedName, quotedName,
nonCapturingGroup, emptyGroup,
comment, newline,
escapedWS, charClassContent, whitespace,
plainDot,
index, all) {
if (group)
{
++groupCount;
}
var capturingGroup = (!nonCapturingGroup && !(modifierGroup && emptyGroup));
if (capturingGroup)
{
++groupCount;
}
 
if (positiveModifiers)
{
var
rxPosModifiers = /[sx]/g,
m;
if (positiveModifiers)
{
var
rxPosModifiers = /[sx]/g,
m;
 
while ((m = rxPosModifiers.exec(positiveModifiers)))
{
switch (m[0])
while ((m = rxPosModifiers.exec(positiveModifiers)))
{
case "s":
dotAll = true;
break;
switch (m[0])
{
case "s":
dotAll = true;
break;
 
case "x":
extended = true;
case "x":
extended = true;
}
}
}
}
 
if (negativeModifiers)
{
var rxNegModifiers = /[sx]/g;
if (negativeModifiers)
{
var rxNegModifiers = /[sx]/g;
 
while ((m = rxNegModifiers.exec(negativeModifiers)))
{
switch (m[0])
while ((m = rxNegModifiers.exec(negativeModifiers)))
{
case "s":
dotAll = false;
break;
switch (m[0])
{
case "s":
dotAll = false;
break;
 
case "x":
extended = false;
case "x":
extended = false;
}
}
}
}
 
if (capturingGroup)
{
/* Support for named capturing groups (PCRE-compliant) */
var name = bracketedName || quotedName;
if (name)
if (capturingGroup)
{
if (names[name])
/* Support for named capturing groups (PCRE-compliant) */
var name = bracketedName || quotedName;
if (name)
{
jsx.throwThis("SyntaxError", "Duplicate symbolic name");
if (names[name])
{
jsx.throwThis("SyntaxError", "Duplicate symbolic name");
}
 
groups[groupCount] = name;
names[name] = groupCount;
}
 
groups[groupCount] = name;
names[name] = groupCount;
/*
* NOTE: Helps with determining in exec() and match()
* whether \b matched at beginning and \Ws need to be
* ltrimmed from match
*/
patternGroups.push(all.substring(index));
 
return "(";
}
 
/*
* NOTE: Helps with determining in exec() and match()
* whether \b matched at beginning and \Ws need to be
* ltrimmed from match
*/
patternGroups.push(all.substring(index));
 
return "(";
return emptyGroup ? "" : "(?" + nonCapturingGroup;
}
 
return emptyGroup ? "" : "(?" + nonCapturingGroup;
}
 
/* PCRE_EXTENDED */
if (extended)
{
/* Remove comments */
if (comment)
/* PCRE_EXTENDED */
if (extended)
{
return "";
}
/* Remove comments */
if (comment)
{
return "";
}
 
/* Keep escaped whitespace, remove escape */
if (escapedWS)
{
return escapedWS;
/* Keep escaped whitespace, remove escape */
if (escapedWS)
{
return escapedWS;
}
 
/* Remove unescaped whitespace */
if (whitespace)
{
return "";
}
}
 
/* Remove unescaped whitespace */
if (whitespace)
/* PCRE_DOTALL */
if (dotAll && plainDot)
{
return "";
return "[\\S\\s]";
}
}
 
/* PCRE_DOTALL */
if (dotAll && plainDot)
{
return "[\\S\\s]";
}
return match;
});
 
return match;
});
groups.length = groupCount;
 
groups.length = groupCount;
/* Unicode mode */
if (unicodeMode)
{
var characterEscapes = {
"d": "\\p{Digit}",
"s": "\\p{Space}",
"w": "\\p{Word}"
};
 
/* Unicode mode */
if (unicodeMode)
{
var characterEscapes = {
"d": "\\p{Digit}",
"s": "\\p{Space}",
"w": "\\p{Word}"
};
expression = expression.replace(
/\[(([^\]\\]|\\.)*)\]|(\\([dsw]))/gi,
function (match, charClassContent, p2, classCharacter, escapeLetter) {
if (charClassContent)
{
var normalized = _normalizeCharClass(charClassContent, true);
 
expression = expression.replace(
/\[(([^\]\\]|\\.)*)\]|(\\([dsw]))/gi,
function (match, charClassContent, p2, classCharacter, escapeLetter) {
if (charClassContent)
{
var normalized = _normalizeCharClass(charClassContent, true);
 
return normalized.replace(
/\\\\|(\\([dsw]))/gi,
function (match, classCharacter, escapeLetter) {
if (classCharacter)
{
if (escapeLetter >= "A" && escapeLetter <= "Z")
return normalized.replace(
/\\\\|(\\([dsw]))/gi,
function (match, classCharacter, escapeLetter) {
if (classCharacter)
{
if (charClassContent.charAt(0) != "^")
if (escapeLetter >= "A" && escapeLetter <= "Z")
{
jsx.warn("jsx.regexp.RegExp: Negative character"
+ " class escape sequences in character"
+ " class not yet supported in Unicode mode."
+ " Use positive escape sequences in negated"
+ " character classes in the meantime.");
if (charClassContent.charAt(0) != "^")
{
jsx.warn("jsx.regexp.RegExp: Negative character"
+ " class escape sequences in character"
+ " class not yet supported in Unicode mode."
+ " Use positive escape sequences in negated"
+ " character classes in the meantime.");
 
return classCharacter;
return classCharacter;
}
}
 
return characterEscapes[escapeLetter.toLowerCase()];
}
 
return characterEscapes[escapeLetter.toLowerCase()];
}
return match;
});
}
 
return match;
});
}
if (classCharacter)
{
return "["
+ (escapeLetter >= "A" && escapeLetter <= "Z" ? "^" : "")
+ characterEscapes[escapeLetter.toLowerCase()] + "]";
}
 
if (classCharacter)
{
return "["
+ (escapeLetter >= "A" && escapeLetter <= "Z" ? "^" : "")
+ characterEscapes[escapeLetter.toLowerCase()] + "]";
}
return match;
});
 
return match;
});
/* Replace \b */
var firstGroup = expression.match(/\((\?(P?(<([^>]+)>|'([^']+)')|[:!]))?/);
var afterFirstGroup = (firstGroup && (firstGroup.index + firstGroup[0].length) || 0);
var wordEscape = characterEscapes.w;
expression = expression.replace(
/\\\\|(\\b)/g,
function (match, wordBorder, index, all) {
if (wordBorder)
{
/* Handle \b in leading groups properly */
if (index > afterFirstGroup)
{
return "(?!" + wordEscape + ")";
}
 
/* Replace \b */
var firstGroup = expression.match(/\((\?(P?(<([^>]+)>|'([^']+)')|[:!]))?/);
var afterFirstGroup = (firstGroup && (firstGroup.index + firstGroup[0].length) || 0);
var wordEscape = characterEscapes.w;
expression = expression.replace(
/\\\\|(\\b)/g,
function (match, wordBorder, index, all) {
if (wordBorder)
{
/* Handle \b in leading groups properly */
if (index > afterFirstGroup)
{
return "(?!" + wordEscape + ")";
return "(?:^|[^" + wordEscape + "])";
}
 
return "(?:^|[^" + wordEscape + "])";
}
return match;
});
}
 
return match;
});
}
/* Support for Unicode character property classes (PCRE-compliant) */
expression = expression.replace(rxEscapes, fEscapeMapper);
 
/* Support for Unicode character property classes (PCRE-compliant) */
expression = expression.replace(rxEscapes, fEscapeMapper);
var rx = new RegExp(expression, sFlags);
 
var rx = new RegExp(expression, sFlags);
/* Augmented properties */
rx.pattern = pattern;
rx._patternGroups = patternGroups;
rx.groups = groups;
rx.names = names;
rx.flags = flags;
rx.dotAll = !!originalDotAll;
rx.extended = !!originalExtended;
rx.unicodeMode = unicodeMode;
 
/* Augmented properties */
rx.pattern = pattern;
rx._patternGroups = patternGroups;
rx.groups = groups;
rx.names = names;
rx.flags = flags;
rx.dotAll = !!originalDotAll;
rx.extended = !!originalExtended;
rx.unicodeMode = unicodeMode;
rx._oldExec = rx.exec;
rx.exec = jsx_regexp_RegExp.exec;
 
rx._oldExec = rx.exec;
rx.exec = jsx_regexp_RegExp.exec;
return rx;
}
 
return rx;
}
 
return jsx_regexp_RegExp;
}()),
return jsx_regexp_RegExp;
}()
),
{
/**
* @memberOf jsx.regexp.RegExp
*/
ucdScriptPath: "/scripts/UnicodeData.js",
ucdTextPath: "/scripts/UnicodeData.txt",
 
/**
* Determines if an object has been constructed using this constructor
* @memberOf jsx.regexp.RegExp
* Determines if an object has been constructed using this constructor.
*
* @param rx
* @return {boolean}
*/
isInstance: function (rx) {
return !!rx.pattern;
938,6 → 940,15
return _exec;
}()),
 
/**
* (Re-)defines one or more property classes.
*
* @param {Object} o
* Object whose own enumerable properties are used
* for property class definitions
* @return {jsx.regexp.RegExp}
* This object
*/
definePropertyClasses: function (o) {
for (var keys = _jsx_object.getKeys(o), i = 0, len = keys.length;
i < len; ++i)
945,8 → 956,17
var p = keys[i];
this.propertyClasses[p] = o[p];
}
 
return this;
},
 
/**
* Deletes a property class.
*
* @param {String} p
* @return {boolean}
* <code>true</code> if successful, <code>false</code> otherwise.
*/
deletePropertyClass: function (p) {
return (delete this.propertyClasses[p]);
}
993,8 → 1013,7
* Exception thrown if a referred character property class
* cannot be resolved
*
* @type jsx.regexp.UndefinedPropertyClassError
* @extends jsx.object#PropertyError
* @extends jsx.object.PropertyError
*/
UndefinedPropertyClassError:
/**
1017,7 → 1036,7
*
* @constructor
* @param sMsg
* @extends jsx.object#ObjectError
* @extends jsx.object.ObjectError
*/
InvalidPropertyClassError:
function jsx_regexp_InvalidPropertyClassError (sMsg) {
1031,9 → 1050,8
RegExp: _RegExp2,
 
/**
* @type jsx.regexp.String
* @memberOf __jsx.regexp.String
* @constructor
* @extends String
*/
String: function jsx_regexp_String (s) {
if (this.constructor != jsx_regexp_String)
1045,7 → 1063,6
this.value = String(s);
}.extend(String, (function () {
var _replace = String.prototype.replace;
var _getDataObject = jsx.object.getDataObject;
 
function _toString ()
{
1188,14 → 1205,12
* Concatenates strings or regular expressions ({@link RegExp})
* and returns the resulting <code>RegExp</code>.
*
* If flags are used with either <code>RegExp</code> argument, the
* If flags are set with either <code>RegExp</code> argument, the
* resulting <code>RegExp</code> has all of those flags set.
*
* @author Copyright (c) 2005
* Thomas Lahn &lt;regexp.js@PointedEars.de&gt;
* @partof
* http://pointedears.de/scripts/regexp.js
* @params
* @param {RegExp|String}
* Expressions to be concatenated. If a not a {@link RegExp},
* the argument is converted to {@link String}; this allows
* for expressions to be grouped and used in alternation.
1283,8 → 1298,8
* Returns a {@link RegExp} that is an intersection of two
* regular expressions.
*
* @param pattern2
* @param pattern1
* @param {RegExp} pattern2
* @param {RegExp} pattern1
* @return {RegExp}
* A regular expression which matches the strings that both
* <var>pattern1</var> (or this object) and <var>pattern2</var>
1309,6 → 1324,7
return null;
}
 
/* Remove outer parentheses */
var
s = pattern1.source.replace(/^\(?([^)]*)\)?$/, "$1"),
s2 = pattern2.source.replace(/^\(?([^)]*)\)?$/, "$1");
1332,27 → 1348,24
}
 
/* Compose the new alternation out of common parts */
var hOP = (
function () {
if (typeof Object.prototype.hasOwnProperty == "function")
{
return function (o, p) {
return o.hasOwnProperty(p);
};
}
var hasOwnProperty = (function () {
return (
(typeof Object.prototype.hasOwnProperty == "function")
? function (o, p) {
return o.hasOwnProperty(p);
}
: function (o, p) {
/* suffices _here_ */
return typeof o[p] != "undefined"
&& typeof o.constructor.prototype[p] == "undefined";
}
);
}());
 
/* suffices *here* */
return function (o, p) {
return typeof o[p] != "undefined"
&& typeof o.constructor.prototype[p] == "undefined";
};
}
)();
 
a = [];
for (var p in o)
{
if (hOP(o2, p))
if (hasOwnProperty(o2, p))
{
a.push(p);
}
1370,7 → 1383,7
* @return {string}
*/
escape: function (s) {
if (arguments.length == 0 && this.constructor == String)
if (arguments.length == 0 && _isString(this.constructor))
{
s = this;
}
1400,14 → 1413,33
 
// jsx.regexp.docURL = jsx.regexp.path + "regexp.htm";
 
/** @deprecated */
var regexp2str = jsx.regexp.toString2;
RegExp.prototype.toString2 = regexp2str;
 
/** @deprecated */
var regexp_concat = jsx.regexp.concat;
RegExp.prototype.concat = regexp_concat;
 
/** @deprecated */
var regexp_intersect = jsx.regexp.intersect;
RegExp.prototype.intersect = regexp_intersect;
 
/** @deprecated */
var strRegExpEscape = jsx.regexp.escape;
String.prototype.regExpEscape = strRegExpEscape;
 
if (jsx.options.augmentPrototypes)
{
jsx.object.extend(RegExp.prototype, {
/**
* @memberOf RegExp.prototype
*/
intersect: jsx.regexp.intersect,
concat: jsx.regexp.concat,
toString2: jsx.regexp.toString2
});
 
jsx.object.extend(String.prototype, {
/**
* @memberOf String.prototype
*/
regExpEscape: jsx.regexp.escape
});
}