1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sw=4 et tw=99 ft=cpp:
3 : *
4 : * ***** BEGIN LICENSE BLOCK *****
5 : * Copyright (C) 2009 Apple Inc. All rights reserved.
6 : * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18 : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 : * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
21 : * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 : * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 : * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 : * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 : * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 : *
29 : * ***** END LICENSE BLOCK ***** */
30 :
31 : #include "YarrPattern.h"
32 :
33 : #include "Yarr.h"
34 : #include "YarrParser.h"
35 :
36 : using namespace WTF;
37 :
38 : namespace JSC { namespace Yarr {
39 :
40 : #include "RegExpJitTables.h"
41 :
42 : #if WTF_CPU_SPARC
43 : #define BASE_FRAME_SIZE 24
44 : #else
45 : #define BASE_FRAME_SIZE 0
46 : #endif
47 :
48 58352 : class CharacterClassConstructor {
49 : public:
50 58352 : CharacterClassConstructor(bool isCaseInsensitive = false)
51 58352 : : m_isCaseInsensitive(isCaseInsensitive)
52 : {
53 58352 : }
54 :
55 30655 : void reset()
56 : {
57 30655 : m_matches.clear();
58 30655 : m_ranges.clear();
59 30655 : m_matchesUnicode.clear();
60 30655 : m_rangesUnicode.clear();
61 30655 : }
62 :
63 763 : void append(const CharacterClass* other)
64 : {
65 1489 : for (size_t i = 0; i < other->m_matches.size(); ++i)
66 726 : addSorted(m_matches, other->m_matches[i]);
67 2807 : for (size_t i = 0; i < other->m_ranges.size(); ++i)
68 2044 : addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
69 1555 : for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
70 792 : addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
71 952 : for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
72 189 : addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
73 763 : }
74 :
75 36017 : void putChar(UChar ch)
76 : {
77 36017 : if (ch <= 0x7f) {
78 35990 : if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
79 3578 : addSorted(m_matches, toASCIIUpper(ch));
80 3578 : addSorted(m_matches, toASCIILower(ch));
81 : } else
82 32412 : addSorted(m_matches, ch);
83 : } else {
84 : UChar upper, lower;
85 27 : if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
86 0 : addSorted(m_matchesUnicode, upper);
87 0 : addSorted(m_matchesUnicode, lower);
88 : } else
89 27 : addSorted(m_matchesUnicode, ch);
90 : }
91 36017 : }
92 :
93 : // returns true if this character has another case, and 'ch' is the upper case form.
94 0 : static inline bool isUnicodeUpper(UChar ch)
95 : {
96 0 : return ch != Unicode::toLower(ch);
97 : }
98 :
99 : // returns true if this character has another case, and 'ch' is the lower case form.
100 0 : static inline bool isUnicodeLower(UChar ch)
101 : {
102 0 : return ch != Unicode::toUpper(ch);
103 : }
104 :
105 41891 : void putRange(UChar lo, UChar hi)
106 : {
107 41891 : if (lo <= 0x7f) {
108 41846 : char asciiLo = lo;
109 41846 : char asciiHi = std::min(hi, (UChar)0x7f);
110 41846 : addSortedRange(m_ranges, lo, asciiHi);
111 :
112 41846 : if (m_isCaseInsensitive) {
113 38184 : if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
114 12 : addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
115 38184 : if ((asciiLo <= 'z') && (asciiHi >= 'a'))
116 20535 : addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
117 : }
118 : }
119 41891 : if (hi >= 0x80) {
120 48 : uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
121 48 : addSortedRange(m_rangesUnicode, unicodeCurr, hi);
122 :
123 48 : if (m_isCaseInsensitive) {
124 0 : while (unicodeCurr <= hi) {
125 : // If the upper bound of the range (hi) is 0xffff, the increments to
126 : // unicodeCurr in this loop may take it to 0x10000. This is fine
127 : // (if so we won't re-enter the loop, since the loop condition above
128 : // will definitely fail) - but this does mean we cannot use a UChar
129 : // to represent unicodeCurr, we must use a 32-bit value instead.
130 0 : ASSERT(unicodeCurr <= 0xffff);
131 :
132 0 : if (isUnicodeUpper(unicodeCurr)) {
133 0 : UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
134 0 : UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
135 0 : while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
136 0 : lowerCaseRangeEnd++;
137 0 : addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
138 0 : } else if (isUnicodeLower(unicodeCurr)) {
139 0 : UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
140 0 : UChar upperCaseRangeEnd = upperCaseRangeBegin;
141 0 : while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
142 0 : upperCaseRangeEnd++;
143 0 : addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
144 : } else
145 0 : ++unicodeCurr;
146 : }
147 : }
148 : }
149 41891 : }
150 :
151 30655 : CharacterClass* charClass()
152 : {
153 30655 : CharacterClass* characterClass = js::OffTheBooks::new_<CharacterClass>(PassRefPtr<CharacterClassTable>(0));
154 :
155 30655 : characterClass->m_matches.append(m_matches);
156 30655 : characterClass->m_ranges.append(m_ranges);
157 30655 : characterClass->m_matchesUnicode.append(m_matchesUnicode);
158 30655 : characterClass->m_rangesUnicode.append(m_rangesUnicode);
159 :
160 30655 : reset();
161 :
162 30655 : return characterClass;
163 : }
164 :
165 : private:
166 41113 : void addSorted(Vector<UChar>& matches, UChar ch)
167 : {
168 41113 : unsigned pos = 0;
169 41113 : unsigned range = matches.size();
170 :
171 : // binary chop, find position to insert char.
172 153631 : while (range) {
173 71513 : unsigned index = range >> 1;
174 :
175 71513 : int val = matches[pos+index] - ch;
176 71513 : if (!val)
177 108 : return;
178 71405 : else if (val > 0)
179 30242 : range = index;
180 : else {
181 41163 : pos += (index+1);
182 41163 : range -= (index+1);
183 : }
184 : }
185 :
186 41005 : if (pos == matches.size())
187 28305 : matches.append(ch);
188 : else
189 12700 : matches.insert(pos, ch);
190 : }
191 :
192 64674 : void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
193 : {
194 64674 : unsigned end = ranges.size();
195 :
196 : // Simple linear scan - I doubt there are that many ranges anyway...
197 : // feel free to fix this with something faster (eg binary chop).
198 71522 : for (unsigned i = 0; i < end; ++i) {
199 : // does the new range fall before the current position in the array
200 44328 : if (hi < ranges[i].begin) {
201 : // optional optimization: concatenate appending ranges? - may not be worthwhile.
202 37425 : if (hi == (ranges[i].begin - 1)) {
203 27 : ranges[i].begin = lo;
204 27 : return;
205 : }
206 37398 : ranges.insert(i, CharacterRange(lo, hi));
207 37398 : return;
208 : }
209 : // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
210 : // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
211 : // end of the last range they concatenate, which is just as good.
212 6903 : if (lo <= (ranges[i].end + 1)) {
213 : // found an intersect! we'll replace this entry in the array.
214 55 : ranges[i].begin = std::min(ranges[i].begin, lo);
215 55 : ranges[i].end = std::max(ranges[i].end, hi);
216 :
217 : // now check if the new range can subsume any subsequent ranges.
218 55 : unsigned next = i+1;
219 : // each iteration of the loop we will either remove something from the list, or break the loop.
220 119 : while (next < ranges.size()) {
221 37 : if (ranges[next].begin <= (ranges[i].end + 1)) {
222 : // the next entry now overlaps / concatenates this one.
223 9 : ranges[i].end = std::max(ranges[i].end, ranges[next].end);
224 9 : ranges.remove(next);
225 : } else
226 28 : break;
227 : }
228 :
229 55 : return;
230 : }
231 : }
232 :
233 : // CharacterRange comes after all existing ranges.
234 27194 : ranges.append(CharacterRange(lo, hi));
235 : }
236 :
237 : bool m_isCaseInsensitive;
238 :
239 : Vector<UChar> m_matches;
240 : Vector<CharacterRange> m_ranges;
241 : Vector<UChar> m_matchesUnicode;
242 : Vector<CharacterRange> m_rangesUnicode;
243 : };
244 :
245 : class YarrPatternConstructor {
246 : public:
247 58352 : YarrPatternConstructor(YarrPattern& pattern)
248 : : m_pattern(pattern)
249 : , m_characterClassConstructor(pattern.m_ignoreCase)
250 58352 : , m_invertParentheticalAssertion(false)
251 : {
252 58352 : m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
253 58352 : m_alternative = m_pattern.m_body->addNewAlternative();
254 58352 : m_pattern.m_disjunctions.append(m_pattern.m_body);
255 58352 : }
256 :
257 58352 : ~YarrPatternConstructor()
258 58352 : {
259 58352 : }
260 :
261 0 : void reset()
262 : {
263 0 : m_pattern.reset();
264 0 : m_characterClassConstructor.reset();
265 :
266 0 : m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
267 0 : m_alternative = m_pattern.m_body->addNewAlternative();
268 0 : m_pattern.m_disjunctions.append(m_pattern.m_body);
269 0 : }
270 :
271 19833 : void assertionBOL()
272 : {
273 19833 : if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
274 19833 : m_alternative->m_startsWithBOL = true;
275 19833 : m_alternative->m_containsBOL = true;
276 19833 : m_pattern.m_containsBOL = true;
277 : }
278 19833 : m_alternative->m_terms.append(PatternTerm::BOL());
279 19833 : }
280 21546 : void assertionEOL()
281 : {
282 21546 : m_alternative->m_terms.append(PatternTerm::EOL());
283 21546 : }
284 423 : void assertionWordBoundary(bool invert)
285 : {
286 423 : m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
287 423 : }
288 :
289 201798 : void atomPatternCharacter(UChar ch)
290 : {
291 : // We handle case-insensitive checking of unicode characters which do have both
292 : // cases by handling them as if they were defined using a CharacterClass.
293 201798 : if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
294 0 : atomCharacterClassBegin();
295 0 : atomCharacterClassAtom(ch);
296 0 : atomCharacterClassEnd();
297 : } else
298 201798 : m_alternative->m_terms.append(PatternTerm(ch));
299 201798 : }
300 :
301 31580 : void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
302 : {
303 31580 : switch (classID) {
304 : case DigitClassID:
305 25230 : m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
306 25230 : break;
307 : case SpaceClassID:
308 807 : m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
309 807 : break;
310 : case WordClassID:
311 372 : m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
312 372 : break;
313 : case NewlineClassID:
314 5171 : m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
315 5171 : break;
316 : }
317 31580 : }
318 :
319 30655 : void atomCharacterClassBegin(bool invert = false)
320 : {
321 30655 : m_invertCharacterClass = invert;
322 30655 : }
323 :
324 36017 : void atomCharacterClassAtom(UChar ch)
325 : {
326 36017 : m_characterClassConstructor.putChar(ch);
327 36017 : }
328 :
329 41891 : void atomCharacterClassRange(UChar begin, UChar end)
330 : {
331 41891 : m_characterClassConstructor.putRange(begin, end);
332 41891 : }
333 :
334 763 : void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
335 : {
336 763 : ASSERT(classID != NewlineClassID);
337 :
338 763 : switch (classID) {
339 : case DigitClassID:
340 28 : m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
341 28 : break;
342 :
343 : case SpaceClassID:
344 108 : m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
345 108 : break;
346 :
347 : case WordClassID:
348 627 : m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
349 627 : break;
350 :
351 : default:
352 0 : ASSERT_NOT_REACHED();
353 : }
354 763 : }
355 :
356 30655 : void atomCharacterClassEnd()
357 : {
358 30655 : CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
359 30655 : m_pattern.m_userCharacterClasses.append(newCharacterClass);
360 30655 : m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
361 30655 : }
362 :
363 49831 : void atomParenthesesSubpatternBegin(bool capture = true)
364 : {
365 49831 : unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
366 49831 : if (capture)
367 31303 : m_pattern.m_numSubpatterns++;
368 :
369 49831 : PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
370 49831 : m_pattern.m_disjunctions.append(parenthesesDisjunction);
371 49831 : m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
372 49831 : m_alternative = parenthesesDisjunction->addNewAlternative();
373 49831 : }
374 :
375 72 : void atomParentheticalAssertionBegin(bool invert = false)
376 : {
377 72 : PatternDisjunction* parenthesesDisjunction = js::OffTheBooks::new_<PatternDisjunction>(m_alternative);
378 72 : m_pattern.m_disjunctions.append(parenthesesDisjunction);
379 72 : m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
380 72 : m_alternative = parenthesesDisjunction->addNewAlternative();
381 72 : m_invertParentheticalAssertion = invert;
382 72 : }
383 :
384 49903 : void atomParenthesesEnd()
385 : {
386 49903 : ASSERT(m_alternative->m_parent);
387 49903 : ASSERT(m_alternative->m_parent->m_parent);
388 :
389 49903 : PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
390 49903 : m_alternative = m_alternative->m_parent->m_parent;
391 :
392 49903 : PatternTerm& lastTerm = m_alternative->lastTerm();
393 :
394 49903 : unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
395 49903 : unsigned numBOLAnchoredAlts = 0;
396 :
397 105986 : for (unsigned i = 0; i < numParenAlternatives; i++) {
398 : // Bubble up BOL flags
399 56083 : if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
400 126 : numBOLAnchoredAlts++;
401 : }
402 :
403 49903 : if (numBOLAnchoredAlts) {
404 126 : m_alternative->m_containsBOL = true;
405 : // If all the alternatives in parens start with BOL, then so does this one
406 126 : if (numBOLAnchoredAlts == numParenAlternatives)
407 0 : m_alternative->m_startsWithBOL = true;
408 : }
409 :
410 49903 : lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
411 49903 : m_invertParentheticalAssertion = false;
412 49903 : }
413 :
414 18 : void atomBackReference(unsigned subpatternId)
415 : {
416 18 : ASSERT(subpatternId);
417 18 : m_pattern.m_containsBackreferences = true;
418 18 : m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
419 :
420 18 : if (subpatternId > m_pattern.m_numSubpatterns) {
421 0 : m_alternative->m_terms.append(PatternTerm::ForwardReference());
422 0 : return;
423 : }
424 :
425 18 : PatternAlternative* currentAlternative = m_alternative;
426 18 : ASSERT(currentAlternative);
427 :
428 : // Note to self: if we waited until the AST was baked, we could also remove forwards refs
429 54 : while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
430 18 : PatternTerm& term = currentAlternative->lastTerm();
431 18 : ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
432 :
433 18 : if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
434 0 : m_alternative->m_terms.append(PatternTerm::ForwardReference());
435 0 : return;
436 : }
437 : }
438 :
439 18 : m_alternative->m_terms.append(PatternTerm(subpatternId));
440 : }
441 :
442 : // deep copy the argument disjunction. If filterStartsWithBOL is true,
443 : // skip alternatives with m_startsWithBOL set true.
444 25507 : PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
445 : {
446 25507 : PatternDisjunction* newDisjunction = 0;
447 51635 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
448 26128 : PatternAlternative* alternative = disjunction->m_alternatives[alt];
449 26128 : if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
450 6405 : if (!newDisjunction) {
451 5946 : newDisjunction = js::OffTheBooks::new_<PatternDisjunction>();
452 5946 : newDisjunction->m_parent = disjunction->m_parent;
453 : }
454 6405 : PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
455 22962 : for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
456 16557 : newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
457 : }
458 : }
459 :
460 25507 : if (newDisjunction)
461 5946 : m_pattern.m_disjunctions.append(newDisjunction);
462 25507 : return newDisjunction;
463 : }
464 :
465 57558 : PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
466 : {
467 57558 : if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
468 51765 : return PatternTerm(term);
469 :
470 5793 : PatternTerm termCopy = term;
471 5793 : termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
472 5793 : return termCopy;
473 : }
474 :
475 67933 : void quantifyAtom(unsigned min, unsigned max, bool greedy)
476 : {
477 67933 : ASSERT(min <= max);
478 67933 : ASSERT(m_alternative->m_terms.size());
479 :
480 67933 : if (!max) {
481 0 : m_alternative->removeLastTerm();
482 0 : return;
483 : }
484 :
485 67933 : PatternTerm& term = m_alternative->lastTerm();
486 67933 : ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
487 67933 : ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
488 :
489 : // For any assertion with a zero minimum, not matching is valid and has no effect,
490 : // remove it. Otherwise, we need to match as least once, but there is no point
491 : // matching more than once, so remove the quantifier. It is not entirely clear
492 : // from the spec whether or not this behavior is correct, but I believe this
493 : // matches Firefox. :-/
494 67933 : if (term.type == PatternTerm::TypeParentheticalAssertion) {
495 0 : if (!min)
496 0 : m_alternative->removeLastTerm();
497 0 : return;
498 : }
499 :
500 67933 : if (min == 0)
501 24226 : term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
502 43707 : else if (min == max)
503 2706 : term.quantify(min, QuantifierFixedCount);
504 : else {
505 41001 : term.quantify(min, QuantifierFixedCount);
506 41001 : m_alternative->m_terms.append(copyTerm(term));
507 : // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
508 41001 : m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
509 41001 : if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
510 2793 : m_alternative->lastTerm().parentheses.isCopy = true;
511 : }
512 : }
513 :
514 7719 : void disjunction()
515 : {
516 7719 : m_alternative = m_alternative->m_parent->addNewAlternative();
517 7719 : }
518 :
519 122379 : ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned *callFrameSizeOut)
520 : {
521 122379 : alternative->m_hasFixedSize = true;
522 122379 : unsigned currentInputPosition = initialInputPosition;
523 :
524 535693 : for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
525 413314 : PatternTerm& term = alternative->m_terms[i];
526 :
527 413314 : switch (term.type) {
528 : case PatternTerm::TypeAssertionBOL:
529 : case PatternTerm::TypeAssertionEOL:
530 : case PatternTerm::TypeAssertionWordBoundary:
531 41928 : term.inputPosition = currentInputPosition;
532 41928 : break;
533 :
534 : case PatternTerm::TypeBackReference:
535 18 : term.inputPosition = currentInputPosition;
536 18 : term.frameLocation = currentCallFrameSize;
537 18 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
538 18 : alternative->m_hasFixedSize = false;
539 18 : break;
540 :
541 : case PatternTerm::TypeForwardReference:
542 0 : break;
543 :
544 : case PatternTerm::TypePatternCharacter:
545 214565 : term.inputPosition = currentInputPosition;
546 214565 : if (term.quantityType != QuantifierFixedCount) {
547 6067 : term.frameLocation = currentCallFrameSize;
548 6067 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
549 6067 : alternative->m_hasFixedSize = false;
550 : } else
551 208498 : currentInputPosition += term.quantityCount;
552 214565 : break;
553 :
554 : case PatternTerm::TypeCharacterClass:
555 101107 : term.inputPosition = currentInputPosition;
556 101107 : if (term.quantityType != QuantifierFixedCount) {
557 46462 : term.frameLocation = currentCallFrameSize;
558 46462 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
559 46462 : alternative->m_hasFixedSize = false;
560 : } else
561 54645 : currentInputPosition += term.quantityCount;
562 101107 : break;
563 :
564 : case PatternTerm::TypeParenthesesSubpattern:
565 : // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
566 55624 : term.frameLocation = currentCallFrameSize;
567 55624 : if (term.quantityCount == 1 && !term.parentheses.isCopy) {
568 49882 : if (term.quantityType != QuantifierFixedCount)
569 12596 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
570 49882 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, ¤tCallFrameSize))
571 0 : return error;
572 : // If quantity is fixed, then pre-check its minimum size.
573 49882 : if (term.quantityType == QuantifierFixedCount)
574 37286 : currentInputPosition += term.parentheses.disjunction->m_minimumSize;
575 49882 : term.inputPosition = currentInputPosition;
576 5742 : } else if (term.parentheses.isTerminal) {
577 2739 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
578 2739 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition, ¤tCallFrameSize))
579 0 : return error;
580 2739 : term.inputPosition = currentInputPosition;
581 : } else {
582 3003 : term.inputPosition = currentInputPosition;
583 : unsigned dummy;
584 3003 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, BASE_FRAME_SIZE, currentInputPosition, &dummy))
585 0 : return error;
586 3003 : currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
587 : }
588 : // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
589 55624 : alternative->m_hasFixedSize = false;
590 55624 : break;
591 :
592 : case PatternTerm::TypeParentheticalAssertion:
593 72 : term.inputPosition = currentInputPosition;
594 72 : term.frameLocation = currentCallFrameSize;
595 72 : if (ErrorCode error = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition, ¤tCallFrameSize))
596 0 : return error;
597 72 : break;
598 : }
599 : }
600 :
601 122379 : alternative->m_minimumSize = currentInputPosition - initialInputPosition;
602 122379 : *callFrameSizeOut = currentCallFrameSize;
603 122379 : return NoError;
604 : }
605 :
606 114048 : ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned *maximumCallFrameSizeOut)
607 : {
608 114048 : if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
609 5912 : initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
610 :
611 114048 : unsigned minimumInputSize = UINT_MAX;
612 114048 : unsigned maximumCallFrameSize = 0;
613 114048 : bool hasFixedSize = true;
614 :
615 236427 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
616 122379 : PatternAlternative* alternative = disjunction->m_alternatives[alt];
617 : unsigned currentAlternativeCallFrameSize;
618 122379 : if (ErrorCode error = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition, ¤tAlternativeCallFrameSize))
619 0 : return error;
620 122379 : minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
621 122379 : maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
622 122379 : hasFixedSize &= alternative->m_hasFixedSize;
623 : }
624 :
625 114048 : if (minimumInputSize == UINT_MAX)
626 0 : return PatternTooLarge;
627 :
628 114048 : ASSERT(maximumCallFrameSize >= initialCallFrameSize);
629 :
630 114048 : disjunction->m_hasFixedSize = hasFixedSize;
631 114048 : disjunction->m_minimumSize = minimumInputSize;
632 114048 : disjunction->m_callFrameSize = maximumCallFrameSize;
633 114048 : *maximumCallFrameSizeOut = maximumCallFrameSize;
634 114048 : return NoError;
635 : }
636 :
637 58352 : ErrorCode setupOffsets()
638 : {
639 : unsigned dummy;
640 58352 : return setupDisjunctionOffsets(m_pattern.m_body, BASE_FRAME_SIZE, 0, &dummy);
641 : }
642 :
643 : // This optimization identifies sets of parentheses that we will never need to backtrack.
644 : // In these cases we do not need to store state from prior iterations.
645 : // We can presently avoid backtracking for:
646 : // * where the parens are at the end of the regular expression (last term in any of the
647 : // alternatives of the main body disjunction).
648 : // * where the parens are non-capturing, and quantified unbounded greedy (*).
649 : // * where the parens do not contain any capturing subpatterns.
650 58352 : void checkForTerminalParentheses()
651 : {
652 : // This check is much too crude; should be just checking whether the candidate
653 : // node contains nested capturing subpatterns, not the whole expression!
654 58352 : if (m_pattern.m_numSubpatterns)
655 26356 : return;
656 :
657 31996 : Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
658 65252 : for (size_t i = 0; i < alternatives.size(); ++i) {
659 33256 : Vector<PatternTerm>& terms = alternatives[i]->m_terms;
660 33256 : if (terms.size()) {
661 33238 : PatternTerm& term = terms.last();
662 35968 : if (term.type == PatternTerm::TypeParenthesesSubpattern
663 : && term.quantityType == QuantifierGreedy
664 : && term.quantityCount == quantifyInfinite
665 2730 : && !term.capture())
666 2730 : term.parentheses.isTerminal = true;
667 : }
668 : }
669 : }
670 :
671 58352 : void optimizeBOL()
672 : {
673 : // Look for expressions containing beginning of line (^) anchoring and unroll them.
674 : // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
675 : // This code relies on the parsing code tagging alternatives with m_containsBOL and
676 : // m_startsWithBOL and rolling those up to containing alternatives.
677 : // At this point, this is only valid for non-multiline expressions.
678 58352 : PatternDisjunction* disjunction = m_pattern.m_body;
679 :
680 58352 : if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
681 38638 : return;
682 :
683 19714 : PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
684 :
685 : // Set alternatives in disjunction to "onceThrough"
686 39464 : for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
687 19750 : disjunction->m_alternatives[alt]->setOnceThrough();
688 :
689 19714 : if (loopDisjunction) {
690 : // Move alternatives from loopDisjunction to disjunction
691 306 : for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
692 153 : disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
693 :
694 153 : loopDisjunction->m_alternatives.clear();
695 : }
696 : }
697 :
698 : private:
699 : YarrPattern& m_pattern;
700 : PatternAlternative* m_alternative;
701 : CharacterClassConstructor m_characterClassConstructor;
702 : bool m_invertCharacterClass;
703 : bool m_invertParentheticalAssertion;
704 : };
705 :
706 58352 : ErrorCode YarrPattern::compile(const UString& patternString)
707 : {
708 116704 : YarrPatternConstructor constructor(*this);
709 :
710 58352 : if (ErrorCode error = parse(constructor, patternString))
711 0 : return error;
712 :
713 : // If the pattern contains illegal backreferences reset & reparse.
714 : // Quoting Netscape's "What's new in JavaScript 1.2",
715 : // "Note: if the number of left parentheses is less than the number specified
716 : // in \#, the \# is taken as an octal escape as described in the next row."
717 58352 : if (containsIllegalBackReference()) {
718 0 : unsigned numSubpatterns = m_numSubpatterns;
719 :
720 0 : constructor.reset();
721 : #if !ASSERT_DISABLED
722 : ErrorCode error =
723 : #endif
724 0 : parse(constructor, patternString, numSubpatterns);
725 :
726 0 : ASSERT(!error);
727 0 : ASSERT(numSubpatterns == m_numSubpatterns);
728 : }
729 :
730 58352 : constructor.checkForTerminalParentheses();
731 58352 : constructor.optimizeBOL();
732 :
733 58352 : if (ErrorCode error = constructor.setupOffsets())
734 0 : return error;
735 :
736 58352 : return NoError;
737 : }
738 :
739 58352 : YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error)
740 : : m_ignoreCase(ignoreCase)
741 : , m_multiline(multiline)
742 : , m_containsBackreferences(false)
743 : , m_containsBOL(false)
744 : , m_numSubpatterns(0)
745 : , m_maxBackReference(0)
746 : , newlineCached(0)
747 : , digitsCached(0)
748 : , spacesCached(0)
749 : , wordcharCached(0)
750 : , nondigitsCached(0)
751 : , nonspacesCached(0)
752 58352 : , nonwordcharCached(0)
753 : {
754 58352 : *error = compile(pattern);
755 58352 : }
756 :
757 : } }
|