Ignore:
Timestamp:
Mar 26, 2012, 1:13:39 PM (13 years ago)
Author:
[email protected]
Message:

Greek sigma is handled wrong in case independent regexp.
https://bugs.webkit.org/show_bug.cgi?id=82063

Reviewed by Oliver Hunt.

Source/JavaScriptCore:

The bug here is that we assume that any given codepoint has at most one additional value it
should match under a case insensitive match, and that the pair of codepoints that match (if
a codepoint does not only match itself) can be determined by calling toUpper/toLower on the
given codepoint). Life is not that simple.

Instead, pre-calculate a set of tables mapping from a UCS2 codepoint to the set of characters
it may match, under the ES5.1 case-insensitive matching rules. Since unicode is fairly regular
we can pack this table quite nicely, and get it down to 364 entries. This means we can use a
simple binary search to find an entry in typically eight compares.

  • CMakeLists.txt:
  • GNUmakefile.list.am:
  • JavaScriptCore.gypi:
  • JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.vcproj:
  • JavaScriptCore.xcodeproj/project.pbxproj:
  • yarr/yarr.pri:
    • Added new files to build systems.
  • yarr/YarrCanonicalizeUCS2.cpp: Added.
    • New - autogenerated, UCS2 canonicalized comparison tables.
  • yarr/YarrCanonicalizeUCS2.h: Added.

(JSC::Yarr::rangeInfoFor):

  • Look up the canonicalization info for a UCS2 character.

(JSC::Yarr::getCanonicalPair):

  • For a UCS2 character with a single equivalent value, look it up.

(JSC::Yarr::isCanonicallyUnique):

  • Returns true if no other UCS2 code points are canonically equal.

(JSC::Yarr::areCanonicallyEquivalent):

  • Compare two values, under canonicalization rules.
  • yarr/YarrCanonicalizeUCS2.js: Added.
    • script used to generate YarrCanonicalizeUCS2.cpp.
  • yarr/YarrInterpreter.cpp:

(JSC::Yarr::Interpreter::tryConsumeBackReference):

  • Use isCanonicallyUnique, rather than Unicode toUpper/toLower.
  • yarr/YarrJIT.cpp:

(JSC::Yarr::YarrGenerator::jumpIfCharNotEquals):
(JSC::Yarr::YarrGenerator::generatePatternCharacterOnce):
(JSC::Yarr::YarrGenerator::generatePatternCharacterFixed):

  • Use isCanonicallyUnique, rather than Unicode toUpper/toLower.
  • yarr/YarrPattern.cpp:

(JSC::Yarr::CharacterClassConstructor::putChar):

  • Updated to determine canonical equivalents correctly.

(JSC::Yarr::CharacterClassConstructor::putUnicodeIgnoreCase):

  • Added, used to put a non-ascii, non-unique character in a case-insensitive match.

(JSC::Yarr::CharacterClassConstructor::putRange):

  • Updated to determine canonical equivalents correctly.

(JSC::Yarr::YarrPatternConstructor::atomPatternCharacter):

  • Changed to call putUnicodeIgnoreCase, instead of putChar, avoid a double lookup of rangeInfo.

LayoutTests:

  • fast/regex/script-tests/unicodeCaseInsensitive.js: Added.

(shouldBeTrue.ucs2CodePoint):

  • fast/regex/unicodeCaseInsensitive-expected.txt: Added.
  • fast/regex/unicodeCaseInsensitive.html: Added.
    • Added test cases for case-insensitive matches of non-ascii characters.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r106748 r112143  
    2929
    3030#include "Yarr.h"
     31#include "YarrCanonicalizeUCS2.h"
    3132#include "YarrParser.h"
    3233#include <wtf/Vector.h>
     
    6768    void putChar(UChar ch)
    6869    {
     70        // Handle ascii cases.
    6971        if (ch <= 0x7f) {
    7072            if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
     
    7375            } else
    7476                addSorted(m_matches, ch);
     77            return;
     78        }
     79
     80        // Simple case, not a case-insensitive match.
     81        if (!m_isCaseInsensitive) {
     82            addSorted(m_matchesUnicode, ch);
     83            return;
     84        }
     85
     86        // Add multiple matches, if necessary.
     87        UCS2CanonicalizationRange* info = rangeInfoFor(ch);
     88        if (info->type == CanonicalizeUnique)
     89            addSorted(m_matchesUnicode, ch);
     90        else
     91            putUnicodeIgnoreCase(ch, info);
     92    }
     93
     94    void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
     95    {
     96        ASSERT(m_isCaseInsensitive);
     97        ASSERT(ch > 0x7f);
     98        ASSERT(ch >= info->begin && ch <= info->end);
     99        ASSERT(info->type != CanonicalizeUnique);
     100        if (info->type == CanonicalizeSet) {
     101            for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
     102                addSorted(m_matchesUnicode, ch);
    75103        } else {
    76             UChar upper, lower;
    77             if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
    78                 addSorted(m_matchesUnicode, upper);
    79                 addSorted(m_matchesUnicode, lower);
    80             } else
    81                 addSorted(m_matchesUnicode, ch);
    82         }
    83     }
    84 
    85     // returns true if this character has another case, and 'ch' is the upper case form.
    86     static inline bool isUnicodeUpper(UChar ch)
    87     {
    88         return ch != Unicode::toLower(ch);
    89     }
    90 
    91     // returns true if this character has another case, and 'ch' is the lower case form.
    92     static inline bool isUnicodeLower(UChar ch)
    93     {
    94         return ch != Unicode::toUpper(ch);
     104            addSorted(m_matchesUnicode, ch);
     105            addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
     106        }
    95107    }
    96108
     
    109121            }
    110122        }
    111         if (hi >= 0x80) {
    112             uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
    113             addSortedRange(m_rangesUnicode, unicodeCurr, hi);
    114            
    115             if (m_isCaseInsensitive) {
    116                 while (unicodeCurr <= hi) {
    117                     // If the upper bound of the range (hi) is 0xffff, the increments to
    118                     // unicodeCurr in this loop may take it to 0x10000.  This is fine
    119                     // (if so we won't re-enter the loop, since the loop condition above
    120                     // will definitely fail) - but this does mean we cannot use a UChar
    121                     // to represent unicodeCurr, we must use a 32-bit value instead.
    122                     ASSERT(unicodeCurr <= 0xffff);
    123 
    124                     if (isUnicodeUpper(unicodeCurr)) {
    125                         UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
    126                         UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
    127                         while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
    128                             lowerCaseRangeEnd++;
    129                         addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
    130                     } else if (isUnicodeLower(unicodeCurr)) {
    131                         UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
    132                         UChar upperCaseRangeEnd = upperCaseRangeBegin;
    133                         while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
    134                             upperCaseRangeEnd++;
    135                         addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
    136                     } else
    137                         ++unicodeCurr;
    138                 }
    139             }
    140         }
     123        if (hi <= 0x7f)
     124            return;
     125
     126        lo = std::max(lo, (UChar)0x80);
     127        addSortedRange(m_rangesUnicode, lo, hi);
     128       
     129        if (!m_isCaseInsensitive)
     130            return;
     131
     132        UCS2CanonicalizationRange* info = rangeInfoFor(lo);
     133        while (true) {
     134            // Handle the range [lo .. end]
     135            UChar end = std::min(info->end, hi);
     136
     137            switch (info->type) {
     138            case CanonicalizeUnique:
     139                // Nothing to do - no canonical equivalents.
     140                break;
     141            case CanonicalizeSet: {
     142                UChar ch;
     143                for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
     144                    addSorted(m_matchesUnicode, ch);
     145                break;
     146            }
     147            case CanonicalizeRangeLo:
     148                addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
     149                break;
     150            case CanonicalizeRangeHi:
     151                addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
     152                break;
     153            case CanonicalizeAlternatingAligned:
     154                // Use addSortedRange since there is likely an abutting range to combine with.
     155                if (lo & 1)
     156                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
     157                if (!(end & 1))
     158                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
     159                break;
     160            case CanonicalizeAlternatingUnaligned:
     161                // Use addSortedRange since there is likely an abutting range to combine with.
     162                if (!(lo & 1))
     163                    addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
     164                if (end & 1)
     165                    addSortedRange(m_rangesUnicode, end + 1, end + 1);
     166                break;
     167            }
     168
     169            if (hi == end)
     170                return;
     171
     172            ++info;
     173            lo = info->begin;
     174        };
     175
    141176    }
    142177
     
    281316        // We handle case-insensitive checking of unicode characters which do have both
    282317        // cases by handling them as if they were defined using a CharacterClass.
    283         if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
    284             atomCharacterClassBegin();
    285             atomCharacterClassAtom(ch);
    286             atomCharacterClassEnd();
    287         } else
     318        if (!m_pattern.m_ignoreCase || isASCII(ch)) {
    288319            m_alternative->m_terms.append(PatternTerm(ch));
     320            return;
     321        }
     322
     323        UCS2CanonicalizationRange* info = rangeInfoFor(ch);
     324        if (info->type == CanonicalizeUnique) {
     325            m_alternative->m_terms.append(PatternTerm(ch));
     326            return;
     327        }
     328
     329        m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
     330        CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
     331        m_pattern.m_userCharacterClasses.append(newCharacterClass);
     332        m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
    289333    }
    290334
Note: See TracChangeset for help on using the changeset viewer.