Summary: | Missing Unicode support in RegExp | ||
---|---|---|---|
Product: | [Applications] konqueror | Reporter: | Kyle Schmitt <kyleaschmitt> |
Component: | kjs | Assignee: | Konqueror Developers <konq-bugs> |
Status: | RESOLVED FIXED | ||
Severity: | normal | CC: | maksim |
Priority: | NOR | ||
Version: | unspecified | ||
Target Milestone: | --- | ||
Platform: | Compiled Sources | ||
OS: | Linux | ||
Latest Commit: | Version Fixed In: | ||
Sentry Crash Report: | |||
Attachments: | Test-case showing the problem |
Description
Kyle Schmitt
2005-03-12 23:51:33 UTC
Created attachment 10087 [details]
Test-case showing the problem
This shows the problem.
Confirmed on 3.4.0. The problem is not related to case sensitivity. It's due to the general lack of Unicode support in the RegExp object. Changing the summary. Right now we only use the lower byte which leads to ambiguity between ASCII character and other Unicode chars with the same lower byte. var s = "\u3041"; var reg = RegExp('A'); // == \u0041 document.write(s.match(reg) ? "FAIL" : "PASS"); The problem is that RegExp object uses conversion from Unicode (16-bit) into 8-bit with simple unsigned char() conversion. It simply cannot work... It should use UTF-8 internally and send that to libpcre (that is used for Perl compatible regular expressions), then convert the result to Unicode from UTF-8 back. This is a major bug, because it goes against ECMA-262. Moreover, the problem is in the whole JavaScript code (kjs). I looked in the code of 3.5.2 (kdelibs/kjs) and also strings are not able to handle Unicode: toLowerCase and toUpperCase uses normal glibc's tolower(int) and toupper(int). Qt has nice functions to convert strings to/from UTF-8 and also to make them upper case and lower case. Why it is not used? If the KDE team doesn't want a dependency on Qt in kjs, then those functions should be reimplemented to fully support Unicode. I think I have a fix for this in trunk, will consider merging back in for 3.5.6 (too late for 3.5.5...) Thanks for the testcase. As for unicode stuff: we will link to Qt for it in 4.0. It probably will not change for 3.x though. SVN commit 604411 by orlovich: Fix Unicode support in RegExp handling, and also be more robust vs. embedded nulls. Fixes problems with some cyrillic characters in gmail (#116512) Makes google calendar somewhat work, though not fully (part #135445) Also fixes #135246, #101398 BUG:135246 BUG:101398 BUG:116512 CCBUG:135445 M +149 -8 regexp.cpp M +24 -1 regexp.h M +2 -0 regexp_object.cpp M +28 -4 string_object.cpp --- branches/KDE/3.5/kdelibs/kjs/regexp.cpp #604410:604411 @@ -2,6 +2,8 @@ /* * This file is part of the KDE libraries * Copyright (C) 1999-2001 Harri Porten (porten@kde.org) + * Copyright (C) 2003,2004 Apple Computer, Inc. + * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -22,22 +24,35 @@ #include "regexp.h" #include "lexer.h" +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> using namespace KJS; +RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; + RegExp::RegExp(const UString &p, int f) - : pat(p), flgs(f), m_notEmpty(false), valid(true) + : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0) { + // Determine whether libpcre has unicode support if need be.. + if (utf8Support == Unknown) { + int supported; + pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); + utf8Support = supported ? Supported : Unsupported; + } + nrSubPatterns = 0; // determined in match() with POSIX regex. // JS regexps can contain Unicode escape sequences (\uxxxx) which // are rather uncommon elsewhere. As our regexp libs don't understand // them we do the unescaping ourselves internally. + // Also make sure to expand out any nulls as pcre_compile + // expects null termination.. UString intern; - if (p.find('\\') >= 0) { + const char* const nil = "\\x00"; + if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) { bool escape = false; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; @@ -52,7 +67,12 @@ if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) && Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) { c = Lexer::convertUnicode(c0, c1, c2, c3); - intern += UString(&c, 1); + if (c.unicode() == 0) { + // Make sure to encode 0, to avoid terminating the string + intern += UString(nil); + } else { + intern += UString(&c, 1); + } i += 4; continue; } @@ -62,6 +82,8 @@ } else { if (c == '\\') escape = true; + else if (c == '\0') + intern += UString(nil); else intern += UString(&c, 1); } @@ -81,8 +103,16 @@ if (flgs & Multiline) pcreflags |= PCRE_MULTILINE; - pcregex = pcre_compile(intern.ascii(), pcreflags, + if (utf8Support == Supported) + pcreflags |= PCRE_UTF8; + + // Fill our buffer with an encoded version, whether utf-8, or, + // if PCRE is incapable, truncated. + prepareMatch(intern); + + pcregex = pcre_compile(buffer, pcreflags, &perrormsg, &errorOffset, NULL); + doneMatch(); // Cleanup buffers if (!pcregex) { #ifndef NDEBUG fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); @@ -128,6 +158,7 @@ RegExp::~RegExp() { + doneMatch(); // Be 100% sure buffers are freed #ifdef HAVE_PCREPOSIX if (pcregex) pcre_free(pcregex); @@ -137,8 +168,97 @@ #endif } +void RegExp::prepareUtf8(const UString& s) +{ + // Allocate a buffer big enough to hold all the characters plus \0 + const int length = s.size(); + buffer = new char[length * 3 + 1]; + + // Also create buffer for positions. We need one extra character in there, + // even past the \0 since the non-empty handling may jump one past the end + originalPos = new int[length * 3 + 2]; + + // Convert to runs of 8-bit characters, and generate indeces + // Note that we do NOT combine surrogate pairs here, as + // regexps operate on them as separate characters + char *p = buffer; + int *posOut = originalPos; + const UChar *d = s.data(); + for (int i = 0; i != length; ++i) { + unsigned short c = d[i].unicode(); + + int sequenceLen; + if (c < 0x80) { + *p++ = (char)c; + sequenceLen = 1; + } else if (c < 0x800) { + *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 + *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + sequenceLen = 2; + } else { + *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 + *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set + *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + sequenceLen = 3; + } + + while (sequenceLen > 0) { + *posOut = i; + ++posOut; + --sequenceLen; + } + } + + bufferSize = p - buffer; + + *p++ = '\0'; + + // Record positions for \0, and the fictional character after that. + *posOut = length; + *(posOut+1) = length+1; +} + +void RegExp::prepareASCII (const UString& s) +{ + originalPos = 0; + + // Best-effort attempt to get something done + // when we don't have utf 8 available -- use + // truncated version, and pray for the best + CString truncated = s.cstring(); + buffer = new char[truncated.size() + 1]; + memcpy(buffer, truncated.c_str(), truncated.size()); + buffer[truncated.size()] = '\0'; // For _compile use + bufferSize = truncated.size(); +} + +void RegExp::prepareMatch(const UString &s) +{ + delete[] originalPos; // Just to be sure.. + delete[] buffer; + if (utf8Support == Supported) + prepareUtf8(s); + else + prepareASCII(s); + +#ifndef NDEBUG + originalS = s; +#endif +} + +void RegExp::doneMatch() +{ + delete[] originalPos; originalPos = 0; + delete[] buffer; buffer = 0; +} + UString RegExp::match(const UString &s, int i, int *pos, int **ovector) { +#ifndef NDEBUG + assert(s.data() == originalS.data()); // Make sure prepareMatch got called right.. +#endif + assert(valid); + if (i < 0) i = 0; if (ovector) @@ -151,14 +271,28 @@ return UString::null; #ifdef HAVE_PCREPOSIX - CString buffer(s.cstring()); - int bufferSize = buffer.size(); int ovecsize = (nrSubPatterns+1)*3; // see pcre docu if (ovector) *ovector = new int[ovecsize]; if (!pcregex) return UString::null; - if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i, + int startPos; + int nextPos; + + if (utf8Support == Supported) { + startPos = i; + while (originalPos[startPos] < i) + ++startPos; + + nextPos = startPos; + while (originalPos[nextPos] < (i + 1)) + ++nextPos; + } else { + startPos = i; + nextPos = i + 1; + } + + if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos, m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) { @@ -172,7 +306,7 @@ fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n"); #endif m_notEmpty = 0; - if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0, + if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) return UString::null; } @@ -181,6 +315,13 @@ } // Got a match, proceed with it. + // But fix up the ovector if need be.. + if (ovector && originalPos) { + for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) { + if ((*ovector)[c] != -1) + (*ovector)[c] = originalPos[(*ovector)[c]]; + } + } if (!ovector) return UString::null; // don't rely on the return value if you pass ovector==0 --- branches/KDE/3.5/kdelibs/kjs/regexp.h #604410:604411 @@ -51,16 +51,39 @@ // RegExp.exec, so it has to store $1 etc. // bool test(const UString &s, int i = -1); unsigned int subPatterns() const { return nrSubPatterns; } + + //These methods should be called around the match of the same string.. + void prepareMatch(const UString &s); + void doneMatch(); private: const UString pat; - int flgs; + int flgs : 8; bool m_notEmpty; bool valid; + + // Cached encoding info... + char* buffer; + int* originalPos; + int bufferSize; + void prepareUtf8 (const UString& s); + void prepareASCII (const UString& s); +#ifndef NDEBUG + UString originalS; // the original string, used for sanity-checking +#endif + #ifndef HAVE_PCREPOSIX regex_t preg; #else pcre *pcregex; + + enum UTF8SupportState { + Unknown, + Supported, + Unsupported + }; + + static UTF8SupportState utf8Support; #endif unsigned int nrSubPatterns; --- branches/KDE/3.5/kdelibs/kjs/regexp_object.cpp #604410:604411 @@ -120,7 +120,9 @@ RegExpObjectImp* regExpObj = static_cast<RegExpObjectImp*>(exec->lexicalInterpreter()->builtinRegExp().imp()); int **ovector = regExpObj->registerRegexp( re, s.value() ); + re->prepareMatch(s.value()); str = re->match(s.value(), i, 0L, ovector); + re->doneMatch(); regExpObj->setSubPatterns(re->subPatterns()); if (id == Test) --- branches/KDE/3.5/kdelibs/kjs/string_object.cpp #604410:604411 @@ -294,6 +294,7 @@ } RegExpObjectImp* regExpObj = static_cast<RegExpObjectImp*>(exec->interpreter()->builtinRegExp().imp()); int **ovector = regExpObj->registerRegexp(reg, s); + reg->prepareMatch(s); UString mstr = reg->match(s, -1, &pos, ovector); if (id == Search) { result = Number(pos); @@ -316,6 +317,7 @@ result = exec->lexicalInterpreter()->builtinArray().construct(exec, list); } } + reg->doneMatch(); delete tmpReg; break; } @@ -337,13 +339,17 @@ else u3 = a1.toString(exec); // 2nd arg is the replacement string + UString out; + // This is either a loop (if global is set) or a one-way (if not). + reg->prepareMatch(s); do { int **ovector = regExpObj->registerRegexp( reg, s ); UString mstr = reg->match(s, lastIndex, &pos, ovector); regExpObj->setSubPatterns(reg->subPatterns()); if (pos == -1) break; + len = mstr.size(); UString rstr; @@ -381,12 +387,27 @@ Object thisObj = exec->interpreter()->globalObject(); rstr = o1.call( exec, thisObj, l ).toString(exec); } - lastIndex = pos + rstr.size(); - s = s.substr(0, pos) + rstr + s.substr(pos + len); - //fprintf(stderr,"pos=%d,len=%d,lastIndex=%d,s=%s\n",pos,len,lastIndex,s.ascii()); + + // Append the stuff we skipped over to get to the match -- + // that would be [lastIndex, pos) of the original.. + if (pos != lastIndex) + out += s.substr(lastIndex, pos - lastIndex); + + // Append the replacement.. + out += rstr; + + lastIndex = pos + len; // Skip over the matched stuff... } while (global); - result = String(s); + // Append the rest of the string to the output... + if (lastIndex == 0 && out.size() == 0) // Don't copy stuff if nothing changed + out = s; + else + out += s.substr(lastIndex, s.size() - lastIndex); + + reg->doneMatch(); + + result = String(out); } else { // First arg is a string u2 = a0.toString(exec); pos = s.find(u2); @@ -433,8 +454,10 @@ if (a0.type() == ObjectType && Object::dynamicCast(a0).inherits(&RegExpImp::info)) { Object obj0 = Object::dynamicCast(a0); RegExp reg(obj0.get(exec,"source").toString(exec)); + reg.prepareMatch(s); if (s.isEmpty() && !reg.match(s, 0).isNull()) { // empty string matched by regexp -> empty array + reg.doneMatch(); res.put(exec, lengthPropertyName, Number(0), DontDelete|ReadOnly|DontEnum); break; } @@ -454,6 +477,7 @@ i++; } } + reg.doneMatch(); } else { u2 = a0.toString(exec); if (u2.isEmpty()) { |