Version: (using KDE Devel) Installed from: Compiled sources Compiler: g++ (GCC) 4.1.0 (SUSE Linux) x86_64 OS: Linux I tried Google calendar (http://www.google.com/calendar/render) with a recent (after 3.5.5) Konqueror, but it doesn't show the calendar at all. Some menus appear in the left and top area, but it is useless. I tried different UA strings, with no luck.
SVN commit 604411 by orlovich: Fix Unicode support in RegExp handling, and also be more robust vs. embedded nulls. Fixes problems with some cyrillic characters in gmail (#116512) Makes google calendar somewhat work, though not fully (part #135445) Also fixes #135246, #101398 BUG:135246 BUG:101398 BUG:116512 CCBUG:135445 M +149 -8 regexp.cpp M +24 -1 regexp.h M +2 -0 regexp_object.cpp M +28 -4 string_object.cpp --- branches/KDE/3.5/kdelibs/kjs/regexp.cpp #604410:604411 @@ -2,6 +2,8 @@ /* * This file is part of the KDE libraries * Copyright (C) 1999-2001 Harri Porten (porten@kde.org) + * Copyright (C) 2003,2004 Apple Computer, Inc. + * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -22,22 +24,35 @@ #include "regexp.h" #include "lexer.h" +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> using namespace KJS; +RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; + RegExp::RegExp(const UString &p, int f) - : pat(p), flgs(f), m_notEmpty(false), valid(true) + : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0) { + // Determine whether libpcre has unicode support if need be.. + if (utf8Support == Unknown) { + int supported; + pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); + utf8Support = supported ? Supported : Unsupported; + } + nrSubPatterns = 0; // determined in match() with POSIX regex. // JS regexps can contain Unicode escape sequences (\uxxxx) which // are rather uncommon elsewhere. As our regexp libs don't understand // them we do the unescaping ourselves internally. + // Also make sure to expand out any nulls as pcre_compile + // expects null termination.. UString intern; - if (p.find('\\') >= 0) { + const char* const nil = "\\x00"; + if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) { bool escape = false; for (int i = 0; i < p.size(); ++i) { UChar c = p[i]; @@ -52,7 +67,12 @@ if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) && Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) { c = Lexer::convertUnicode(c0, c1, c2, c3); - intern += UString(&c, 1); + if (c.unicode() == 0) { + // Make sure to encode 0, to avoid terminating the string + intern += UString(nil); + } else { + intern += UString(&c, 1); + } i += 4; continue; } @@ -62,6 +82,8 @@ } else { if (c == '\\') escape = true; + else if (c == '\0') + intern += UString(nil); else intern += UString(&c, 1); } @@ -81,8 +103,16 @@ if (flgs & Multiline) pcreflags |= PCRE_MULTILINE; - pcregex = pcre_compile(intern.ascii(), pcreflags, + if (utf8Support == Supported) + pcreflags |= PCRE_UTF8; + + // Fill our buffer with an encoded version, whether utf-8, or, + // if PCRE is incapable, truncated. + prepareMatch(intern); + + pcregex = pcre_compile(buffer, pcreflags, &perrormsg, &errorOffset, NULL); + doneMatch(); // Cleanup buffers if (!pcregex) { #ifndef NDEBUG fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); @@ -128,6 +158,7 @@ RegExp::~RegExp() { + doneMatch(); // Be 100% sure buffers are freed #ifdef HAVE_PCREPOSIX if (pcregex) pcre_free(pcregex); @@ -137,8 +168,97 @@ #endif } +void RegExp::prepareUtf8(const UString& s) +{ + // Allocate a buffer big enough to hold all the characters plus \0 + const int length = s.size(); + buffer = new char[length * 3 + 1]; + + // Also create buffer for positions. We need one extra character in there, + // even past the \0 since the non-empty handling may jump one past the end + originalPos = new int[length * 3 + 2]; + + // Convert to runs of 8-bit characters, and generate indeces + // Note that we do NOT combine surrogate pairs here, as + // regexps operate on them as separate characters + char *p = buffer; + int *posOut = originalPos; + const UChar *d = s.data(); + for (int i = 0; i != length; ++i) { + unsigned short c = d[i].unicode(); + + int sequenceLen; + if (c < 0x80) { + *p++ = (char)c; + sequenceLen = 1; + } else if (c < 0x800) { + *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 + *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + sequenceLen = 2; + } else { + *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 + *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set + *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + sequenceLen = 3; + } + + while (sequenceLen > 0) { + *posOut = i; + ++posOut; + --sequenceLen; + } + } + + bufferSize = p - buffer; + + *p++ = '\0'; + + // Record positions for \0, and the fictional character after that. + *posOut = length; + *(posOut+1) = length+1; +} + +void RegExp::prepareASCII (const UString& s) +{ + originalPos = 0; + + // Best-effort attempt to get something done + // when we don't have utf 8 available -- use + // truncated version, and pray for the best + CString truncated = s.cstring(); + buffer = new char[truncated.size() + 1]; + memcpy(buffer, truncated.c_str(), truncated.size()); + buffer[truncated.size()] = '\0'; // For _compile use + bufferSize = truncated.size(); +} + +void RegExp::prepareMatch(const UString &s) +{ + delete[] originalPos; // Just to be sure.. + delete[] buffer; + if (utf8Support == Supported) + prepareUtf8(s); + else + prepareASCII(s); + +#ifndef NDEBUG + originalS = s; +#endif +} + +void RegExp::doneMatch() +{ + delete[] originalPos; originalPos = 0; + delete[] buffer; buffer = 0; +} + UString RegExp::match(const UString &s, int i, int *pos, int **ovector) { +#ifndef NDEBUG + assert(s.data() == originalS.data()); // Make sure prepareMatch got called right.. +#endif + assert(valid); + if (i < 0) i = 0; if (ovector) @@ -151,14 +271,28 @@ return UString::null; #ifdef HAVE_PCREPOSIX - CString buffer(s.cstring()); - int bufferSize = buffer.size(); int ovecsize = (nrSubPatterns+1)*3; // see pcre docu if (ovector) *ovector = new int[ovecsize]; if (!pcregex) return UString::null; - if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i, + int startPos; + int nextPos; + + if (utf8Support == Supported) { + startPos = i; + while (originalPos[startPos] < i) + ++startPos; + + nextPos = startPos; + while (originalPos[nextPos] < (i + 1)) + ++nextPos; + } else { + startPos = i; + nextPos = i + 1; + } + + if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos, m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0, // see man pcretest ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) { @@ -172,7 +306,7 @@ fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n"); #endif m_notEmpty = 0; - if (pcre_exec(pcregex, NULL, buffer.c_str(), bufferSize, i+1, 0, + if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, 0, ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH) return UString::null; } @@ -181,6 +315,13 @@ } // Got a match, proceed with it. + // But fix up the ovector if need be.. + if (ovector && originalPos) { + for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) { + if ((*ovector)[c] != -1) + (*ovector)[c] = originalPos[(*ovector)[c]]; + } + } if (!ovector) return UString::null; // don't rely on the return value if you pass ovector==0 --- branches/KDE/3.5/kdelibs/kjs/regexp.h #604410:604411 @@ -51,16 +51,39 @@ // RegExp.exec, so it has to store $1 etc. // bool test(const UString &s, int i = -1); unsigned int subPatterns() const { return nrSubPatterns; } + + //These methods should be called around the match of the same string.. + void prepareMatch(const UString &s); + void doneMatch(); private: const UString pat; - int flgs; + int flgs : 8; bool m_notEmpty; bool valid; + + // Cached encoding info... + char* buffer; + int* originalPos; + int bufferSize; + void prepareUtf8 (const UString& s); + void prepareASCII (const UString& s); +#ifndef NDEBUG + UString originalS; // the original string, used for sanity-checking +#endif + #ifndef HAVE_PCREPOSIX regex_t preg; #else pcre *pcregex; + + enum UTF8SupportState { + Unknown, + Supported, + Unsupported + }; + + static UTF8SupportState utf8Support; #endif unsigned int nrSubPatterns; --- branches/KDE/3.5/kdelibs/kjs/regexp_object.cpp #604410:604411 @@ -120,7 +120,9 @@ RegExpObjectImp* regExpObj = static_cast<RegExpObjectImp*>(exec->lexicalInterpreter()->builtinRegExp().imp()); int **ovector = regExpObj->registerRegexp( re, s.value() ); + re->prepareMatch(s.value()); str = re->match(s.value(), i, 0L, ovector); + re->doneMatch(); regExpObj->setSubPatterns(re->subPatterns()); if (id == Test) --- branches/KDE/3.5/kdelibs/kjs/string_object.cpp #604410:604411 @@ -294,6 +294,7 @@ } RegExpObjectImp* regExpObj = static_cast<RegExpObjectImp*>(exec->interpreter()->builtinRegExp().imp()); int **ovector = regExpObj->registerRegexp(reg, s); + reg->prepareMatch(s); UString mstr = reg->match(s, -1, &pos, ovector); if (id == Search) { result = Number(pos); @@ -316,6 +317,7 @@ result = exec->lexicalInterpreter()->builtinArray().construct(exec, list); } } + reg->doneMatch(); delete tmpReg; break; } @@ -337,13 +339,17 @@ else u3 = a1.toString(exec); // 2nd arg is the replacement string + UString out; + // This is either a loop (if global is set) or a one-way (if not). + reg->prepareMatch(s); do { int **ovector = regExpObj->registerRegexp( reg, s ); UString mstr = reg->match(s, lastIndex, &pos, ovector); regExpObj->setSubPatterns(reg->subPatterns()); if (pos == -1) break; + len = mstr.size(); UString rstr; @@ -381,12 +387,27 @@ Object thisObj = exec->interpreter()->globalObject(); rstr = o1.call( exec, thisObj, l ).toString(exec); } - lastIndex = pos + rstr.size(); - s = s.substr(0, pos) + rstr + s.substr(pos + len); - //fprintf(stderr,"pos=%d,len=%d,lastIndex=%d,s=%s\n",pos,len,lastIndex,s.ascii()); + + // Append the stuff we skipped over to get to the match -- + // that would be [lastIndex, pos) of the original.. + if (pos != lastIndex) + out += s.substr(lastIndex, pos - lastIndex); + + // Append the replacement.. + out += rstr; + + lastIndex = pos + len; // Skip over the matched stuff... } while (global); - result = String(s); + // Append the rest of the string to the output... + if (lastIndex == 0 && out.size() == 0) // Don't copy stuff if nothing changed + out = s; + else + out += s.substr(lastIndex, s.size() - lastIndex); + + reg->doneMatch(); + + result = String(out); } else { // First arg is a string u2 = a0.toString(exec); pos = s.find(u2); @@ -433,8 +454,10 @@ if (a0.type() == ObjectType && Object::dynamicCast(a0).inherits(&RegExpImp::info)) { Object obj0 = Object::dynamicCast(a0); RegExp reg(obj0.get(exec,"source").toString(exec)); + reg.prepareMatch(s); if (s.isEmpty() && !reg.match(s, 0).isNull()) { // empty string matched by regexp -> empty array + reg.doneMatch(); res.put(exec, lengthPropertyName, Number(0), DontDelete|ReadOnly|DontEnum); break; } @@ -454,6 +477,7 @@ i++; } } + reg.doneMatch(); } else { u2 = a0.toString(exec); if (u2.isEmpty()) {
Works pretty well in 4.0.x, though not perfect..
Thank you Maksim Orlovich, testin qith Kubuntu Gutsy with KDE 4.0.1 works pretty well ;-)
Closing as fixed as of comments #2 and #3.