Version: (using KDE KDE 3.2.3) Installed from: Debian testing/unstable Packages Thai words are not separated using space like English. Libthai (http://linux.thai.net/libthai) provides the function to break the string of words at the right position. Here is the patch by Lars Knoll and me to let khtml make use of libthai to break the Thai words correctly (http://linux.thai.net/cgi-bin/viewcvs.cgi/software/kde/kdelibs-3.2.3/). More info here: http://linux.thai.net/Members/ott/qt/. diff -uNr kdelibs-3.2.3.org/khtml/khtml_factory.cpp kdelibs-3.2.3/khtml/khtml_factory.cpp --- kdelibs-3.2.3.org/khtml/khtml_factory.cpp 2004-02-29 21:32:10.000000000 +0900 +++ kdelibs-3.2.3/khtml/khtml_factory.cpp 2004-07-28 19:29:15.000000000 +0900 @@ -25,6 +25,7 @@ #include "css/cssstyleselector.h" #include "html/html_imageimpl.h" #include "rendering/render_style.h" +#include "rendering/break_lines.h" #include "misc/loader.h" #include <kinstance.h> @@ -94,6 +95,7 @@ khtml::CSSStyleSelector::clear(); khtml::RenderStyle::cleanup(); khtml::Cache::clear(); + khtml::cleanup_thaibreaks(); } else deref(); diff -uNr kdelibs-3.2.3.org/khtml/rendering/break_lines.cpp kdelibs-3.2.3/khtml/rendering/break_lines.cpp --- kdelibs-3.2.3.org/khtml/rendering/break_lines.cpp 2002-09-06 03:12:06.000000000 +0900 +++ kdelibs-3.2.3/khtml/rendering/break_lines.cpp 2004-07-29 02:11:34.000000000 +0900 @@ -1 +1,103 @@ #include <break_lines.h> +#include <klibloader.h> +#include "qcstring.h" +#include <qtextcodec.h> +#include <qcleanuphandler.h> + + +typedef int (*th_brk_def)(const char*, int[], int); +static th_brk_def th_brk; + +namespace khtml { + struct ThaiCache + { + ThaiCache() { + string = 0; + allocated = 0x400; + wbrpos = (int *) malloc(allocated*sizeof(int)); + numwbrpos = 0; + numisbreakable = 0x400; + isbreakable = (int *) malloc(numisbreakable*sizeof(int)); + } + ~ThaiCache() { + free(wbrpos); + free(isbreakable); + library->unload(); + } + const QChar *string; + int *wbrpos; + int *isbreakable; + int allocated; + int numwbrpos,numisbreakable; + KLibrary *library; + }; + static ThaiCache *cache = 0; + + void cleanup_thaibreaks() + { + delete cache; + } + + bool isBreakableThai( const QChar *string, const int pos, const int len) + { + static QTextCodec *thaiCodec = QTextCodec::codecForMib(2259); + + //printf("Entering isBreakableThai with pos = %d\n", pos); + + KLibrary *lib = 0; + + /* load libthai dynamically */ + if (( !th_brk ) && thaiCodec ) { + KLibLoader *loader = KLibLoader::self(); + lib = loader->library("libthai.so.0"); + if (lib && lib->hasSymbol("th_brk")) { + th_brk = (th_brk_def) lib->symbol("th_brk"); + } else { + // indication that loading failed and we shouldn't try to load again + printf("Error, can't load libthai...\n"); + thaiCodec = 0; + if (lib) + lib->unload(); + } + } + + if (!th_brk ) { + return true; + } + + if (!cache ) { + cache = new ThaiCache; + cache->library = lib; + } + + // build up string of thai chars + if ( string != cache->string ) { + //fprintf(stderr,"new string found (not in cache), calling libthai\n"); + QCString cstr = thaiCodec->fromUnicode( QConstString(string,len).string()); + //printf("About to call libthai::th_brk with str: %s",cstr.data()); + + cache->numwbrpos = th_brk(cstr.data(), cache->wbrpos, cache->allocated); + //fprintf(stderr,"libthai returns with value %d\n",cache->numwbrpos); + if (cache->numwbrpos > cache->allocated) { + cache->allocated = cache->numwbrpos; + cache->wbrpos = (int *)realloc(cache->wbrpos, cache->allocated*sizeof(int)); + cache->numwbrpos = th_brk(cstr.data(), cache->wbrpos, cache->allocated); + } + if ( len > cache->numisbreakable ) { + cache->numisbreakable=len; + cache->isbreakable = (int *)realloc(cache->isbreakable, cache->numisbreakable*sizeof(int)); + } + for (int i = 0 ; i < len ; ++i) { + cache->isbreakable[i] = 0; + } + if ( cache->numwbrpos > 0 ) { + for (int i = cache->numwbrpos-1; i >= 0; --i) { + cache->isbreakable[cache->wbrpos[i]] = 1; + } + } + cache->string = string; + } + //printf("Returning %d\n", cache->isbreakable[pos]); + return cache->isbreakable[pos]; + } +} diff -uNr kdelibs-3.2.3.org/khtml/rendering/break_lines.h kdelibs-3.2.3/khtml/rendering/break_lines.h --- kdelibs-3.2.3.org/khtml/rendering/break_lines.h 2004-03-01 00:27:59.000000000 +0900 +++ kdelibs-3.2.3/khtml/rendering/break_lines.h 2004-07-28 19:29:15.000000000 +0900 @@ -116,7 +116,10 @@ } } - inline bool isBreakable( const QChar *str, const int pos, int /*len*/ ) + bool isBreakableThai( const QChar *string, const int pos, const int len); + void cleanup_thaibreaks(); + + inline bool isBreakable( const QChar *str, const int pos, int len ) { const QChar *c = str+pos; unsigned short ch = c->unicode(); @@ -126,9 +129,8 @@ if ( row == 0x0e ) { // 0e00 - 0e7f == Thai if ( c->cell() < 0x80 ) { - // we don't a have a thai line breaking lib at the moment, allow - // breaks everywhere except directly before punctuation. - return true; + // consult libthai + return isBreakableThai(str, pos, len); } else return false; }
Ok, here is the test page in which you can see if the browser supports Thai word breaking or not: <A HREF=http://linux.thai.net/~ott/thaiwbrtest/>http://linux.thai.net/~ott/thaiwbrtest/</A>
After discussions in kfm-devel list, the patch is refined (thanks to all kfm-devel crew) and added to the CVS. KHTML in KDE 3.4 should include Thai word break support.
Hence problem solved.