Bug 86199 - Thai word break support in KHTML
Summary: Thai word break support in KHTML
Status: RESOLVED WORKSFORME
Alias: None
Product: konqueror
Classification: Applications
Component: khtml (show other bugs)
Version: unspecified
Platform: Debian testing Linux
: NOR wishlist
Target Milestone: ---
Assignee: Konqueror Developers
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2004-07-29 06:14 UTC by Pattara Kiatisevi
Modified: 2005-03-09 03:53 UTC (History)
0 users

See Also:
Latest Commit:
Version Fixed In:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Pattara Kiatisevi 2004-07-29 06:14:49 UTC
Version:            (using KDE KDE 3.2.3)
Installed from:    Debian testing/unstable Packages

Thai words are not separated using space like English. Libthai (http://linux.thai.net/libthai) provides the function to break the string of words at the right position. Here is the patch by Lars Knoll and me to let khtml make use of libthai to break the Thai words correctly (http://linux.thai.net/cgi-bin/viewcvs.cgi/software/kde/kdelibs-3.2.3/). More info here: http://linux.thai.net/Members/ott/qt/.

diff -uNr kdelibs-3.2.3.org/khtml/khtml_factory.cpp kdelibs-3.2.3/khtml/khtml_factory.cpp
--- kdelibs-3.2.3.org/khtml/khtml_factory.cpp	2004-02-29 21:32:10.000000000 +0900
+++ kdelibs-3.2.3/khtml/khtml_factory.cpp	2004-07-28 19:29:15.000000000 +0900
@@ -25,6 +25,7 @@
 #include "css/cssstyleselector.h"
 #include "html/html_imageimpl.h"
 #include "rendering/render_style.h"
+#include "rendering/break_lines.h"
 #include "misc/loader.h"
 
 #include <kinstance.h>
@@ -94,6 +95,7 @@
         khtml::CSSStyleSelector::clear();
         khtml::RenderStyle::cleanup();
         khtml::Cache::clear();
+        khtml::cleanup_thaibreaks();
     }
     else
         deref();
diff -uNr kdelibs-3.2.3.org/khtml/rendering/break_lines.cpp kdelibs-3.2.3/khtml/rendering/break_lines.cpp
--- kdelibs-3.2.3.org/khtml/rendering/break_lines.cpp	2002-09-06 03:12:06.000000000 +0900
+++ kdelibs-3.2.3/khtml/rendering/break_lines.cpp	2004-07-29 02:11:34.000000000 +0900
@@ -1 +1,103 @@
 #include <break_lines.h>
+#include <klibloader.h>
+#include "qcstring.h"
+#include <qtextcodec.h>
+#include <qcleanuphandler.h>
+
+
+typedef int (*th_brk_def)(const char*, int[], int);
+static th_brk_def th_brk;
+
+namespace khtml {
+    struct ThaiCache
+    {
+        ThaiCache() {
+            string = 0;
+            allocated = 0x400;
+            wbrpos = (int *) malloc(allocated*sizeof(int));
+            numwbrpos = 0;
+            numisbreakable = 0x400;
+            isbreakable = (int *) malloc(numisbreakable*sizeof(int));
+        }
+        ~ThaiCache() {
+            free(wbrpos);
+            free(isbreakable);
+            library->unload();
+        }
+        const QChar *string;
+        int *wbrpos;
+        int *isbreakable;
+        int allocated;
+        int numwbrpos,numisbreakable;
+        KLibrary *library;
+    };
+    static ThaiCache *cache = 0;
+
+    void cleanup_thaibreaks()
+    {
+        delete cache;
+    }
+
+    bool isBreakableThai( const QChar *string, const int pos, const int len)
+    {
+        static QTextCodec *thaiCodec = QTextCodec::codecForMib(2259);
+
+	//printf("Entering isBreakableThai with pos = %d\n", pos);
+	
+	KLibrary *lib = 0;
+
+        /* load libthai dynamically */
+	if (( !th_brk ) && thaiCodec  ) {
+            KLibLoader *loader = KLibLoader::self();
+            lib = loader->library("libthai.so.0");
+            if (lib && lib->hasSymbol("th_brk")) {
+                th_brk = (th_brk_def) lib->symbol("th_brk");
+            } else {
+                // indication that loading failed and we shouldn't try to load again
+		printf("Error, can't load libthai...\n");
+                thaiCodec = 0;
+                if (lib)
+                    lib->unload();
+            }
+        }
+
+        if (!th_brk ) {
+            return true;
+        }
+
+	if (!cache ) {
+            cache = new ThaiCache;
+            cache->library = lib;
+	}
+
+        // build up string of thai chars
+        if ( string != cache->string ) {
+            //fprintf(stderr,"new string found (not in cache), calling libthai\n");
+            QCString cstr = thaiCodec->fromUnicode( QConstString(string,len).string());
+            //printf("About to call libthai::th_brk with str: %s",cstr.data());
+
+            cache->numwbrpos = th_brk(cstr.data(), cache->wbrpos, cache->allocated);
+            //fprintf(stderr,"libthai returns with value %d\n",cache->numwbrpos);
+            if (cache->numwbrpos > cache->allocated) {
+                cache->allocated = cache->numwbrpos;
+                cache->wbrpos = (int *)realloc(cache->wbrpos, cache->allocated*sizeof(int));
+                cache->numwbrpos = th_brk(cstr.data(), cache->wbrpos, cache->allocated);
+            }
+	    if ( len > cache->numisbreakable ) {
+		cache->numisbreakable=len;
+                cache->isbreakable = (int *)realloc(cache->isbreakable, cache->numisbreakable*sizeof(int));
+	    }
+	    for (int i = 0 ; i < len ; ++i) {
+		cache->isbreakable[i] = 0;
+	    }
+            if ( cache->numwbrpos > 0 ) {
+            	for (int i = cache->numwbrpos-1; i >= 0; --i) {
+                	cache->isbreakable[cache->wbrpos[i]] = 1;
+		}
+	    }
+            cache->string = string;
+        }
+	//printf("Returning %d\n", cache->isbreakable[pos]);
+	return cache->isbreakable[pos];
+    }
+}
diff -uNr kdelibs-3.2.3.org/khtml/rendering/break_lines.h kdelibs-3.2.3/khtml/rendering/break_lines.h
--- kdelibs-3.2.3.org/khtml/rendering/break_lines.h	2004-03-01 00:27:59.000000000 +0900
+++ kdelibs-3.2.3/khtml/rendering/break_lines.h	2004-07-28 19:29:15.000000000 +0900
@@ -116,7 +116,10 @@
         }
     }
     
-    inline bool isBreakable( const QChar *str, const int pos, int /*len*/ )
+    bool isBreakableThai( const QChar *string, const int pos, const int len);
+    void cleanup_thaibreaks();
+
+    inline bool isBreakable( const QChar *str, const int pos, int len )
     {
 	const QChar *c = str+pos;
 	unsigned short ch = c->unicode();
@@ -126,9 +129,8 @@
 	    if ( row == 0x0e ) {
 		// 0e00 - 0e7f == Thai
 		if ( c->cell() < 0x80 ) {
-		    // we don't a have a thai line breaking lib at the moment, allow
-		    // breaks everywhere except directly before punctuation.
-		    return true;
+		    // consult libthai
+		    return isBreakableThai(str, pos, len);
 		} else
 		    return false;
 	    }
Comment 1 Pattara Kiatisevi 2004-08-03 09:18:44 UTC
Ok, here is the test page in which you can see if the browser supports Thai word breaking or not:
<A HREF=http://linux.thai.net/~ott/thaiwbrtest/>http://linux.thai.net/~ott/thaiwbrtest/</A>
Comment 2 Pattara Kiatisevi 2005-03-09 03:52:53 UTC
After discussions in kfm-devel list, the patch is refined (thanks to all kfm-devel crew) and added to the CVS. KHTML in KDE 3.4 should include Thai word break support.
Comment 3 Pattara Kiatisevi 2005-03-09 03:53:57 UTC
Hence problem solved.