Version: (using KDE KDE 3.1) Installed from: Debian testing/unstable Packages OS: Linux Web archiver in konq-plugins saves the html file with QT's current locale, effectively converting high unicode characters to questions marks if the default encoding happens to be latin-1 or other 8-bit charset. To reproduce, open a chinese page and try to archive it.
Created attachment 1169 [details] This patch should fix the problem
.
While the patch helps the situation, it doesn't completely fix the problem. If the original HTML file contains &1234;-style characters, the parser translates them to 16bit unicode before storing them into the DOM tree. Therefore even the document's original character encoding scheme may not be able to represent all characters in the decoded document tree. The best solution would probably be if the re-encoder detected such characters and passed them to an external, caller supplied callback (in this case a HTML specific function that converts a character to the &1245;-form).
CVS commit by waba: Save html as utf-8 (BR55929) CCMAIL: 55929-done@bugs.kde.org M +55 -3 archivedialog.cpp 1.9 --- kdeaddons/konq-plugins/webarchiver/archivedialog.cpp #1.8:1.9 @@ -45,4 +45,6 @@ #undef DEBUG_WAR +#define CONTENT_TYPE "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">" + ArchiveDialog::ArchiveDialog(QWidget *parent, const QString &filename, KHTMLPart *part) : @@ -127,5 +129,5 @@ void ArchiveDialog::setSavingState() KTempFile tmpFile; QTextStream* textStream = tmpFile.textStream(); - textStream->setEncoding(QTextStream::Locale); + textStream->setEncoding(QTextStream::UnicodeUTF8); m_widget->progressBar->setProgress(m_widget->progressBar->totalSteps()); @@ -176,4 +178,41 @@ void ArchiveDialog::saveToArchive(QTextS } +static bool hasAttribute(const DOM::Node &pNode, const QString &attrName, const QString &attrValue) +{ + const DOM::Element element = (const DOM::Element) pNode; + DOM::Attr attr; + DOM::NamedNodeMap attrs = element.attributes(); + unsigned long lmap = attrs.length(); + for( unsigned int j=0; j<lmap; j++ ) { + attr = static_cast<DOM::Attr>(attrs.item(j)); + if ((attr.name().string().upper() == attrName) && + (attr.value().string().upper() == attrValue)) + return true; + } + return false; +} + +static bool hasChildNode(const DOM::Node &pNode, const QString &nodeName) +{ + DOM::Node child; + try + { + // We might throw a DOM exception + child = pNode.firstChild(); + } + catch (...) + { + // No children, stop recursion here + child = DOM::Node(); + } + + while(!child.isNull()) { + if (child.nodeName().string().upper() == nodeName) + return true; + child = child.nextSibling(); + } + return false; +} + /* Transform DOM-Tree to HTML */ @@ -199,4 +238,7 @@ void ArchiveDialog::saveArchiveRecursive * Saving SCRIPT but they can cause trouble! */ + } else if ((nodeName == "META") && hasAttribute(pNode, "HTTP-EQUIV", "CONTENT-TYPE")) { + /* Skip content-type meta tag, we provide our own. + */ } else { if (!m_bPreserveWS) { @@ -255,4 +297,14 @@ void ArchiveDialog::saveArchiveRecursive text += attributes.simplifyWhiteSpace(); text += ">"; + + if (nodeName == "HTML") { + /* Search for a HEAD tag, if not found, generate one. + */ + if (!hasChildNode(pNode, "HEAD")) + text += "\n" + strIndent + " <HEAD>" CONTENT_TYPE "</HEAD>"; + } + else if (nodeName == "HEAD") { + text += "\n" + strIndent + " " + CONTENT_TYPE; + } } } else {