Bug 126128 - cannot find link tag in html
Summary: cannot find link tag in html
Status: RESOLVED FIXED
Alias: None
Product: akregator
Classification: Applications
Component: general (show other bugs)
Version: unspecified
Platform: Ubuntu Linux
: NOR normal
Target Milestone: ---
Assignee: kdepim bugs
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2006-04-23 21:26 UTC by Mik Mifflin
Modified: 2006-08-20 19:13 UTC (History)
0 users

See Also:
Latest Commit:
Version Fixed In:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Mik Mifflin 2006-04-23 21:26:05 UTC
Version:            (using KDE KDE 3.5.2)
Installed from:    Ubuntu Packages

When dragging a URL of an HTML page to aKregator, it won't always find the RSS feed.  I've found this bug while trying to add both http://softiesonrails.com/ and http://podcast.rubyonrails.org/.  The link tags look fine, I don't see any reason why akregator would fail to find the feed.
Comment 1 Eckhart Wörner 2006-04-23 22:31:27 UTC
Confirmed.

Reason for this is the regexp which only matches when attributes "rel" and "href" appear in this order (src/librss/loader.cpp).
Comment 2 Frank Osterfeld 2006-08-20 19:13:38 UTC
SVN commit 575039 by osterfeld:

use the more robust FeedDetector class from the konq plugin instead of the old and unmaintained "feed discovery" code in 
loader.cpp
BUG: 126128


 M  +1 -1      Makefile.am  
 A             feeddetector.cpp   [License: GPL (v2+) (+Qt exception)]
 A             feeddetector.h   [License: GPL (v2+) (+Qt exception)]
 M  +25 -63    loader.cpp  


--- branches/KDE/3.5/kdepim/akregator/src/librss/Makefile.am #575038:575039
@@ -9,7 +9,7 @@
   loader.h librss.h enclosure.h
 
 librsslocal_la_SOURCES = article.cpp document.cpp image.cpp textinput.cpp \
-  tools_p.cpp loader.cpp enclosure.cpp category.cpp
+  tools_p.cpp loader.cpp enclosure.cpp category.cpp feeddetector.cpp
 
 librsslocal_la_METASOURCES = AUTO
 
--- branches/KDE/3.5/kdepim/akregator/src/librss/loader.cpp #575038:575039
@@ -10,6 +10,7 @@
  */
 #include "loader.h"
 #include "document.h"
+#include "feeddetector.h"
 
 #include <kio/job.h>
 #include <kprocess.h>
@@ -377,74 +378,35 @@
 void Loader::discoverFeeds(const QByteArray &data)
 {
     QString str = QString(data).simplifyWhiteSpace();
-    QString s2;
-    //QTextStream ts( &str, IO_WriteOnly );
-    //ts << data.data();
-
-    // "<[\\s]link[^>]*rel[\\s]=[\\s]\\\"[\\s]alternate[\\s]\\\"[^>]*>"
-    // "type[\\s]=[\\s]\\\"application/rss+xml\\\""
-    // "href[\\s]=[\\s]\\\"application/rss+xml\\\""
-    QRegExp rx( "(?:REL)[^=]*=[^sAa]*(?:service.feed|ALTERNATE)[\\s]*[^s][^s](?:[^>]*)(?:HREF)[^=]*=[^A-Z0-9-_~,./$]*([^'\">\\s]*)", false);
-    if (rx.search(str)!=-1)
-        s2=rx.cap(1);
-    else{
-    // does not support Atom/RSS autodiscovery.. try finding feeds by brute force....
-        int pos=0;
-        QStringList feeds;
-        QString host=d->url.host();
-        rx.setPattern("(?:<A )[^H]*(?:HREF)[^=]*=[^A-Z0-9-_~,./]*([^'\">\\s]*)");
-        while ( pos >= 0 ) {
-            pos = rx.search( str, pos );
-            s2=rx.cap(1);
-            if (s2.endsWith(".rdf") || s2.endsWith(".rss") || s2.endsWith(".xml"))
-                    feeds.append(s2);
-            if ( pos >= 0 ) {
-                pos += rx.matchedLength();
-            }
-        }
-
-        s2=feeds.first();
-        KURL testURL;
-        // loop through, prefer feeds on same host
-        QStringList::Iterator end( feeds.end() );
-        for ( QStringList::Iterator it = feeds.begin(); it != end; ++it ) {
-            testURL=*it;
-            if (testURL.host()==host)
-            {
-                s2=*it;
-                break;
-            }
-        }
-    }
-
-    if (s2.isNull()) {
-        //kdDebug() << "No feed found for a site" << endl;
-        return;
-    }
-
-    if (KURL::isRelativeURL(s2))
+    
+    QStringList feeds; 
+    
+    FeedDetectorEntryList list = FeedDetector::extractFromLinkTags(str); 
+    
+    for (FeedDetectorEntryList::ConstIterator it = list.begin(); it != list.end(); ++it)
     {
-        if (s2.startsWith("//"))
+        feeds += (*it).url();
+    }  
+    
+    if (list.isEmpty())
+        feeds = FeedDetector::extractBruteForce(str);
+        
+    QString feed = feeds.first();
+    QString host = d->url.host();
+    KURL testURL;
+    // loop through, prefer feeds on same host
+    QStringList::Iterator end( feeds.end() );
+    for ( QStringList::Iterator it = feeds.begin(); it != end; ++it) 
+    {
+        testURL=*it;
+        if (testURL.host() == host)
         {
-            s2=s2.prepend(d->url.protocol()+":");
-            d->discoveredFeedURL=s2;
+            feed = *it;
+            break;
         }
-        else if (s2.startsWith("/"))
-        {
-            d->discoveredFeedURL=d->url;
-            d->discoveredFeedURL.setPath(s2);
-        }
-        else
-        {
-            d->discoveredFeedURL=d->url;
-            d->discoveredFeedURL.addPath(s2);
-        }
-        d->discoveredFeedURL.cleanPath();
     }
-    else
-        d->discoveredFeedURL=s2;
 
-    d->discoveredFeedURL.cleanPath();
+    d->discoveredFeedURL = feed.isNull() ? QString() : FeedDetector::fixRelativeURL(feed, d->url); 
 }
 
 #include "loader.moc"