From a707b6abbb2601ca581607c7d82b27d2ad74a797 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sun, 14 Nov 2010 22:49:06 +0000 Subject: [PATCH] BibleGateway: Fix bitrotted scraper, use mobile.* Biblegateway has changed a fair bit. On the one hand it now claims to be XHTML so it is more valid than it was. On the other hand the interface to different bible versions has changed and there are now about 3 different ways of referring to a bible version. Some use numeric ids still, but most use the full name, munged into the url. All use a short name for the searching. We use the mobile version at mobile.biblegateway.com as it's more likely to be valid (mobile browsers are more pedantic), and likely to be more compact (to save bandwidth). The list of versions is now extracted directly from the versions list at http://mobile.biblegateway.com/versions/. The table is read and the urls to the version info pages are extracted. The version info pages then provide more info, as well as a table linking to every chapter in the Bible, sorted into rows for each book, which can be read as they were before. When reading the actual bible text, use the contents of sup tags instead of the value attribute, and clip the footnotes section. --- .../biblegateway/KwBibleManagerBibleGateway.cpp | 123 +++++++++++++++++---- .../biblegateway/KwBibleManagerBibleGateway.h | 4 +- .../biblegateway/KwBibleModuleBibleGateway.cpp | 18 ++- .../bible/biblegateway/KwBibleModuleBibleGateway.h | 2 +- 4 files changed, 118 insertions(+), 29 deletions(-) diff --git a/kworship/bible/biblegateway/KwBibleManagerBibleGateway.cpp b/kworship/bible/biblegateway/KwBibleManagerBibleGateway.cpp index 82276f9..b7089f2 100644 --- a/kworship/bible/biblegateway/KwBibleManagerBibleGateway.cpp +++ b/kworship/bible/biblegateway/KwBibleManagerBibleGateway.cpp @@ -48,7 +48,7 @@ KwBibleManagerBibleGateway::KwBibleManagerBibleGateway(QObject* parent, const QS : KwBibleManager(parent, params) , m_cached(false) , m_languages() -, m_versionsById() +, m_versions() , m_versionsByName() , m_versionsByLanguage() { @@ -58,7 +58,7 @@ KwBibleManagerBibleGateway::KwBibleManagerBibleGateway(QObject* parent, const QS /// Destructor. KwBibleManagerBibleGateway::~KwBibleManagerBibleGateway() { - foreach (Version* version, m_versionsById) + foreach (Version* version, m_versions) { delete version->module; delete version; @@ -99,7 +99,7 @@ KwBibleModule* KwBibleManagerBibleGateway::module(const QString& name) if (!moduleIds.isEmpty()) { int id = moduleIds.first(); - version = m_versionsById[id]; + version = m_versions[id]; } } } @@ -107,7 +107,7 @@ KwBibleModule* KwBibleManagerBibleGateway::module(const QString& name) { if (0 == version->module) { - version->module = new KwBibleModuleBibleGateway(version->id); + version->module = new KwBibleModuleBibleGateway(version->url); } return version->module; } @@ -133,7 +133,7 @@ QStringList KwBibleManagerBibleGateway::moduleNamesInLanguage(const QString& lan const QList& moduleIds = m_versionsByLanguage[languageId]; foreach (int id, moduleIds) { - names << m_versionsById[id]->name; + names << m_versions[id]->name; } } return names; @@ -155,14 +155,15 @@ void KwBibleManagerBibleGateway::ensureCached() if (!m_cached) { QString tmpFile; - if (KIO::NetAccess::download(KUrl("http://www.biblegateway.com/"), tmpFile, 0)) + // First get list from drop down menu on main page +#if 0 + if (KIO::NetAccess::download(KUrl("http://mobile.biblegateway.com/"), tmpFile, 0)) { QFile file(tmpFile); if (file.open(QFile::ReadOnly | QFile::Text)) { QByteArray rawPage = file.readAll(); file.close(); - /// @todo Convert to use KDE DOM as its a bit more flexible of invalid XML QString page = QString::fromUtf8(rawPage); QRegExp rx(".*"); if (-1 != rx.indexIn(page)) @@ -190,24 +191,102 @@ void KwBibleManagerBibleGateway::ensureCached() } else if (langId >= 0) { - bool ok; - int versionId = el.attribute("value").toInt(&ok); - if (ok) + QString versionId = el.attribute("value"); + // Ensure there is none with this shortname already + QHash::const_iterator it = m_versionsByShortName.constFind(versionId); + if (it == m_versionsByShortName.constEnd()) { - // Ensure there is none with this id already - QHash::const_iterator it = m_versionsById.constFind(versionId); - if (it == m_versionsById.constEnd()) + Version* version = new Version; + version->name = text; + version->shortname = versionId; + version->id = m_versionsById.size(); + version->module = 0; + m_versionsById[version->id] = version; + m_versionsByName[text] = version; + m_versionsByShortName[versionId] = version; + *langMods << version->id; + } + } + } + } + } + } + } + } +#endif + if (KIO::NetAccess::download(KUrl("http://mobile.biblegateway.com/versions"), tmpFile, 0)) + { + QFile file(tmpFile); + if (file.open(QFile::ReadOnly | QFile::Text)) + { + QByteArray rawPage = file.readAll(); + file.close(); + QString page = QString::fromUtf8(rawPage); + QRegExp rx("]*class=\"infotable\">.*"); + if (-1 != rx.indexIn(page)) + { + QDomDocument dom; + if (dom.setContent(rx.cap(), false)) + { + // Go through all the options (languages and versions) + int langId = -1; + int rowSpan = 0; + QList* langMods = 0; + QDomNodeList rows = dom.elementsByTagName("tr"); + for (int i = 0; i < rows.count(); ++i) + { + QDomElement row = rows.at(i).toElement(); + if (!row.isNull()) + { + QDomNodeList cols = row.elementsByTagName("td"); + for (int j = 0; j < cols.count(); ++j) + { + QDomElement cell = cols.at(j).toElement(); + if (!cell.isNull()) + { + // if first columnn and rowspan, its a language + QString rowSpanStr = cell.attribute("rowspan"); + bool ok; + int newRowSpan = rowSpanStr.toInt(&ok); + if (!j && ok && newRowSpan) + { + QString text = cell.text(); + m_languages << text; + ++langId; + langMods = &m_versionsByLanguage[langId]; + rowSpan = newRowSpan; + } + else if (langId >= 0) { - Version* version = new Version; - version->name = text; - version->id = versionId; - version->module = 0; - m_versionsById[versionId] = version; - m_versionsByName[text] = version; - *langMods << versionId; + QDomNodeList as = cell.elementsByTagName("a"); + if (as.count()) + { + QDomElement ael = as.at(0).toElement(); + if (!ael.isNull()) + { + QString href = ael.attribute("href"); + QString text = ael.text(); + QHash::const_iterator it = m_versionsByName.constFind(text); + if (it == m_versionsByName.constEnd()) + { + Version* version = new Version; + version->name = text; + version->id = m_versions.size(); + version->module = 0; + version->url = href; + m_versions.push_back(version); + m_versionsByName[text] = version; + *langMods << version->id; + } + } + } + // jump out of column loop + break; } } } + if (rowSpan) + --rowSpan; } } m_cached = true; @@ -229,14 +308,14 @@ void KwBibleManagerBibleGateway::clear() { if (m_cached) { - foreach (Version* version, m_versionsById) + foreach (Version* version, m_versions) { delete version->module; delete version; } m_cached = false; m_languages.clear(); - m_versionsById.clear(); + m_versions.clear(); m_versionsByName.clear(); m_versionsByLanguage.clear(); } diff --git a/kworship/bible/biblegateway/KwBibleManagerBibleGateway.h b/kworship/bible/biblegateway/KwBibleManagerBibleGateway.h index d036181..52ca636 100644 --- a/kworship/bible/biblegateway/KwBibleManagerBibleGateway.h +++ b/kworship/bible/biblegateway/KwBibleManagerBibleGateway.h @@ -31,6 +31,7 @@ #include #include #include +#include #include class KwBibleModuleBibleGateway; @@ -100,13 +101,14 @@ class KwBibleManagerBibleGateway : public KwBibleManager struct Version { QString name; + QString url; int id; int lang; KwBibleModuleBibleGateway* module; }; /// Versions by id. - QHash m_versionsById; + QVector m_versions; /// Versions by name. QHash m_versionsByName; diff --git a/kworship/bible/biblegateway/KwBibleModuleBibleGateway.cpp b/kworship/bible/biblegateway/KwBibleModuleBibleGateway.cpp index e5b7e73..cf32a70 100644 --- a/kworship/bible/biblegateway/KwBibleModuleBibleGateway.cpp +++ b/kworship/bible/biblegateway/KwBibleModuleBibleGateway.cpp @@ -41,10 +41,10 @@ */ /// Default constructor. -KwBibleModuleBibleGateway::KwBibleModuleBibleGateway(int id) +KwBibleModuleBibleGateway::KwBibleModuleBibleGateway(QString vurl) : KwBibleModule() { - KUrl url(QString("http://www.biblegateway.com/versions/index.php?action=getVersionInfo&vid=%1").arg(id)); + KUrl url("http://mobile.biblegateway.com/" + vurl); QString tmpFile; if (KIO::NetAccess::download(url, tmpFile, 0)) @@ -105,7 +105,7 @@ KwBibleModuleBibleGateway::KwBibleModuleBibleGateway(int id) // Get the link book->chapters.push_back(Chapter()); Chapter* chapter = &book->chapters[book->chapters.size()-1]; - chapter->url = "http://www.biblegateway.com" + link.getAttribute("href").string(); + chapter->url = "http://mobile.biblegateway.com/" + link.getAttribute("href").string(); chapter->fetched = false; } } @@ -236,7 +236,7 @@ KwBibleModuleBibleGateway::Chapter* KwBibleModuleBibleGateway::fetchChapter(int { // Get the verse number and validate bool numeric; - QString verseNumber = sup.getAttribute("value").string(); + QString verseNumber = sup.innerText().string(); Verse verseInfo; int check = verseNumber.toInt(&numeric); if (!numeric) @@ -290,12 +290,20 @@ KwBibleModuleBibleGateway::Chapter* KwBibleModuleBibleGateway::fetchChapter(int { break; } - /// @todo Handle footnotes properly + // ignore footnote references else if (siblingElement.getAttribute("class") == "footnote") { append = false; } } + // and the actual footnotes section + else if (siblingElement.tagName() == "div") + { + if (siblingElement.getAttribute("class") == "footnotes") + { + break; + } + } // Also stop at headings DOM::HTMLHeadingElement heading = siblingElement; diff --git a/kworship/bible/biblegateway/KwBibleModuleBibleGateway.h b/kworship/bible/biblegateway/KwBibleModuleBibleGateway.h index 80a8609..adb92d8 100644 --- a/kworship/bible/biblegateway/KwBibleModuleBibleGateway.h +++ b/kworship/bible/biblegateway/KwBibleModuleBibleGateway.h @@ -42,7 +42,7 @@ class KwBibleModuleBibleGateway : public KwBibleModule */ /// Default constructor. - KwBibleModuleBibleGateway(int id); + KwBibleModuleBibleGateway(QString url); /// Destructor. virtual ~KwBibleModuleBibleGateway(); -- 2.11.4.GIT