Make a branch to make krunner Good Enough For Aaron™.
[kdebase/uwolfer.git] / runtime / nepomuk / strigibackend / sopranoindexreader.cpp
blob550168b6d29c6f2b62b502af679742ba85ded417
1 /*
2 $Id: sourceheader 511311 2006-02-19 14:51:05Z trueg $
4 This file is part of the Strigi project.
5 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of
10 the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this library; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
23 #include "sopranoindexreader.h"
24 #include "tstring.h"
25 #include <strigi/query.h>
26 #include <strigi/queryparser.h>
27 #include <strigi/fieldtypes.h>
28 #include "util.h"
30 #include <Soprano/Soprano>
31 #include <Soprano/Index/IndexFilterModel>
32 #include <Soprano/Index/CLuceneIndex>
33 #include <Soprano/Vocabulary/XMLSchema>
35 #include <map>
36 #include <utility>
37 #include <sstream>
39 #include <CLucene.h>
41 #include <QtCore/QThread>
42 #include <QtCore/QDateTime>
43 #include <QtCore/QDebug>
44 #include <QtCore/QString>
45 #include <QtCore/QLatin1String>
48 using namespace Soprano;
51 static lucene::index::Term* createWildCardTerm( const TString& name,
52 const string& value );
53 static lucene::index::Term* createTerm( const TString& name,
54 const string& value );
55 static lucene::index::Term* createKeywordTerm( const TString& name,
56 const string& value );
57 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
58 static lucene::search::Query* createQuery( const Strigi::Query& query );
59 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
60 static lucene::search::Query* createSingleFieldQuery( const string& field,
61 const Strigi::Query& query );
62 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
65 static lucene::index::Term* createWildCardTerm( const TString& name,
66 const string& value )
68 TString v = TString::fromUtf8( value.c_str() );
69 return _CLNEW lucene::index::Term( name.data(), v.data() );
72 static lucene::index::Term* createTerm( const TString& name,
73 const string& value )
75 qDebug() << "createTerm" << name << value.c_str();
77 TString v = TString::fromUtf8( value.c_str() );
79 lucene::util::StringReader sr( v.data() );
80 lucene::analysis::standard::StandardAnalyzer a;
81 lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
82 lucene::analysis::Token* to = ts->next();
83 const wchar_t *tv;
84 if (to) {
85 tv = to->termText();
86 } else {
87 tv = v.data();
89 lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
90 if (to) {
91 _CLDELETE(to);
93 _CLDELETE(ts);
94 return t;
97 static lucene::index::Term* createKeywordTerm( const TString& name,
98 const string& value )
100 TString v = TString::fromUtf8( value.c_str() );
101 lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
102 return t;
105 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
107 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
108 bool isAnd = query.type() == Strigi::Query::And;
109 const vector<Strigi::Query>& sub = query.subQueries();
110 for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
111 lucene::search::Query* q = createQuery(*i);
112 bq->add(q, true, isAnd, i->negate());
114 return bq;
117 static lucene::search::Query* createQuery( const Strigi::Query& query )
119 return query.subQueries().size()
120 ? createBooleanQuery(query)
121 : createSimpleQuery(query);
124 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
126 switch (query.fields().size()) {
127 case 0: return createSingleFieldQuery("text", query);
128 case 1: return createSingleFieldQuery(query.fields()[0], query);
129 default: return createMultiFieldQuery(query);
133 static lucene::search::Query* createSingleFieldQuery( const string& field,
134 const Strigi::Query& query ) {
135 qDebug() << "Creating single field query: " << field.c_str();
136 TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
137 lucene::search::Query* q;
138 lucene::index::Term* t;
139 const string& val = query.term().string();
140 switch (query.type()) {
141 case Strigi::Query::LessThan:
142 t = createTerm(fieldname, val.c_str());
143 q = _CLNEW lucene::search::RangeQuery(0, t, false);
144 break;
145 case Strigi::Query::LessThanEquals:
146 t = createTerm(fieldname, query.term().string());
147 q = _CLNEW lucene::search::RangeQuery(0, t, true);
148 break;
149 case Strigi::Query::GreaterThan:
150 t = createTerm(fieldname, query.term().string());
151 q = _CLNEW lucene::search::RangeQuery(t, 0, false);
152 break;
153 case Strigi::Query::GreaterThanEquals:
154 t = createTerm(fieldname, query.term().string());
155 q = _CLNEW lucene::search::RangeQuery(t, 0, true);
156 break;
157 case Strigi::Query::Keyword:
158 t = createKeywordTerm(fieldname, query.term().string());
159 q = _CLNEW lucene::search::TermQuery(t);
160 break;
161 default:
162 if (strpbrk(val.c_str(), "*?")) {
163 t = createWildCardTerm(fieldname, val);
164 q = _CLNEW lucene::search::WildcardQuery(t);
165 } else {
166 t = createTerm(fieldname, val);
167 q = _CLNEW lucene::search::TermQuery(t);
170 _CLDECDELETE(t);
171 return q;
174 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
176 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
177 for (vector<string>::const_iterator i = query.fields().begin();
178 i != query.fields().end(); ++i) {
179 lucene::search::Query* q = createSingleFieldQuery(*i, query);
180 bq->add(q, true, false, false);
182 return bq;
186 class Strigi::Soprano::IndexReader::Private
188 public:
189 bool createDocument( const Node& res, IndexedDocument& doc ) {
190 StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
191 if ( it.lastError() ) {
192 return false;
195 // use the resource URI as fallback file URI
196 doc.uri = res.uri().toLocalFile().toUtf8().data();
198 while ( it.next() ) {
199 Statement s = *it;
200 if ( s.object().isLiteral() ) {
201 std::string fieldName = Util::fieldName( s.predicate().uri() );
202 std::string value = s.object().toString().toUtf8().data();
204 if (fieldName == "text") {
205 doc.fragment = value;
207 else if (fieldName == FieldRegister::pathFieldName) {
208 qDebug() << "Setting IndexedDocument uri=" << value.c_str();
209 doc.uri = value;
211 else if (fieldName == FieldRegister::mimetypeFieldName) {
212 doc.mimetype = value;
214 else if (fieldName == FieldRegister::mtimeFieldName) {
215 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
216 if ( s.object().literal().isDateTime() ) {
217 doc.mtime = s.object().literal().toDateTime().toTime_t();
219 else {
220 doc.mtime = s.object().literal().toUnsignedInt();
223 else if (fieldName == FieldRegister::sizeFieldName) {
224 doc.size = s.object().literal().toInt64();
226 else {
227 doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
230 else {
231 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
232 // not literal statements
236 return true;
239 // ::Soprano::Index::IndexFilterModel* repository;
240 ::Soprano::Model* repository;
244 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
245 : Strigi::IndexReader()
247 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
248 d = new Private;
249 d->repository = model;
253 Strigi::Soprano::IndexReader::~IndexReader()
255 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
256 delete d;
260 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
262 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
264 lucene::search::Query* q = createQuery( query );
265 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
266 ::Soprano::Query::QUERY_LANGUAGE_USER,
267 QLatin1String( "lucene" ) );
268 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
269 int s = 0;
270 while ( hits.next() ) {
271 qDebug() << "Query hit:" << hits.binding( 0 );
272 ++s;
274 _CLDELETE(q);
275 return s;
279 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
280 const std::vector<std::string>& fields,
281 const std::vector<Strigi::Variant::Type>& types,
282 std::vector<std::vector<Strigi::Variant> >& result,
283 int off, int max )
285 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
286 lucene::search::Query* bq = createQuery( query );
287 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
288 ::Soprano::Query::QUERY_LANGUAGE_USER,
289 QLatin1String( "lucene" ) );
290 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
292 int i = -1;
293 while ( hits.next() ) {
294 ++i;
295 if ( i < off ) {
296 continue;
298 if ( i > max ) {
299 break;
302 // ::Soprano::Index::QueryHit hit = *hits;
303 std::vector<Strigi::Variant> resultRow;
304 std::vector<std::string>::const_iterator fieldIt = fields.begin();
305 std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
306 while ( fieldIt != fields.end() ) {
307 if ( typesIt == types.end() ) {
308 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
309 return;
312 StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
313 Util::fieldUri( *fieldIt ),
314 Node() ) );
315 // FIXME: what if we have a field with a cardinality > 1?
316 if ( it.next() ) {
317 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
319 else {
320 resultRow.push_back( Strigi::Variant() );
323 ++fieldIt;
324 ++typesIt;
327 result.push_back( resultRow );
329 _CLDELETE(bq);
333 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
335 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
336 vector<IndexedDocument> results;
337 lucene::search::Query* bq = createQuery( query );
338 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
339 ::Soprano::Query::QUERY_LANGUAGE_USER,
340 "lucene" );
341 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
343 int i = -1;
344 while ( hits.next() ) {
345 ++i;
346 if ( i < off ) {
347 continue;
349 if ( i > max ) {
350 break;
353 IndexedDocument result;
354 // ::Soprano::Index::QueryHit hit = *hits;
355 result.score = hits.binding( 1 ).literal().toDouble();
356 if ( d->createDocument( hits.binding( 0 ), result ) ) {
357 results.push_back( result );
359 else {
360 qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
363 _CLDELETE(bq);
364 return results;
368 // an empty parent url is perfectly valid as strigi stores a parent url for everything
369 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
370 std::map<std::string, time_t>& children )
372 qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
373 QString query = QString( "select distinct ?path ?mtime where { ?r <%1> \"%2\"^^<%3> . ?r <%4> ?mtime . ?r <%5> ?path . }")
374 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString() )
375 .arg( QString::fromUtf8( parent.c_str() ) )
376 .arg( Vocabulary::XMLSchema::string().toString() )
377 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
378 .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() );
380 qDebug() << "running getChildren query:" << query;
382 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
384 while ( result.next() ) {
385 Node pathNode = result.binding( "path" );
386 Node mTimeNode = result.binding( "mtime" );
387 qDebug() << "file in index: " << pathNode.toString() << "mtime:" << mTimeNode.literal().toDateTime() << "(" << mTimeNode.literal().toDateTime().toTime_t() << ")";
389 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
390 if ( mTimeNode.literal().isDateTime() ) {
391 children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toDateTime().toTime_t();
393 else {
394 children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toUnsignedInt();
400 int32_t Strigi::Soprano::IndexReader::countDocuments()
402 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
403 // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
404 return 0;
408 int32_t Strigi::Soprano::IndexReader::countWords()
410 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
411 // FIXME: what to do here? use the index? Count the predicates?
412 return -1;
416 int64_t Strigi::Soprano::IndexReader::indexSize()
418 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
419 return d->repository->statementCount();
423 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
425 qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
426 QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
427 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
428 .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() )
429 .arg( QString::fromUtf8( uri.c_str() ) )
430 .arg( Vocabulary::XMLSchema::string().toString() );
432 qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
434 QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
436 time_t mtime = 0;
437 if ( it.next() ) {
438 ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
440 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
441 if ( val.isDateTime() ) {
442 mtime = val.toDateTime().toTime_t();
444 else {
445 mtime = val.toUnsignedInt();
448 return mtime;
452 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
454 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
455 // This is a weird method
456 // Our list of field names (the predicates) is probably awefully long.
458 std::vector<std::string> fields;
459 QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
460 while ( it.next() ) {
461 fields.push_back( Util::fieldName( it.binding("p").uri() ) );
463 return fields;
467 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
468 const std::string& fieldname,
469 const std::string& labeltype )
471 Q_UNUSED(query);
472 Q_UNUSED(fieldname);
473 Q_UNUSED(labeltype);
475 // FIXME: what is meant by fieldname and labeltype?
476 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
477 // IMPLEMENTME? Seems not like a very important method though.
478 return std::vector<std::pair<std::string,uint32_t> >();
482 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
483 const std::vector<std::string>& fieldnames)
485 Q_UNUSED(keywordprefix);
486 Q_UNUSED(fieldnames);
488 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
489 // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
490 return 2;
494 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
495 const std::vector<std::string>& fieldnames,
496 uint32_t max, uint32_t offset )
498 Q_UNUSED(keywordmatch);
499 Q_UNUSED(fieldnames);
500 Q_UNUSED(max);
501 Q_UNUSED(offset);
503 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
504 // IMPLEMENTME? Seems like a rarely used method...
505 return std::vector<std::string>();