2 $Id: sourceheader 511311 2006-02-19 14:51:05Z trueg $
4 This file is part of the Strigi project.
5 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of
10 the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this library; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
23 #include "sopranoindexreader.h"
25 #include <strigi/query.h>
26 #include <strigi/queryparser.h>
27 #include <strigi/fieldtypes.h>
30 #include <Soprano/Soprano>
31 #include <Soprano/Index/IndexFilterModel>
32 #include <Soprano/Index/CLuceneIndex>
33 #include <Soprano/Vocabulary/XMLSchema>
41 #include <QtCore/QThread>
42 #include <QtCore/QDateTime>
43 #include <QtCore/QDebug>
44 #include <QtCore/QString>
45 #include <QtCore/QLatin1String>
48 using namespace Soprano
;
51 static lucene::index::Term
* createWildCardTerm( const TString
& name
,
52 const string
& value
);
53 static lucene::index::Term
* createTerm( const TString
& name
,
54 const string
& value
);
55 static lucene::index::Term
* createKeywordTerm( const TString
& name
,
56 const string
& value
);
57 static lucene::search::BooleanQuery
* createBooleanQuery( const Strigi::Query
& query
);
58 static lucene::search::Query
* createQuery( const Strigi::Query
& query
);
59 static lucene::search::Query
* createSimpleQuery( const Strigi::Query
& query
);
60 static lucene::search::Query
* createSingleFieldQuery( const string
& field
,
61 const Strigi::Query
& query
);
62 static lucene::search::Query
* createMultiFieldQuery( const Strigi::Query
& query
);
65 static lucene::index::Term
* createWildCardTerm( const TString
& name
,
68 TString v
= TString::fromUtf8( value
.c_str() );
69 return _CLNEW
lucene::index::Term( name
.data(), v
.data() );
72 static lucene::index::Term
* createTerm( const TString
& name
,
75 qDebug() << "createTerm" << name
<< value
.c_str();
77 TString v
= TString::fromUtf8( value
.c_str() );
79 lucene::util::StringReader
sr( v
.data() );
80 lucene::analysis::standard::StandardAnalyzer a
;
81 lucene::analysis::TokenStream
* ts
= a
.tokenStream(name
.data(), &sr
);
82 lucene::analysis::Token
* to
= ts
->next();
89 lucene::index::Term
* t
= _CLNEW
lucene::index::Term(name
.data(), tv
);
97 static lucene::index::Term
* createKeywordTerm( const TString
& name
,
100 TString v
= TString::fromUtf8( value
.c_str() );
101 lucene::index::Term
* t
= _CLNEW
lucene::index::Term( name
.data(), v
.data() );
105 static lucene::search::BooleanQuery
* createBooleanQuery( const Strigi::Query
& query
)
107 lucene::search::BooleanQuery
* bq
= _CLNEW
lucene::search::BooleanQuery();
108 bool isAnd
= query
.type() == Strigi::Query::And
;
109 const vector
<Strigi::Query
>& sub
= query
.subQueries();
110 for (vector
<Strigi::Query
>::const_iterator i
= sub
.begin(); i
!= sub
.end(); ++i
) {
111 lucene::search::Query
* q
= createQuery(*i
);
112 bq
->add(q
, true, isAnd
, i
->negate());
117 static lucene::search::Query
* createQuery( const Strigi::Query
& query
)
119 return query
.subQueries().size()
120 ? createBooleanQuery(query
)
121 : createSimpleQuery(query
);
124 static lucene::search::Query
* createSimpleQuery( const Strigi::Query
& query
)
126 switch (query
.fields().size()) {
127 case 0: return createSingleFieldQuery("text", query
);
128 case 1: return createSingleFieldQuery(query
.fields()[0], query
);
129 default: return createMultiFieldQuery(query
);
133 static lucene::search::Query
* createSingleFieldQuery( const string
& field
,
134 const Strigi::Query
& query
) {
135 qDebug() << "Creating single field query: " << field
.c_str();
136 TString fieldname
= Strigi::Soprano::Util::convertSearchField( field
);
137 lucene::search::Query
* q
;
138 lucene::index::Term
* t
;
139 const string
& val
= query
.term().string();
140 switch (query
.type()) {
141 case Strigi::Query::LessThan
:
142 t
= createTerm(fieldname
, val
.c_str());
143 q
= _CLNEW
lucene::search::RangeQuery(0, t
, false);
145 case Strigi::Query::LessThanEquals
:
146 t
= createTerm(fieldname
, query
.term().string());
147 q
= _CLNEW
lucene::search::RangeQuery(0, t
, true);
149 case Strigi::Query::GreaterThan
:
150 t
= createTerm(fieldname
, query
.term().string());
151 q
= _CLNEW
lucene::search::RangeQuery(t
, 0, false);
153 case Strigi::Query::GreaterThanEquals
:
154 t
= createTerm(fieldname
, query
.term().string());
155 q
= _CLNEW
lucene::search::RangeQuery(t
, 0, true);
157 case Strigi::Query::Keyword
:
158 t
= createKeywordTerm(fieldname
, query
.term().string());
159 q
= _CLNEW
lucene::search::TermQuery(t
);
162 if (strpbrk(val
.c_str(), "*?")) {
163 t
= createWildCardTerm(fieldname
, val
);
164 q
= _CLNEW
lucene::search::WildcardQuery(t
);
166 t
= createTerm(fieldname
, val
);
167 q
= _CLNEW
lucene::search::TermQuery(t
);
174 static lucene::search::Query
* createMultiFieldQuery( const Strigi::Query
& query
)
176 lucene::search::BooleanQuery
* bq
= _CLNEW
lucene::search::BooleanQuery();
177 for (vector
<string
>::const_iterator i
= query
.fields().begin();
178 i
!= query
.fields().end(); ++i
) {
179 lucene::search::Query
* q
= createSingleFieldQuery(*i
, query
);
180 bq
->add(q
, true, false, false);
186 class Strigi::Soprano::IndexReader::Private
189 bool createDocument( const Node
& res
, IndexedDocument
& doc
) {
190 StatementIterator it
= repository
->listStatements( Statement( res
, Node(), Node() ) );
191 if ( it
.lastError() ) {
195 // use the resource URI as fallback file URI
196 doc
.uri
= res
.uri().toLocalFile().toUtf8().data();
198 while ( it
.next() ) {
200 if ( s
.object().isLiteral() ) {
201 std::string fieldName
= Util::fieldName( s
.predicate().uri() );
202 std::string value
= s
.object().toString().toUtf8().data();
204 if (fieldName
== "text") {
205 doc
.fragment
= value
;
207 else if (fieldName
== FieldRegister::pathFieldName
) {
208 qDebug() << "Setting IndexedDocument uri=" << value
.c_str();
211 else if (fieldName
== FieldRegister::mimetypeFieldName
) {
212 doc
.mimetype
= value
;
214 else if (fieldName
== FieldRegister::mtimeFieldName
) {
215 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
216 if ( s
.object().literal().isDateTime() ) {
217 doc
.mtime
= s
.object().literal().toDateTime().toTime_t();
220 doc
.mtime
= s
.object().literal().toUnsignedInt();
223 else if (fieldName
== FieldRegister::sizeFieldName
) {
224 doc
.size
= s
.object().literal().toInt64();
227 doc
.properties
.insert( make_pair
<const string
, string
>( fieldName
, value
) );
231 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
232 // not literal statements
239 // ::Soprano::Index::IndexFilterModel* repository;
240 ::Soprano::Model
* repository
;
244 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model
* model
)
245 : Strigi::IndexReader()
247 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
249 d
->repository
= model
;
253 Strigi::Soprano::IndexReader::~IndexReader()
255 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
260 int32_t Strigi::Soprano::IndexReader::countHits( const Query
& query
)
262 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
264 lucene::search::Query
* q
= createQuery( query
);
265 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( q
->toString(), true ),
266 ::Soprano::Query::QUERY_LANGUAGE_USER
,
267 QLatin1String( "lucene" ) );
268 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
270 while ( hits
.next() ) {
271 qDebug() << "Query hit:" << hits
.binding( 0 );
279 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query
& query
,
280 const std::vector
<std::string
>& fields
,
281 const std::vector
<Strigi::Variant::Type
>& types
,
282 std::vector
<std::vector
<Strigi::Variant
> >& result
,
285 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
286 lucene::search::Query
* bq
= createQuery( query
);
287 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( bq
->toString(), true ),
288 ::Soprano::Query::QUERY_LANGUAGE_USER
,
289 QLatin1String( "lucene" ) );
290 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
293 while ( hits
.next() ) {
302 // ::Soprano::Index::QueryHit hit = *hits;
303 std::vector
<Strigi::Variant
> resultRow
;
304 std::vector
<std::string
>::const_iterator fieldIt
= fields
.begin();
305 std::vector
<Strigi::Variant::Type
>::const_iterator typesIt
= types
.begin();
306 while ( fieldIt
!= fields
.end() ) {
307 if ( typesIt
== types
.end() ) {
308 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
312 StatementIterator it
= d
->repository
->listStatements( Statement( hits
.binding( "resource" ),
313 Util::fieldUri( *fieldIt
),
315 // FIXME: what if we have a field with a cardinality > 1?
317 resultRow
.push_back( Util::nodeToVariant( it
.current().object() ) );
320 resultRow
.push_back( Strigi::Variant() );
327 result
.push_back( resultRow
);
333 std::vector
<Strigi::IndexedDocument
> Strigi::Soprano::IndexReader::query( const Query
& query
, int off
, int max
)
335 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
336 vector
<IndexedDocument
> results
;
337 lucene::search::Query
* bq
= createQuery( query
);
338 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( bq
->toString(), true ),
339 ::Soprano::Query::QUERY_LANGUAGE_USER
,
341 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
344 while ( hits
.next() ) {
353 IndexedDocument result
;
354 // ::Soprano::Index::QueryHit hit = *hits;
355 result
.score
= hits
.binding( 1 ).literal().toDouble();
356 if ( d
->createDocument( hits
.binding( 0 ), result
) ) {
357 results
.push_back( result
);
360 qDebug() << "Failed to create indexed document for resource " << hits
.binding( 0 ) << ": " << d
->repository
->lastError();
368 // an empty parent url is perfectly valid as strigi stores a parent url for everything
369 void Strigi::Soprano::IndexReader::getChildren( const std::string
& parent
,
370 std::map
<std::string
, time_t>& children
)
372 qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
373 QString query
= QString( "select distinct ?path ?mtime where { ?r <%1> \"%2\"^^<%3> . ?r <%4> ?mtime . ?r <%5> ?path . }")
374 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName
).toString() )
375 .arg( QString::fromUtf8( parent
.c_str() ) )
376 .arg( Vocabulary::XMLSchema::string().toString() )
377 .arg( Util::fieldUri( FieldRegister::mtimeFieldName
).toString() )
378 .arg( Util::fieldUri( FieldRegister::pathFieldName
).toString() );
380 qDebug() << "running getChildren query:" << query
;
382 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
384 while ( result
.next() ) {
385 Node pathNode
= result
.binding( "path" );
386 Node mTimeNode
= result
.binding( "mtime" );
387 qDebug() << "file in index: " << pathNode
.toString() << "mtime:" << mTimeNode
.literal().toDateTime() << "(" << mTimeNode
.literal().toDateTime().toTime_t() << ")";
389 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
390 if ( mTimeNode
.literal().isDateTime() ) {
391 children
[std::string( pathNode
.toString().toUtf8().data() )] = mTimeNode
.literal().toDateTime().toTime_t();
394 children
[std::string( pathNode
.toString().toUtf8().data() )] = mTimeNode
.literal().toUnsignedInt();
400 int32_t Strigi::Soprano::IndexReader::countDocuments()
402 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
403 // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
408 int32_t Strigi::Soprano::IndexReader::countWords()
410 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
411 // FIXME: what to do here? use the index? Count the predicates?
416 int64_t Strigi::Soprano::IndexReader::indexSize()
418 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
419 return d
->repository
->statementCount();
423 time_t Strigi::Soprano::IndexReader::mTime( const std::string
& uri
)
425 qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
426 QString query
= QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
427 .arg( Util::fieldUri( FieldRegister::mtimeFieldName
).toString() )
428 .arg( Util::fieldUri( FieldRegister::pathFieldName
).toString() )
429 .arg( QString::fromUtf8( uri
.c_str() ) )
430 .arg( Vocabulary::XMLSchema::string().toString() );
432 qDebug() << "mTime( " << uri
.c_str() << ") query:" << query
;
434 QueryResultIterator it
= d
->repository
->executeQuery( query
, ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
438 ::Soprano::LiteralValue val
= it
.binding( "mtime" ).literal();
440 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
441 if ( val
.isDateTime() ) {
442 mtime
= val
.toDateTime().toTime_t();
445 mtime
= val
.toUnsignedInt();
452 std::vector
<std::string
> Strigi::Soprano::IndexReader::fieldNames()
454 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
455 // This is a weird method
456 // Our list of field names (the predicates) is probably awefully long.
458 std::vector
<std::string
> fields
;
459 QueryResultIterator it
= d
->repository
->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
460 while ( it
.next() ) {
461 fields
.push_back( Util::fieldName( it
.binding("p").uri() ) );
467 std::vector
<std::pair
<std::string
,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string
& query
,
468 const std::string
& fieldname
,
469 const std::string
& labeltype
)
475 // FIXME: what is meant by fieldname and labeltype?
476 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
477 // IMPLEMENTME? Seems not like a very important method though.
478 return std::vector
<std::pair
<std::string
,uint32_t> >();
482 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string
& keywordprefix
,
483 const std::vector
<std::string
>& fieldnames
)
485 Q_UNUSED(keywordprefix
);
486 Q_UNUSED(fieldnames
);
488 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
489 // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
494 std::vector
<std::string
> Strigi::Soprano::IndexReader::keywords( const std::string
& keywordmatch
,
495 const std::vector
<std::string
>& fieldnames
,
496 uint32_t max
, uint32_t offset
)
498 Q_UNUSED(keywordmatch
);
499 Q_UNUSED(fieldnames
);
503 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
504 // IMPLEMENTME? Seems like a rarely used method...
505 return std::vector
<std::string
>();