2 $Id: sourceheader 511311 2006-02-19 14:51:05Z trueg $
4 This file is part of the Strigi project.
5 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of
10 the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this library; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
23 #include "sopranoindexwriter.h"
26 #include <Soprano/Soprano>
27 #include <Soprano/Index/IndexFilterModel>
28 #include <Soprano/Index/CLuceneIndex>
29 #include <Soprano/Vocabulary/RDF>
31 #include <QtCore/QList>
32 #include <QtCore/QHash>
33 #include <QtCore/QVariant>
34 #include <QtCore/QFileInfo>
35 #include <QtCore/QFile>
36 #include <QtCore/QUrl>
37 #include <QtCore/QDebug>
38 #include <QtCore/QThread>
39 #include <QtCore/QDateTime>
51 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
53 using namespace Soprano
;
56 uint
qHash( const std::string
& s
)
58 return qHash( s
.c_str() );
62 QString
findArchivePath( const QString
& path
) {
65 while ( ( i
= p
.lastIndexOf( '/' ) ) > 0 ) {
67 if ( QFileInfo( p
).isFile() ) {
74 QUrl
createResourceUri( const Strigi::AnalysisResult
* idx
) {
75 // HACK: Strigi includes analysers that recurse into tar or zip archives and index
76 // the files therein. In KDE these files could perfectly be handled through kio slaves
77 // such as tar:/ or zip:/
78 // Here we try to use KDE-compatible URIs for these indexed files the best we can
79 // everything else defaults to file:/
80 QString path
= QFile::decodeName( idx
->path().c_str() );
81 QUrl url
= QUrl::fromLocalFile( QFileInfo( path
).absoluteFilePath() );
82 if ( idx
->depth() > 0 ) {
83 QString archivePath
= findArchivePath( path
);
84 if ( QFile::exists( archivePath
) ) {
85 if ( archivePath
.endsWith( QLatin1String( ".tar" ) ) ||
86 archivePath
.endsWith( QLatin1String( ".tar.gz" ) ) ||
87 archivePath
.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
88 url
.setScheme( "tar" );
90 else if ( archivePath
.endsWith( QLatin1String( ".zip" ) ) ) {
91 url
.setScheme( "zip" );
97 if ( url
.scheme().isEmpty() ) {
98 url
.setScheme( "file" );
107 // caching URIs for little speed improvement
115 class Strigi::Soprano::IndexWriter::Private
119 : indexTransactionID( 0 ) {
120 literalTypes
[FieldRegister::stringType
] = QVariant::String
;
121 literalTypes
[FieldRegister::floatType
] = QVariant::Double
;
122 literalTypes
[FieldRegister::integerType
] = QVariant::Int
;
123 literalTypes
[FieldRegister::binaryType
] = QVariant::ByteArray
;
124 literalTypes
[FieldRegister::datetimeType
] = QVariant::DateTime
; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
127 QVariant::Type
literalType( const std::string
& strigiType
) {
128 QHash
<std::string
, QVariant::Type
>::const_iterator it
= literalTypes
.find( strigiType
);
129 if ( it
== literalTypes
.constEnd() ) {
130 // qDebug() << "Unknown field type: " << strigiType.c_str() << "falling back to string";
131 return QVariant::String
;
138 LiteralValue
createLiteraValue( const std::string
& strigiDataType
,
139 const unsigned char* data
,
141 QString value
= QString::fromUtf8( ( const char* )data
, size
);
142 QVariant::Type type
= literalType( strigiDataType
);
143 if ( type
== QVariant::DateTime
) {
144 return LiteralValue( QDateTime::fromTime_t( value
.toUInt() ) );
147 return LiteralValue::fromString( value
, type
);
151 // ::Soprano::Index::IndexFilterModel* repository;
152 ::Soprano::Model
* repository
;
153 int indexTransactionID
;
156 QHash
<std::string
, QVariant::Type
> literalTypes
;
160 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model
* model
)
161 : Strigi::IndexWriter()
163 // qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
165 d
->repository
= model
;
166 // qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
170 Strigi::Soprano::IndexWriter::~IndexWriter()
176 void Strigi::Soprano::IndexWriter::commit()
181 // delete all indexed data for the files listed in entries
182 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector
<std::string
>& entries
)
184 // qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
186 QString systemLocationUri
= Util::fieldUri( FieldRegister::pathFieldName
).toString();
187 for ( unsigned int i
= 0; i
< entries
.size(); ++i
) {
188 QString path
= QString::fromUtf8( entries
[i
].c_str() );
189 // QString path = QString::fromUtf8( entries[i].c_str() );
190 QString query
= QString( "select ?g where { ?r <%1> \"%2\"^^<%3> . "
191 "?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" )
192 .arg( systemLocationUri
)
194 .arg( Vocabulary::XMLSchema::string().toString() );
196 qDebug() << "deleteEntries query:" << query
;
198 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
199 if ( result
.next() ) {
200 Node indexGraph
= result
.binding( "g" );
203 qDebug() << "Found indexGraph to delete:" << indexGraph
;
205 // delete the indexed data
206 d
->repository
->removeContext( indexGraph
);
208 // delete the metadata
209 d
->repository
->removeAllStatements( Statement( indexGraph
, Node(), Node() ) );
215 void Strigi::Soprano::IndexWriter::deleteAllEntries()
217 // qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
219 // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
220 QString query
= QString( "select ?g where { ?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" );
222 qDebug() << "deleteAllEntries query:" << query
;
224 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
225 QList
<Node
> allIndexGraphs
= result
.iterateBindings( "g" ).allNodes();
226 for ( QList
<Node
>::const_iterator it
= allIndexGraphs
.constBegin(); it
!= allIndexGraphs
.constEnd(); ++it
) {
227 Node indexGraph
= *it
;
229 qDebug() << "Found indexGraph to delete:" << indexGraph
;
231 // delete the indexed data
232 d
->repository
->removeContext( indexGraph
);
234 // delete the metadata
235 d
->repository
->removeAllStatements( Statement( indexGraph
, Node(), Node() ) );
240 // cache the field type mapping in the RegisteredFields
241 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister
& )
247 // cleanup field type caching
248 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister
& )
254 // called for each indexed file
255 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult
* idx
)
257 // qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
258 FileMetaData
* data
= new FileMetaData();
259 data
->fileUri
= createResourceUri( idx
);
261 // let's check if we already have data on the file
262 StatementIterator it
= d
->repository
->listStatements( Statement( Node(),
263 QUrl( "http://www.strigi.org/fields#indexGraphFor" ), // FIXME: put the URI somewhere else
266 data
->context
= it
.current().subject().uri();
269 data
->context
= Util::uniqueUri( "http://www.strigi.org/contexts/", d
->repository
);
272 qDebug() << "Starting analysis for" << data
->fileUri
<< "in thread" << QThread::currentThread();
274 idx
->setWriterData( data
);
278 // plain text accociated with the indexed file but no field name.
279 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult
* idx
, const char* text
, int32_t length
)
281 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
282 md
->content
.append( text
, length
);
286 // convenience method for adding string fields
287 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
288 const RegisteredField
* fieldname
,
289 const std::string
& value
)
291 addValue( idx
, fieldname
, ( unsigned char* )value
.c_str(), value
.length() );
295 // the main addValue method
296 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
297 const RegisteredField
* fieldname
,
298 const unsigned char* data
,
301 addValue( idx
, fieldname
, fieldname
->key(), std::string( ( const char* )data
, size
) );
305 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
, const RegisteredField
* field
,
306 const std::string
& name
, const std::string
& value
)
308 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
309 if ( value
.length() > 0 ) {
310 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
312 if ( d
->literalType( field
->type() ) == QVariant::Invalid
) {
313 // FIXME: only save it in the index: binary data (how does strigi handle that anyway??)
315 else if ( QString( name
.c_str() ) == ::Soprano::Vocabulary::RDF::type().toString() ) {
316 // Strigi uses rdf:type improperly since it stores the value as a string. We have to
317 // make sure it is a resource. The problem is that this results in the type not being
318 // indexed properly. Thus, it cannot be searched with normal lucene queries.
319 // That is why we need to introduce a stringType property
321 d
->repository
->addStatement( Statement( md
->fileUri
,
322 ::Soprano::Vocabulary::RDF::type(),
323 QUrl( QString::fromUtf8( value
.c_str() ) ),
325 d
->repository
->addStatement( Statement( md
->fileUri
,
326 QUrl( "http://strigi.sourceforge.net/fields#rdf-string-type" ),
327 d
->createLiteraValue( field
->type(), ( unsigned char* )value
.c_str(), value
.length() ),
331 d
->repository
->addStatement( Statement( md
->fileUri
,
332 Util::fieldUri( name
),
333 d
->createLiteraValue( field
->type(), ( unsigned char* )value
.c_str(), value
.length() ),
337 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
341 // convenience method for adding unsigned int (or datetime!) fields
342 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
343 const RegisteredField
* fieldname
,
346 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
347 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
348 LiteralValue
val( value
);
349 if ( fieldname
->type() == FieldRegister::datetimeType
) {
350 // qDebug() << "(Soprano::IndexWriter) adding datetime value.";
351 val
= QDateTime::fromTime_t( value
);
354 d
->repository
->addStatement( Statement( md
->fileUri
,
355 Util::fieldUri( fieldname
->key() ),
358 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
362 // convenience method for adding int fields
363 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
364 const RegisteredField
* fieldname
,
367 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
368 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
369 d
->repository
->addStatement( Statement( md
->fileUri
,
370 Util::fieldUri( fieldname
->key() ),
371 LiteralValue( value
),
373 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
377 // convenience method for adding double fields
378 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
379 const RegisteredField
* fieldname
,
382 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
383 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
384 d
->repository
->addStatement( Statement( md
->fileUri
,
385 Util::fieldUri( fieldname
->key() ),
386 LiteralValue( value
),
388 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
392 void Strigi::Soprano::IndexWriter::addTriplet( const std::string
& subject
,
393 const std::string
& predicate
, const std::string
& object
)
395 // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
398 // FIXME: create an NRL metadata graph
399 d
->repository
->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject
.c_str() ) ) ),
400 Node( QUrl( QString::fromUtf8( predicate
.c_str() ) ) ),
401 Node( QUrl( QString::fromUtf8( object
.c_str() ) ) ),
406 // called after each indexed file
407 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult
* idx
)
409 // qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
410 FileMetaData
* md
= static_cast<FileMetaData
*>( idx
->writerData() );
412 if ( md
->content
.length() > 0 ) {
413 d
->repository
->addStatement( Statement( md
->fileUri
,
414 Util::fieldUri( FieldRegister::contentFieldName
),
415 LiteralValue( QString::fromUtf8( md
->content
.c_str() ) ),
419 // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
420 // Thus, here we go the easy way and amrk each indexed file as a xesam:File.
421 d
->repository
->addStatement( Statement( md
->fileUri
,
422 Vocabulary::RDF::type(),
423 Vocabulary::Xesam::File(),
427 // create the provedance data for the data graph
428 // TODO: add more data at some point when it becomes of interest
429 QUrl metaDataContext
= Util::uniqueUri( "http://www.strigi.org/graphMetaData/", d
->repository
);
430 d
->repository
->addStatement( Statement( md
->context
,
431 Vocabulary::RDF::type(),
432 Vocabulary::NRL::InstanceBase(),
434 d
->repository
->addStatement( Statement( md
->context
,
435 Vocabulary::NAO::created(),
436 LiteralValue( QDateTime::currentDateTime() ),
438 d
->repository
->addStatement( Statement( md
->context
,
439 QUrl( "http://www.strigi.org/fields#indexGraphFor" ), // FIXME: put the URI somewhere else
442 d
->repository
->addStatement( Statement( metaDataContext
,
443 Vocabulary::RDF::type(),
444 Vocabulary::NRL::GraphMetadata(),
449 idx
->setWriterData( 0 );
451 // qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();