Make a branch to make krunner Good Enough For Aaron™.
[kdebase/uwolfer.git] / runtime / nepomuk / strigibackend / sopranoindexwriter.cpp
blob6210da038af1f4e5bb22508ca8d8ec2ed8b0ac3c
1 /*
2 $Id: sourceheader 511311 2006-02-19 14:51:05Z trueg $
4 This file is part of the Strigi project.
5 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of
10 the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this library; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
23 #include "sopranoindexwriter.h"
24 #include "util.h"
26 #include <Soprano/Soprano>
27 #include <Soprano/Index/IndexFilterModel>
28 #include <Soprano/Index/CLuceneIndex>
29 #include <Soprano/Vocabulary/RDF>
31 #include <QtCore/QList>
32 #include <QtCore/QHash>
33 #include <QtCore/QVariant>
34 #include <QtCore/QFileInfo>
35 #include <QtCore/QFile>
36 #include <QtCore/QUrl>
37 #include <QtCore/QDebug>
38 #include <QtCore/QThread>
39 #include <QtCore/QDateTime>
41 #include <sys/stat.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <errno.h>
46 #include <map>
47 #include <sstream>
48 #include <algorithm>
51 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
53 using namespace Soprano;
56 uint qHash( const std::string& s )
58 return qHash( s.c_str() );
61 namespace {
62 QString findArchivePath( const QString& path ) {
63 QString p( path );
64 int i = 0;
65 while ( ( i = p.lastIndexOf( '/' ) ) > 0 ) {
66 p.truncate( i );
67 if ( QFileInfo( p ).isFile() ) {
68 return p;
71 return QString();
74 QUrl createResourceUri( const Strigi::AnalysisResult* idx ) {
75 // HACK: Strigi includes analysers that recurse into tar or zip archives and index
76 // the files therein. In KDE these files could perfectly be handled through kio slaves
77 // such as tar:/ or zip:/
78 // Here we try to use KDE-compatible URIs for these indexed files the best we can
79 // everything else defaults to file:/
80 QString path = QFile::decodeName( idx->path().c_str() );
81 QUrl url = QUrl::fromLocalFile( QFileInfo( path ).absoluteFilePath() );
82 if ( idx->depth() > 0 ) {
83 QString archivePath = findArchivePath( path );
84 if ( QFile::exists( archivePath ) ) {
85 if ( archivePath.endsWith( QLatin1String( ".tar" ) ) ||
86 archivePath.endsWith( QLatin1String( ".tar.gz" ) ) ||
87 archivePath.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
88 url.setScheme( "tar" );
90 else if ( archivePath.endsWith( QLatin1String( ".zip" ) ) ) {
91 url.setScheme( "zip" );
96 // fallback for all
97 if ( url.scheme().isEmpty() ) {
98 url.setScheme( "file" );
101 return url;
104 class FileMetaData
106 public:
107 // caching URIs for little speed improvement
108 QUrl fileUri;
109 QUrl context;
110 std::string content;
115 class Strigi::Soprano::IndexWriter::Private
117 public:
118 Private()
119 : indexTransactionID( 0 ) {
120 literalTypes[FieldRegister::stringType] = QVariant::String;
121 literalTypes[FieldRegister::floatType] = QVariant::Double;
122 literalTypes[FieldRegister::integerType] = QVariant::Int;
123 literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
124 literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
127 QVariant::Type literalType( const std::string& strigiType ) {
128 QHash<std::string, QVariant::Type>::const_iterator it = literalTypes.find( strigiType );
129 if ( it == literalTypes.constEnd() ) {
130 // qDebug() << "Unknown field type: " << strigiType.c_str() << "falling back to string";
131 return QVariant::String;
133 else {
134 return *it;
138 LiteralValue createLiteraValue( const std::string& strigiDataType,
139 const unsigned char* data,
140 uint32_t size ) {
141 QString value = QString::fromUtf8( ( const char* )data, size );
142 QVariant::Type type = literalType( strigiDataType );
143 if ( type == QVariant::DateTime ) {
144 return LiteralValue( QDateTime::fromTime_t( value.toUInt() ) );
146 else {
147 return LiteralValue::fromString( value, type );
151 // ::Soprano::Index::IndexFilterModel* repository;
152 ::Soprano::Model* repository;
153 int indexTransactionID;
155 private:
156 QHash<std::string, QVariant::Type> literalTypes;
160 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model* model )
161 : Strigi::IndexWriter()
163 // qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
164 d = new Private;
165 d->repository = model;
166 // qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
170 Strigi::Soprano::IndexWriter::~IndexWriter()
172 delete d;
176 void Strigi::Soprano::IndexWriter::commit()
181 // delete all indexed data for the files listed in entries
182 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector<std::string>& entries )
184 // qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
186 QString systemLocationUri = Util::fieldUri( FieldRegister::pathFieldName ).toString();
187 for ( unsigned int i = 0; i < entries.size(); ++i ) {
188 QString path = QString::fromUtf8( entries[i].c_str() );
189 // QString path = QString::fromUtf8( entries[i].c_str() );
190 QString query = QString( "select ?g where { ?r <%1> \"%2\"^^<%3> . "
191 "?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" )
192 .arg( systemLocationUri )
193 .arg( path )
194 .arg( Vocabulary::XMLSchema::string().toString() );
196 qDebug() << "deleteEntries query:" << query;
198 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
199 if ( result.next() ) {
200 Node indexGraph = result.binding( "g" );
201 result.close();
203 qDebug() << "Found indexGraph to delete:" << indexGraph;
205 // delete the indexed data
206 d->repository->removeContext( indexGraph );
208 // delete the metadata
209 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
215 void Strigi::Soprano::IndexWriter::deleteAllEntries()
217 // qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
219 // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
220 QString query = QString( "select ?g where { ?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" );
222 qDebug() << "deleteAllEntries query:" << query;
224 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
225 QList<Node> allIndexGraphs = result.iterateBindings( "g" ).allNodes();
226 for ( QList<Node>::const_iterator it = allIndexGraphs.constBegin(); it != allIndexGraphs.constEnd(); ++it ) {
227 Node indexGraph = *it;
229 qDebug() << "Found indexGraph to delete:" << indexGraph;
231 // delete the indexed data
232 d->repository->removeContext( indexGraph );
234 // delete the metadata
235 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
240 // cache the field type mapping in the RegisteredFields
241 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister& )
243 // nothing to do ATM
247 // cleanup field type caching
248 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister& )
250 // nothing to do ATM
254 // called for each indexed file
255 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult* idx )
257 // qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
258 FileMetaData* data = new FileMetaData();
259 data->fileUri = createResourceUri( idx );
261 // let's check if we already have data on the file
262 StatementIterator it = d->repository->listStatements( Statement( Node(),
263 QUrl( "http://www.strigi.org/fields#indexGraphFor" ), // FIXME: put the URI somewhere else
264 data->fileUri ) );
265 if ( it.next() ) {
266 data->context = it.current().subject().uri();
268 else {
269 data->context = Util::uniqueUri( "http://www.strigi.org/contexts/", d->repository );
272 qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
274 idx->setWriterData( data );
278 // plain text accociated with the indexed file but no field name.
279 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult* idx, const char* text, int32_t length )
281 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
282 md->content.append( text, length );
286 // convenience method for adding string fields
287 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
288 const RegisteredField* fieldname,
289 const std::string& value )
291 addValue( idx, fieldname, ( unsigned char* )value.c_str(), value.length() );
295 // the main addValue method
296 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
297 const RegisteredField* fieldname,
298 const unsigned char* data,
299 uint32_t size )
301 addValue( idx, fieldname, fieldname->key(), std::string( ( const char* )data, size ) );
305 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx, const RegisteredField* field,
306 const std::string& name, const std::string& value )
308 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
309 if ( value.length() > 0 ) {
310 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
312 if ( d->literalType( field->type() ) == QVariant::Invalid ) {
313 // FIXME: only save it in the index: binary data (how does strigi handle that anyway??)
315 else if ( QString( name.c_str() ) == ::Soprano::Vocabulary::RDF::type().toString() ) {
316 // Strigi uses rdf:type improperly since it stores the value as a string. We have to
317 // make sure it is a resource. The problem is that this results in the type not being
318 // indexed properly. Thus, it cannot be searched with normal lucene queries.
319 // That is why we need to introduce a stringType property
321 d->repository->addStatement( Statement( md->fileUri,
322 ::Soprano::Vocabulary::RDF::type(),
323 QUrl( QString::fromUtf8( value.c_str() ) ),
324 md->context) );
325 d->repository->addStatement( Statement( md->fileUri,
326 QUrl( "http://strigi.sourceforge.net/fields#rdf-string-type" ),
327 d->createLiteraValue( field->type(), ( unsigned char* )value.c_str(), value.length() ),
328 md->context) );
330 else {
331 d->repository->addStatement( Statement( md->fileUri,
332 Util::fieldUri( name ),
333 d->createLiteraValue( field->type(), ( unsigned char* )value.c_str(), value.length() ),
334 md->context) );
337 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
341 // convenience method for adding unsigned int (or datetime!) fields
342 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
343 const RegisteredField* fieldname,
344 uint32_t value )
346 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
347 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
348 LiteralValue val( value );
349 if ( fieldname->type() == FieldRegister::datetimeType ) {
350 // qDebug() << "(Soprano::IndexWriter) adding datetime value.";
351 val = QDateTime::fromTime_t( value );
354 d->repository->addStatement( Statement( md->fileUri,
355 Util::fieldUri( fieldname->key() ),
356 val,
357 md->context) );
358 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
362 // convenience method for adding int fields
363 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
364 const RegisteredField* fieldname,
365 int32_t value )
367 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
368 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
369 d->repository->addStatement( Statement( md->fileUri,
370 Util::fieldUri( fieldname->key() ),
371 LiteralValue( value ),
372 md->context) );
373 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
377 // convenience method for adding double fields
378 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
379 const RegisteredField* fieldname,
380 double value )
382 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
383 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
384 d->repository->addStatement( Statement( md->fileUri,
385 Util::fieldUri( fieldname->key() ),
386 LiteralValue( value ),
387 md->context) );
388 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
392 void Strigi::Soprano::IndexWriter::addTriplet( const std::string& subject,
393 const std::string& predicate, const std::string& object )
395 // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
396 // next commit()?
398 // FIXME: create an NRL metadata graph
399 d->repository->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject.c_str() ) ) ),
400 Node( QUrl( QString::fromUtf8( predicate.c_str() ) ) ),
401 Node( QUrl( QString::fromUtf8( object.c_str() ) ) ),
402 Node() ) );
406 // called after each indexed file
407 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult* idx )
409 // qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
410 FileMetaData* md = static_cast<FileMetaData*>( idx->writerData() );
412 if ( md->content.length() > 0 ) {
413 d->repository->addStatement( Statement( md->fileUri,
414 Util::fieldUri( FieldRegister::contentFieldName ),
415 LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
416 md->context ) );
419 // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
420 // Thus, here we go the easy way and amrk each indexed file as a xesam:File.
421 d->repository->addStatement( Statement( md->fileUri,
422 Vocabulary::RDF::type(),
423 Vocabulary::Xesam::File(),
424 md->context ) );
427 // create the provedance data for the data graph
428 // TODO: add more data at some point when it becomes of interest
429 QUrl metaDataContext = Util::uniqueUri( "http://www.strigi.org/graphMetaData/", d->repository );
430 d->repository->addStatement( Statement( md->context,
431 Vocabulary::RDF::type(),
432 Vocabulary::NRL::InstanceBase(),
433 metaDataContext ) );
434 d->repository->addStatement( Statement( md->context,
435 Vocabulary::NAO::created(),
436 LiteralValue( QDateTime::currentDateTime() ),
437 metaDataContext ) );
438 d->repository->addStatement( Statement( md->context,
439 QUrl( "http://www.strigi.org/fields#indexGraphFor" ), // FIXME: put the URI somewhere else
440 md->fileUri,
441 metaDataContext ) );
442 d->repository->addStatement( Statement( metaDataContext,
443 Vocabulary::RDF::type(),
444 Vocabulary::NRL::GraphMetadata(),
445 metaDataContext ) );
447 // cleanup
448 delete md;
449 idx->setWriterData( 0 );
451 // qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();