[ci] Fix netbsd job to upgrade existing packages
[xapian.git] / xapian-applications / omega / handler_poppler.cc
blob810d3e58fae78657a587b893a0fbb4a30e9508eb
1 /** @file
2 * @brief Extract text and metadata using poppler.
3 */
4 /* Copyright (C) 2019 Bruno Baruffaldi
5 * Copyright (C) 2022,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
24 #include "handler.h"
25 #include "str.h"
27 #include <poppler/glib/poppler-document.h>
28 #include <poppler/glib/poppler-page.h>
30 using namespace std;
32 static gchar*
33 convert_to_uri(const string& filename, GError** e)
35 #if GLIB_CHECK_VERSION(2,58,0)
36 gchar* abs_filename = g_canonicalize_filename(filename.c_str(), NULL);
37 #else
38 gchar* abs_filename;
39 if (g_path_is_absolute(filename.c_str())) {
40 abs_filename = g_strdup(filename.c_str());
41 } else {
42 gchar* cwd = g_get_current_dir();
43 abs_filename = g_build_filename(cwd, filename.c_str(), NULL);
44 g_free(cwd);
46 #endif
47 gchar* uri = g_filename_to_uri(abs_filename, NULL, e);
48 g_free(abs_filename);
49 return uri;
52 static void
53 send_glib_field(Field field, gchar* data)
55 if (data) {
56 send_field(field, data);
57 g_free(data);
61 bool
62 initialise()
64 return true;
67 void
68 extract(const string& filename, const string&)
70 GError* e = nullptr;
71 gchar* uri = convert_to_uri(filename, &e);
72 if (!uri) {
73 send_field(FIELD_ERROR, "g_filename_to_uri() failed: ");
74 send_field(FIELD_ERROR, e->message);
75 g_error_free(e);
76 return;
79 PopplerDocument* doc = poppler_document_new_from_file(uri, NULL, &e);
80 g_free(uri);
81 if (!doc) {
82 send_field(FIELD_ERROR, "poppler_document_new_from_file() failed: ");
83 send_field(FIELD_ERROR, e->message);
84 g_error_free(e);
85 return;
88 int pages = poppler_document_get_n_pages(doc);
89 send_field_page_count(pages);
90 // Extracting text from PDF file
91 for (int i = 0; i < pages; ++i) {
92 PopplerPage* page = poppler_document_get_page(doc, i);
93 if (!page) {
94 g_object_unref(doc);
95 send_field(FIELD_ERROR, "Failed to get page " + str(i));
96 return;
98 send_field(FIELD_BODY, poppler_page_get_text(page));
99 g_object_unref(page);
102 // Extract PDF metadata.
103 send_glib_field(FIELD_AUTHOR, poppler_document_get_author(doc));
104 send_glib_field(FIELD_TITLE, poppler_document_get_title(doc));
105 send_glib_field(FIELD_KEYWORDS, poppler_document_get_keywords(doc));
106 send_field_created_date(poppler_document_get_creation_date(doc));
108 g_object_unref(doc);