Bug 14187 - DBRev 16.12.00.002
[koha.git] / Koha / Indexer / Utils.pm
blob07e8d230c1026b5864a0a4d9c4aaf0079ae6adf6
1 package Koha::Indexer::Utils;
3 # Copyright (c) 2012 Equinox Software, Inc.
4 # This file is part of Koha.
6 # Koha is free software; you can redistribute it and/or modify it under the
7 # terms of the GNU General Public License as published by the Free Software
8 # Foundation; either version 2 of the License, or (at your option) any later
9 # version.
11 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
12 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
13 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License along with
16 # Koha; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 use strict;
20 use warnings;
21 use 5.010;
23 use XML::LibXML;
25 =head1 Koha::Indexer::Utils
27 Koha::Indexer::Utils - utility functions for managing search indexes
29 =head1 DESCRIPTION
31 This modules contains utility functions for managing various aspects
32 of Koha's bibliographic and authority search indexes.
34 =head1 FUNCTIONS
36 =cut
38 =head2 zebra_record_abs_to_dom
40 $dom_config = Koha::Indexer::Utils::zebra_record_abs_to_dom($record_abs_config, $marcflavour);
42 Given a string containing the contents of a records.abs configuration file as
43 used by Zebra's GRS-1 filter, emit an equivalent DOM configuration.
45 =cut
47 our $idxNS = 'http://www.koha-community.org/schemas/index-defs';
49 sub zebra_record_abs_to_dom {
50 my $grs1_cfg = shift;
51 my $marcflavour = shift;
53 chomp $grs1_cfg;
54 my @grs1_cfg_lines = split /\n/, $grs1_cfg, -1;
55 my $grs1_defs = [];
57 # generate an arrayref of structures representing
58 # each records.abs line
59 for (my $i = 0; $i <= $#grs1_cfg_lines; $i++) {
60 my $line = $grs1_cfg_lines[$i];
61 next if _can_ignore_grs1_cfg_line($line);
62 my $grs1_def = _parse_grs1_cfg_line($line);
63 $grs1_def->{orig_def} = $line;
64 $grs1_def->{lineno} = $i + 1;
65 push @$grs1_defs, $grs1_def;
68 # map the index definitions to a DOM tree representing
69 # the index definitions -- if you squint hard, you
70 # can see the beginnings of a more general definition language
71 # for Koha index definitions
72 my $dom_cfg = XML::LibXML::Document->new('1.0', 'utf-8');
73 my $root = $dom_cfg->createElement('index_defs');
74 $root->setNamespace($idxNS, 'kohaidx');
75 foreach my $grs1_def (@$grs1_defs) {
76 _append_grs1_def_to_dom_cfg($dom_cfg, $root, $grs1_def, $marcflavour);
79 # and emit the result as a string
80 $dom_cfg->setDocumentElement($root);
81 return $dom_cfg->toString(1);
85 # bunch of utility functions for zebra_record_abs_to_dom
87 sub _can_ignore_grs1_cfg_line {
88 my $line = shift;
89 return 1 if $line =~ /^\s*$/ or
90 $line =~ /^#/ or
91 $line =~ /^(encoding|name|attset|esetname|marc|systag|xpath)/ or
92 $line =~ /^all/; # DOM filter automatically indexes all tokens, so
93 # no need to deal with 'all any' lines in record.abs
94 return 0;
97 sub _parse_grs1_cfg_line {
98 my $line = shift;
99 my $grs1_def;
101 if ($line =~ /^melm\s+(.*)/ || $line =~ m!^xelm /record/(.*)!) {
102 $grs1_def = _parse_xelm_melm($1);
104 return $grs1_def;
107 sub _parse_xelm_melm {
108 my $line = shift;
110 my ($field, $index_defs) = split /\s+/, $line, 2;
112 # munge fixed field range indicators
113 $index_defs =~ s/range\(data,(\d+),(\d+)\)/$1:$2/g;
115 my ($tag, $subfield) = split /\$/, $field, 2;
116 return {
117 tag => $tag,
118 subfield => $subfield,
119 index_defs => [ map { _parse_grs1_index_def($_) } split /,/, $index_defs ],
123 sub _parse_grs1_index_def {
124 my $index_def = shift;
126 my @parts = split /:/, $index_def, -1;
127 my $parsed_def = {};
128 $parsed_def->{name} = shift @parts;
129 $parsed_def->{index_type} = shift @parts;
130 $parsed_def->{offset} = shift @parts;
131 $parsed_def->{length} = shift @parts;
132 # if the original index definition didn't specify an index
133 # type, set it 'w' -- the DOM filter needs the index type
134 # to be specified explicitly
135 $parsed_def->{index_type} = 'w' unless defined $parsed_def->{index_type};
136 return $parsed_def;
139 sub _append_grs1_def_to_dom_cfg {
140 my $dom_cfg = shift;
141 my $root = shift;
142 my $grs1_def = shift;
143 my $marcflavour = shift;
145 my $comment = $dom_cfg->createComment('record.abs line ' .
146 $grs1_def->{lineno} . ': ' .
147 $grs1_def->{orig_def});
148 $root->appendChild($comment);
150 if (defined $grs1_def->{tag} && defined $grs1_def->{subfield}) {
151 my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_subfields');
152 $dom_def->setAttribute('tag', $grs1_def->{tag});
153 $dom_def->setAttribute('subfields', $grs1_def->{subfield});
154 _append_target_indexes($dom_cfg, $dom_def, $grs1_def);
155 $root->appendChild($dom_def);
156 } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} eq 'leader') {
157 # we're the leader
158 _append_grs1_defs_for_leader($dom_cfg, $root, $grs1_def);
159 } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} < 10) {
160 # we're a control field
161 _append_grs1_defs_for_control_field($dom_cfg, $root, $grs1_def);
162 } elsif (defined $grs1_def->{tag}) {
163 # we're indexing an entire variable data field
164 my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_data_field');
165 $dom_def->setAttribute('tag', $grs1_def->{tag});
166 _append_target_indexes($dom_cfg, $dom_def, $grs1_def);
167 $root->appendChild($dom_def);
171 sub _append_target_indexes {
172 my $dom_cfg = shift;
173 my $dom_def = shift;
174 my $grs1_def = shift;
176 foreach my $index_def (@{ $grs1_def->{index_defs} }) {
177 _append_one_target_index($dom_cfg, $dom_def, $index_def);
181 sub _append_one_target_index {
182 my $dom_cfg = shift;
183 my $dom_def = shift;
184 my $index_def = shift;
185 my $tgt_idx = $dom_cfg->createElementNS($idxNS, 'target_index');
186 my $index_name = "$index_def->{name}:$index_def->{index_type}";
187 $tgt_idx->appendText($index_name);
188 $dom_def->appendChild($tgt_idx);
191 sub _append_grs1_defs_for_leader {
192 my $dom_cfg = shift;
193 my $root = shift;
194 my $grs1_def = shift;
195 foreach my $index_def (@{ $grs1_def->{index_defs} }) {
196 my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_leader');
197 if (defined $index_def->{offset} && defined $index_def->{length}) {
198 $dom_def->setAttribute('offset', $index_def->{offset});
199 $dom_def->setAttribute('length', $index_def->{length});
201 _append_one_target_index($dom_cfg, $dom_def, $index_def);
202 $root->appendChild($dom_def);
206 sub _append_grs1_defs_for_control_field {
207 my $dom_cfg = shift;
208 my $root = shift;
209 my $grs1_def = shift;
210 foreach my $index_def (@{ $grs1_def->{index_defs} }) {
211 my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_control_field');
212 $dom_def->setAttribute('tag', $grs1_def->{tag});
213 if (defined $index_def->{offset} && defined $index_def->{length}) {
214 $dom_def->setAttribute('offset', $index_def->{offset});
215 $dom_def->setAttribute('length', $index_def->{length});
217 _append_one_target_index($dom_cfg, $dom_def, $index_def);
218 $root->appendChild($dom_def);