Index Visio files using vsd2xhtml
[xapian.git] / xapian-applications / omega / sample.cc
blobb224f4358378d3da9a3c6cbb5485cb3875801863
1 /* sample.cc: generate a sample from a utf-8 string.
3 * Copyright (C) 2007,2013 Olly Betts
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include <config.h>
22 #include <xapian.h>
24 #include "sample.h"
26 #include <string>
28 using namespace std;
30 string
31 generate_sample(const string & input, size_t maxlen,
32 const string & ind, const string & ind2)
34 string output;
36 // Reserve an appropriate amount of space to repeated reallocation as
37 // output grows.
38 if (input.size() <= maxlen) {
39 output.reserve(input.size());
40 } else {
41 // Add 3 to allow for a 4 byte utf-8 sequence being appended when
42 // output is maxlen - 1 bytes long.
43 output.reserve(maxlen + 3);
46 size_t last_word_end = 0;
47 bool in_space = true;
48 Xapian::Utf8Iterator i(input);
49 for ( ; i != Xapian::Utf8Iterator(); ++i) {
50 if (output.size() >= maxlen) {
51 // Need to truncate output.
52 if (last_word_end <= maxlen / 2) {
53 // Fixed when maxlen < ind.size leading to a negative
54 // reference
55 if (maxlen < ind.size()) {
56 output.resize(0);
57 } else {
58 // Monster word! We'll have to just split it.
59 output.replace(maxlen - ind.size(), string::npos, ind);
61 } else {
62 output.replace(last_word_end, string::npos, ind2);
64 break;
67 unsigned ch = *i;
68 if (ch <= ' ' || ch == 0xa0) {
69 // FIXME: if all the whitespace characters between two words are
70 // 0xa0 (non-breaking space) then perhaps we should output 0xa0.
71 if (!in_space) {
72 in_space = true;
73 last_word_end = output.size();
74 output += ' ';
76 continue;
79 Xapian::Unicode::append_utf8(output, ch);
80 in_space = false;
83 return output;