Add basic documentation and examples to PHP4 bindings.
[xapian.git] / xapian-bindings / php4 / docs / examples / simpleindex.php
blobad15d65d37cf6cf6155c83fe5dc72d26c7d3bcc7
1 <?php
2 /* $Id$
3 * Index each paragraph in a textfile as a document
5 * ----START-LICENCE----
6 * Copyright 2004 James Aylett
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * USA
22 * -----END-LICENCE-----
25 define('MAX_PROB_TERM_LENGTH', 64);
27 function p_alnum($c)
29 return ctype_alnum($c);
32 function p_notalnum($c)
34 return !ctype_alnum($c);
37 function p_notplusminus($c)
39 return $c!='+' and $c!='-';
42 function find_p($string, $start, $predicate)
44 while ($start < strlen($string) and
45 !$predicate(substr($string, $start, 1))) {
46 $start ++;
48 return $start;
51 if (!isset($_SERVER['argv']) or count($_SERVER['argv']) != 2) {
52 print "usage: {$_SERVER['argv'][0]} <path to database>\n";
53 exit;
56 $database = open_writable($_SERVER['argv'][1], DB_CREATE_OR_OPEN);
57 if (!$database) {
58 print "Died! :-(\n";
59 exit;
61 $stemmer = new_Stem("english");
62 $para = '';
63 $lines = file("php://stdin");
64 foreach ($lines as $line) {
65 $line = rtrim($line);
66 if ($line=="") {
67 if ($para!="") {
68 $doc = new_Document();
69 Document_set_data($doc, $para);
70 $pos = 0;
72 * At each point, find the next alnum character (i), then
73 * find the first non-alnum character after that (j). Find
74 * the first non-plusminus character after that (k), and if
75 * k is non-alnum (or is off the end of the para), set j=k.
76 * The term generation string is [i,j), so len = j-i
78 $i = 0;
79 $j = 0;
80 while ($i<strlen($para)) {
81 $i = find_p($para, $j, 'p_alnum');
82 $j = find_p($para, $i, 'p_notalnum');
83 $k = find_p($para, $j, 'p_notplusminus');
84 if ($k==strlen($para) or !p_alnum(substr($para, $k, 1))) {
85 $j = $k;
87 if ($j-$i <= MAX_PROB_TERM_LENGTH and $j>$i) {
88 $term = stem_stem_word($stemmer, substr($para, $i, $j-$i));
89 Document_add_posting($doc, $term, $pos);
90 $pos ++;
92 $i = $j;
94 WritableDatabase_add_document($database, $doc);
95 WritableDatabase_flush($database);
96 $para = "";
98 } else {
99 if ($para!="") {
100 $para .= " ";
102 $para .= $line;
105 delete_WritableDatabase($database);