2 * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
4 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5 * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
7 * Permission is hereby granted to use or copy this program
8 * for any purpose, provided the above notices are retained on all copies.
9 * Permission to modify the code and to distribute modified code is granted,
10 * provided the above notices are retained, and a notice that the code was
11 * modified is included with the above copyright notice.
13 * Author: Hans-J. Boehm (boehm@parc.xerox.com)
15 /* Boehm, October 5, 1995 4:20 pm PDT */
18 * Cords are immutable character strings. A number of operations
19 * on long cords are much more efficient than their strings.h counterpart.
20 * In particular, concatenation takes constant time independent of the length
21 * of the arguments. (Cords are represented as trees, with internal
22 * nodes representing concatenation and leaves consisting of either C
23 * strings or a functional description of the string.)
25 * The following are reasonable applications of cords. They would perform
26 * unacceptably if C strings were used:
27 * - A compiler that produces assembly language output by repeatedly
28 * concatenating instructions onto a cord representing the output file.
29 * - A text editor that converts the input file to a cord, and then
30 * performs editing operations by producing a new cord representing
31 * the file after echa character change (and keeping the old ones in an
34 * For optimal performance, cords should be built by
35 * concatenating short sections.
36 * This interface is designed for maximum compatibility with C strings.
37 * ASCII NUL characters may be embedded in cords using CORD_from_fn.
38 * This is handled correctly, but CORD_to_char_star will produce a string
39 * with embedded NULs when given such a cord.
41 * This interface is fairly big, largely for performance reasons.
42 * The most basic constants and functions:
44 * CORD - the type of a cord;
45 * CORD_EMPTY - empty cord;
46 * CORD_len(cord) - length of a cord;
47 * CORD_cat(cord1,cord2) - concatenation of two cords;
48 * CORD_substr(cord, start, len) - substring (or subcord);
49 * CORD_pos i; CORD_FOR(i, cord) { ... CORD_pos_fetch(i) ... } -
50 * examine each character in a cord. CORD_pos_fetch(i) is the char.
51 * CORD_fetch(int i) - Retrieve i'th character (slowly).
52 * CORD_cmp(cord1, cord2) - compare two cords.
53 * CORD_from_file(FILE * f) - turn a read-only file into a cord.
54 * CORD_to_char_star(cord) - convert to C string.
55 * (Non-NULL C constant strings are cords.)
56 * CORD_printf (etc.) - cord version of printf. Use %r for cords.
63 /* Cords have type const char *. This is cheating quite a bit, and not */
64 /* 100% portable. But it means that nonempty character string */
65 /* constants may be used as cords directly, provided the string is */
66 /* never modified in place. The empty cord is represented by, and */
67 /* can be written as, 0. */
69 typedef const char * CORD
;
71 /* An empty cord is always represented as nil */
74 /* Is a nonempty cord represented as a C string? */
75 #define CORD_IS_STRING(s) (*(s) != '\0')
77 /* Concatenate two cords. If the arguments are C strings, they may */
78 /* not be subsequently altered. */
79 CORD
CORD_cat(CORD x
, CORD y
);
81 /* Concatenate a cord and a C string with known length. Except for the */
82 /* empty string case, this is a special case of CORD_cat. Since the */
83 /* length is known, it can be faster. */
84 /* The string y is shared with the resulting CORD. Hence it should */
85 /* not be altered by the caller. */
86 CORD
CORD_cat_char_star(CORD x
, const char * y
, size_t leny
);
88 /* Compute the length of a cord */
89 size_t CORD_len(CORD x
);
91 /* Cords may be represented by functions defining the ith character */
92 typedef char (* CORD_fn
)(size_t i
, void * client_data
);
94 /* Turn a functional description into a cord. */
95 CORD
CORD_from_fn(CORD_fn fn
, void * client_data
, size_t len
);
97 /* Return the substring (subcord really) of x with length at most n, */
98 /* starting at position i. (The initial character has position 0.) */
99 CORD
CORD_substr(CORD x
, size_t i
, size_t n
);
101 /* Return the argument, but rebalanced to allow more efficient */
102 /* character retrieval, substring operations, and comparisons. */
103 /* This is useful only for cords that were built using repeated */
104 /* concatenation. Guarantees log time access to the result, unless */
105 /* x was obtained through a large number of repeated substring ops */
106 /* or the embedded functional descriptions take longer to evaluate. */
107 /* May reallocate significant parts of the cord. The argument is not */
108 /* modified; only the result is balanced. */
109 CORD
CORD_balance(CORD x
);
111 /* The following traverse a cord by applying a function to each */
112 /* character. This is occasionally appropriate, especially where */
113 /* speed is crucial. But, since C doesn't have nested functions, */
114 /* clients of this sort of traversal are clumsy to write. Consider */
115 /* the functions that operate on cord positions instead. */
117 /* Function to iteratively apply to individual characters in cord. */
118 typedef int (* CORD_iter_fn
)(char c
, void * client_data
);
120 /* Function to apply to substrings of a cord. Each substring is a */
121 /* a C character string, not a general cord. */
122 typedef int (* CORD_batched_iter_fn
)(const char * s
, void * client_data
);
123 # define CORD_NO_FN ((CORD_batched_iter_fn)0)
125 /* Apply f1 to each character in the cord, in ascending order, */
126 /* starting at position i. If */
127 /* f2 is not CORD_NO_FN, then multiple calls to f1 may be replaced by */
128 /* a single call to f2. The parameter f2 is provided only to allow */
129 /* some optimization by the client. This terminates when the right */
130 /* end of this string is reached, or when f1 or f2 return != 0. In the */
131 /* latter case CORD_iter returns != 0. Otherwise it returns 0. */
132 /* The specified value of i must be < CORD_len(x). */
133 int CORD_iter5(CORD x
, size_t i
, CORD_iter_fn f1
,
134 CORD_batched_iter_fn f2
, void * client_data
);
136 /* A simpler version that starts at 0, and without f2: */
137 int CORD_iter(CORD x
, CORD_iter_fn f1
, void * client_data
);
138 # define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)
140 /* Similar to CORD_iter5, but end-to-beginning. No provisions for */
141 /* CORD_batched_iter_fn. */
142 int CORD_riter4(CORD x
, size_t i
, CORD_iter_fn f1
, void * client_data
);
144 /* A simpler version that starts at the end: */
145 int CORD_riter(CORD x
, CORD_iter_fn f1
, void * client_data
);
147 /* Functions that operate on cord positions. The easy way to traverse */
148 /* cords. A cord position is logically a pair consisting of a cord */
149 /* and an index into that cord. But it is much faster to retrieve a */
150 /* charcter based on a position than on an index. Unfortunately, */
151 /* positions are big (order of a few 100 bytes), so allocate them with */
153 /* Things in cord_pos.h should be treated as opaque, except as */
154 /* described below. Also note that */
155 /* CORD_pos_fetch, CORD_next and CORD_prev have both macro and function */
156 /* definitions. The former may evaluate their argument more than once. */
157 # include "private/cord_pos.h"
160 Visible definitions from above:
162 typedef <OPAQUE but fairly big> CORD_pos[1];
164 * Extract the cord from a position:
165 CORD CORD_pos_to_cord(CORD_pos p);
167 * Extract the current index from a position:
168 size_t CORD_pos_to_index(CORD_pos p);
170 * Fetch the character located at the given position:
171 char CORD_pos_fetch(CORD_pos p);
173 * Initialize the position to refer to the given cord and index.
174 * Note that this is the most expensive function on positions:
175 void CORD_set_pos(CORD_pos p, CORD x, size_t i);
177 * Advance the position to the next character.
178 * P must be initialized and valid.
179 * Invalidates p if past end:
180 void CORD_next(CORD_pos p);
182 * Move the position to the preceding character.
183 * P must be initialized and valid.
184 * Invalidates p if past beginning:
185 void CORD_prev(CORD_pos p);
187 * Is the position valid, i.e. inside the cord?
188 int CORD_pos_valid(CORD_pos p);
190 # define CORD_FOR(pos, cord) \
191 for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))
194 /* An out of memory handler to call. May be supplied by client. */
195 /* Must not return. */
196 extern void (* CORD_oom_fn
)(void);
198 /* Dump the representation of x to stdout in an implementation defined */
199 /* manner. Intended for debugging only. */
200 void CORD_dump(CORD x
);
202 /* The following could easily be implemented by the client. They are */
203 /* provided in cordxtra.c for convenience. */
205 /* Concatenate a character to the end of a cord. */
206 CORD
CORD_cat_char(CORD x
, char c
);
208 /* Concatenate n cords. */
209 CORD
CORD_catn(int n
, /* CORD */ ...);
211 /* Return the character in CORD_substr(x, i, 1) */
212 char CORD_fetch(CORD x
, size_t i
);
214 /* Return < 0, 0, or > 0, depending on whether x < y, x = y, x > y */
215 int CORD_cmp(CORD x
, CORD y
);
217 /* A generalization that takes both starting positions for the */
218 /* comparison, and a limit on the number of characters to be compared. */
219 int CORD_ncmp(CORD x
, size_t x_start
, CORD y
, size_t y_start
, size_t len
);
221 /* Find the first occurrence of s in x at position start or later. */
222 /* Return the position of the first character of s in x, or */
223 /* CORD_NOT_FOUND if there is none. */
224 size_t CORD_str(CORD x
, size_t start
, CORD s
);
226 /* Return a cord consisting of i copies of (possibly NUL) c. Dangerous */
227 /* in conjunction with CORD_to_char_star. */
228 /* The resulting representation takes constant space, independent of i. */
229 CORD
CORD_chars(char c
, size_t i
);
230 # define CORD_nul(i) CORD_chars('\0', (i))
232 /* Turn a file into cord. The file must be seekable. Its contents */
233 /* must remain constant. The file may be accessed as an immediate */
234 /* result of this call and/or as a result of subsequent accesses to */
235 /* the cord. Short files are likely to be immediately read, but */
236 /* long files are likely to be read on demand, possibly relying on */
237 /* stdio for buffering. */
238 /* We must have exclusive access to the descriptor f, i.e. we may */
239 /* read it at any time, and expect the file pointer to be */
240 /* where we left it. Normally this should be invoked as */
241 /* CORD_from_file(fopen(...)) */
242 /* CORD_from_file arranges to close the file descriptor when it is no */
243 /* longer needed (e.g. when the result becomes inaccessible). */
244 /* The file f must be such that ftell reflects the actual character */
245 /* position in the file, i.e. the number of characters that can be */
246 /* or were read with fread. On UNIX systems this is always true. On */
247 /* MS Windows systems, f must be opened in binary mode. */
248 CORD
CORD_from_file(FILE * f
);
250 /* Equivalent to the above, except that the entire file will be read */
251 /* and the file pointer will be closed immediately. */
252 /* The binary mode restriction from above does not apply. */
253 CORD
CORD_from_file_eager(FILE * f
);
255 /* Equivalent to the above, except that the file will be read on demand.*/
256 /* The binary mode restriction applies. */
257 CORD
CORD_from_file_lazy(FILE * f
);
259 /* Turn a cord into a C string. The result shares no structure with */
260 /* x, and is thus modifiable. */
261 char * CORD_to_char_star(CORD x
);
263 /* Turn a C string into a CORD. The C string is copied, and so may */
264 /* subsequently be modified. */
265 CORD
CORD_from_char_star(const char *s
);
267 /* Identical to the above, but the result may share structure with */
268 /* the argument and is thus not modifiable. */
269 const char * CORD_to_const_char_star(CORD x
);
271 /* Write a cord to a file, starting at the current position. No */
272 /* trailing NULs are newlines are added. */
273 /* Returns EOF if a write error occurs, 1 otherwise. */
274 int CORD_put(CORD x
, FILE * f
);
276 /* "Not found" result for the following two functions. */
277 # define CORD_NOT_FOUND ((size_t)(-1))
279 /* A vague analog of strchr. Returns the position (an integer, not */
280 /* a pointer) of the first occurrence of (char) c inside x at position */
281 /* i or later. The value i must be < CORD_len(x). */
282 size_t CORD_chr(CORD x
, size_t i
, int c
);
284 /* A vague analog of strrchr. Returns index of the last occurrence */
285 /* of (char) c inside x at position i or earlier. The value i */
286 /* must be < CORD_len(x). */
287 size_t CORD_rchr(CORD x
, size_t i
, int c
);
290 /* The following are also not primitive, but are implemented in */
291 /* cordprnt.c. They provide functionality similar to the ANSI C */
292 /* functions with corresponding names, but with the following */
293 /* additions and changes: */
294 /* 1. A %r conversion specification specifies a CORD argument. Field */
295 /* width, precision, etc. have the same semantics as for %s. */
296 /* (Note that %c,%C, and %S were already taken.) */
297 /* 2. The format string is represented as a CORD. */
298 /* 3. CORD_sprintf and CORD_vsprintf assign the result through the 1st */ /* argument. Unlike their ANSI C versions, there is no need to guess */
299 /* the correct buffer size. */
300 /* 4. Most of the conversions are implement through the native */
301 /* vsprintf. Hence they are usually no faster, and */
302 /* idiosyncracies of the native printf are preserved. However, */
303 /* CORD arguments to CORD_sprintf and CORD_vsprintf are NOT copied; */
304 /* the result shares the original structure. This may make them */
305 /* very efficient in some unusual applications. */
306 /* The format string is copied. */
307 /* All functions return the number of characters generated or -1 on */
308 /* error. This complies with the ANSI standard, but is inconsistent */
309 /* with some older implementations of sprintf. */
311 /* The implementation of these is probably less portable than the rest */
312 /* of this package. */
318 int CORD_sprintf(CORD
* out
, CORD format
, ...);
319 int CORD_vsprintf(CORD
* out
, CORD format
, va_list args
);
320 int CORD_fprintf(FILE * f
, CORD format
, ...);
321 int CORD_vfprintf(FILE * f
, CORD format
, va_list args
);
322 int CORD_printf(CORD format
, ...);
323 int CORD_vprintf(CORD format
, va_list args
);
325 #endif /* CORD_NO_IO */