2 * Copyright (c) 1993-1994 by Xerox Corporation. All rights reserved.
4 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
5 * OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
7 * Permission is hereby granted to use or copy this program
8 * for any purpose, provided the above notices are retained on all copies.
9 * Permission to modify the code and to distribute modified code is granted,
10 * provided the above notices are retained, and a notice that the code was
11 * modified is included with the above copyright notice.
13 * Author: Hans-J. Boehm (boehm@parc.xerox.com)
16 * These are functions on cords that do not need to understand their
17 * implementation. They serve also serve as example client code for
20 /* Boehm, December 8, 1995 1:53 pm PST */
27 # define I_HIDE_POINTERS /* So we get access to allocation lock. */
28 /* We use this for lazy file reading, */
29 /* so that we remain independent */
30 /* of the threads primitives. */
33 /* For now we assume that pointer reads and writes are atomic, */
34 /* i.e. another thread always sees the state before or after */
35 /* a write. This might be false on a Motorola M68K with */
36 /* pointers that are not 32-bit aligned. But there probably */
37 /* aren't too many threads packages running on those. */
38 # define ATOMIC_WRITE(x,y) (x) = (y)
39 # define ATOMIC_READ(x) (*(x))
41 /* The standard says these are in stdio.h, but they aren't always: */
49 # define BUFSZ 2048 /* Size of stack allocated buffers when */
50 /* we want large buffers. */
52 typedef void (* oom_fn
)(void);
54 # define OUT_OF_MEMORY { if (CORD_oom_fn != (oom_fn) 0) (*CORD_oom_fn)(); \
55 ABORT("Out of memory\n"); }
56 # define ABORT(msg) { fprintf(stderr, "%s\n", msg); abort(); }
58 CORD
CORD_cat_char(CORD x
, char c
)
60 register char * string
;
62 if (c
== '\0') return(CORD_cat(x
, CORD_nul(1)));
63 string
= GC_MALLOC_ATOMIC(2);
64 if (string
== 0) OUT_OF_MEMORY
;
67 return(CORD_cat_char_star(x
, string
, 1));
70 CORD
CORD_catn(int nargs
, ...)
72 register CORD result
= CORD_EMPTY
;
76 va_start(args
, nargs
);
77 for (i
= 0; i
< nargs
; i
++) {
78 register CORD next
= va_arg(args
, CORD
);
79 result
= CORD_cat(result
, next
);
91 int CORD_fill_proc(char c
, void * client_data
)
93 register CORD_fill_data
* d
= (CORD_fill_data
*)client_data
;
94 register size_t count
= d
-> count
;
96 (d
-> buf
)[count
] = c
;
98 if (count
>= d
-> len
) {
105 int CORD_batched_fill_proc(const char * s
, void * client_data
)
107 register CORD_fill_data
* d
= (CORD_fill_data
*)client_data
;
108 register size_t count
= d
-> count
;
109 register size_t max
= d
-> len
;
110 register char * buf
= d
-> buf
;
111 register const char * t
= s
;
113 while((buf
[count
] = *t
++) != '\0') {
124 /* Fill buf with len characters starting at i. */
125 /* Assumes len characters are available. */
126 void CORD_fill_buf(CORD x
, size_t i
, size_t len
, char * buf
)
133 (void)CORD_iter5(x
, i
, CORD_fill_proc
, CORD_batched_fill_proc
, &fd
);
136 int CORD_cmp(CORD x
, CORD y
)
140 register size_t avail
, yavail
;
142 if (y
== CORD_EMPTY
) return(x
!= CORD_EMPTY
);
143 if (x
== CORD_EMPTY
) return(-1);
144 if (CORD_IS_STRING(y
) && CORD_IS_STRING(x
)) return(strcmp(x
,y
));
145 CORD_set_pos(xpos
, x
, 0);
146 CORD_set_pos(ypos
, y
, 0);
148 if (!CORD_pos_valid(xpos
)) {
149 if (CORD_pos_valid(ypos
)) {
155 if (!CORD_pos_valid(ypos
)) {
158 if ((avail
= CORD_pos_chars_left(xpos
)) <= 0
159 || (yavail
= CORD_pos_chars_left(ypos
)) <= 0) {
160 register char xcurrent
= CORD_pos_fetch(xpos
);
161 register char ycurrent
= CORD_pos_fetch(ypos
);
162 if (xcurrent
!= ycurrent
) return(xcurrent
- ycurrent
);
166 /* process as many characters as we can */
169 if (avail
> yavail
) avail
= yavail
;
170 result
= strncmp(CORD_pos_cur_char_addr(xpos
),
171 CORD_pos_cur_char_addr(ypos
), avail
);
172 if (result
!= 0) return(result
);
173 CORD_pos_advance(xpos
, avail
);
174 CORD_pos_advance(ypos
, avail
);
179 int CORD_ncmp(CORD x
, size_t x_start
, CORD y
, size_t y_start
, size_t len
)
183 register size_t count
;
184 register long avail
, yavail
;
186 CORD_set_pos(xpos
, x
, x_start
);
187 CORD_set_pos(ypos
, y
, y_start
);
188 for(count
= 0; count
< len
;) {
189 if (!CORD_pos_valid(xpos
)) {
190 if (CORD_pos_valid(ypos
)) {
196 if (!CORD_pos_valid(ypos
)) {
199 if ((avail
= CORD_pos_chars_left(xpos
)) <= 0
200 || (yavail
= CORD_pos_chars_left(ypos
)) <= 0) {
201 register char xcurrent
= CORD_pos_fetch(xpos
);
202 register char ycurrent
= CORD_pos_fetch(ypos
);
203 if (xcurrent
!= ycurrent
) return(xcurrent
- ycurrent
);
208 /* process as many characters as we can */
211 if (avail
> yavail
) avail
= yavail
;
213 if (count
> len
) avail
-= (count
- len
);
214 result
= strncmp(CORD_pos_cur_char_addr(xpos
),
215 CORD_pos_cur_char_addr(ypos
), (size_t)avail
);
216 if (result
!= 0) return(result
);
217 CORD_pos_advance(xpos
, (size_t)avail
);
218 CORD_pos_advance(ypos
, (size_t)avail
);
224 char * CORD_to_char_star(CORD x
)
226 register size_t len
= CORD_len(x
);
227 char * result
= GC_MALLOC_ATOMIC(len
+ 1);
229 if (result
== 0) OUT_OF_MEMORY
;
230 CORD_fill_buf(x
, 0, len
, result
);
235 CORD
CORD_from_char_star(const char *s
)
238 size_t len
= strlen(s
);
240 if (0 == len
) return(CORD_EMPTY
);
241 result
= GC_MALLOC_ATOMIC(len
+ 1);
242 if (result
== 0) OUT_OF_MEMORY
;
243 memcpy(result
, s
, len
+1);
247 const char * CORD_to_const_char_star(CORD x
)
249 if (x
== 0) return("");
250 if (CORD_IS_STRING(x
)) return((const char *)x
);
251 return(CORD_to_char_star(x
));
254 char CORD_fetch(CORD x
, size_t i
)
258 CORD_set_pos(xpos
, x
, i
);
259 if (!CORD_pos_valid(xpos
)) ABORT("bad index?");
260 return(CORD_pos_fetch(xpos
));
264 int CORD_put_proc(char c
, void * client_data
)
266 register FILE * f
= (FILE *)client_data
;
268 return(putc(c
, f
) == EOF
);
271 int CORD_batched_put_proc(const char * s
, void * client_data
)
273 register FILE * f
= (FILE *)client_data
;
275 return(fputs(s
, f
) == EOF
);
279 int CORD_put(CORD x
, FILE * f
)
281 if (CORD_iter5(x
, 0, CORD_put_proc
, CORD_batched_put_proc
, f
)) {
289 size_t pos
; /* Current position in the cord */
290 char target
; /* Character we're looking for */
293 int CORD_chr_proc(char c
, void * client_data
)
295 register chr_data
* d
= (chr_data
*)client_data
;
297 if (c
== d
-> target
) return(1);
302 int CORD_rchr_proc(char c
, void * client_data
)
304 register chr_data
* d
= (chr_data
*)client_data
;
306 if (c
== d
-> target
) return(1);
311 int CORD_batched_chr_proc(const char *s
, void * client_data
)
313 register chr_data
* d
= (chr_data
*)client_data
;
314 register char * occ
= strchr(s
, d
-> target
);
317 d
-> pos
+= strlen(s
);
325 size_t CORD_chr(CORD x
, size_t i
, int c
)
331 if (CORD_iter5(x
, i
, CORD_chr_proc
, CORD_batched_chr_proc
, &d
)) {
334 return(CORD_NOT_FOUND
);
338 size_t CORD_rchr(CORD x
, size_t i
, int c
)
344 if (CORD_riter4(x
, i
, CORD_rchr_proc
, &d
)) {
347 return(CORD_NOT_FOUND
);
351 /* Find the first occurrence of s in x at position start or later. */
352 /* This uses an asymptotically poor algorithm, which should typically */
353 /* perform acceptably. We compare the first few characters directly, */
354 /* and call CORD_ncmp whenever there is a partial match. */
355 /* This has the advantage that we allocate very little, or not at all. */
356 /* It's very fast if there are few close misses. */
357 size_t CORD_str(CORD x
, size_t start
, CORD s
)
360 size_t xlen
= CORD_len(x
);
362 register size_t start_len
;
363 const char * s_start
;
364 unsigned long s_buf
= 0; /* The first few characters of s */
365 unsigned long x_buf
= 0; /* Start of candidate substring. */
366 /* Initialized only to make compilers */
368 unsigned long mask
= 0;
370 register size_t match_pos
;
372 if (s
== CORD_EMPTY
) return(start
);
373 if (CORD_IS_STRING(s
)) {
377 s_start
= CORD_to_char_star(CORD_substr(s
, 0, sizeof(unsigned long)));
380 if (xlen
< start
|| xlen
- start
< slen
) return(CORD_NOT_FOUND
);
382 if (start_len
> sizeof(unsigned long)) start_len
= sizeof(unsigned long);
383 CORD_set_pos(xpos
, x
, start
);
384 for (i
= 0; i
< start_len
; i
++) {
388 s_buf
|= (unsigned char)s_start
[i
];
390 x_buf
|= (unsigned char)CORD_pos_fetch(xpos
);
393 for (match_pos
= start
; ; match_pos
++) {
394 if ((x_buf
& mask
) == s_buf
) {
395 if (slen
== start_len
||
396 CORD_ncmp(x
, match_pos
+ start_len
,
397 s
, start_len
, slen
- start_len
) == 0) {
401 if ( match_pos
== xlen
- slen
) {
402 return(CORD_NOT_FOUND
);
405 x_buf
|= (unsigned char)CORD_pos_fetch(xpos
);
410 void CORD_ec_flush_buf(CORD_ec x
)
412 register size_t len
= x
[0].ec_bufptr
- x
[0].ec_buf
;
415 if (len
== 0) return;
416 s
= GC_MALLOC_ATOMIC(len
+1);
417 memcpy(s
, x
[0].ec_buf
, len
);
419 x
[0].ec_cord
= CORD_cat_char_star(x
[0].ec_cord
, s
, len
);
420 x
[0].ec_bufptr
= x
[0].ec_buf
;
423 void CORD_ec_append_cord(CORD_ec x
, CORD s
)
425 CORD_ec_flush_buf(x
);
426 x
[0].ec_cord
= CORD_cat(x
[0].ec_cord
, s
);
430 char CORD_nul_func(size_t i
, void * client_data
)
432 return((char)(unsigned long)client_data
);
436 CORD
CORD_chars(char c
, size_t i
)
438 return(CORD_from_fn(CORD_nul_func
, (void *)(unsigned long)c
, i
));
441 CORD
CORD_from_file_eager(FILE * f
)
450 /* Append the right number of NULs */
451 /* Note that any string of NULs is rpresented in 4 words, */
452 /* independent of its length. */
453 register size_t count
= 1;
455 CORD_ec_flush_buf(ecord
);
456 while ((c
= getc(f
)) == 0) count
++;
457 ecord
[0].ec_cord
= CORD_cat(ecord
[0].ec_cord
, CORD_nul(count
));
460 CORD_ec_append(ecord
, c
);
463 return(CORD_balance(CORD_ec_to_cord(ecord
)));
466 /* The state maintained for a lazily read file consists primarily */
467 /* of a large direct-mapped cache of previously read values. */
468 /* We could rely more on stdio buffering. That would have 2 */
470 /* 1) Empirically, not all fseek implementations preserve the */
471 /* buffer whenever they could. */
472 /* 2) It would fail if 2 different sections of a long cord */
473 /* were being read alternately. */
474 /* We do use the stdio buffer for read ahead. */
475 /* To guarantee thread safety in the presence of atomic pointer */
476 /* writes, cache lines are always replaced, and never modified in */
479 # define LOG_CACHE_SZ 14
480 # define CACHE_SZ (1 << LOG_CACHE_SZ)
481 # define LOG_LINE_SZ 9
482 # define LINE_SZ (1 << LOG_LINE_SZ)
487 /* data[i%LINE_SZ] = ith char in file if tag = i/LINE_SZ */
492 size_t lf_current
; /* Current file pointer value */
493 cache_line
* volatile lf_cache
[CACHE_SZ
/LINE_SZ
];
496 # define MOD_CACHE_SZ(n) ((n) & (CACHE_SZ - 1))
497 # define DIV_CACHE_SZ(n) ((n) >> LOG_CACHE_SZ)
498 # define MOD_LINE_SZ(n) ((n) & (LINE_SZ - 1))
499 # define DIV_LINE_SZ(n) ((n) >> LOG_LINE_SZ)
500 # define LINE_START(n) ((n) & ~(LINE_SZ - 1))
504 size_t file_pos
; /* Position of needed character. */
505 cache_line
* new_cache
;
508 /* Executed with allocation lock. */
509 static char refill_cache(client_data
)
510 refill_data
* client_data
;
512 register lf_state
* state
= client_data
-> state
;
513 register size_t file_pos
= client_data
-> file_pos
;
514 FILE *f
= state
-> lf_file
;
515 size_t line_start
= LINE_START(file_pos
);
516 size_t line_no
= DIV_LINE_SZ(MOD_CACHE_SZ(file_pos
));
517 cache_line
* new_cache
= client_data
-> new_cache
;
519 if (line_start
!= state
-> lf_current
520 && fseek(f
, line_start
, SEEK_SET
) != 0) {
521 ABORT("fseek failed");
523 if (fread(new_cache
-> data
, sizeof(char), LINE_SZ
, f
)
524 <= file_pos
- line_start
) {
525 ABORT("fread failed");
527 new_cache
-> tag
= DIV_LINE_SZ(file_pos
);
528 /* Store barrier goes here. */
529 ATOMIC_WRITE(state
-> lf_cache
[line_no
], new_cache
);
530 state
-> lf_current
= line_start
+ LINE_SZ
;
531 return(new_cache
->data
[MOD_LINE_SZ(file_pos
)]);
534 char CORD_lf_func(size_t i
, void * client_data
)
536 register lf_state
* state
= (lf_state
*)client_data
;
537 register cache_line
* volatile * cl_addr
=
538 &(state
-> lf_cache
[DIV_LINE_SZ(MOD_CACHE_SZ(i
))]);
539 register cache_line
* cl
= (cache_line
*)ATOMIC_READ(cl_addr
);
541 if (cl
== 0 || cl
-> tag
!= DIV_LINE_SZ(i
)) {
547 rd
.new_cache
= GC_NEW_ATOMIC(cache_line
);
548 if (rd
.new_cache
== 0) OUT_OF_MEMORY
;
549 return((char)(GC_word
)
550 GC_call_with_alloc_lock((GC_fn_type
) refill_cache
, &rd
));
552 return(cl
-> data
[MOD_LINE_SZ(i
)]);
556 void CORD_lf_close_proc(void * obj
, void * client_data
)
558 if (fclose(((lf_state
*)obj
) -> lf_file
) != 0) {
559 ABORT("CORD_lf_close_proc: fclose failed");
563 CORD
CORD_from_file_lazy_inner(FILE * f
, size_t len
)
565 register lf_state
* state
= GC_NEW(lf_state
);
568 if (state
== 0) OUT_OF_MEMORY
;
570 /* Dummy read to force buffer allocation. */
571 /* This greatly increases the probability */
572 /* of avoiding deadlock if buffer allocation */
573 /* is redirected to GC_malloc and the */
574 /* world is multithreaded. */
577 (void) fread(buf
, 1, 1, f
);
580 state
-> lf_file
= f
;
581 for (i
= 0; i
< CACHE_SZ
/LINE_SZ
; i
++) {
582 state
-> lf_cache
[i
] = 0;
584 state
-> lf_current
= 0;
585 GC_REGISTER_FINALIZER(state
, CORD_lf_close_proc
, 0, 0, 0);
586 return(CORD_from_fn(CORD_lf_func
, state
, len
));
589 CORD
CORD_from_file_lazy(FILE * f
)
593 if (fseek(f
, 0l, SEEK_END
) != 0) {
594 ABORT("Bad fd argument - fseek failed");
596 if ((len
= ftell(f
)) < 0) {
597 ABORT("Bad fd argument - ftell failed");
600 return(CORD_from_file_lazy_inner(f
, (size_t)len
));
603 # define LAZY_THRESHOLD (128*1024 + 1)
605 CORD
CORD_from_file(FILE * f
)
609 if (fseek(f
, 0l, SEEK_END
) != 0) {
610 ABORT("Bad fd argument - fseek failed");
612 if ((len
= ftell(f
)) < 0) {
613 ABORT("Bad fd argument - ftell failed");
616 if (len
< LAZY_THRESHOLD
) {
617 return(CORD_from_file_eager(f
));
619 return(CORD_from_file_lazy_inner(f
, (size_t)len
));