docs/refman.tex.bak

   1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2 %
   3 %  Here are some abbreviations used within this document
   4 %
   5 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   6 \newfont{\bsf}{cmssbx10}
   7 \newfont{\largebsf}{cmssbx12}
   8 \newfont{\nontermfont}{cmssbx10}
   9 %\newcommand{\nontermfont}{\sf}
  10 \newcommand{\codefont}{\tt}
  11 \newcommand{\symfont}{\bf}
  12
  13 \newcommand{\Prop}{{\bsf Prop}}               % typeset in sans serif
  14 \newcommand{\LargeProp}{{\largebsf Prop}}
  15 \newcommand{\plusplus}{{\tt ++}}              % ++
  16 \newcommand{\Cpp}{{C\plusplus}}               % C++ logo
  17 \newcommand{\Gpp}{{G\plusplus}}               % G++ logo
  18 \newcommand{\email}{{\tt leunga@cs.nyu.edu}}  % my email address
  19 \newcommand{\tilda}{\char126}
  20 \newcommand{\web}{{\tt http://valis.cs.nyu.edu:8888/\tilda leunga}} % my web page
  21 \newcommand{\propweb}{\web{\tt/prop.html}}
  22 \newcommand{\Version}{2.3.0}                  % Prop version
  23 \newcommand{\comment}[1]{}
  24
  25 \newcommand{\sectionfont}{}
  26 %\newcommand{\Section}[1]{\section{\sectionfont #1}}
  27 %\newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
  28 %\newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
  29
  30 \newcommand{\Section}[1]{\pagebreak\section{\sectionfont #1}%
  31    \markright{\usebox{\LINE}\large \thesection. \sc #1}
  32    }
  33 \newcommand{\Subsection}[1]{\subsection{\sectionfont #1}}
  34 \newcommand{\Subsubsection}[1]{\subsubsection{\sectionfont #1}}
  35
  36 \newcommand{\N}[1]{\mbox{{\nontermfont #1}}}
  37 \newcommand{\T}[1]{{\codefont #1}}
  38 \newcommand{\MORE}[2]{#1\T{#2} \mbox{$\ldots$} \T{#2} #1}
  39 \newcommand{\ALT}[2]{#1\ \T{#2} \mbox{$\ldots$} \T{#2} #1}
  40 \newcommand{\LIST}[1]{\MORE{#1}{,}}
  41 \newcommand{\SEQ}[1]{\MORE{#1}{;}}
  42 \newcommand{\OPT}[1]{{\it [ #1 ]}}
  43 \newcommand{\C}[1]{\mbox{\rm #1}}
  44 \newcommand{\IS}{{\symfont ::=}}
  45 \newcommand{\OR}{\ \mbox{$\symfont \mid$}\ }
  46 \newcommand{\Id}{\N{Id}\ }
  47 \newcommand{\Pat}{\N{Pat}\ }
  48 \newcommand{\Exp}{\N{Exp}\ }
  49 \newcommand{\TypeExp}{\N{Type\_Exp}\ }
  50 \newenvironment{syntax}{\begin{quotation}\noindent\begin{tabular}{lcll}} %
  51    {\end{tabular}\end{quotation}}
  52
  53 \newcommand{\CLASSDEF}[1]{\T{#1}\index{Classes!{\codefont #1}}}
  54 \newcommand{\INDEX}[1]{\index{#1}}
  55 \newcommand{\NDEF}[1]{\N{#1}\index{Syntax!Non--terminals!{\nontermfont #1}}}
  56 \newcommand{\OPTIONDEF}[1]{{\tt #1}\index{Command line options!{\tt #1}}}
  57 \newcommand{\TDEF}[1]{\T{#1}\index{Syntax!Keywords!{\codefont #1}}}
  58 \newcommand{\INCOMPLETE}{{\em This section is incomplete.}}
  59 \newenvironment{Error}{\begin{center}\begin{tabular}{p{3in}p{3in}}
  60   \bf Error & \bf Explanation}{\\\hline\end{tabular}\end{center}}
  61 \newcommand{\errormsg}[1]{\\\hline {\tt #1}&}
  62 \newenvironment{Tips}{\noindent {\bf Tips:} \quad}{}
  63
  64 \newcommand{\support}{
  65    {\small\noindent This work is supported in part by
  66     an award from Hewlett-Packard Corporation, from IBM corporation, and
  67     an NYU Research Challenge Grant.
  68    }}
  69 \newcommand{\warning}{
  70    {\small\noindent The author will neither assume responsibility for any
  71    damages caused by the use of this product, nor accept warranty or update
  72    claims.  This product is distributed in the hope that it will be useful,
  73    but WITHOUT ANY WARRANTY; without even the implied warranty of
  74    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  75
  76    \small\noindent This product is in the public domain,
  77    and may be freely distributed.
  78
  79    \small\noindent {\sf Prop} is a research prototype.
  80     The information contained in this document is subject
  81     to change without notice.
  82    }}
  83
  84    \makeindex
  85
  86 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  87 %
  88 %   Beginning of the document
  89 %
  90 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  91 \documentclass{article}
  92
  93    \pagestyle{myheadings}
  94
  95    \title{{\Large \bf Prop \\ Language Reference Manual} \\
  96              {\large \bf (for version \Version)}
  97          }
  98    \author{Allen Leung \\
  99            email: \email \\
 100            \web \\
 101            Courant Institute of Mathematical Sciences \\
 102            251 Mercer Street \\
 103            New York, NY 10012 \\
 104           }
 105
 106    \setlength{\textwidth}{6.6in}
 107    \setlength{\evensidemargin}{0in}
 108    \setlength{\oddsidemargin}{0in}
 109    \setlength{\textheight}{8.75in}
 110    \setlength{\topmargin}{-0.5in}
 111    \setlength{\parskip}{1mm}
 112    \setlength{\parindent}{3ex}
 113    \setlength{\itemsep}{0.5ex}
 114
 115    \newsavebox{\LINE}
 116    \savebox{\LINE}[0.0in][l]{\rule[-1ex]{\textwidth}{.01in}}
 117 \begin{document}
 118
 119    \maketitle
 120    \vspace{\fill}
 121    %\support
 122    \warning
 123    \titlepage
 124    \tableofcontents
 125    \bibliographystyle{alpha}
 126
 127 \Section{Introduction} \label{sec:intro}
 128
 129    This is the language reference manual for \Prop, a compiler generator.
 130 \Prop{} can be used to generate lexers, parsers, analysis and
 131 transformation tools using pattern matching and rewriting.
 132 The \Prop{} language is an extension of \Cpp\cite{C++} and
 133 \Cpp{} programs generated from \Prop{} specifications can be compiled
 134 into efficient code.
 135
 136    Version \Version{} of \Prop{} contains the following major features:
 137 \begin{itemize}
 138    \item Algebraic datatypes and pattern matching, and automatic
 139 mapping of algebraic datatypes into \Cpp{} class hierarchies.
 140    \item A LALR(1) parser generator in the tradition of {\it yacc} and
 141 {\it bison}.  Each grammar is encapsulated into a parser class and
 142 it is possible to have multiple instances of a parser.
 143    \item Regular expression string matching and lexer generation.
 144    \item Transformation of algebraic datatypes using bottom-up tree rewriting.
 145 \end{itemize}
 146
 147    In addition, we intend to implement/fix-up the following features in the
 148 near future:
 149 \begin{itemize}
 150    \item Persistence.
 151    \item Pretty printing/reading.
 152    \item Visualization of data structures using {\em vcg}\cite{vcg,vcg-manual}.
 153    \item Automatic mapping of graph structure specifications into
 154 \Cpp{} classes.
 155    \item Graph rewriting system generation.
 156 \end{itemize}
 157
 158 \Subsection{Availability}
 159
 160    The latest release of \Prop{} \Version{} is available from
 161 \begin{quotation}
 162 \propweb
 163 \end{quotation}
 164
 165 \Subsection{Organization of this Reference Manual}
 166    This reference manual is organized as follows.  Section~\ref{sec:general}
 167 overviews the general features of \Prop.
 168 Section~\ref{sec:parser}
 169 describes the lexer and parser specification formalisms.
 170 Section~\ref{sec:datatype} describes the algebraic datatypes and
 171 pattern matching features.  Section~\ref{sec:tree-rewriting} describes
 172 the tree rewriting mechanism.  Finally, section~\ref{sec:usage} describes
 173 the process of running the \Prop{} translator.
 174
 175    We'll use the following extended BNF notation to specify the syntax
 176 of the language.  Terminals are typeset in {\tt typewriter} font,
 177 while non-terminals are typeset in {\nontermfont sans serif} font.
 178 Given a syntactic class \N{S}, we use \LIST{\N{S}} to denote one
 179 or more occurrences of \N{S} separated by commas.  Similarly,
 180 \SEQ{\N{S}} denotes one or more occurrences of \N{S} separated by
 181 semi-colons.  Slanted brackets are used to denote an optional occurrence
 182 of a syntactic class.   We'll combined the two forms when we
 183 want to denote zero or more occurrences of some term.
 184 For example, \OPT{\LIST{\N{S}}} denotes
 185 zero or more occurrences of \N{S}.
 186
 187 \Section{General Concepts} \label{sec:general}
 188
 189 \Subsection{Syntactic Conventions}
 190
 191 The syntax of \Prop{} is an extension of \Cpp.  Most new constructs
 192 of \Prop{} starts with new keywords, listed below.
 193 \begin{center}
 194 \tt
 195 \begin{tabular}{lllllll}
 196 applicative & arb & as & bagof & begin & bitfield & card \\
 197 category & classof & collectable & constraint & dataflow & datatype & declare \\
 198 dequeof & dom & domain & edges: & elsif & end & equiv: \\
 199 exists & expect: & feature & finalizable & forall & fun & function \\
 200 functor & graphrewrite & graphtype & implies: & inference & inherited & inline \\
 201 instantiate & is & law & left: & less & lexeme & listof \\
 202 loop & mapof & match & matchall & matchscan & module & multimapof \\
 203 mutable & nodes: & of & persistent & prec: & printable & priqueueof \\
 204 procedure & queueof & ran & refine & relation & rewrite & right: \\
 205 setof & sharing & signature & space: & syntax & synthesized & then \\
 206 time: & topdown & traced & treeparser & tupleof & unifiable & unique \\
 207 view & where & with & xor: & bottomup: & topdown: & before: \\
 208 after: & cutrewrite \\
 209 \end{tabular}
 210 \end{center}
 211
 212    In addition, the following new multi-character symbols are added:
 213 \begin{verbatim}
 214    <->  <=>  :=  :-  .(   .[
 215    #[   #(   #{  [|  |]   (|  |)  {|  |}
 216 \end{verbatim}
 217
 218 \Subsection{Basic syntactic classes}
 219
 220
 221 We'll use \NDEF{Integer}, \NDEF{Char}, \NDEF{Real}, \NDEF{String}
 222 and \NDEF{Id} to denote the syntactic classes of
 223 integers, characters, real numbers, strings and identifiers, respectively.
 224 As in \Cpp, characters are quoted with the single quotes \verb|'|, while
 225 strings are quoted with double quotes \verb|"|.  An identifier is
 226 any sequence of alphanumeric of characters that begins with a letter.
 227 In addition, we'll use the syntactic class
 228 \NDEF{Stmt} to denote any valid combination of \Cpp{} and \Prop{}
 229 statements.
 230
 231 \Subsection{Literals}
 232
 233   In addition to the usual \Cpp{} literals, \Prop{} introduces two new type
 234 of literals: {\em quark literals} and {\em regular expression literals}.
 235 Quark literals are of type \T{Quark} and are written as a string
 236 prefixed by the \verb|#| symbol.
 237 \begin{syntax}
 238   \NDEF{Quark} & \IS & \T{\#}\N{String} \\
 239 \end{syntax}
 240
 241 Quarks act like atoms in Lisp, in which name equality implies
 242 pointer equality.  In contrast, in \Cpp{} two string literals that are
 243 identical in value do not necessarily reside in the same address.
 244 Given strings \verb|s1| and \verb|s2|, \verb|strcmp(s1,s2) == 0| does
 245 not imply \verb|s1 == s2|.  Quarks are defined in the library include file
 246 \verb|<AD/strings/quark.h>|.
 247
 248 Regular expression literals are like string literals except
 249 that they are quoted by slashes \verb|/|.   We'll discuss
 250 the regular expression in section~\ref{sec:regular-expressions}.
 251
 252 \Subsection{The \Prop{} Language}
 253
 254 The basic \Prop{} language is the same as \Cpp{} with the following
 255 syntactic extensions:
 256 \begin{quotation}
 257 \begin{tabular}{lll}
 258          & \bf Keywords              & \bf Functions \\
 259   \bsf 1 & \T{datatype} \T{refine} \T{instantiate}
 260                                      & Algebraic datatype definitions \\
 261   \bsf 2 & \T{match} \T{matchall}    & Pattern matching \\
 262   \bsf 3 & \T{rewrite}               & Rewriting \\
 263   \bsf 4 & \T{lexeme}                & Lexical category definition \\
 264   \bsf 5 & \T{matchscan}             & Lexical scanning \\
 265   \bsf 6 & \T{syntax}                & Parser specification \\
 266   \bsf 7 & \T{graphtype}             & Graph data structure specification \\
 267   \bsf 8 & \T{graphrewrite}          & Graph rewriting \\
 268 \end{tabular}
 269 \end{quotation}
 270
 271 Like \Cpp, a \Prop{} is typically divided into the {\em specification},
 272 which defines the data structure and its interface, and the
 273 {\em implementation} parts.   \Prop{} specifications should
 274 be placed in a file with a \verb|.ph| suffix, while an implementation
 275 should be placed in a file with a \verb|.pC|\footnote{The suffixes {\tt .pcc},
 276 {\tt .pCpp} etc. can also be used.}.  The translator will convert each
 277 \verb|.p|$xxx$ into a file with the suffix \verb|.|$xxx$.  The translated
 278 output can then be fed to the \Cpp{} compiler for compilation.
 279 \vspace{\fill}
 280
 281 \Section{Lexer and Parser Generation} \label{sec:parser}
 282
 283 \Subsection{Lexer Specification}
 284    Lexical analyzers are specified using the \verb|matchscan|
 285 statement.  This construct is responsible for generating the actual string
 286 matching DFA.  The actual buffering mechanisms are provided by the
 287 classes \verb|LexerBuffer|, \verb|IOLexerBuffer| and \verb|IOLexerStack|.
 288 These classes are part of the support library distributed with \Prop.
 289
 290    We assume that the user is familiar with lexer generator tools like
 291 {\em lex}\cite{lex} or {\em flex}\cite{flex}.
 292
 293 \Subsubsection{Regular expressions} \label{sec:regular-expressions}
 294    Figure~\ref{fig:regexp} describes the set of current
 295 supported regular expressions in \Prop.  The syntax is similar
 296 to what is found in {\em lex}, or {\em egrep}.
 297
 298 \begin{figure}[htbp]
 299 \begin{center}
 300   \begin{tabular}{|l|l|} \hline
 301     $c$                & matches character $c$ if it is not a meta character \\
 302     $e_1e_2$           & matches $e_1$ then $e_2$ \\
 303     \verb|.|           & matches any character except \verb|\n| \\
 304     $\verb|\|c$        & matches escape sequence $c$ \\
 305     \verb|^|$e$        & matches $e$ at the start of the line \\
 306     $e_1\verb.|.e_2$   & matches $e_1$ or $e_2$ \\
 307     $e\verb|*|$        & matches zero or more $e$ \\
 308     $e\verb|+|$        & matches one or more $e$ \\
 309     $e\verb|?|$        & matches zero or one $e$ \\
 310     \verb|(|$e$\verb|)| & grouping \\
 311     \verb|<<|$C_1, C_2 \ldots C_n$\verb|>>|$e$ & matches $e$ only
 312          if we are in a context from one of $C_1, C_2, \ldots C_n$ \\
 313     \verb|{|{\em lexeme}\verb|}| & matches lexeme \\
 314   \hline
 315   \end{tabular}
 316 \end{center}
 317 \caption{\label{fig:regexp} Regular expressions.}
 318 \end{figure}
 319
 320 The symbols \verb%\ [ ] ( ) { } << >> * + . - ? |% are meta characters and are
 321 interpreted non-literally.  The escape character \verb|\|
 322 can be prepended to the meta characters if they occur as literals in context.
 323
 324 Precedence-wise, meta characters \verb|*|, \verb|+| and \verb|?| bind tighter
 325 than juxtaposition.  Thus the regular expression
 326 \verb|ab*| means \verb|a(b*)|.  Parenthesis can be used to override
 327 the default precedence.
 328
 329 Character classes are of the form as found in {\em lex}:
 330 (i) $c_1$\verb|-|$c_2$ denotes the range of characters from
 331 $c_1$ to $c_2$; (ii) single (non-meta) characters denote themselves;
 332 (iii) the meta character \verb|^| can be used to negate the set.
 333 For example, the regular expression \verb|[a-zA-Z][a-zA-Z0-9]*|
 334 specifies an alphanumeric identifier that must starts with a letter.
 335 Similarly, the regular expression \verb|[^ \t\n]| matches any character
 336 except a space, a tab or a newline.
 337
 338 {\em Lexemes} are simply abbreviated names given to a regular expression
 339 pattern.  They act like macros in {\em lex}.
 340
 341 While a lexer is scanning, it may be in one of many {\em contexts}.
 342 Contexts can be used to group a set of related lexical rules; such rules
 343 are only applicable when the contexts are active.  This makes the
 344 lexer behave like a set of DFAs, with the ability to switch between
 345 DFAs under programmer control.
 346
 347
 348 \Subsubsection{Lexeme}
 349
 350 Lexemes are defined using the \T{lexeme} declaration in \Prop.
 351 Its syntax is
 352 \begin{syntax}
 353   \NDEF{Lexeme\_Decl} & \IS & \TDEF{lexeme} \ALT{\N{Lexeme\_Eq}}{|} \T{;} \\
 354   \NDEF{Lexeme\_Eq}   & \IS & \Id \T{=} \N{Regexp} \\
 355 \end{syntax}
 356
 357 For example, the following lexeme definition is used in the \Prop{}
 358 translator to define the lexical structure of common lexical items.
 359
 360 \begin{verbatim}
 361 lexeme digits      = /[0-9]+/
 362      | sign        = /[\+\-]/
 363      | integer     = /{digits}/
 364      | exponent    = /[eE]{sign}?{digits}/
 365      | mantissa    = /({digits}\.{digits}?|\.{digits})/
 366      | real        = /{mantissa}{exponent}?/
 367      | string      = /"([^\\"\n]|\\.)*"/
 368      | character   = /'([^\\'\n]|\\.[0-9a-f]*)'/
 369      | regexp      = /\/([^\/\n*]|\\.)([^\/\n]|\\.)*\//
 370      ;
 371 \end{verbatim}
 372
 373 Note that regular expression literals are written between
 374 slashes: \verb|/|$re$\verb|/|.
 375
 376 \Subsubsection{Lexeme class}
 377
 378 Often we wish to group a set of lexemes together into {\em lexeme classes} if
 379 they logically behave in some uniform manner; for example, if they
 380 act uniformly in a lexical rule.  By grouping related lexemes together
 381 into a class we can refer to them succinctly by their class name.
 382
 383 The syntax of a lexeme class declaration is
 384
 385 \begin{syntax}
 386   \NDEF{Lexeme\_Class\_Decl} & \IS & \TDEF{lexeme class}
 387        \ALT{\N{Lexeme\_Class\_Eq}}{and} \T{;} \\
 388   \NDEF{Lexeme\_Class\_Eq} & \IS & \Id \T{=} \ALT{\N{Lexeme\_Spec}}{|} \\
 389   \NDEF{Lexeme\_Spec} & \IS & \N{String} \\
 390                    & \OR & \Id \N{Regexp} \\
 391 \end{syntax}
 392
 393 The following example,
 394 the lexeme classes \verb|MainKeywords|, \verb|Symbols|, and \verb|Literals|
 395 are defined.
 396 \begin{verbatim}
 397 lexeme class MainKeywords =
 398     "rewrite" | "inference" | "match" | "matchall" | "matchscan"
 399   | "refine"  | "classof" | "type" | "datatype" | "instantiate"
 400   | "lexeme"  | "bitfield" | "begin" | "syntax"
 401   | "dataflow" | "module" | "signature" | "constraint" | "declare"
 402   | "procedure" | "fun" | "function" | "domain"
 403   | "graphtype" | "graphrewrite"
 404
 405 and  Symbols =
 406     ".." | "..." | "<->" | "::" | "&&" | "||" | "++" | "--" | "->"
 407   | "<<" | ">>" | ">=" | "<=" | "+=" | "-=" | "*=" | "/=" | "%=" | "=="
 408   | "!=" | "<<=" | ">>=" | "&=" | "|=" | "^=" | "=>" | "<-" | "<=>"
 409   | ":=" | ":-" | LONG_BAR /\-\-\-\-\-+/
 410
 411 and Literals =
 412     INT_TOK     /{integer}/
 413   | REAL_TOK    /{real}/
 414   | CHAR_TOK    /{character}/
 415   | STRING_TOK  /{string}/
 416   ;
 417 \end{verbatim}
 418
 419 \Subsubsection{Tokens}
 420
 421 Lexical tokens are defined using the \verb|datatype| declaration.
 422 Its syntax is as follows.
 423 \begin{syntax}
 424   \NDEF{Tokens\_Decl} & \IS & \TDEF{datatype} \Id \T{::} \TDEF{lexeme} \T{=} \\
 425       & & \quad \ALT{\N{Token\_Spec}}{|} \T{;} \\
 426   \NDEF{Token\_Spec} & \IS & \TDEF{lexeme class} \Id &
 427        \C{Include a lexeme class} \\
 428                   & \OR & \Id{} \OPT{\N{Regexp}}  &
 429        \C{Single token spec} \\
 430 \end{syntax}
 431
 432 A token datatype is defined by including one of more lexeme classes,
 433 and by defining stand-alone tokens.  If a lexeme class is included,
 434 all the tokens defined within the lexeme class are included.  A \Cpp{}
 435 \T{enum} type of the same name as the token datatype is generated.
 436 If a token is given an identifier name, then the same name is used
 437 as the \T{enum} literal.  On the other hand, if a string is used
 438 to denote a token, then it can be referred to by prefixing the
 439 string with a dot \verb|.|  For example, the token \verb|"=>"| can
 440 be referenced as \verb|."=>"| within a program.
 441
 442 As an example, the following token datatype definition is used within
 443 the \Prop{} translator.  Here, the keywords are first partitioned into
 444 6 different lexeme classes.  In additional, the tokens \T{ID\_TOK},
 445 \T{REGEXP\_TOK}, etc. are defined.
 446
 447 \begin{verbatim}
 448 datatype PropToken :: lexeme =
 449     lexeme class MainKeywords
 450   | lexeme class Keywords
 451   | lexeme class SepKeywords
 452   | lexeme class Symbols
 453   | lexeme class Special
 454   | lexeme class Literals
 455   | ID_TOK       /{patvar}/
 456   | REGEXP_TOK   /{regexp}/
 457   | QUARK_TOK    /#{string}/
 458   | BIGINT_TOK   /#{sign}{integer}/
 459   | PUNCTUATIONS /[\<\>\,\.\;\&\|\^\!\~\+\-\*\/\%\?\=\:\\]/
 460   ;
 461 \end{verbatim}
 462
 463 \Subsubsection{The \T{matchscan} statement}\label{sec:matchscan}
 464
 465 The \T{matchscan} statement is used to perform tokenization.  The user can
 466 specify a set of string pattern matching rules within a \T{matchscan}
 467 construct. Given a object of class \verb|LexerBuffer|, the \T{matchscan}
 468 statement looks for the rule that matches the longest prefix from
 469 the input stream and executes the action associated with the rule.
 470 Ties are broken by the lexical ordering of the rules.
 471
 472 The general syntax is as follows:
 473 \begin{syntax}
 474 \NDEF{Matchscan}
 475       & \IS & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
 476              & variant 1 \\
 477       & &  \T{\{} \ALT{\T{case} \N{Matchscan\_Rule}}{} \T{\}} \\
 478       & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}}
 479             \T{(} \N{Exp} \T{)} & variant 2 \\
 480       & &   \T{\{} \ALT{\N{Matchscan\_Rule}}{|} \T{\}} \\
 481       & \OR & \N{Matchscan\_Mode} \OPT{\N{Context\_Spec}} \T{(} \N{Exp} \T{)}
 482             \T{of} & variant 3 \\
 483       & & \quad \ALT{\N{Matchscan\_Rule}}{|} \\
 484       & & \T{end} \T{matchscan} \T{;}\\
 485 \NDEF{Matchscan\_Mode} & \IS &
 486            \TDEF{matchscan} \OPT{\T{while}} & case sensitive\\
 487    & \OR & \TDEF{matchscan*} \OPT{\T{while}}& case insensitive\\
 488 \NDEF{Context\_Spec} & \IS & \T{[} \LIST{\Id} \T{]} \\
 489 \NDEF{Matchscan\_Rule}
 490       & \IS & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
 491       &     & \quad \T{lexeme} \T{class} \N{Id} \T{:} \N{Matchscan\_Action} \\
 492       & \OR & \OPT{\T{<<} \ALT{\N{Context}}{,} \T{>>}} \\
 493       &    &  \quad \N{Regexp} \T{:} \N{Matchscan\_Action} \\
 494 \NDEF{Matchscan\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
 495           & \OR & \N{Code} & for variant 1 only \\
 496 \end{syntax}
 497
 498 The two different modes of operation are \T{matchscan} and \T{matchscan*},
 499 which respectively match strings case sensitively and insensitively.
 500 The modifier \T{where} may optionally specify that the matching process
 501 should repeat until no rules apply, or the end of stream condition
 502 is reached.
 503
 504 By default, if no rules apply and if the input stream is non-empty,
 505 then an error has occurred.  The \T{matchscan} statement will
 506 invoke the method \verb|error()| of the \T{LexerBuffer} object by default.
 507
 508 For example, the following is part of the \Prop{} lexer specification.
 509
 510 \begin{verbatim}
 511    datatype LexicalContext = NONE | C | PROP | COMMENT | ...;
 512
 513 int PropParser::get_token()
 514 {
 515    matchscan[LexicalContext] while (lexbuf)
 516    {
 517       ...
 518    |  <<C>> /[ \t\\\014]/:            { emit(); }
 519    |  <<C>> /(\/\/.*)?\n/:            { emit(); line++; }
 520    |  <<C>> /^#.*/:                   { emit(); }
 521    |  <<PROP>> lexeme class MainKeywords: { return ?lexeme; }
 522    |  <<PROP>> lexeme class SepKeywords: { return ?lexeme; }
 523    |  <<PROP>> QUARK_TOK:             { return QUARK_TOK; }
 524    |  <<PROP>> BIGINT_TOK:            { return BIGINT_TOK; }
 525    |  <<PROP>> REGEXP_TOK:            { return REGEXP_TOK; }
 526    |  <<PROP>> PUNCTUATIONS:          { return lexbuf[0]; }
 527    |  <<PROP>> /[ \t\014]/:           { /* skip */ }
 528    |  <<PROP>> /(\/\/.*)?\n/:         { line++; }
 529    |  /\/\*/:                         { emit(); set_context(COMMENT); }
 530    |  <<COMMENT>> /\*\//:             { emit(); set_context(PROP); }
 531    |  <<COMMENT>> /\n/:               { emit(); line++; }
 532    |  <<COMMENT>> /./:                { emit(); }
 533    |  /./: { error("%Lillegal character %c\n", lexbuf[0]); }
 534    }
 535 }
 536 \end{verbatim}
 537
 538 Here, the lexer is partitioned in multiple lexical contexts: context
 539 \verb|C| deals with \Cpp{} code while context \verb|PROP| deals with
 540 \Prop{} extensions.  The special context \verb|COMMENT| is used
 541 to parse \verb|/* */| delimited comments.  Contexts are changed
 542 using the \verb|set_context| method defined in class \verb|LexerBuffer|.
 543
 544 The special variable \verb|?lexeme| can be used within a rule
 545 that matches a lexeme class.  For example, within the rule
 546 \begin{verbatim}
 547    |  <<PROP>> lexeme class MainKeywords:    { return ?lexeme; }
 548 \end{verbatim}
 549 the variable \verb|?lexeme| is bound to
 550 the token \verb|."rewrite"| if the string ``rewrite'' is matched;
 551 it is bound to the token \verb|."inference"| if the string
 552 ``inference'' is matched and so on.
 553
 554 \Subsubsection{Class \T{LexerBuffer}}
 555 We'll next describe the class \T{LexerBuffer} and its subclasses.
 556
 557 Class \CLASSDEF{LexerBuffer}
 558 is the base class in the lexical buffer hierarchy.
 559 It is defined in the library include file \verb|<AD/automata/lexerbuf.h>|.
 560 This class is responsible for implementing a string buffer for use
 561 during lexical analysis.
 562
 563 As it stands, it can be used directly if the lexer input is directly
 564 from a string.  Memory management of the buffer is assumed to be handled
 565 by the user.
 566
 567 The class \T{LexerBuffer} has three constructors.  The default
 568 constructor initializes the string buffer to NULL.  The two
 569 other constructors initialize the string buffer to a string given
 570 by the user.  In the case when the length is not supplied, the buffer
 571 is assumed to be \verb|'\0'|-terminated.  The two \verb|set_buffer| methods
 572 can be used to set the current string buffer.  Notice that
 573 all lexical analysis operations are done in place.  The user should
 574 not alter the string buffer directly, but should use the interface
 575 provided by this class instead.
 576
 577 \begin{verbatim}
 578 class LexerBuffer {
 579 public:
 580    LexerBuffer();
 581    LexerBuffer(char *);
 582    LexerBuffer(char *, size_t);
 583    virtual ~LexerBuffer();
 584    virtual void set_buffer (char *, size_t);
 585            void set_buffer (char *);
 586 };
 587 \end{verbatim}
 588
 589 The following methods are used access the string buffer.
 590 Method \T{capacity} returns the size of the buffer.  Method
 591 \T{length} returns the length of the current matched token.
 592 Methods \T{text} can be used to obtain a point to location
 593 of the current matched token.  The string returned is guaranteed
 594 to be \verb|'\0'|-terminated.  Methods \T{operator []} return
 595 the $i$th character of the token.  Finally, method \T{lookahead}
 596 returns the character code of the next character to be matched.
 597
 598 \begin{verbatim}
 599    int capacity () const;
 600    int length   () const;
 601    const char * text () const;
 602          char * text ();
 603    char  operator [] (int i) const;
 604    char& operator [] (int i);
 605    int lookahead () const;
 606    void push_back (int n)
 607 \end{verbatim}
 608
 609 In addition to the string buffer, the class \T{LexerBuffer} keeps
 610 track of two additional types of information: the current context
 611 of the DFA, and whether the next token starts at the
 612 beginning of the line, or in our terminology, whether it is
 613 {\em anchored}.  These are manipulated with the following methods:
 614
 615 \begin{verbatim}
 616    int context    () const;
 617    void set_context (int c = 0);
 618    Bool is_anchored() const;
 619    void set_anchored(Bool a = true);
 620 \end{verbatim}
 621
 622 Finally, the following methods should be redefined by subclasses
 623 to alter the behavior of this class.  By default, the class \T{LexerBuffer}
 624 calls \T{fill\_buffer()} when it reaches the end of the string; subclasses
 625 can use this method to refill the buffer and return the number of
 626 characters read.  Currently, \T{fill\_buffer} is defined to do nothing
 627 and return 0.  When it reaches the end of the file
 628 (i.e. when \T{fill\_buffer()} fails to refill the buffer and the
 629 scanning process finishes), method \T{end\_of\_file} is called.
 630 Currently, this is a no-op.  Finally, the error handling routine
 631 \T{error()} is called with the position of the beginning and the end
 632 of the buffer in which the error occurs.  By default, this routine
 633 prints out the buffer.
 634
 635 \begin{verbatim}
 636 protected:
 637    virtual size_t fill_buffer();
 638    virtual void   end_of_file();
 639    virtual void   error(const char * start, const char * stop);
 640 \end{verbatim}
 641
 642 \Subsubsection{Class \T{IOLexerBuffer}}
 643
 644   Class \CLASSDEF{IOLexerBuffer}
 645 is a derived class of \T{LexerBuffer}.  It
 646 automatically manages an internal buffer and receives input from
 647 an \T{istream}.  It is defined in the file \verb|<AD/automata/iolexerbuf.h>|
 648
 649 This class provides the following additional features:
 650 Constructor \verb|IOLexerBuffer(istream&)| ties the input to a
 651 stream.  If the default constructor is used, then it automatically
 652 ties the input to the stream \verb|cin|.  The method \verb|set_stream|
 653 can be used to set the stream to some other input.
 654
 655 \begin{verbatim}
 656 class IOLexerBuffer : public LexerBuffer {
 657    size_t    buffer_size;  // size of the buffer
 658    istream * input;        // input stream
 659 public:
 660    IOLexerBuffer();
 661    IOLexerBuffer(istream&);
 662    virtual ~IOLexerBuffer();
 663    virtual void set_stream (istream&);
 664 };
 665 \end{verbatim}
 666
 667 By default, class \verb|IOLexerBuffer| reads new data from the input
 668 stream using a line buffering discipline.  This mechanism is suitable
 669 for interactive, but for other applications it may be more efficient
 670 to use block buffered input.  The protected method
 671 \verb|read_buffer| controls this reading behavior; it
 672 is called whenever the data in the string buffer has been
 673 consumed and new input is needed from the stream.  It is
 674 passed the position of the buffer and its remaining capacity,
 675 and it returns the number of new characters that are read.  Subclasses
 676 can redefine this method.
 677
 678 \begin{verbatim}
 679 protected:
 680    virtual size_t read_buffer(char *, size_t);
 681 \end{verbatim}
 682
 683 \Subsubsection{Class \T{IOLexerStack}}
 684
 685 Class \CLASSDEF{IOLexerStack}
 686 is a derived class of \T{LexerStack}.  It provides
 687 a mechanism of reading from a stack of \verb|istream|'s.  Streams
 688 can be pushed and popped from the stack.  The next token is
 689 obtained from the stream from the top of the stack.  The class allows
 690 allows easy implementation of
 691 constructs such as the \verb|#include| file mechanism of the C preprocessor.
 692
 693 The interface of this class is listed below:
 694 \begin{verbatim}
 695 class IOLexerStack : public IOLexerBuffer {
 696 public:
 697    IOLexerStack();
 698    IOLexerStack(istream&);
 699    virtual ~IOLexerStack();
 700
 701    virtual void     push_stream (istream&);
 702    virtual istream& pop_stream  ();
 703 };
 704 \end{verbatim}
 705
 706 \Subsection{Parser Specification}
 707
 708 Parsers are specified as a two phase process:
 709 \begin{enumerate}
 710    \item First a {\em syntax class} is defined.  A syntax class
 711 declaration is like a normal \Cpp{} class declaration and has a similar
 712 syntax, except that \Prop{} will also generate the interface of the parser.
 713 All parsers that \Prop{} generates are encapsulated within a class.  This
 714 makes it easy for the programmer to add additional data for parsing
 715 use, and to have multiple instances of a parser.
 716    \item Secondly, the grammar of the language is defined in a
 717 {\em syntax} declaration as a set of productions,
 718 in a syntax similar to that of {\em yacc}.
 719 \end{enumerate}
 720
 721 We'll describe these two phases.
 722
 723 \Subsubsection{Syntax class}
 724
 725 A syntax class definition specifies an object class that encapsulates
 726 a parser.  Its syntax is the same as the usual \Cpp{} class definition,
 727 except that the prefix \T{syntax} is used:
 728
 729 \begin{syntax}
 730 \NDEF{Syntax\_Class\_Decl} & \IS & \TDEF{syntax class} \Id
 731    \OPT{\T{:} \N{Inherit\_List}} \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
 732 \end{syntax}
 733
 734 This specification automatically generate a class with the following interface
 735 (assuming that the parser class in question is called \T{ParserClass})
 736 \begin{verbatim}
 737 public:
 738    ParserClass();  // constructor
 739 public:
 740    virtual void parse();
 741 \end{verbatim}
 742
 743 The constructor and the method \verb|parse()| will be automatically
 744 generated later.
 745
 746 A parser class expects the following method to be defined:
 747 \begin{verbatim}
 748    int get_token();
 749 \end{verbatim}
 750 \noindent The function of \verb|get_token| is to return the
 751 next token from the lexer stream whenever it is called.
 752 It should return \verb|EOF| if the stream is empty.
 753
 754 \Subsubsection{Syntax declaration}
 755
 756 The grammar of a language is specified in the \T{syntax} declaration.
 757 Its syntax is listed as follows:
 758
 759 \begin{syntax}
 760 \NDEF{Syntax\_Decl} & \IS & \TDEF{syntax} \Id{} \T{\{} \\
 761     & & \quad \OPT{\MORE{\N{Precedence\_Decl}}{}} \\
 762     & & \quad \OPT{\N{Expect\_Decl}} \\
 763     & & \quad \OPT{\MORE{\N{Production\_Rule}}{}} \\
 764     & & \T{\}} \\
 765 \end{syntax}
 766
 767 The name of a \T{syntax} declaration should match that
 768 of a \T{syntax} \T{class}.  A \T{syntax} declaration is divided
 769 into three parts: (i) the operator precedence section, (ii) the
 770 \T{expect:} $n$ section, and (iii) the grammar section.
 771
 772 \Subsubsection{Precedence}
 773
 774 Operator precedences are specified using precedence declarations.
 775 The syntax is:
 776
 777 \begin{syntax}
 778 \NDEF{Precedence\_Decl}
 779     & \IS & \TDEF{left:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
 780           & left associative operators \\
 781     & \OR & \TDEF{right:} \N{Integer} \ALT{\N{Operator}}{} \T{;}
 782     & right associative operators \\
 783 \NDEF{Operator} & \IS & \N{Char}   & single character operator \\
 784              & \OR & \N{String} & multi-character operator \\
 785              & \OR & \N{Cons}   & constructor \\
 786 \end{syntax}
 787
 788 The integer associated with a precedence declaration is the precedence
 789 level of an operator.  The higher the level of an operator, the {\em lower}
 790 its precedence.
 791
 792 For example, the following set of precedences are used in the \Prop{}
 793 language parser:
 794 \begin{verbatim}
 795 left: 23 "as";
 796 left: 22 "::";
 797 left: 21 "||";
 798 left: 20 "equiv:";
 799 left: 19 "xor:";
 800 left: 18 "implies:";
 801 left: 17 "&&" "and";
 802 right: 16 "|=" "&=" "^=" "<<=" ">>=";
 803 right: 15 '=' ":=" "+=" "-=" "*=" "/=" "%=";
 804 left: 14 '|';
 805 left: 13 ':';
 806 left: 12 ';';
 807 left: 11 '^';
 808 left: 10 '&';
 809 left: 9 "==" "!=";
 810 left: 8 '<' '>' ">=" "<=";
 811 left: 7 "<<" ">>";
 812 left: 6 '+' '-' "with" "less";
 813 left: 5 '*' '/' '%';
 814 left: 4 "++" "--";
 815 left: 3 '!' '~' "arb" "card" "dom" "ran";
 816 left: 2 '[' ']' '{' '}' '.' "->" ;
 817 \end{verbatim}
 818
 819 \Subsubsection{{\tt expect:} $n$}
 820
 821 The declaration \T{expect} $n$ specifies that there should be $n$
 822 shift/reduce errors in the grammar.  If the actual grammar deviates
 823 from this number, a warning will be generated.  The alternative
 824 form \T{expect:} \T{\_} suppresses all warnings.  The syntax is:
 825
 826 \begin{syntax}
 827 \NDEF{Expect\_Decl}    & \IS & \TDEF{expect:} \N{Integer} \T{;}
 828     & expects $n$ shift/reduce errors \\
 829                      & \OR & \T{expect:} \T{\_} \T{;}
 830     & expects many shift/reduce errors \\
 831 \end{syntax}
 832
 833 This declaration is optional.  If omitted, warnings will be printed
 834 if parser conflicts are found during parser generation.
 835
 836 \Subsubsection{Production rules}
 837
 838
 839 The syntax of the production rules is as follows:
 840
 841 \begin{syntax}
 842 \NDEF{Production\_Rule} & \IS &
 843   \Id \OPT{\TypeExp} \T{:} \\
 844   & & \quad \OPT{\ALT{\N{One\_Alt}}{|}} \T{;} \\
 845 \NDEF{One\_Alt}  & \IS & \OPT{\ALT{\N{Symbol}}{}} & one alternative \\
 846 \NDEF{Symbol}
 847     & \IS & \Id        & non-terminal \\
 848     & \OR & \N{Cons}   & terminal (a datatype constructor) \\
 849     & \OR & \N{String} & terminal (a datatype constructor) \\
 850     & \OR & \N{Char}   & terminal (from the ASCII character set) \\
 851     & \OR & \T{?}      & the error terminal \\
 852     & \OR & \T{\$}     & the end of file terminal \\
 853     & \OR & \T{\{} \N{Code} \T{\}} & embedded actions \\
 854 \end{syntax}
 855
 856 Each rule specifies a set of productions of the form
 857 \[ \begin{array}{lcl}
 858     lhs &  :  & A_{11} A_{12} \ldots A_{1n_1} \\
 859         & \OR & A_{21} A_{22} \ldots A_{2n_2} \\
 860         & \vdots & \\
 861         & \OR & A_{m1} A_{m2} \ldots A_{mn_m} \\
 862    \end{array}
 863 \]
 864 where $lhs$ is the non-terminal and $A_{ij}$'s are a mixture of
 865 terminals, non-terminals and embedded actions.  If synthesized
 866 attributes are used, then the type of the s-attribute should be
 867 annotated next to the lhs non-terminal.
 868
 869 Terminals can be specified in a few ways: (i) a character literal
 870 is a predefined terminal of the same name, (ii) a string literal
 871 is matched with a \T{datatype} or a \T{lexeme} \T{class} constructor
 872 of the same name, (iii) an identifier is a terminal if there is a
 873 \T{datatype} constructor of the same name.  Any other identifiers
 874 are assumed to be non-terminals.
 875
 876 For example, the following are two sets of production rules used in the
 877 \Prop{} translator itself:
 878 \begin{verbatim}
 879 ty Ty:  simple_ty               { $$ = $1; }
 880 |       ty '=' exp              { $$ = DEFVALty($1,$3); }
 881 |       ty '*'                  { $$ = mkptrty($1); }
 882 |       ty '&'                  { $$ = mkrefty($1); }
 883 |       ty '[' ']'              { $$ = mkptrty($1); }
 884 |       ty '[' exp ']'          { $$ = mkarrayty($1,$3); }
 885 ;
 886 exp Exp:
 887         app_exp                 { $$ = $1; }
 888 |       exp '+' exp             { $$ = BINOPexp("+",$1,$3); }
 889 |       exp '-' exp             { $$ = BINOPexp("-",$1,$3); }
 890 |       exp '*' exp             { $$ = BINOPexp("*",$1,$3); }
 891 |       exp '/' exp             { $$ = BINOPexp("/",$1,$3); }
 892 |       exp '%' exp             { $$ = BINOPexp("%",$1,$3); }
 893 |       exp '^' exp             { $$ = BINOPexp("^",$1,$3); }
 894 |       exp "+=" exp            { $$ = BINOPexp("+=",$1,$3); }
 895 |       exp "-=" exp            { $$ = BINOPexp("-=",$1,$3); }
 896 |       exp "*=" exp            { $$ = BINOPexp("*=",$1,$3); }
 897 |       exp "/=" exp            { $$ = BINOPexp("/=",$1,$3); }
 898 |       exp "%=" exp            { $$ = BINOPexp("%=",$1,$3); }
 899 \end{verbatim}
 900 \noindent {\em etc $\ldots$}
 901
 902 Here, \T{ty} is the non-terminal of this production.  The type of
 903 the synthesized attribute of this rule is \T{Ty}, and it is
 904 written next to the non-terminal.  Like {\em yacc}, the synthesized
 905 attribute of a rule can be referenced using the \verb.$. variables:
 906 variable \TDEF{\$\$} denote the synthesized attribute of the current rule,
 907 while \TDEF{\$$n$} where $n$ is 1, 2, 3, \ldots denote the synthesized
 908 attribute of the $n$th rhs symbol of the current rule.
 909
 910 \Subsubsection{Parser report}
 911
 912 If the command line option \OPTIONDEF{-r} is used, then
 913 in addition to generating the code for the grammar, the translator
 914 will also generate a comprehensive report of the grammar and
 915 the resulting LALR(1) automaton.  The amount of details in this
 916 report can be varied using the the verbose option
 917 \OPTIONDEF{-v}:
 918
 919 \begin{description}
 920    \item[{\tt -v1}]  Print full information for all LR(0) states.
 921    \item[{\tt -v2}]  Also print out the lookahead set of a reduction rule.
 922 This may be useful for tracking down shift/reduce and reduce/reduce conflicts.
 923    \item[{\tt -v3}]  Do both \verb|-v1| and \verb|-v2|.
 924 \end{description}
 925
 926 \Subsubsection{Interfacing with the generated lexer}
 927
 928 The easiest way of interfacing with a lexer is to embed
 929 an object of class \T{LexerBuffer}, \T{IOLexerBuffer}, or \T{IOLexerStack}
 930 within a syntax class\footnote{Alternatively, inheritance may be used.}.
 931 We can then define the tokenization method
 932 \verb|get_token| using the \T{matchscan} statement described
 933 in section~\ref{sec:matchscan}.
 934    One useful methodology for specifying a parser for a language is that
 935
 936 \Section{Algebraic Datatypes and Pattern Matching} \label{sec:datatype}
 937
 938     \Prop{} implements algebraic datatypes and pattern matching in the
 939 style of Standard ML\cite{SML}.  Tree, DAG and even graph structures can be
 940 specified as a set of datatype equations in algebraic datatype specifications.
 941 \Prop{} then proceeds to translate these into concrete \Cpp{} classes.
 942 This makes it very easy to implement complex data structures.
 943
 944    Algebraic datatypes can be manipulated and transformed
 945 using \Prop{} pattern matching constructs, which decompose a datatype
 946 value into its constituents.  Complex patterns involving multiple
 947 objects can be specified.  These are translated into efficient
 948 testing and selection code in \Cpp.
 949
 950 In the following we shall give a brief overview of the pattern matching
 951 features of \Prop.  For most users of modern declarative languages many
 952 of these features are already familiar constructs.
 953
 954 \Subsection{A quick tour of pattern matching}
 955
 956    Algebraic datatypes are specified using \verb|datatype| definitions,
 957 which define the inductive structure of one of more types using
 958 a tree-grammar like syntax.
 959 %In addition, pretty printers,
 960 %lexical scanners, parsers, persistence I/O methods and
 961 %garbage collection inferences can also be specified with
 962 %additional options in the same construct.
 963 When a datatype is declared,
 964 the following operations are implicitly defined by the datatype compiler:
 965 (1) the constructors for all the variants of a type;
 966 (2) the identity test operator \verb|==|, and the assignment
 967     operator \verb|=| for this type; and
 968 (3) the member functions needed to decompose a datatype value during
 969     pattern matching.
 970
 971    We'll select the internals of a compiler for a simplified imperative
 972 language as the running example in this section.
 973 Suppose that in this language an expression is composed of
 974 identifiers, integer constants and the four arithmetic operators.
 975 Then the structure of the abstract syntax tree can be specified as follows:
 976
 977 \begin{verbatim}
 978    datatype Exp = INT (int)
 979                 | ID  (const char *)
 980                 | ADD (Exp, Exp)
 981                 | SUB (Exp, Exp)
 982                 | MUL (Exp, Exp)
 983                 | DIV (Exp, Exp)
 984                 ;
 985 \end{verbatim}
 986
 987 Abstract syntax of an expression such as {\em a * b - 17}
 988 can be constructed directly in a prefix syntax, directly mirroring
 989 that of the definition.  The \Prop{} datatype
 990 compiler will automatically generate a \Cpp{} class hierarchy to represent
 991 the variants of type \verb|Exp|.  Datatype constructor
 992 functions(not to be mistaken with \Cpp's class constructors) will
 993 also be automatically generated using the same names as the variants.
 994
 995 \begin{verbatim}
 996    Exp formula = ADD(MUL(ID("a"),ID("b")),INT(17));
 997 \end{verbatim}
 998
 999 Datatype values can be decomposed using the \verb|match| statement, which
1000 can be seen as a generalization of \C's \verb|switch| construct.  Pattern
1001 matching is a combination of conditional branching and value
1002 binding.  For example, a typical evaluation function for the type \verb|Exp|
1003 can be written as in the following example.  Notice that each arm of
1004 a \verb|case| is in fact a pattern(with optional variables)
1005 mirroring the syntax of a datatype.  The pattern variables(written
1006 with the prefix \verb|?| in the sequel) of a matching
1007 arm is {\em bound} to the value of the matching value, which can be
1008 subsequently referenced and modified in the action of an arm.
1009
1010 \begin{verbatim}
1011 int eval (Exp e, const map<const char *, int>& env)
1012 {  match (e)
1013    {  case INT ?i:        return ?i;
1014       case ID  ?id:       return env[?id];
1015       case ADD (?e1,?e2): return eval(?e1,env) + eval(?e2,env);
1016       case SUB (?e1,?e2): return eval(?e1,env) - eval(?e2,env);
1017       case MUL (?e1,?e2): return eval(?e1,env) * eval(?e2,env);
1018       case DIV (?e1,?e2): return eval(?e1,env) / eval(?e2,env);
1019    }
1020 }
1021 \end{verbatim}
1022
1023 \Subsubsection{Pattern matching versus object-oriented style}
1024
1025   Although a comparable evaluation function can be written
1026 in object oriented style using late binding, as in below, in general pattern
1027 matching is much more powerful than late binding in \Cpp, which only
1028 allows dispatching based on the type of one receiver.
1029
1030 \begin{verbatim}
1031   // Class definitions
1032   class Exp {
1033   public:
1034      virtual int eval(const map<const char *, int>& env) const = 0;
1035   };
1036   class INT : Exp {
1037      int i;
1038   public:
1039      int eval(const map<const char *, int>& env);
1040   };
1041   class ID : Exp {
1042      const char * id
1043   public:
1044      int eval(const map<const char *, int>& env);
1045   };
1046   ...
1047
1048   // Member functions
1049   int INT::eval(const map<const char *, int>& env) const { return i; }
1050   int ID ::eval(const map<const char *, int>& env) const { return id; }
1051   int ADD::eval(const map<const char *, int>& env) const
1052      { return e1->eval(env) + e2->eval(env); }
1053   int SUB::eval(const map<const char *, int>& env) const
1054      { return e1->eval(env) - e2->eval(env); }
1055   int MUL::eval(const map<const char *, int>& env) const
1056      { return e1->eval(env) * e2->eval(env); }
1057   int DIV::eval(const map<const char *, int>& env) const
1058      { return e1->eval(env) / e2->eval(env); }
1059 \end{verbatim}
1060
1061 For example, in the following function we use nested patterns,
1062 non-linear patterns (i.e. patterns with multiple occurrences of
1063 a pattern variable), and guards to perform algebraic simplification
1064 of an expression.  Although the patterns are relatively simple in
1065 this example, in general arbitrarily complex patterns may be used.
1066
1067 \begin{verbatim}
1068 Exp simplify (Exp redex)
1069 {  // recursive traversal code omitted ...
1070
1071    // match while repeats the matching process
1072    // until no more matches are found.
1073    match while (redex)
1074    {  ADD(INT 0,  ?x):     { redex = ?x; }
1075    |  ADD(INT ?x, INT ?y): { redex = INT(?x+?y); }
1076    |  ADD(?x,     INT 0)   { redex = ?x; }
1077    |  SUB(?x,     INT 0):  { redex = ?x; }
1078    |  SUB(?x,     ?x):     { redex = INT(0); }
1079    |  SUB(INT ?x, INT ?y): { redex = INT(?x-?y); }
1080    |  MUL(INT 0,  ?x):     { redex = INT(0); }
1081    |  MUL(?x,     INT 0):  { redex = INT(0); }
1082    |  DIV(?x,     ?x):     { redex = INT(1); }
1083         // don't divide by zero.
1084    |  DIV(INT ?x, INT ?y) | ?y != 0: { redex = INT(?x/?y); }
1085    |  ...
1086    }
1087    return redex;
1088 }
1089 \end{verbatim}
1090
1091 Pattern matching in \Prop{} is not restricted to one datatype at
1092 a time.  In the following example, we use matching on multiple values to
1093 define equality on expressions inductively.  For variety, we'll
1094 use the \verb|fun| variant of \verb|match|, which defines a function
1095 in rule form.  Notice that the last case
1096 of the match set uses {\em wild cards} \verb|_| to catch all the other
1097 non-equal combinations.  Since \Cpp{} does not provide multiple dispatching,
1098 implementing binary (or $n$-ary) matching operations on variant datatypes are
1099 in general cumbersome and verbose in object-oriented style. In contrast,
1100 using an applicative pattern matching style many manipulations and
1101 transformations on variant datatypes with tree-like or graph-like structure
1102 can be expressed succinctly.
1103
1104 \begin{verbatim}
1105 fun equal INT ?i,     INT ?j: bool: { return ?i == ?j; }
1106   | equal ID  ?a,     ID  ?b:       { return strcmp(a,b) == 0; }
1107   | equal ADD(?a,?b), ADD(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1108   | equal SUB(?a,?b), SUB(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1109   | equal MUL(?a,?b), MUL(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1110   | equal DIV(?a,?b), DIV(?c,?d):   { return equal(?a,?c) && equal(?b,?d); }
1111   | equal _,          _:            { return false; }
1112   ;
1113 \end{verbatim}
1114
1115 \Subsubsection{More examples} \label{sec:Wff}
1116
1117   As another example, we can specify the term structure of
1118 {\em well-formed formulas} in proposition calculus as follows.  Notice
1119 that the constructors \verb|F| and \verb|T| are nullary.
1120
1121 \begin{verbatim}
1122    datatype Wff = F
1123                 | T
1124                 | Var     (const char *)
1125                 | And     (Wff, Wff)
1126                 | Or      (Wff, Wff)
1127                 | Not     (Wff)
1128                 | Implies (Wff, Wff)
1129                 ;
1130 \end{verbatim}
1131
1132 Datatypes that are parametrically polymorphic, such as lists and trees, can
1133 be defined by parameterizing them with respect to one of more types.
1134 For example, both lists and tree below are parametric on one type argument
1135 \verb|T|.
1136
1137 \begin{verbatim}
1138    datatype List<T> = nil
1139                     | cons(T, List<T>);
1140    datatype Tree<T> = empty
1141                     | leaf(T)
1142                     | node(Tree<T>, T, Tree<T>);
1143
1144    List<int> primes = cons(2,cons(3,cons(5,cons(7,nil))));
1145    List<int> more_primes = cons(11,cons(13,primes));
1146    Tree<char *> names = node(leaf("Church"),"Godel",empty);
1147 \end{verbatim}
1148
1149 As a programming convenience, \Prop{} has a set of built-in list-like
1150 syntactic forms.  Unlike languages such as ML, however,
1151 these forms are not predefined to be any specific list types.  Instead, it is
1152 possible for the user to use these forms on any datatypes with
1153 a natural binary {\em cons} operator and a nullary {\em nil} constructor.
1154 For instance, the previous list datatype can be redefined as follows:
1155
1156 \begin{verbatim}
1157    datatype List<T> = #[] | #[ T ... List<T> ];
1158    List<int> primes = #[ 2, 3, 5, 7 ];
1159    List<int> more_primes = #[ 11, 13 ... primes ];
1160    List<char *> names = #[ "Church", "Godel", "Turing", "Curry" ];
1161
1162    template <class T>
1163       List<T> append (List<T> a, List<T> b)
1164       {  match (a)
1165          {  case #[]:          return b;
1166             case #[hd ... tl]: return #[hd ... append(tl,b)];
1167          }
1168       }
1169 \end{verbatim}
1170 The empty list is written as \verb|#[]|, while {\em cons(a,b)}
1171 is written as \verb|#[ a ... b ]|.   An expression of the special form
1172 \verb|#[a, b, c]|, for instance, is simple syntactic sugar for
1173 repeated application of the {\em cons} operator, i.e.
1174
1175 \begin{verbatim}
1176     #[a, b, c] == #[a ... #[ b ... #[ c ... #[] ] ] ].
1177     #[a, b, c ... d ] == #[a ... #[ b ... #[ c ... d ] ] ].
1178 \end{verbatim}
1179
1180 List-like special forms are not limited to datatypes with only two variants.
1181 For example, we can define a datatype similar in structure to S-expressions
1182 in {\em Lisp} or {\em scheme}.  Here's how such as datatype may be
1183 defined\footnote{for simplicity, we'll use a string representation for atoms
1184 instead of a more efficient method.}:
1185
1186 \begin{verbatim}
1187    datatype Sexpr = INT    (int)
1188                   | REAL   (double)
1189                   | STRING (char *)
1190                   | ATOM   (const char *)
1191                   | #()
1192                   | #( Sexpr ... Sexpr )
1193    where type Atom = Sexpr  // synonym for Sexpr
1194    ;
1195 \end{verbatim}
1196
1197    With this datatype specification in place, we can construct values
1198 of type \verb|Sexpr| in a syntax close to that of {\em Lisp}.  For example,
1199 we can define lambda expressions corresponding to the combinators $I$, $K$
1200 and $S$ as follows:
1201
1202 \begin{verbatim}
1203    Atom LAMBDA = ATOM("LAMBDA");
1204    Atom f      = ATOM("f");
1205    Atom x      = ATOM("x");
1206    Atom y      = ATOM("y");
1207    Atom NIL    = #();
1208
1209    Sexpr I = #(LAMBDA, #(x), x);
1210    Sepxr K = #(LAMBDA, #(x), #(LAMBDA, #(y), x));
1211    Sepxr S = #(LAMBDA, #(f),
1212                 #(LAMBDA, #(x),
1213                    #(LAMBDA, #(y), #(#(f,x), #(g,x)))));
1214 \end{verbatim}
1215
1216 Similar to list-like forms, vector-like forms are also available.
1217 This addresses one of the flaws of the \Cpp{} language, which lacks
1218 first class array constructors.
1219 Vectors are simply homogeneous arrays whose sizes
1220 are fixed and are determined at creation time\footnote{For efficiency,
1221 bounds checking is {\em not} performed.}.   Random access within
1222 vectors can be done in constant time.  Unlike lists, however, the
1223 prepending operation is not supported.   Vectors literals are delimited
1224 with the composite brackets \verb!(| ... |)!, \verb![| ... |]!, or
1225 \verb!{| ... |}!.  In the following example the datatype \verb|Exp|
1226 uses vectors to represent the cooefficients of the polynomials:
1227
1228 \begin{verbatim}
1229    datatype Vector<T> = (| T |);
1230    datatype Exp = Polynomial (Var, Vector<int>)
1231                 | Functor (Exp, Vector<Exp>)
1232                 | Atom (Var)
1233                 | ...
1234    where type Var = const char *;
1235    Exp formula = Polynomial("X", (| 1, 2, 3 |));
1236 \end{verbatim}
1237
1238 Commonly used patterns can be given synonyms so that they can be readily
1239 reused without undue repetition.  This can be accomplished by defining pseudo
1240 datatype constructors to stand for common patterns using
1241 {\em datatype law} definitions.  For example, the following set of
1242 laws define some commonly used special forms for a {\em Lisp}-like language
1243 using the previously defined \verb|Sexpr| datatype.
1244
1245 \begin{verbatim}
1246    datatype law inline Lambda(x,e)  = #(ATOM "LAMBDA", #(x), e)
1247               | inline Quote(x)     = #(ATOM "QUOTE", x)
1248               | inline If(a,b,c)    = #(ATOM "IF", a, b, c)
1249               | inline Nil          = #()
1250               | inline ProgN(exprs) = #(ATOM "PROGN" ... exprs)
1251               | SpecialForm  = #(ATOM ("LAMBDA" || "IF" ||
1252                                        "PROGN" || "QUOTE") ... _)
1253               | Call(f,args) = ! SpecialForm && #(f ... args)
1254               ;
1255 \end{verbatim}
1256
1257 Note that the pattern \verb|SpecialForm| is meant to match all
1258 special forms in our toy language: i.e. {\em lambdas}, {\em ifs}, {\em progn}'s
1259 and {\em quotes}.  The pattern disjunction connective \verb.||. is used
1260 to link these forms together.  Since we'd like the \verb|Call| pattern
1261 to match only if the S-expression is not a special form, we use
1262 the pattern negation and conjunction operators, \verb|!| and \verb|&&|
1263 are used to screen out special forms.
1264
1265
1266 With these definitions in place,
1267 an interpreter for our language can be written thus:
1268
1269 \begin{verbatim}
1270    Sexpr eval (Sexpr e)
1271    {  match (e)
1272       {  Call(?f,?args):     { /* perform function call */ }
1273       |  Lambda(?x,?e):      { /* construct closure */ }
1274       |  If(?e,?then,?else): { /* branching */ }
1275       |  Quote(?x):          { return ?x; }
1276       |  ...:                { /* others */ }
1277       }
1278    }
1279 \end{verbatim}
1280
1281
1282 As an interesting note, the special form pattern can also be rewritten
1283 using regular expression string matching, as in the following:
1284
1285 \begin{verbatim}
1286  datatype law SpecialForm = #(ATOM /LAMBDA|IF|PROGN|QUOTE/ ... _)
1287 \end{verbatim}
1288
1289 In addition, since the law constructors \verb|Lambda|, \verb|Quote|,
1290 \verb|If|, \verb|Nil| and \verb|ProgN| are defined with the \verb|inline|
1291 keyword, these constructors can be used to within expressions as abbreviates
1292 for their rhs definitions.  For example, writing
1293 \verb|Lambda(x,x)| is the same writing \verb|#(ATOM("Lambda"),#(x),x)|.
1294
1295 \Subsubsection{Variants of match}
1296
1297 Aside from the usual plain pattern matching constructs, a few variants of the
1298 \verb|match| construct are offered.  We'll briefly enumerate a few of these:
1299 \begin{itemize}
1300   \item The \verb|matchall| construct is a variant of \verb|match| that
1301    executes all matching rules(instead of just the first one) in sequence.
1302   \item Each rule of a \verb|match| statement can have associated cost
1303     expressions.  Instead of selecting the first matching rule to
1304     execute as in the default, all matching rules are considered
1305     and the rule with the least cost is executed.  Ties are broken by
1306     choosing the rule that comes first lexically.  For example:
1307
1308    \begin{verbatim}
1309    match (ir)
1310    {  ADD(LOAD(?r0),?r1) \ e1: { ... }
1311    |  ADD(?r0,LOAD(?r0)) \ e2: { ... }
1312    |  ADD(?r0, ?r1)      \ e3: { ... }
1313    |  ...
1314    }
1315    \end{verbatim}
1316   \item A \verb|match| construct can be modified with the \verb|while|
1317      modifier, as in the following example.  A match modified thus is
1318      repeatedly matched until none of the patterns are applicable.
1319      For example, the following routine uses a \verb|match while| statement
1320      to traverse to the leftmost leaf.
1321
1322 \begin{verbatim}
1323    template <class T>
1324    Tree<T> left_most_leaf(Tree<T> tree)
1325    {  match while (tree)
1326       {  case node(t,_,_):  tree = t;
1327       }
1328       return tree;
1329    }
1330 \end{verbatim}
1331   \item Finally, the \verb|matchscan| variant of match can be used to
1332      perform string matching on a stream.  For example, a simple lexical
1333      scanner can be written as follows.
1334
1335      \begin{verbatim}
1336    int lexer (LexerBuffer& in)
1337    {  matchscan while (in)
1338       {  /if/:                     { return IF; }
1339       |  /else/:                   { return ELSE; }
1340       |  /while/:                  { return WHILE; }
1341       |  /for/:                    { return FOR; }
1342       |  /break/:                  { return BREAK; }
1343       |  /[0-9]+/:                 { return INTEGER; }
1344       |  /[a-zA-Z_][a-zA-Z_0-9]*/: { return IDENTIFIER; }
1345       |  /[ \t]+/:                 { /* skip spaces */ }
1346       |  ... // other rules
1347       }
1348    }
1349      \end{verbatim}
1350 \end{itemize}
1351
1352 \Subsection{Algebraic Datatypes} \label{sec:algebraic-datatypes}
1353    Algebraic datatypes are defined using the \T{datatype}
1354 construct.  Its syntax is as follows.
1355
1356 \begin{syntax}
1357 \NDEF{Datatype\_Decl}
1358    & \IS & \TDEF{datatype} \\
1359    &     & \quad \OPT{\ALT{\N{Datatype\_Spec}}{and}} \\
1360    &     & \quad \OPT{\TDEF{where type} \ALT{\N{Type\_Spec}}{and}} \\
1361    &     & \quad \OPT{\TDEF{law} \ALT{\N{Law\_Spec}}{and}} \\
1362    &     & \T{;} \\
1363 \end{syntax}
1364
1365 Each \T{datatype} declaration specifies a set of
1366 (potentially mutually recursive) types, a set of type abbreviations,
1367 and a set of pattern matching abbreviations called {\em laws}.
1368
1369 \begin{syntax}
1370 \NDEF{Datatype\_Spec}
1371    & \IS & \Id \OPT{\T{<} \LIST{\Id} \T{>}} \\
1372    &     & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1373    &     & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1374    &     & \quad \OPT{\T{=} \N{Cons\_Specs}} \\
1375 \NDEF{Cons\_Specs}
1376    & \IS & \ALT{\N{Cons\_Spec}}{|} \\
1377    &     & \quad \OPT{\T{public:} \N{Datatype\_Body}} \\
1378 \NDEF{Cons\_Spec}
1379    & \IS & \N{Simple\_Cons\_Spec}  \\
1380    &     &  \quad \OPT{\T{with} \T{\{} \N{Class\_Decl} \T{\}}} \\
1381 \NDEF{Simple\_Cons\_Spec}
1382    & \IS & \Id \OPT{\N{String}} & unit constructor \\
1383    & \OR & \N{String}              & unit constructor \\
1384    & \OR & \Id \OPT{\T{of}} \TypeExp  & constructor with arguments \\
1385    & \OR & \T{\#[} \T{]}                 & unit list constructor \\
1386    & \OR & \T{\#\{} \T{\}}               & unit list constructor \\
1387    & \OR & \T{\#(} \T{)}                 & unit list constructor \\
1388    & \OR & \T{\#[} \TypeExp \T{...} \TypeExp \T{]}
1389       & list constructor \\
1390    & \OR & \T{\#\{} \TypeExp \T{...} \TypeExp \T{\}}
1391       & list constructor \\
1392    & \OR & \T{\#(} \TypeExp \T{...} \TypeExp \T{)}
1393       & list constructor \\
1394    & \OR & \T{[|} \TypeExp \T{|]} & vector constructor \\
1395    & \OR & \T{(|} \TypeExp \T{|)} & vector constructor \\
1396    & \OR & \T{\{|} \TypeExp \T{|\}} & vector constructor \\
1397 \end{syntax}
1398
1399 Datatypes can be annotated with {\em qualifiers}.  They tell the \Prop{}
1400 translator to generate additional code for each datatype to provide
1401 additional functionality.
1402
1403 \begin{syntax}
1404 \NDEF{Datatype\_Qualifiers}
1405    & \IS & \MORE{\N{Datatype\_Qualifier}}{} \\
1406 \NDEF{Datatype\_Qualifier}
1407    & \IS & \TDEF{collectable} & garbage collectable \\
1408    & \OR & \TDEF{rewrite}     & optimized for tree rewriting \\
1409    & \OR & \TDEF{persistent}  & generate persistence interface \\
1410    & \OR & \TDEF{lexeme}      & used for parser/lexer tokens  \\
1411 \end{syntax}
1412
1413 Type specifications assign abbreviations to commonly used type expressions.
1414 They act like \T{typedef}'s in \Cpp, except that \Prop{} can make use
1415 of the type information provided by these specifications.
1416
1417 \begin{syntax}
1418 \NDEF{Type\_Spec}
1419    & \IS & \Id \T{=} \TypeExp \\
1420 \end{syntax}
1421
1422 Law specifications are used to define abbreviations to datatype
1423 patterns\footnote{Patterns are discussed in
1424 section~\ref{sec:pattern-matching}.}.
1425 They behave like macros in some sense, except that unlike macros, they are
1426 properly type checked by \Prop.  In addition, if the keyword \T{inline} is
1427 used, then \Prop{} will treat the lhs as an expression and generate
1428 a function of the same name that can be used to construct the datatype term.
1429
1430 \begin{syntax}
1431 \NDEF{Law\_Spec}
1432    & \IS & \OPT{\T{inline}} \Id \OPT{\N{Law\_Arg}} \T{=} \Pat \\
1433 \NDEF{Law\_Arg}
1434    & \IS & \Id \\
1435    & \OR & \T{(} \LIST{\Id} \T{)} \\
1436 \end{syntax}
1437
1438 \Prop{} recognizes the following set of type expressions.
1439 \begin{syntax}
1440 \NDEF{Type\_Exp}
1441    & \IS & \N{Type\_Id}                & type name \\
1442    & \OR & \TypeExp \T{*}              & pointer type \\
1443    & \OR & \TypeExp \T{\&}             & reference type \\
1444    & \OR & \N{Type\_Qualifier} \N{Type\_Id}      & qualified type \\
1445    & \OR & \N{Datatype\_Qualifier} \TypeExp       & annotation  \\
1446    & \OR & \T{(} \TypeExp \T{)}          & grouping \\
1447    & \OR & \T{(} \TypeExp \T{,}
1448       \LIST{\TypeExp} \T{)} & tuple type \\
1449    & \OR & \T{(} \TypeExp \T{,}
1450       \LIST{\TypeExp} \T{)} & tuple class \\
1451    & \OR & \T{\{} \LIST{\N{Lab\_Type\_Exp}} \T{\}} & record type \\
1452    & \OR & \TypeExp \T{=} \N{Exp} & default value \\
1453 \NDEF{Lab\_Type\_Exp} & \IS &
1454    \Id \T{:} \TypeExp \\
1455 \NDEF{Type\_Qualifier}
1456    & \IS & \T{class}       & a class type \\
1457    & \OR & \T{const}       & constant type \\
1458    & \OR & \T{rewrite}     & rewritable \\
1459    & \OR & \T{collectable} & garbage collected \\
1460    & \OR & \T{persistent}  & persistent object  \\
1461 \end{syntax}
1462
1463 \Subsubsection{Instantiating a datatype}
1464
1465 Each declared datatype should be {\em instantiated} in some implementation
1466 file using the \TDEF{instantiate datatype} statement.  The translator
1467 will generate additional code to implement the datatype.  The general
1468 syntax is:
1469 \begin{syntax}
1470  \NDEF{Instantiate\_Decl}
1471     & \IS &
1472        \TDEF{instantiate datatype} \LIST{\N{Type\_Spec}} \\
1473     & \OR &
1474        \TDEF{instantiate} \T{extern} \T{datatype} \LIST{\N{Type\_Spec}} \\
1475  \NDEF{Type\_Spec} & \IS & \Id \OPT{ \T{<} \LIST{\TypeExp} \T{>}} \\
1476 \end{syntax}
1477
1478 \Subsection{Pattern Matching} \label{sec:pattern-matching}
1479    The basic pattern matching construct is the \T{match} statement.
1480 A match statement can be seen as a generalization of \Cpp's \T{switch}
1481 statement, in which the expression that follows a \T{case} can be
1482 a general complex pattern.
1483
1484    Its basic syntax is as follows:
1485 \begin{syntax}
1486 \NDEF{Match} & \IS &
1487    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and}  \\
1488         & & \T{\{} \MORE{\T{case} \N{Match\_Rule}}{} \T{\}} & variant 1 \\
1489           & \OR &
1490    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \\
1491         & & \T{\{} \ALT{\N{Match\_Rule}}{|} \T{\}} & variant 2 \\
1492           & \OR &
1493    \N{Match\_Mode} \ALT{\T{(} \N{Exp} \T{)}}{and} \T{of} & variant 3 \\
1494         & & \quad \ALT{\N{Match\_Rule}}{|} \\
1495         & & \T{end} \T{match} \T{;} \\
1496 \NDEF{Match\_Mode} & \IS & \TDEF{match} \OPT{\T{while}} \\
1497                 & \OR & \TDEF{matchall} \OPT{\T{while}} \\
1498 \NDEF{Match\_Rule} & \IS & \Pat{} \OPT{\N{Guard}} \OPT{\N{Cost}}
1499         \T{:} \N{Match\_Action} \\
1500 \NDEF{Guard}       & \IS & \T{if} \N{Exp} \\
1501                 & \OR & \T{where} \N{Exp} \\
1502                 & \OR & \T{|} \N{Exp} \\
1503 \NDEF{Cost} & \IS & \verb|\| \Exp & cost expression \\
1504 \NDEF{Match\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
1505                   & \OR & \N{Code} & for variant 1 only \\
1506 \end{syntax}
1507
1508 The 3 syntactic variants of the match statement are equivalent in meaning.
1509 A \T{match} statement takes a list of match objects and a
1510 set of pattern matching rules.  The first rule that matches the pattern
1511 completely will be executed.  Notice that guards can be placed on each rule,
1512 and a rule is only applicable if the guards evaluate to true.
1513
1514 \paragraph{Cost minimization}
1515 In addition, if cost expressions are specified, then the matching process
1516 will proceed as follows:
1517 \begin{enumerate}
1518    \item Locate all applicable rules.  If no such rule exists then exit.
1519    \item Compute the cost expressions of all applicable rules.
1520    \item Choose the action that has the lowest cost to execute.  If
1521 a tie occurs, resolve by the textual order of the rules.
1522 \end{enumerate}
1523
1524 \paragraph{Match all rules}
1525 The \T{matchall} statement is a variant of \T{match}.  In the matchall
1526 mode, the actions of {\em all} applicable rules are executed in the
1527 textual order.  Caution: since the matching variable binding
1528 process occurs {\em before} the rules are executed, the rhs actions
1529 should not alter the matched value or the pattern variables may
1530 be bound to the wrong value.
1531
1532 \paragraph{Repetition}
1533 In addition, both \T{match} and \T{matchall} can be annotated with the
1534 \TDEF{while} modifier.  It this case the semantics of the matching
1535 construct is to repeat until no more rule is applicable.  For example,
1536 the following code fragment returns the last element of a list
1537 \begin{verbatim}
1538 datatype List = #[] | #[ int ... List ];
1539
1540 int last (List l)
1541 {  match while (l)
1542    { case #[]:         assert(0); // an error
1543      case #[i]:        return i;
1544      case #[h ... t]:  l = t;
1545    }
1546 }
1547 \end{verbatim}
1548
1549 \paragraph{Pattern syntax}
1550 The basic form of a pattern is a literal, which
1551 that matches only a single integer, real number, string, etc.  Complex
1552 patterns can be constructed inductively
1553 using datatype constructors, tuples, lists, and
1554 {\em logical pattern connectives}.
1555
1556 The syntax of the patterns is as follows:
1557 \begin{syntax}
1558 \NDEF{Pat} &
1559    \IS & \N{Integer} & matches integer literal \\
1560  & \OR & \N{Real}    & matches real literal \\
1561  & \OR & \N{Char}    & matches character literal \\
1562  & \OR & \N{Quark}   & matches quark literal \\
1563  & \OR & \N{String}  & matches string literal \\
1564  & \OR & \N{Regexp}  & matches any string that matches {\em regexp}\\
1565  & \OR & \N{Pat\_Var}  & a pattern variable; matches anything \\
1566  & \OR & \T{\_}      & wild card; matches anything \\
1567  & \OR & \N{Cons}     & matches a datatype constructor  \\
1568  & \OR & \N{Cons} \N{PatArg}& a constructor application pattern \\
1569  & \OR & \Id{} \T{as} \Pat & binds a variable to the sub-pattern \\
1570  & \OR & \Pat{} \T{:} \TypeExp & typed pattern \\
1571  & \OR & \Pat{} \T{||} \Pat  & matches either pattern \\
1572  & \OR & \Pat{} \T{\&\&} \Pat  & matches both patterns \\
1573  & \OR & \T{!} \Pat             & matches anything but the pattern \\
1574  & \OR & \T{(} \Pat{} \T{)}       & Grouping \\
1575  & \OR & \T{\#[} \LIST{\Pat} \T{]} & list pattern; exact length match \\
1576  & \OR & \T{\#[} \LIST{\Pat} \T{...} \T{]} & list pattern;
1577      non-exact length match \\
1578  & \OR & \T{[|} \LIST{\Pat} \T{|]} & vector pattern; exact length match \\
1579  & \OR & \T{[|} \LIST{\Pat} \T{...} \T{|]}
1580     & vector pattern; matches to the left \\
1581  & \OR & \T{[|} \T{...} \LIST{\Pat} \T{|]}
1582     & vector pattern; matches to the right \\
1583 \NDEF{PatArg} & \IS & \Pat \\
1584            & \OR & \T{(} \LIST{\Pat} \T{)} \\
1585            & \OR & \T{\{} \LIST{\N{Lab\_Pat}} \OPT{\T{...}} \T{\}} \\
1586 \NDEF{Lab\_Pat} & \IS & \Id \T{=} \Pat \\
1587 \NDEF{Pat\_Var} & \IS & \Id       \\
1588             & \OR & \T{?}\Id  \\
1589 \end{syntax}
1590
1591 \Subsection{Refining a datatype}
1592
1593    A datatype definition can be refined in a few ways:
1594
1595 \begin{enumerate}
1596   \item{\bf inheritance} A datatype can be declared to be inherited from
1597 one or more classes $C_1, C_2, \ldots, C_n$.
1598 The effect is that all variants of a datatype will be inherited from the
1599 classes $C_1, C_2, \ldots C_2$.  In addition
1600
1601   \item{\bf member functions}  Member functions and addition attributes
1602 can be attached to the datatype variants using the keyword \TDEF{with}.
1603 \end{enumerate}
1604
1605 For example, the following datatype \T{Exp} is inherited from some user
1606 defined classes \verb|AST| and \verb|Object|.   Furthermore, a virtual
1607 member function called print is defined.
1608
1609 \begin{verbatim}
1610 datatype Exp : public AST, virtual public Object
1611    = INT int
1612          with { ostream& print(ostream&); }
1613    | PLUS  Exp, Exp
1614          with { ostream& print(ostream&); }
1615    | MINUS Exp, Exp
1616          with { ostream& print(ostream&); }
1617    | MULT  Exp, Exp
1618          with { ostream& print(ostream&); }
1619    | DIV   Exp, Exp
1620          with { ostream& print(ostream&); }
1621    | VAR { name : const char *, typ : Type }
1622          with { ostream& print(ostream&); }
1623    public:
1624    {  virtual ostream& print (ostream&) = 0;
1625       void * operator new (size_t);
1626    }
1627 ;
1628 \end{verbatim}
1629
1630 The allocator and printing methods can be defined as follows:
1631 \begin{verbatim}
1632 void * classof Exp::operator new(size_t) { ... }
1633 ostream& classof INT::print(ostream& s)   { return s << INT; }
1634 ostream& classof PLUS::print(ostream& s)  { return '('<< #1 <<'+'<< #2 <<')'; }
1635 ostream& classof MINUS::print(ostream& s) { return '('<< #1 <<'-'<< #2 <<')'; }
1636 ostream& classof MULT::print(ostream& s)  { return '('<< #1 <<'*'<< #2 <<')'; }
1637 ostream& classof DIV::print(ostream& s)   { return '('<< #1 <<'/'<< #2 <<')'; }
1638 ostream& classof VAR::print(ostream& s)   { return s << name; }
1639 \end{verbatim}
1640
1641 The special type form \T{classof} {\em con} is used to reference
1642 the type of a constructor.  Arguments of a constructor uses the following
1643 naming convention: (i) if the constructor $C$ takes only one argument,
1644 then the name of the argument is $C$; (ii) if $C$ takes two or more
1645 unnamed arguments, then these are named \verb|#1|, \verb|#2|, etc;
1646 (iii) finally, labeled arguments are given the same name as the label.
1647
1648 For readability reasons, it is often useful to separate a datatype definition
1649 from the member functions and inheritance definitions.   The \TDEF{refine}
1650 declaration can be used in this situation.  For example, the above
1651 example can be restated more succinctly as follows:
1652
1653 \begin{verbatim}
1654 //
1655 // Datatype definition section
1656 //
1657 datatype Exp = INT int
1658              | PLUS  Exp, Exp
1659              | MINUS Exp, Exp
1660              | MULT  Exp, Exp
1661              | DIV   Exp, Exp
1662              | VAR { name : const char *, typ : Type }
1663 ;
1664
1665 //
1666 // Refinement section
1667 //
1668 refine Exp : public AST, virtual public Object
1669        { virtual ostream& print (ostream&) = 0;
1670          void * operator new (size_t);
1671        }
1672 and    INT, PLUS, MINUS, MULT, DIV, VAR
1673        { ostream& print(ostream&);
1674        }
1675 ;
1676 \end{verbatim}
1677
1678 The general syntax of the \TDEF{refine} declaration is as follows:
1679 \begin{syntax}
1680 \NDEF{Refine\_Decl}
1681    & \IS & \T{refine} \ALT{\N{Refine\_Spec}}{\T{and}} \T{;} \\
1682 \NDEF{Refine\_Spec}
1683    & \IS & \LIST{\N{Id}} \\
1684    &     & \quad \OPT{\T{:} \N{Inherit\_List}} \\
1685    &     & \quad \OPT{\T{::} \N{Datatype\_Qualifiers}} \\
1686    &     & \quad \OPT{\T{\{} \N{Datatype\_Body} \T{\}}} \\
1687 \end{syntax}
1688 Here, \N{Id} refers to either a constructor name or a datatype name.
1689
1690 \Subsection{Memory management}
1691
1692    By default, a datatype constructor calls the \verb|operator new| to
1693 allocated memory.  There are a few ways to change this behavior: (i)
1694 redefine the \verb|operator new| within the datatype using refinement; or
1695 (ii) inherit from a class that redefines this method.
1696
1697    If a {\em placement} \verb|operator new| is defined in datatype,
1698 the placement constructor syntax can be used invoke this
1699 operator:
1700 \begin{syntax}
1701    \NDEF{Placement\_Cons}
1702    & \IS & \N{Cons} \T{'(} \LIST{\Exp} \T{)} \T{(} \OPT{\LIST{\Exp}} \T{)} \\
1703    & \OR & \N{Cons} \T{'(} \LIST{\Exp} \T{)} \T{\{} \OPT{\LIST{\Id = \Exp}}
1704      \T{\}} \\
1705 \end{syntax}
1706
1707 For example, in the following the extra parameter \verb|mem| will
1708 be passed to placement \verb|new| operator.
1709 \begin{verbatim}
1710   Exp e1 = INT'(mem)(1);
1711   Exp e2 = VAR'(mem){ name = "foo", typ = t };
1712 \end{verbatim}
1713
1714 \Subsubsection{Garbage collection}
1715
1716   The support library contains an implementation\footnote{The implementation
1717 has only been tested on Linux and SunOS 4.1.x.  Please contact the
1718 author for details
1719 if you'd like to port the collector to your particular platform.} of a
1720 conservative garbage collector using two algorithms: one using
1721 a mostly copying scheme\cite{Mostly-copying,Gen-mostly-copying}
1722 and one using mark-sweep.
1723
1724   A datatype can be defined to be garbage collectable with the
1725 qualifier \TDEF{collectable}.  In the following example, the datatype
1726 \T{Exp} is will be allocated from the a garbage collected heap instead
1727 of the default heap.  Since it references another
1728 user defined garbage collectable object
1729 \verb|SymbolTable|, the pointer to that symbol table is also marked
1730 as \T{collectable}.
1731 \begin{verbatim}
1732    // SymbolTable is collectable
1733    class SymbolTable: public GCObject
1734    {
1735    public:
1736       class Id : public GCObject { ... };
1737       ...
1738    };
1739
1740    datatype Exp :: collectable
1741       = INT int
1742       | VAR (collectable SymbolTable::Id, collectable SymbolTable *)
1743       | ADD Exp, Exp
1744       | SUB Exp, Exp
1745       | MUL Exp, Exp
1746       | DIV Exp, Exp
1747       ;
1748 \end{verbatim}
1749
1750 The corresponding \TDEF{instantiate datatype} statement for \T{Exp}
1751 will generate the appropriate tracing methods for garbage collection.
1752
1753 Please see appendix~\ref{appendix:gc} for more details concerning
1754 garbage collection.
1755
1756 \Subsubsection{Persistence}
1757
1758   Datatypes can be interfaced with the persistent object library
1759 supplied with the translator using the \TDEF{persistent} qualifier.
1760 A persistent object defines the protocol for serializing its representation
1761 into an architecture independent format on a persistence stream.
1762
1763 For example, the following datatype declaration defines a persistent
1764 type \verb|Exp|.  Note that a pointer to the symbol table
1765 is annotated with the \T{persisent} qualifier to signify that the object
1766 associated with the pointer should be traversed during the serialization
1767 phase.
1768 \begin{verbatim}
1769    // SymbolTable is persistent
1770    class SymbolTable: public PObject
1771    {
1772    public:
1773       class Id : public PObject { ... };
1774       ...
1775    };
1776
1777    datatype Exp :: persistent
1778       = INT int
1779       | VAR (persistent SymbolTable::Id, persistent SymbolTable *)
1780       | ADD Exp, Exp
1781       | SUB Exp, Exp
1782       | MUL Exp, Exp
1783       | DIV Exp, Exp
1784       ;
1785 \end{verbatim}
1786
1787 The corresponding \TDEF{instantiate datatype} statement will generate
1788 the appropriate methods to communicate with a persistent stream.
1789 \begin{verbatim}
1790   instantiate datatype Exp;
1791 \end{verbatim}
1792
1793 To write an object $e$ to the datafile \verb|"foo"|, we can say:
1794 \begin{verbatim}
1795    #include <AD/persist/postream.h>
1796    ofstream out("foo");
1797    Postream pout(out);
1798    pout << e;
1799    out.close();
1800 \end{verbatim}
1801
1802 To read the object back from the file, we can say:
1803 \begin{verbatim}
1804    #include <AD/persist/pistream.h>
1805    EXP e;
1806    ifstream in("foo");
1807    Pistream pin(in);
1808    e = (EXP)read_object(pin);
1809    in.close();
1810 \end{verbatim}
1811
1812 For more details concerning persistent streams and persist objects,
1813 please see directory \verb|<AD/persist>| for details.
1814
1815 \Section{Inference} \label{sec:inference}
1816
1817    Semantic processing
1818 in compilers and other language processors, such as data flow analysis,
1819 can frequently be specified as in a rule-based,
1820 logical deductive style.  In \Prop, deductive inference using forward
1821 chaining\footnote{
1822 The current
1823 implementation of {\sf Prop} translates inference into very naive code.
1824 This feature will be phased out and replaced by
1825 the more general and much faster graph rewriting mechanism.
1826 }
1827 is provided as a built-in mechanism, orthogonal
1828 to pattern matching and rewriting, for manipulating user-defined
1829 algebraic datatypes.
1830
1831 Similar to syntax classes, {\bf inference classes} may be used
1832 for data encapsulation.  An inference class is a combination of a \Cpp{}
1833 class, a database of inference relations, and a collection of inference rules
1834 of the form {\em lhs \verb|->| rhs}.  The lhs of an inference rule is a set
1835 of patterns in conjunctive form.  During the inference process, a rule is
1836 fired when its lhs condition is satisfied.  A fired rule then executes
1837 the corresponding rhs action, which may assert or retract additional relations
1838 from the database.   Using multiple inheritance, it is possible to combine
1839 a rewriting class with an inference class such that the rewriting process
1840 generates new relations to drive the inference process, or vice versa.
1841
1842 \Subsection{An Example}
1843 Datatype relations are not a distinct kind of data structure but are in fact
1844 simply algebraic datatypes declared to have a \TDEF{relation}
1845 attribute.  For example, in
1846 the following definition three relation types \verb|Person|, \verb|Parent|
1847 and \verb|Gen| are defined.
1848
1849 \begin{verbatim}
1850    datatype Person :: relation = person (const char *)
1851        and  Parent :: relation = parent (const char *, const char *)
1852        and  Gen    :: relation = same_generation (const char *, const char *);
1853
1854    instantiate datatype Person, Parent, Gen;
1855 \end{verbatim}
1856
1857 Using these relations we can define an inference class that computes whether
1858 two persons are in the same generation.   Nine axioms are defined (i.e.
1859 those whose lhs are empty) in the following.  The two inference rules
1860 state that (1) a person is in the same generation as him/herself,
1861 and (2) two persons are in the same generation if
1862 their parents are in the same generation.
1863
1864 \begin{verbatim}
1865    inference class SameGeneration {};
1866
1867    inference SameGeneration
1868    {  -> person("p1") and person("p2") and
1869          person("p3") and person("p4") and
1870          person("p5");
1871
1872       -> parent("p1", "p2") and
1873          parent("p1", "p3") and
1874          parent("p2", "p4") and
1875          parent("p3", "P5");
1876
1877       person ?p -> same_generation (?p, ?p);
1878
1879       parent (?x, ?y) and parent (?z, ?w) and same_generation (?x, ?z)
1880       -> same_generation(?y, ?w);
1881    };
1882 \end{verbatim}
1883
1884 In general, datatypes qualified as \verb|relation|s will inherit
1885 from the base class \verb|Fact|, while a rewrite class definition
1886 implicitly defines two member functions used to assert and retract facts
1887 in the internal database.  For example, in the above example, the following
1888 protocol will be automatically generated by the inference compiler.
1889
1890 \begin{verbatim}
1891    class SameGeneration : ...
1892    {
1893    public:
1894        virtual Rete&    infer       ();       // start the inference process
1895        virtual ReteNet& operator += (Fact *); // assert fact
1896        virtual ReteNet& operator -= (Fact *); // retract fact
1897    };
1898 \end{verbatim}
1899
1900
1901
1902 Using these methods, an application can insert or remove relations
1903 from an inference class.  This will in turn trigger any attached inference
1904 rules of the class.
1905
1906 \Subsubsection{Another example}
1907
1908 Consider the following example, which is used to compute Pythagorean
1909 triangles.  Only one axiom and two rules are used.  The axiom and the
1910 first rule are used to assert the relations \verb|num(1)| to \verb|num(n)|
1911 into the database, where \verb|n| is limited by the term \verb|limit(n)|.
1912 The second inference rule is responsible for printing out only
1913 the appropriate combinations of numbers.
1914
1915 \begin{verbatim}
1916    datatype Number :: relation = num int | limit int;
1917
1918    inference class Triangle {};
1919
1920    inference Triangle
1921    {  ->  num 1;
1922
1923           num m
1924       and limit n | n > m
1925       ->  num (m+1);
1926
1927           num a
1928       and num b
1929       and num c | a < b && b < c && a*a + b*b == c*c
1930       ->  { cout << a << " * " << a << " + "
1931                  << b << " * " << b << " = "
1932                  << c << " * " << c << "\n";
1933           };
1934    };
1935 \end{verbatim}
1936
1937 Now, to print all the triangle identities lying in range of $1$ to $100$,
1938 we only have to create an instance of the inference class, insert the
1939 limit, and start the inference process, as in below:
1940
1941 \begin{verbatim}
1942    Triangle triangle;
1943    triangle += limit(100);
1944    triangle.infer();
1945 \end{verbatim}
1946
1947 \Subsection{Inference Class}
1948 \INCOMPLETE
1949
1950 \Subsection{Inference Rules}
1951 \INCOMPLETE
1952
1953 \Section{Tree Rewriting} \label{sec:tree-rewriting}
1954
1955 \Prop's tree rewriting mechanism let us transform a tree in
1956 algebraic datatype form into another tree according to a set of
1957 {\em rewriting rules}.  Unlike plain pattern matching described
1958 in section~\ref{sec:pattern-matching}, which only apply to the
1959 root of a tree, rewriting rules are applicable to all parts of
1960 the tree.  This allows the user to concentrate on developing the
1961 set of transformations; the actual traversal of the tree object
1962 is handled implicitly by \Prop.  When used properly, this
1963 mechanism has an advantage over plain pattern matching since
1964 rewriting rules remain valid even when a new variant to a datatype
1965 is introduced.
1966
1967 In \Prop, a rewriting system is developed in a similar manner as a parser.
1968 The first phase requires the user to defines a {\em rewrite class}
1969 to encapsulate the rewriting rules.
1970
1971 Frequently, the rewriting mechanism is used to collect information about
1972 a tree structure; furthermore, more transformation rules are {\em conditional}:
1973 i.e. we want them to be applicable only if certain conditions are satisfied.
1974 We can accomplish both of these tasks by defining data members and methods
1975 in the rewriting class.  These are accessible during the rewriting
1976 process.  Information collected during rewriting can be stored within
1977 the data members.
1978
1979 In the rewriting formalism, equational
1980 rules of the form {\em lhs} \verb|->| {\em rhs} are specified by the user.
1981 During processing, each instance of the lhs in a complex tree is replaced
1982 by an instance of the rhs, until no such replacement is possible.
1983 Equational rules can often be used to specify semantics based simplification
1984 (e.g. constant folding and simplification based on simple algebraic
1985 identities) or transformation(e.g. code selection in a compiler
1986 backend\cite{codegen-with-trees}).
1987
1988 Unlike plain pattern matching, however, the structural traversal process
1989 in rewriting is implicitly inferred from the type structure of an
1990 algebraic datatype, as specified in its definition.
1991
1992 There are two main forms of rewriting modes available:
1993 \begin{itemize}
1994    \item The first is {\bf normalization} mode: a given tree is reduced
1995 using the matching rules until no more redexes are available.  There
1996 are two modes of operations available:
1997        \begin{itemize}
1998           \item in {\bf replacement} mode, the redex of a tree will be
1999 physically overwritten.
2000           \item in {\bf applicative} mode, on the other hand, a new
2001 tree corresponding to the replacement value will be constructed.
2002        \end{itemize}
2003      Replacement mode is used as the default since it is usually
2004 the more efficient of the two.
2005    \item The second form is {\bf reduction} and {\bf transformation}.
2006 In this mode a tree parse of the input term is computed.  If cost functions
2007 are attached to the rules, then they will also be used to determine a
2008 minimal cost reduction sequence.  During this process attached actions of
2009 a rule may be invoked to synthesize new data.
2010 \end{itemize}
2011
2012    Each independent set of rewriting rules in \Prop{} is encapsulated
2013 in its own {\bf rewrite class}.  A rewrite class is basically a normal \Cpp{}
2014 class with a set of rewriting rules attached.   During rewriting, the
2015 data members and the member functions are visible according to the normal
2016 \Cpp{} scoping rules.  This makes it is easy to encapsulate additional data
2017 computed as a side effect during the rewriting process.
2018
2019 \Subsection{A rewriting example}
2020
2021    Consider an abbreviated simplifier for the well-formed formula datatype
2022 \verb|Wff| defined in the section~\ref{sec:Wff}.  The rewrite class for this
2023 can be defined as follows.  Since there is no encapsulated data in this
2024 example, only the default constructor for the class needs to be defined.
2025 A rewrite class definition requires the traversal list to be
2026 specified.  This is simply a list of datatypes
2027 involved in the rewriting traversal process.
2028 In this instance only \verb|Wff| is needed.
2029
2030 \begin{verbatim}
2031    rewrite class Simplify (Wff)
2032    {
2033    public:
2034       Simplify() {}
2035    };
2036 \end{verbatim}
2037
2038    The rewrite rules for the simplifier can then be specified succinctly as
2039 follows.  Like the \verb|match| statement, in general the rhs
2040 of a rewrite rule can be any statement.  A special statement
2041 \verb|rewrite(e)| can be used to rewrite the current redex into another form.
2042 If the rhs is of the form \verb|rewrite(e)|, then it can be abbreviated
2043 to \verb|e|, as in below:
2044 \begin{verbatim}
2045    rewrite Simplify
2046    {  And(F,  _):      F
2047    |  And(_,  F):      F
2048    |  And(T,  ?X):     ?X
2049    |  And(?X, T):      ?X
2050    |  Or (T,  _):      T
2051    |  Or (_,  T):      T
2052    |  Or (F,  ?X):     ?X
2053    |  Or (?X, F):      ?X
2054    |  Not(Not(?X)):    ?X
2055    |  Not(And(?X,?Y)): Or(Not(?X), Not(?Y))
2056    |  Not(Or(?X,?Y)):  And(Not(?X), Not(?Y))
2057    |  Implies(?X,?Y):  Or(Not(?X), ?Y)
2058    |  And (?X, ?X):     ?X
2059    |  Or (?X, ?X):      ?X
2060    |  Implies (?X, ?X): ?X
2061    // etc ...
2062    };
2063 \end{verbatim}
2064
2065 The rewrite class definition creates a new class of the same name.  This new
2066 class defines an implicit \verb|operator ()| with the protocol below.  This
2067 member function can be invoked to perform the rewriting in a functional
2068 syntax.
2069
2070 \begin{verbatim}
2071    class Simplify : ... {
2072    {  ...
2073    public:
2074       void operator () (Wff);
2075       // Wff operator () (Wff); // if rewrite class is applicative
2076    };
2077
2078    Wff wff = ...;
2079    Simplify simplify; // create a new instance of the rewrite class
2080    simplify(wff);    // rewrite the term wff
2081 \end{verbatim}
2082
2083
2084 \Subsubsection{Conditional rewriting and actions}
2085
2086    Rewriting rules may be guarded with predicates to limit their
2087 applicability.  In addition, the {\em rhs} of a rewrite rule is not limited
2088 to only a replacement expression: in general, any arbitrarily complex sequence
2089 of code may be used.  For example, in the following set of rewriting
2090 rules we use guards to prevent undesirable replacements to be made during
2091 expression constant folding:
2092
2093 \begin{verbatim}
2094    rewrite ConstantFolding
2095    {  ADD (INT a, INT b):  INT(a+b)
2096    |  SUB (INT a, INT b):  INT(a-b)
2097    |  MUL (INT a, INT b):
2098       {  int c = a * b;                      // silent overflow
2099          if (a == 0 || b == 0 || c / b == a) // no overflow?
2100          {  rewrite(INT(c)); }
2101          else
2102          {  cerr << "Overflow in multiply\n"; }
2103       }
2104    |  DIV (INT a, INT b) | b == 0:  { cerr << "Division by zero\n"; }
2105    |  DIV (INT a, INT b):  INT(a/b)
2106    |  // etc...
2107    };
2108 \end{verbatim}
2109
2110 \Subsection{Rewrite class} \label{sec:rewrite-class}
2111
2112 The syntax of a rewrite class declaration is as follows:
2113
2114 \begin{syntax}
2115 \NDEF{Rewrite\_Class\_Decl} & \IS & \TDEF{rewrite class} \Id
2116             \T{(} \LIST{\TypeExp} \T{)} \\
2117    & & \quad \OPT{\T{:} \N{Inherit\_List}}
2118         \OPT{\T{::} \MORE{\N{Rewrite\_Mode}}{}} \\
2119    & & \quad \T{\{} \N{Class\_Body} \T{\}} \T{;} \\
2120 \NDEF{Rewrite\_Mode} & \IS & \TDEF{treeparser} \\
2121                   & \OR & \TDEF{applicative} \\
2122                   & \OR & \TDEF{topdown} \\
2123 \end{syntax}
2124
2125 This is simply the usual \Cpp{} class declaration syntax
2126 extended to the following information:
2127 \begin{description}
2128    \item[a traversal list] enclosed in parenthesis.  The traversal
2129 list of a rewrite class defines the set of types that rewriting
2130 should traverse.  If a datatype object contains an argument of a type
2131 not listed in the traversal list, its value will not be altered.
2132    \item[rewrite mode] this defines the rewriting mode of the
2133 rewrite class.
2134 \end{description}
2135
2136 \Subsection{Rewriting rules}
2137
2138 Rewriting rules are specified in a rewrite declaration.
2139 The syntax of a rewriting specification is as follows:
2140
2141 \begin{syntax}
2142 \NDEF{Rewrite\_Decl}
2143    & \IS & \TDEF{rewrite} \Id & variant 1 \\
2144    &    & \T{\{} \OPT{\N{Index\_Decl}}
2145         \ALT{\T{case} \N{Rewrite\_Rules}}{} \T{\}} \\
2146    & \OR & \T{rewrite} \Id & variant 2 \\
2147    &    & \T{\{} \OPT{\N{Index\_Decl}} \ALT{\N{Rewrite\_Rules}}{|} \T{\}} \\
2148 \NDEF{Rewrite\_Rule}
2149   & \IS & \OPT{\N{Rewrite\_Modifier}} \OPT{\N{Id} \T{->}} \\
2150   & & \quad \Pat \OPT{\N{Guard}} \OPT{\N{Cost}} \T{:} \N{Rewrite\_Action} \\
2151 \NDEF{Rewrite\_Modifier}
2152   & \IS & \T{bottomup:} & bottom up mode \\
2153   & \OR & \T{topdown:} & top down mode \\
2154   & \OR & \T{before:} & before actions \\
2155   & \OR & \T{after:} & after actions \\
2156 \NDEF{Rewrite\_Action} & \IS & \T{\{} \N{Code} \T{\}} \\
2157                   & \OR & \Exp \\
2158 \end{syntax}
2159
2160 The name of a rewrite declaration should match the name of
2161 a rewrite class.
2162 The two syntactic forms of \T{rewrite} have equivalent semantics.
2163
2164 The special statement \T{rewrite}($e$) may be used
2165 inside the rhs action to rewrite the current redex into $e$.
2166 Note that \T{rewrite}($e$) is a branching statement; statements
2167 after \T{rewrite} are not reachable.
2168
2169 \Subsection{Rewriting modifiers}
2170 In version 2.3.0 onward, rewrite rules can be modified by the
2171 modifiers: \T{bottomup:}, \T{topdown:}, \T{before:} and \T{after}.
2172 These modifiers alter the order in which rewriting rules are applied.
2173
2174 Their meanings are as follows:
2175 \begin{description}
2176 \item[\TDEF{bottomup:}] This specifies the rewriting rules should
2177 be applied in the default bottom-up mode.
2178 \item[\TDEF{topdown:}] This specifies the rewriting rules should be
2179 applied in a topdown mode.  Topdown rules
2180 will be tried during the initial topdown traversal phase of the rewriting
2181 process.  The user can use this mode to specify actions to be performed
2182 before the bottomup traversal, i.e. before redexes are identified.
2183 \item[\TDEF{before:}]  This specifies that the rewriting rules that
2184 should be tried before the topdown phase.   In addition, unlike topdown mode,
2185 if state-caching if used (see
2186 \ref{sec:state-caching}), the rules are only tried once.
2187 Otherwise, these act just like topdown mode.
2188 \item[\TDEF{after:}]  This specifies the rewriting rules should be
2189 tried only after a redex has been reduced to normal form, i.e.
2190 after the bottomup rules.
2191 \end{description}
2192
2193 These modifiers act like delimiters, similar
2194 to the way scoping keywords like \T{public:}, \T{protected:} and
2195 \T{private:} are used to delimit declarations in C++.
2196 Note that all four modes of rules can be used together in a rewriting system
2197 under tree rewriting mode (see \ref{sec:rewriting-modes})
2198 and their executions are intermixed together.
2199
2200 \Subsubsection{Rewriting modifier example}
2201
2202 We'll use the following example, extracted from a query optimizer for
2203 a subset of the relational calculus, to demonstrate the use
2204 of the rewriting modifiers.
2205
2206 The query optimizer transforms the following abstract syntax, defined
2207 as a set of \Prop's datatypes.
2208 \begin{verbatim}
2209 datatype List<T> = #[] | #[ T ... List<T> ];
2210 datatype Literal = INT int
2211                  | STRING const char *
2212                  | BOOL Bool
2213
2214 and      Exp     = OP Id, Exps
2215                  | APP Id, Exps
2216                  | LIT Literal
2217                  | ID Id
2218                  | TUPLE Exps                  // [ E1, E2, ... En ]
2219                  | FORALL Id, Exp, Exp         // forall x in E1 . E2
2220                  | EXISTS Id, Exp, Exp         // exists x in E1 . E2
2221                  | GUARD (Exp, Exp)            //
2222                  | GENERATOR (Ids, Exps, Exp)  // [x_1,...,x_n] in X_1*...*X_n
2223                  | LET  (Id, Exp, Exp)
2224
2225 law inline Int  i    = LIT(INT i)
2226 |   inline Boolean b = LIT(BOOL b)
2227 |          True      = Boolean true
2228 |          False     = Boolean false
2229 |   inline And a,b   = OP("and",#[a,b])
2230 |   inline Or  a,b   = OP("or", #[a,b])
2231 |   inline Not a     = OP("not",#[a])
2232 |   inline Eq a,b    = OP("=",  #[a,b])
2233 |   inline Ne a,b    = OP("/=", #[a,b])
2234 |   inline Gt a,b    = OP(">",  #[a,b])
2235 |   inline Ge a,b    = OP(">=", #[a,b])
2236 |   inline Lt a,b    = OP("<",  #[a,b])
2237 |   inline Le a,b    = OP("<=", #[a,b])
2238 |   inline In a,b    = OP("in", #[a,b])
2239 |   inline Union a,b = OP("union", #[a,b])
2240 |   inline Count a   = OP("#", #[a])
2241
2242 where type Id         = const char *
2243 and        Ids        = List<Id>
2244 and        Exps       = List<Exp>
2245 ;
2246 \end{verbatim}
2247
2248 Note that existential and
2249 universal quantifiers are represented by the constructors \T{FORALL}
2250 and \T{EXISTS} respectively.  For example, $\exists x \in A.p$ is
2251 presented as the term $\T{EXISTS}(x,A,p)$.  Here $x$ is a {\em binding}
2252 occurrence for $x$; note that the variable $x$ may occur within
2253 the predicate $p$.  In addition, a list comprehension
2254 expression such as
2255 \[ \{ e : [x_1,\ldots,x_n] \in X_1
2256 \times \ldots \times X_n\ |\ p \} \]
2257 is represented as the compound term
2258 \[
2259   \T{GENERATOR}(\#[x_1,\ldots,x_n],\#[X_1,\ldots,X_n],\T{GUARD}(p,e))
2260 \]
2261
2262 Suppose we'll like to rename all variables in a query $Q$
2263 so that no two binding occurrences are given the same name.
2264 This can be easily accomplished using a combination
2265 of \T{topdown:} and \T{bottomup:} rules in the following manner:
2266 \begin{enumerate}
2267 \item[(i)] Use topdown rules to check for new variable bindings,
2268 for example in $\T{EXISTS}(x,A,p)$.
2269 For each new binding occurrence for $x$, create
2270 a new name for $x$.
2271 These will be performed before the subterms $A$ and $p$ are traversed.
2272 \item[(ii)] Whenever a variable $x$ is found during the sub-traversal
2273 of $A$ and $p$, rename the variable by looking up its new name.
2274 \item[(iii)] Finally, use bottomup rules to remove the current
2275 variable substitution whenever we're exiting a binding occurrence.
2276 \end{enumerate}
2277
2278 For example, the following set of rewriting rules accomplish this renaming
2279 task for our query language using exactly this method.
2280 \begin{verbatim}
2281 rewrite RemoveDuplicateNames
2282 {
2283    // We use before and after rules to remove duplicates from variable names.
2284    // As each new binding occurrence is found, we enter a new binding
2285    // into the current environment.  Identifiers found within
2286    // the binding occurrence are then renamed.
2287
2288 topdown: // insert new bindings
2289    EXISTS(x,_,_):     { new_binding(x); }
2290 |  FORALL(x,_,_):     { new_binding(x); }
2291 |  GENERATOR(xs,_,_): { new_binding(xs); }
2292 |  LET(x,_,_):        { new_binding(x); }
2293
2294          // rename variables
2295 |  ID x:             { rename(x); }
2296
2297 bottomup: // removes the binding
2298 |  EXISTS(x,A,_):      { old_binding(x); }
2299 |  FORALL(x,A,_):      { old_binding(x); }
2300 |  GENERATOR(xs,As,_): { old_binding(xs); }
2301 |  LET(x,_,_):         { old_binding(x); }
2302 };
2303 \end{verbatim}
2304
2305 Note that we have accomplished a depth first traversal using rewriting
2306 without writing any traversal code!  As a side benefit,
2307 since the traversal is automatically determined by
2308 the structure of the datatypes, we do not have to rewrite this
2309 renaming code even if the abstract syntax of the query language is
2310 extended, as long as no additional binding operators are added.
2311
2312 \Subsection{The \T{rewrite} statement}
2313
2314    While the rewrite class construct provides a very general abstraction
2315 for rewriting, in general its full power is unneeded.  It is often
2316 convenient to be able to perform rewriting on a term without having
2317 to make a new name for a class just for the occasion, especially if member
2318 functions and member data are unneeded.  To accommodate these situations,
2319 the \verb|rewrite| statement is provided
2320 to perform a set rewriting transformations on a term without having
2321 to define a temporary rewrite class.   It is simply syntactic sugar
2322 for the more general rewrite class and rewrite
2323 rules specifications.
2324 For example, a simplify routine for type \verb|Exp| defined above can be
2325 specified as follows:
2326
2327 \begin{verbatim}
2328    Exp simplify (Exp e)
2329    {  // transformations on e before
2330       rewrite (e) type (Exp)
2331       {  ADD (INT a, INT b):  INT(a+b)
2332       |  SUB (INT a, INT b):  INT(a-b)
2333       |  MUL (INT a, INT b):  INT(a*b)
2334       |  ...
2335       }
2336       // transformations on e after
2337       return e;
2338    }
2339 \end{verbatim}
2340
2341 The \verb|rewrite| normally performs the replacement in place.
2342 An applicative version of the same can be written as follows\footnote{The
2343 variable {\tt e} is assigned the new copy.}:
2344
2345 \begin{verbatim}
2346    Exp simplify (Exp e)
2347    {  rewrite (e) => e type (Exp)
2348       {  ADD (INT a, INT b):  INT(a+b)
2349       |  SUB (INT a, INT b):  INT(a+b)
2350       |  MUL (INT a, INT b):  INT(a*b)
2351       |  ...
2352       }
2353       return e;
2354    }
2355 \end{verbatim}
2356
2357 The syntax of the rewrite statement is as follows.  The
2358 traversal list of the set of rewrite rule is listed next to
2359 the keyword \T{type}.
2360
2361 \begin{syntax}
2362 \NDEF{Rewrite\_Stmt}
2363    & \IS & \TDEF{rewrite} \T{(} \Exp \T{)} \OPT{\T{=>} \Exp} \\
2364    &     & \quad \T{type} \T{(} \LIST{\TypeExp} \T{)} \\
2365    &     & \T{\{} \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \T{\}} \\
2366    & \OR & \TDEF{rewrite} \T{(} \Exp \T{)} \OPT{\T{=>} \Exp} \\
2367    &    & \quad \TDEF{type} \T{(} \LIST{\TypeExp} \T{)} \T{of} \\
2368    &    & \quad \OPT{\N{Index\_Decl}} \N{Rewrite\_Rules} \\
2369    &    & \T{end} \T{rewrite} \T{;} \\
2370 \end{syntax}
2371
2372 \Subsubsection{Rewriting modes} \label{sec:rewriting-modes}
2373    There are two basic modes of operations in a rewrite class: (i)
2374 {\em tree rewriting} and (ii) {\em tree parsing}.
2375 Tree rewriting repeatedly looks for redexes in a tree and transforms it into
2376 another tree.   There are two sub-modes of operations: (a) {\em applicative}
2377 and (b) {\em in-place}.  Applicative mode is specified using the
2378 \T{applicative} mode specifier when declaring a rewrite class.
2379 In this mode, a new tree is built from a
2380 bottom-up manner, and the subject tree is left unaltered.  The default
2381 mode is ``in-place.''  In this mode, the subject tree is overwritten
2382 with the transformed expression.
2383
2384    On the other hand, in tree parsing mode, the left hand side
2385 of a rewrite rules specification is treated as a tree grammar.
2386 The tree parser will look for a derivation of given tree object from
2387 the start non-terminal.
2388 A guarded rule will only be used if the guard evaluates to true.
2389
2390    If rules are annotated with cost expressions, then the
2391 tree parser will try to locate a derivation chain with a minimal total
2392 cost.  After a derivation tree is found, the tree can be repeated
2393 traversed.  The rhs actions will be invoked during this traversal process.
2394
2395 \Subsubsection{State caching optimization} \label{sec:state-caching}
2396
2397    The pattern matching process proceeds in a bottom up manner:
2398 for each subtree of the form $l(t_1,t_2,\ldots,t_n)$, $n \ge 0$,
2399 a state number is computed from the current tree label $l$ and the
2400 state numbers of its subtrees $t_1, t_2, \ldots, t_n$.  The set of matched
2401 rules can be directly computed from the current state number.
2402 This means that in the absence of redex replacements, pattern matching
2403 takes time proportional only $O(|t| + r)$, where $|t|$ is the size
2404 of the input tree $t$ and $r$ is time it takes to execute the rhs actions.
2405 That is, the runtime is insensitive to the number of rewrite rules or the
2406 complexity of the patterns.
2407
2408    However, if the input is a DAG rather than a tree\footnote{Rewriting
2409 of a generic graph structure is not recommended with \T{rewrite}.},
2410 or if replacements are performed, then the above analysis may not hold.
2411 Replacements during rewriting often require state information of the
2412 replacement term to be recomputed to further the matching process.
2413 Since computation of state can involve a complete traversal of a term,
2414 replacement can become expensive if the replacement term is large.
2415 For instance, consider the following replacement rule, which replaces
2416 all expressions of the form {\em 2*x} into {\em x+x}:
2417
2418 \begin{verbatim}
2419    rewrite class StrengthReduction
2420    {
2421       MUL (INT 2, ?x):  ADD(?x, ?x)
2422       ...
2423    }
2424 \end{verbatim}
2425
2426 \noindent
2427 Since the subterm \verb|?x| could be arbitrarily large, recomputing the
2428 state encoding for \verb|ADD(?x,?x)| naively takes time in proportion to the
2429 size of \verb|?x|.  However, since the subtree \verb|?x| occurs as part
2430 of the pattern, its state has already been computed, and it would be
2431 wasteful to recompute its state from scratch.
2432
2433 A similar observation can be made to the case when the input is a DAG,
2434 where many subcomponents are shared between nodes.  In order to speed
2435 up the rewriting process, the state of each shared copy of the nodes
2436 should be computed only once.
2437
2438 In order to speedup the state computation process in these situations,
2439 {\em \INDEX{state caching}} can be enabled by
2440 specifying a {\em rewriting index}.
2441 This can be accomplished in two ways: (i) use the qualifier
2442 \TDEF{rewrite} when defining a datatype.  This specifies the
2443 {\em primary index}.  Or alternatively, (ii) use index declarations
2444 to specify one or more {\em secondary indices}.
2445
2446 We'll first describe the first method.
2447 A \TDEF{rewrite} qualifier can be used
2448 in the definition of a datatype.  For instance:
2449
2450 \begin{verbatim}
2451    datatype Exp :: rewrite
2452       = INT (int)
2453       | ID  (const char *)
2454       | ADD (Exp, Exp)
2455       | SUB (Exp, Exp)
2456       | MUL (Exp, Exp)
2457       | DIV (Exp, Exp)
2458       ;
2459 \end{verbatim}
2460
2461 The \T{rewrite} qualifier tells the translator to generate an extra
2462 state field in the implementation of \T{Exp}.  This state field is implicitly
2463 updated during the rewriting process.  When the rewriter encounters a
2464 node with a cached state it can quickly short-circuit all unnecessary
2465 state computation.
2466
2467 {\bf A word of caution:} since each rewriting system computes
2468 its own encoding, rewriting systems should not be mixed if they shard the
2469 same index, i.e. actions of a
2470 rewriting system should not invoke another rewriting
2471 system on a term that is participating in the current rewriting system,
2472 if both systems use the same index.  This limitation can be overcome
2473 by using secondary indices, which we'll discuss next.
2474
2475 \Subsubsection{Specifying secondary indices}
2476
2477 From release 2.3.0 onward, it is possible to specify secondary indices
2478 within a rewriting system.  This makes it possible to invoke other
2479 rewriting systems within the execution of another, while retaining the
2480 state caching optimizations throughout.
2481
2482 There are two forms of secondary index: {\em internal index} and
2483 {\em external index}.  Internal indices are stored directly within
2484 a rewrite term and require the user to pre-allocate space for each term.
2485 In constrast, external indices are stored in a separate data structure
2486 such as a hash table.
2487
2488 The syntax of secondary index declarations is as follows:
2489 \begin{syntax}
2490 \NDEF{Index\_Decl} & \IS & \TDEF{index:} \LIST{\N{Index\_Spec}} \T{;} \\
2491 \NDEF{Index\_Spec} & \IS &
2492    \TypeExp{} \T{=} \T{none} & \C{Disable indexing} \\
2493    & \OR & \TypeExp{} \T{=} \N{Id} & \C{Internal index} \\
2494    & \OR & \TypeExp{} \T{=} \TDEF{extern} \N{Id} & \C{External index} \\
2495 \end{syntax}
2496
2497
2498 \Subsubsection{Using an internal index}
2499
2500 In order to make use of an index, certain member functions have to defined
2501 by the user.  Given an internal index named {\em foo}, the rewriter
2502 invokes two functions named
2503 \begin{quotation}
2504 \noindent
2505    {\tt int get\_{\em foo}\_state() const; } \\
2506    {\tt void set\_{\em foo}\_state(int); }
2507 \end{quotation}
2508 \noindent within the generated code.  The user should provide the
2509 implementations for these functions within the datatypes.
2510
2511 For example, reusing the well-formed formulas example (see
2512 section~\ref{sec:Wff}),
2513 an internal index on the datatype \T{Wff} can be implemented as follows:
2514 \begin{verbatim}
2515 #include <AD/rewrite/burs.h>
2516 class WffIndex {
2517    int state;
2518 public:
2519    WffIndex() : state(BURS::undefined_state) {}
2520    int get_wff_state() const { return state; }
2521    void set_wff_state(int s) { state = s; }
2522 };
2523
2524 datatype Wff : public WffIndex
2525    = F
2526    | T
2527    | Var     (const char *)
2528    | And     (Wff, Wff)
2529    | Or      (Wff, Wff)
2530    | Not     (Wff)
2531    | Implies (Wff, Wff)
2532    ;
2533
2534 ...
2535
2536 rewrite Simplify {
2537    index: Wff = wff;
2538    ...
2539 };
2540 \end{verbatim}
2541
2542 Here, the class \T{WffIndex} provides the internal index interface
2543 functions \verb|get_wff_state| and \verb|set_wff_state| expected by
2544 the rewriter.  Note that each term must be initialized with the
2545 state \verb|BURS::undefined_state|.  This constant is defined within
2546 the library include file \verb|<AD/rewrite/burs.h>|.
2547
2548 To enable the index, in the rewriter
2549 we specify that datatype \T{Wff} should make use of the index named \T{wff},
2550 using the \T{index:} declaration.
2551
2552 \Subsubsection{Using an external index}
2553
2554 External indices are specified in a similar manner.  Given a datatype
2555 {\em Foo} and an external index named {\em bar}, the rewriter invokes
2556 calls to the following functions:
2557 \begin{quotation}
2558 \noindent
2559    {\tt int get\_{\em bar}\_state({\em Foo}) const; } \\
2560    {\tt void set\_{\em bar}\_state({\em Foo}, int); }
2561 \end{quotation}
2562 Typically, these are implemented as member functions in the rewrite class.
2563
2564 The rewriter uses the function {\tt get\_{\em bar}\_state} to lookup
2565 previously cached information set by the function {\tt set\_{\em bar}\_state}.
2566 Typically, the implementation of the two functions can be implemented
2567 as hash tables, using the addresses of the terms as hash functions and
2568 pointer equality comparisons.   Note that caching can be {\em lossy}; i.e.
2569 it is perfectly alright for the cache to eliminate cached information to
2570 keep its size under control.  If no cache information is found,
2571 the function {\tt get\_{\em bar}\_state} should return
2572 \verb|BURS::undefined_state|.
2573
2574 \paragraph{Class \CLASSDEF{RewriteCache}}
2575    To make it easy for the users to implement their own external indices,
2576 two sample external index implementation are provided.  The first
2577 is the class \verb|RewriteCache| defined in the library file
2578 \verb|<AD/rewrite/cache.h>|.
2579
2580    The class \verb|RewriteCache| implements a simple fixed capacity
2581    cache with the following member functions.
2582 \begin{verbatim}
2583    RewriteCache();
2584    RewriteCache(int capacity);
2585   ~RewriteCache();
2586
2587    void clear();                   // after rewriting
2588    void initialize();              // before rewriting
2589    void initialize(int capacity);  // before rewriting/also set capacity
2590
2591    int capacity() const;
2592
2593    void set_state(const void * t, int s);
2594    int get_state(const void * t) const;
2595
2596    void invalidate(const void * t);
2597 \end{verbatim}
2598
2599 The user should call the function \verb|initialize| before rewriting
2600 in order to setup the cache and/or clear out previously cached result.
2601 The functions \verb|set_state| and \verb|get_state| can be used to
2602 implement the necessary state caching functions expected by the rewriter.
2603
2604 Returning to our \T{Wff} example, we have the following possible implementation
2605 of the external index:
2606 \begin{verbatim}
2607    rewrite class Simplify (Wff)
2608    {  RewriteCache cache;
2609    public:
2610       int get_wff_state(Wff t) const { return cache.get_state(t); }
2611       void set_wff_state(Wff t, int s) { cache.set_state(t,s); }
2612    };
2613
2614    rewrite Simplify
2615    {  index: Wff = extern wff;
2616       ...
2617    };
2618 \end{verbatim}
2619
2620 \begin{Tips}
2621 If manual memory management is used,
2622 a term should not be deallocated from memory if a cache
2623 contains a handle to it.  The method \verb|invalidate| can be used to
2624 make sure the entry associated with a term is removed from the cache.
2625 \end{Tips}
2626
2627 \paragraph{Class \CLASSDEF{GCRewriteCache}}
2628
2629 {\em This class has not be completely tested.}
2630
2631 Since a garbage collectable object is not reclaimed by the collector
2632 whenever there exists a reference to it, using the class
2633 \verb|RewriteCache| may create problems since objects are not reclaimed
2634 if it is referenced by the cache.  The template class \verb|GCRewriteCache|
2635 solves this problem by using {\em weak pointers} in its internal
2636 implementation.
2637
2638 The template class \verb|GCRewriteCache<T>| expects \verb|T| to be
2639 a class derived from \verb|GCObject|.
2640 This class implements the following functions:
2641 \begin{verbatim}
2642    GCRewriteCache();
2643    GCRewriteCache(int capacity);
2644   ~GCRewriteCache();
2645
2646    void clear();                   // after rewriting
2647    void initialize();              // before rewriting
2648    void initialize(int capacity);  // before rewriting/also set capacity
2649
2650    int capacity() const;
2651
2652    void set_state(T * t, int s);
2653    int get_state(T * t) const;
2654
2655    void invalidate(T * t);
2656 \end{verbatim}
2657
2658 The usage of these functions are identical to that of class
2659 \verb|RewriteCache|.
2660
2661 \Subsection{Short circuiting rewrite paths with \TDEF{cutrewrite}}
2662
2663    From version 2.3.0 onward, the \T{cutrewrite}($e$)
2664 statement may be used whereever a \T{rewrite}($e)$ statement is expected.
2665 The \T{cutrewrite} statement can be used to selectively ignore certain
2666 subterms during the rewriting process.
2667
2668 The semantics of \T{cutrewrite}($e$) is to replace the current redex
2669 with $e$ but do not bother with looking for other redexes in the replacement
2670 term.  This means that the replacement term will not be retraversed
2671 immediatedly.  Furthermore, the replaced term will not match any other
2672 left hand side sub-patterns except wildcards.
2673
2674    This feature is very useful for preventing the rewriter from
2675 looking inside certain terms.
2676
2677    To demonstrate, consider the simple following example:
2678 \begin{verbatim}
2679    datatype Exp = NUM of int
2680                 | ADD of Exp, Exp
2681                 | SUB of Exp, Exp
2682                 | MUL of Exp, Exp
2683                 | DIV of Exp, Exp
2684                 | FINAL of Exp
2685                 ;
2686    Exp e = ADD(NUM(1),MUL(NUM(2),NUM(3)));
2687 \end{verbatim}
2688
2689 Suppose we want to find all numbers within
2690 the expression \T{e} and increment their values by 1.
2691 The straight forward way of writing:
2692 \begin{verbatim}
2693    rewrite (e) type (Exp) {
2694       NUM x: NUM(x+1)
2695    }
2696 \end{verbatim}
2697 \noindent will not work, since the replacement term is a new redex,
2698 and the rewriting system will not terminate.
2699
2700 We can reimplement the rewriting system as a two stage process.
2701 First, we mark all terms that we do not want to be changed; whenever we
2702 find a marked term, we execute a \T{cutrewrite} to prevent the term
2703 from being changed.  Then we unmark the terms.
2704 \begin{verbatim}
2705    rewrite (e) type (Exp) {
2706       NUM x:   FINAL(NUM(x+1));
2707    topdown:
2708       FINAL _: cutrewrite;
2709    }
2710    rewrite (e) type (Exp) {
2711       FINAL x: x
2712    }
2713 \end{verbatim}
2714
2715 In the first rewrite statement, replacement terms that we
2716 want to stay fixed are enscapsulated
2717 within the auxiliary \T{FINAL} constructor.
2718 Recall that topdown rules are tried before the subterms of a redex are
2719 reduced.  Thus we can make sure all that
2720 all terms that are arguments to a \T{FINAL} term is left unaltered
2721 by adding a \T{topdown} rule that executes a \T{cutrewrite}\footnote{
2722 If an expression argument is not given to a \T{cutrewrite} statement, then
2723 the redex is left unchanged.}
2724 Finally, after the first rewrite statement finishes, we use
2725 the second rewrite statement to remove all \T{FINAL} terms generated
2726 in the first.
2727
2728 \begin{Tips}
2729 Since the semantics of a rewriting system with \T{cutrewrite} can
2730 be difficult to analyze, its use should be avoided as much as possible.
2731 \end{Tips}
2732
2733 \Subsection{Confluence and termination}
2734
2735    Currently no automatic checking is available to make sure that
2736 a set of rewriting rules is confluent and will terminate.  This is currently
2737 the user's responsibility.  Some verification features will be developed
2738 in the future.
2739
2740 \Section{Graph Types and Graph Rewriting} \label{sec:graph}
2741
2742 A typical compiler uses many types of graphs to represent internal program
2743 information.  For instance, control flow graphs,
2744 data dependence graphs, call graphs, def/use chains are
2745 some of the most common examples.  In order to automate the process
2746 of implementing and manipulating these graph structures, \Prop{} provides
2747 a generic formalism-- {\em graph types}--
2748 for specifying multisorted, attributed, hierarchical directed graphs in
2749 very high level manner.  The data structure mapping process of translating
2750 high level graph specifications into concrete \Cpp{} classes is completely
2751 automated.  Thus the user can concentrate on specifying the semantic
2752 relationships between the elements of the domain, instead of the
2753 implementation details.
2754
2755 The graph structure mapping process uses real-time set machine
2756 simulation\cite{real-time-SETL} and subtype analysis\cite{subtype-SETL}
2757 to convert associative accesses involving sets, maps, and multimaps
2758 into hash free, worst-case constant time pointer manipulations.
2759 This optimization is performed transparently by the translator.
2760
2761 Manipulation of graphs structures are done in three ways:
2762 \begin{description}
2763 \item[object-oriented]
2764  Using a standard interface generated from \Prop{} the user can
2765 manipulate a graph in the usual procedural/object-oriented manner.
2766 \item[set formalism]  Using an embedded SETL\cite{SETL}-like sublanguage,
2767 the user can manipulate the graphs using high level set operations
2768 such as set comprehension.
2769 \item[graph rewriting]  At the highest level, analysis and transformation
2770 can be carried out using the graph rewriting formalism.
2771 \end{description}
2772
2773 In this section we'll describe each of these topics in detail.
2774
2775 \Subsection{Graph Types} \label{sec:graph-types}
2776
2777 A \INDEX{graph type} declaration is used to specify a
2778 graph structure
2779 with multiple sorts of labeled nodes and labeled directed edges.
2780 Attributes can be attached to both nodes and edges.
2781 The syntax of \T{graphtype} declarations is as follows:
2782 \begin{syntax}
2783 \NDEF{Graph\_Type}
2784       & \IS & \TDEF{graphtype} \Id{} \\
2785       & & \quad \OPT{\T{:} \N{Inherit\_List}} \\
2786       & & \quad \OPT{\T{::} \MORE{\N{Graph\_Mode}}{}} \\
2787       & & \T{declare} \\
2788       & & \quad \TDEF{node:} \ALT{\N{Node\_Def}}{|} \\
2789       & & \quad \TDEF{edge:} \ALT{\N{Edge\_Def}}{|} \\
2790       & & \T{begin} \\
2791       & & \quad \N{Code} \\
2792       & & \T{end} \T{graphtype} \T{;} \\
2793 \NDEF{Graph\_Mode} & \IS & \\
2794 \NDEF{Node\_Def} & \IS & \Id{} \OPT{\OPT{\T{of}} \TypeExp{}} \\
2795 \NDEF{Edge\_Def}
2796     & \IS & \Id{} \OPT{\T{of}} \TypeExp{} \T{->} \TypeExp \\
2797     & \OR & \Id{} \OPT{\T{of}} \TypeExp{} \T{<->} \TypeExp \\
2798     & \OR & \Id{} \OPT{\T{of}} \TypeExp{} \T{<=>} \TypeExp \\
2799     & \OR & \Id{} \OPT{\T{of}} \TypeExp{} \T{<=>*} \TypeExp \\
2800 \end{syntax}
2801
2802 \Subsection{The Graph Interface} \label{sec:graph-interface}
2803 \INCOMPLETE
2804
2805 \Subsection{Set Formalisms} \label{sec:set-formalisms}
2806 \INCOMPLETE
2807
2808 \Subsection{Graph Rewriting} \label{sec:graph-rewriting}
2809 \INCOMPLETE
2810
2811 \Section{Running the Translator} \label{sec:usage}
2812
2813 The \Prop{} translator is a program called \verb|prop|.  The translator
2814 uses the following command line syntax:
2815 \begin{syntax}
2816   \NDEF{Running\_Prop} & ::= &
2817    \T{prop} \OPT{{\em prop\_options}} \ALT{{\em file}}{} \\
2818 \end{syntax}
2819
2820 Here, \N{File} is a file with suffix \verb|.p|$*$.  By default, the output
2821 file will have the same name with the \verb|p| extension removed.
2822 For example, a file named ``foo.pC'' will be translated into
2823 the file ``foo.C''  Similarly, a file named ``bar.ph'' will be translated
2824 into ``bar.h''
2825
2826 \Subsection{Options}
2827 The available options to the translator are as follows:
2828 \begin{description}
2829 \item[\OPTIONDEF{-G -GNU}]      Use GNU style error messages.
2830 \item[\OPTIONDEF{-I{\em path}}] Use the search path {\em path}.
2831 \Prop{} will search {\em path} to look for \Prop{} files with
2832 the suffix \verb|.ph| inside a \verb|#include| directive.  Multiple
2833 paths can be specified by using multiple \verb|-I| options.
2834 \item[\OPTIONDEF{-l -no\_line\_directives}]   By default, \Prop{} will try
2835 to generate \verb|#line| directives to correlate the source and
2836 the translated output.  This option suppresses this feature.
2837 \item[\OPTIONDEF{-M -make\_depends}]  Similarly to the \verb|-M| option
2838 in \verb|cc| and \verb|cpp|.  This option generates a suitable dependency
2839 list for Makefiles.
2840 \item[\OPTIONDEF{-n -no\_codegen}]
2841 Don't output any code; perform semantic checking
2842 only.
2843 \item[\OPTIONDEF{-N -non\_linear}]
2844 Use non-linear patterns during pattern matching.
2845 A non-linear pattern is one in which a variable occurs more than
2846 once.   By default, \Prop{} consider this an error.
2847 \item[\OPTIONDEF{-o{\em outfile}}]  Send output to {\em outfile} instead of
2848 the default.
2849 \item[\OPTIONDEF{-Ofast\_string\_match}]
2850 Generate (potentially) faster string matching
2851 code by expanding string literal tests into character tests.  By default,
2852 \Prop{} generates string tests using \verb|strcmp| and binary search.
2853 \item[\OPTIONDEF{-Oadaptive\_matching}]  Use the adaptive pattern matching
2854 algorithm\footnote{The implementation may not be effective.}.
2855 \item[\OPTIONDEF{-Oinline\_casts}]
2856 Generate inline type casts instead of function
2857 calls that do the same.  This may be faster for compilation.
2858 \item[\OPTIONDEF{-Otagged\_pointer}]
2859 Use a tagged pointer representation instead
2860 of representing variant tags explicitly in a structure field.  For
2861 example, if there are two variants called \verb|Foo| and
2862 \verb|Bar|  in a datatype.  Then a pointer to \verb|Foo| can be tagged
2863 with a low order bit 0 while a pointer to \verb|Bar| can be tagged with
2864 a low order bit 1.  Accessing the tag thus becomes just a simple bit
2865 test.  This should save space and may be faster\footnote{However, this feature
2866 has not been fully tested.}
2867 \item[\OPTIONDEF{-r -report}]
2868 Generate a report whenever necessary.  Parser, string
2869 matching, and rewriting constructs produce reports.
2870 \item[\OPTIONDEF{-s -strict}]  Use strict semantic checking mode.
2871 All warnings are considered to be errors.
2872 \item[\OPTIONDEF{-S -save\_space}]  Use space saving mode.
2873 Try not to use inline functions if code space can be saved.
2874 Datatype constructors will not be inlined
2875 in this mode.  Instead, datatype constructors will be generated at
2876 where the corresponding \TDEF{instantiate datatype} declaration occurs.
2877 \item[\OPTIONDEF{-t -stdout}]   Send translated program to the standard output.
2878 \item[\OPTIONDEF{-v{\em num}}]  Use verbose mode in report generation.
2879 This will provide more detailed information.
2880 \end{description}
2881
2882 \Subsection{Error Messages}
2883
2884 The following is a canonical list of all error messages generated by
2885 the \Prop{} translator.  Each message is prefixed by the
2886 file name and line number in which the error occurs.    Most of these
2887 errors messages are self explanatory; more detailed explanations are provided
2888 below.
2889
2890 \begin{Error}
2891 \errormsg{unknown option {\em option}}  The translator does not recognize
2892 the command line {\em option}.   Type {\em prop} to see a list of options.
2893 \errormsg{Error in command: {\em command}}  The translator fails when trying
2894 to execution {\em command}.  When multiple files are specified in a command
2895 line, {\em prop} invokes itself on each of the file.
2896 \errormsg{{\em pat} with type {\em type} is not unifiable}  Pattern has not
2897 been declared to be a unifiable type.
2898 \errormsg{Sorry: pattern not supported in rewriting: {\em pat}}  Pattern
2899 {\em pat} is not supported.  Currently, logical pattern connectives
2900 are not supported in rewriting.  You'll have to rewrite them into
2901 non-logical form.
2902 \errormsg{Unknown complexity operator: {\em op}}
2903 \errormsg{accessor is undefined for view pattern: {\em pat}} An accessor
2904 function has not been declared for a constructor.  When using datatype views
2905 \errormsg{arity mismatch (expecting $n$) in pattern: {\em pat}}  Pattern
2906 {\em pat} is expected to have arity $n$.   The arity is the number of
2907 expressions that you're matching concurrently.  If you are matching $n$
2908 objects then there must be $n$ patterns for each rule.
2909 \errormsg{arity mismatch in logical pattern: {\em pat}}  Logical patterns
2910 do not match the arity of a pattern.
2911 \errormsg{bad constructor type {\em type}}
2912 \errormsg{bad view constructor pattern: {\em pat}}
2913 \errormsg{can't find include file {\em file}}  Additional search paths
2914 can be specified with the \verb|-I| option
2915 \errormsg{component \#$i$ not found in constructor {\em con}}
2916 Datatype constructor {\em con} does not have a component named \#$i$.
2917 This can happen when you write a record constructor expression and misspelt
2918 one of the labels.
2919 \errormsg{component $l$ not found in constructor {\em con}}  Datatype
2920 constructor {\em con} does not have a labeled component $l$.
2921 \errormsg{constructor {\em con} already has print formats}
2922 \errormsg{constructor {\em con} is not a class}
2923 \errormsg{constructor {\em con} is not a lexeme}  A constructor is used
2924 as a terminal in the parser but it has not been declared to be a lexeme.
2925 \errormsg{constructor {\em con} doesn't take labeled arguments}  You're trying
2926 to use record constructor syntax on a constructor take does not take
2927 record arguments.
2928 \errormsg{constructor {\em con} is undefined}
2929 \errormsg{cyclic type definition in type {\em id} = {\em type}}
2930 Type abbreviations cannot be cyclic.  For recursive types, use \verb|datatype|
2931 definitions.
2932 \errormsg{datatype {\em T} is not a lexeme type}  {\em T} is used as a
2933 terminal within syntax class when it has not been declared as a lexeme type.
2934 All non-terminals must be a defined using the \verb|lexeme| qualifier.
2935 \errormsg{determinism {\em det} not recognized}
2936 \end{Error}
2937
2938 \begin{Error}
2939 \errormsg{duplicated definition of pattern constructor '{\em con}'}
2940   The constructor {\em con} has already been in another datatype.  {\em Prop}
2941   does not allow overloading of constructor names.
2942 \errormsg{duplicated label '$l$' in expression: {\em exp}}
2943   A record type has one of its labels duplicated.
2944 \errormsg{duplicated label '$l$' in type {\em type}}
2945 \errormsg{duplicated pattern variable '{\em id}'.  Use option -N}
2946    By default,
2947    pattern variables may not be duplicated within a pattern.  Non-linear
2948    patterns are allowed only when the option \verb|-N| is invoked.
2949 \errormsg{edge $e$ is not defined in graphtype {\em id} }
2950 \errormsg{empty type list in rewrite (...) ... }
2951 \errormsg{expecting $c_1$ ... $c_2$ (from line $l$) but found $c_3$ ... $c_4$
2952    instead}  Quotation symbols are not properly balanced.
2953 \errormsg{expecting non-terminal {\em nt} to have type $t_1$ but found $t_2$}
2954    Non-terminal {\em nt} has already been defined to have a
2955    synthesized attribute type of $t_1$ but $t_2$ is found.   This can happen
2956    if you have rules with the same lhs non-terminal previously defined.
2957 \errormsg{expecting type $t_1$ but found $t_2$ in pattern variable '{\em v}'}
2958 {\em Prop} performs type inference on all the lhs patterns to determine
2959 the types of all variables.  Type errors can occur if the patterns are
2960 miswritten.
2961 \errormsg{expecting type $t_1$ but found $t_2$}
2962 \errormsg{flexible vector pattern currently not supported in
2963 rewriting: {\em pat}}
2964 \errormsg{format \#$i$ used on constructor {\em con}}
2965 \errormsg{format \#$l$ used on non-record constructor {\em con}}
2966 \errormsg{function name mismatch: expecting $f$ ...}
2967 \errormsg{illegal character $c$}
2968 \errormsg{illegal context type: {\em type}}  A context type in
2969 a \verb|matchscan| statement must be previously defined to be datatype.
2970 The unit constructors of the datatype can be used as context values.
2971 \errormsg{illegal context(s) in pattern {\em pat}}
2972 \errormsg{illegal format '\_' on constructor {\em con}}
2973 \errormsg{illegal incomplete record pattern: {\em pat}}
2974 \errormsg{illegal print format '$c$' in constructor {\em con}}
2975 \errormsg{illegal record label '$l$' in expression: {\em exp}}
2976 \errormsg{illegal width in bitfield '{\em id} ($n$)'}
2977 \errormsg{inherited attribute '{\em type}' can only be used in treeparser
2978 mode in rewrite class {\em id}}
2979 \end{Error}
2980
2981 \begin{Error}
2982 \errormsg{law {\em id}(...) = {\em pat} is not invertible}   Pattern {\em pat}
2983 cannot be treated as an expression.  It may involve logical patterns and
2984 wild cards.
2985 \errormsg{law '{\em id}': bound variable '$v$' is absent in body {\em exp}}
2986 A parameter variable $v$ must be present within the body of a law.
2987
2988 \errormsg{law '{\em id}': type {\em type} cannot be used in parameter {\em id}}
2989
2990 \errormsg{lexeme {\em id} is undefined}
2991
2992 \errormsg{lexeme class {\em id} is undefined}
2993
2994 \errormsg{lexeme pattern is undefined for constructor {\em con}}
2995
2996 \errormsg{lexeme \{{\em id}\} already defined as {\em regexp}}
2997
2998 \errormsg{lexeme \{{\em id}\} is undefined in {\em regexp}}
2999
3000 \errormsg{missing '\}' in regular expression {\em regexp}}
3001
3002 \errormsg{missing label '$l$' in expression: {\em con} {\em exp}}
3003
3004 \errormsg{missing non-terminal in tree grammar rule: {\em nt}}
3005 \errormsg{missing type {\em type} in the traversal list of rewrite class {\em id}}
3006 Within the rewriting rules, you have used a pattern that involve a constructor
3007 of type {\em  type} directly but no such types are defined in the
3008 rewrite class definition.  You should add the type to the traversal list.
3009
3010 \errormsg{missing type info in expression {\em exp} : {\em type}}
3011    Sometimes {\em prop} fails to infer the type of an expression within
3012 a {\em match} statement.  It such
3013 cases it is necessary to help out the translator by adding additional
3014 type information in the patterns.
3015
3016 \errormsg{missing type info in function {\em f} {\em type}}  Similar to above
3017 \errormsg{missing type info in rule: {\em f} {\em pat} : {\em type}}
3018    Similar to above.
3019 \errormsg{missing view selector for pattern: {\em pat}}
3020 \errormsg{multiple mixed polarity pattern variable '{\em v}'}
3021 \errormsg{multiple unit constructors in polymorphic type {\em id} {\em arg}
3022 is not supported}
3023 \errormsg{negative cost {\em cost} is illegal}
3024 \errormsg{node {\em id} is not defined in graphtype {\em id}}
3025 \errormsg{non lexeme type {\em type} in pattern {\em pat}}
3026 \errormsg{non-relation type {\em type} in pattern: {\em pat}}
3027 \errormsg{pattern is undefined for lexeme {\em l}}
3028 \errormsg{pattern scope stack overflows}
3029 \errormsg{pattern scope stack underflows}
3030 \end{Error}
3031
3032 \begin{Error}
3033 \errormsg{pattern variable '$v$' has no binding at this point}
3034 \errormsg{persist object id is undefined for {\em con}}
3035 \errormsg{persistence pid {\em name} already allocated for type {\em type}}
3036 \errormsg{persistence redefined for type {\em type}}
3037 \errormsg{precedence symbol must be terminal: {\em term}}
3038 \errormsg{predicate {\em p}: expecting type $t_1$ but found $t_2$}
3039 \errormsg{redefinition of bitfield '{\em field}'.}
3040 \errormsg{redefinition of constructor '{\em con}'}
3041 \errormsg{redefinition of datatype {\em id}}
3042 \errormsg{redefinition of lexeme class {\em id}}
3043 \errormsg{replacement not in rewrite class: rewrite {\em exp}}
3044 \errormsg{rewrite class {\em id} has already been defined}
3045 \errormsg{rewrite class {\em id} is undefined}
3046 \errormsg{rule $r$ has incomplete type {\em type}}
3047 \errormsg{rule $r$ is of a non datatype: {\em type}}
3048 \errormsg{syntax class {\em id} has already been defined}
3049 \errormsg{syntax class {\em id} is undefined}
3050 \errormsg{synthesized attribute '{\em type}'
3051 can only be used in treeparser mode in rewrite class {\em id}}
3052 \errormsg{the class representation of constructor {\em id} has been elided}
3053 \errormsg{this rule is never selected: $r$}
3054 \errormsg{this rule is never used: $r$}
3055 \errormsg{too few arguments {\em arg} in instantiation of type scheme {\em type}}
3056 \errormsg{too many arguments {\em arg} in instantiation of type scheme {\em type}}
3057 \errormsg{type {\em type} is not a datatype}
3058 \errormsg{type {\em type} is undefined}
3059 \end{Error}
3060
3061 \begin{Error}
3062 \errormsg{type {\em id} = {\em type} is not a datatype}
3063 \errormsg{type {\em id} has already been defined as {\em type}}
3064 \errormsg{type {\em id} has unknown class: {\em C}}
3065 \errormsg{type '{\em type}' is not rewritable in treeparser mode rewrite class
3066 {\em id}}
3067     All datatypes used within the treeparser mode of rewriting must be defined
3068     with the \verb|rewrite| qualifier.
3069 \errormsg{type error in pattern {\em pat}: {\em type}}
3070 \errormsg{type mismatch between nonterminal {\em nt}(type $t_1$)
3071 and rule $r$(type $t_2$)}
3072 \errormsg{type mismatch in pattern: {\em pat}}
3073 \errormsg{type mismatch in rule $r$}
3074 \errormsg{type mismatch in rule $r$ {\em pat}}
3075 \errormsg{type or constructor {\em con} is undefined}
3076 \errormsg{unable to apply pattern scheme {\em pat}}
3077 \errormsg{unbalanced $c_1$ ... $c_2$ at end of file}
3078 \errormsg{undefined non-terminal {\em nt}}
3079 \errormsg{unification fails occurs check with $t_1$ and $t_2$}
3080 \errormsg{unmatched ending quote $c$}
3081 \errormsg{unrecognized quoted expression `{\em exp}`}
3082 \errormsg{unrecognized quoted pattern `{\em pat}`}
3083 \end{Error}
3084
3085 \pagebreak
3086 \appendix
3087
3088 \Section{Garbage Collection in the {\sf Prop} Library} \label{appendix:gc}
3089    In this appendix
3090 we describe the design and implementation of a garbage collector
3091 based on the Customisable Memory Management framework(CMM)\cite{CMM} in our
3092 \Prop{} \Cpp{} class library.  Like the previous approach, we have
3093 implemented a mostly copying conservative collector based on the work
3094 of Bartlett\cite{Mostly-copying}.
3095 Similar to CMM, our architecture provides a protocol to allow
3096 multiple garbage collectors using different algorithms to coexist in the
3097 same memory space.  A few improvements are made to
3098 improve the performance, the flexibility and the functionality of
3099 our collector: to reduce retention due to false roots identification,
3100 blacklisting\cite{Boehm} is used to identify troublesome heap
3101 addresses; the architecture of the system has been generalized so that it is
3102 now possible to have multiple instantiations of Bartlett-style heaps;
3103 finally, object finalization and weak pointer support
3104 are added.  Our collector has been tested on gcc 2.5.8 under Linux,
3105 and SunOS 4.1.x\footnote{An alpha version also works under Solaris. Thanks
3106 to Paul Dietz for the patch.}
3107
3108 \Subsection{Introduction}
3109    The \Prop{} project involves the design and implementation of
3110 an environment and an extension language based on \Cpp{}
3111 for compiler construction and program transformation.
3112 An outgrowth of this project is the \Prop{}
3113 \Cpp{} class library, which contains an extensive set of support algorithms
3114 and data structures.  Since a typical compiler manipulates many complex tree,
3115 dag and graph objects with extended lifetime, manual memory management using
3116 \Cpp's constructors and destructors, reference counting smart pointers, or
3117 some other techniques is frequently complex and error prone.
3118 Furthermore, with the advent of new algorithms for garbage collection,
3119 manual memory management techniques do not necessarily provide better
3120 performance or space utilization.  To make it possible to make use of garbage
3121 collection as a viable alternative for memory management in
3122 \Cpp\cite{Safe-C++-GC}, we have implemented a garbage collection class
3123 hierarchy as part of the \Prop{} library.  The class library can be used
3124 directly by the users who'd like to program in \Cpp; it can also be
3125 used as part of the runtime system of a highly level language implemented
3126 in \Cpp, as we have done for \Prop.
3127
3128    We have several good reasons to prefer garbage collection over manual
3129 memory management.  The \Prop{} language contains many declarative constructs
3130 and extensions such as algebraic datatypes, pattern matching, rewriting, and
3131 logical inference.  When a user programs in \Prop{} using these constructs,
3132 an applicative style of programming is the most natural paradigm.
3133
3134 \Subsection{The Framework}
3135
3136    We base our design on the work on the Customisable Memory Management(CMM)
3137 system\cite{CMM}.  In this framework, multiple independent heaps(including
3138 the usually non-collectable heap) can coexist with each other.
3139 Bartlett's mostly copying garbage collector is used as the primary collector.
3140 CMM extends the work of Bartlett\cite{Mostly-copying}
3141 by allowing cross heap pointers and unrestricted interior pointers.
3142
3143   However, all collectable objects are required to derive from a base class
3144 and reimplement a tracing method, which identifies the internal pointers of
3145 an object.  This burden is usually quite small and in fact can be
3146 automated from the type definition\footnote{In \Prop, a special keyword
3147 \T{collectable} is used to identify garbage collected classes and types.
3148 The \Prop{} translator uses this type annotation to derive the appropriate
3149 tracing methods.}
3150
3151   One of the major advantages of the CMM framework is that multiple
3152 coorperating collectors can coexist at the same time, which makes it
3153 possible to customize the behavior of each collector with respect to
3154 the lifetime behavior of the objects.  In \cite{CMM}, examples are
3155 given in which the lifetime of certain class of objects exhibit
3156 first-in/first-out behavior.  In this case a special collector can be
3157 written to take full advantage of this property.
3158
3159 \subsection{Our Framework}
3160    Our framework retains all the benefits and flexibilities of CMM,
3161 while extending it in several minor but useful ways:
3162 \begin{itemize}
3163    \item Like CMM, interior pointers(i.e. pointers to the middle of an object)
3164 and crossheap pointers (i.e. complex data structures linking nodes locating
3165 in multiple logical heaps.) are supported.  Pointers to collectable
3166 objects are non-intrusive: i.e. they are normal \Cpp{} pointers
3167 and not encapsulated in templates.
3168    \item Also like CMM, we allow multiple garbage collectors using different
3169 algorithms to coexist.  Unlike CMM, however, we allow multiple
3170 Bartlett-style collectors to be instantiated.  Most of the services
3171 involving low level page management and object marking have been relegated
3172 to a separate heap manager.
3173    \item We provide support for finalization and weakpointers.
3174    \item We have implemented blacklisting\cite{Boehm} to reduce the chance
3175 of false roots identification.
3176 \end{itemize}
3177
3178 \subsection{The Implementation}
3179
3180    Our implementation has been completely written from scratch since
3181 we do not desire to utilize copyrighted code and, more importantly,
3182 we have to make a few important architectural changes:
3183  All low level memory management services, such as management of
3184 the page table and the object bitmap, is now relegated to the class
3185 \CLASSDEF{GCHeapManager}.  Collectors now act as clients as of the heap manager
3186 and the Bartlett-style collector no longer has any special status.
3187
3188 \subsection{Architecture}
3189
3190    The architecture of the memory management system is partitioned into
3191 a few classes, each responsible for providing distinct services:
3192
3193 \begin{itemize}
3194    \item \CLASSDEF{GCHeapManager} --- The heap manager.  The heap manager
3195     manages the heap table, performs page level
3196     allocation and deallocation, and provides miscellaneous services
3197     like blacklisting.  It also manages the object bitmaps.
3198    \item \CLASSDEF{GC} --- The base class for all garbage collectors.
3199     This base class describes the protocol used by all the collector
3200     classes\footnote{This base class is also inherited from class
3201     \CLASSDEF{Mem}, so that it adheres to the \Prop{} memory management
3202     protocol.}
3203    \item \CLASSDEF{CGC} --- The base class for conservative collectors.
3204      This class is inherited from class \CLASSDEF{GC} and implements some
3205      methods for locating the stack, heap, and static data areas.
3206    \item \CLASSDEF{BGC} --- Bartlett-style mostly copying collector.
3207       This class is inherited from class \CLASSDEF{CGC} and implements
3208       the Bartlett mostly copying algorithm.
3209    \item \CLASSDEF{MarkSweepGC} --- Mark/sweep style conservative collector.
3210       This class is inherited from class \CLASSDEF{CGC} and implements
3211       a mark/sweep collection algorithm.
3212    \item \CLASSDEF{WeakPointerManager} --- The weakpointer manager.
3213       This class manages the weak pointer table and provides a few
3214       weak pointer scavenging services for the collectors.
3215 \end{itemize}
3216
3217 \Subsection{The Programmatic Interface}
3218
3219    The programmatic interface to the garbage collector involves two
3220 base classes, \CLASSDEF{GC} and \CLASSDEF{GCObject}.  The base class
3221 \CLASSDEF{GC} provides an uniform interface to all types of collectors
3222 and memory
3223 managers while class {\sf GCObject} provides the interface to all
3224 collectable classes.  Table~\ref{GC-Classes} contains a listing
3225 of the classes in our hierarchy.
3226
3227 \begin{table}
3228    \begin{center}
3229       \begin{tabular}{|l|l|} \hline
3230         Class           & Purpose \\ \hline \hline
3231         \sf GCObject    & Collectable object base class \\
3232         \sf GC          & Garbage collector base class \\
3233         \sf CGC         & Conservative garbage collector base class \\
3234         \sf BGC         & Bartlett style mostly copying collector \\
3235         \sf MarkSweepGC & A mark-sweep collector \\
3236         \sf UserHeap    & A heap for pointerless object \\
3237         \sf GCVerifier  & A heap walker that verifies the integrity of a
3238                           structure \\
3239       \hline
3240       \end{tabular}
3241    \end{center}
3242    \caption{\label{GC-Classes} Garbage Collection Classes.}
3243 \end{table}
3244
3245 Class \CLASSDEF{GCObject} is extremely simple: it redefines the operators
3246 \verb|new| and \verb|delete| to allocate memory from the default collector,
3247 and declares a virtual method ``\verb|trace|'' to be defined by subclasses
3248 (more on this later.)
3249
3250 Memory for a \CLASSDEF{GCObject} is allocated and freed using the usually
3251 \verb|new| and \verb|delete| operators.  Of course, freeing memory explicitly
3252 using \verb|delete| is optional for many subclasses of \CLASSDEF{GC}.
3253
3254 \begin{verbatim}
3255    class GCObject {
3256    public:
3257       inline void * operator new(size_t n, GC& gc = *GC::default_gc)
3258          { return gc.m_alloc(n); }
3259       inline void * operator new(size_t n, size_t N, GC& gc = *GC::default_gc)
3260          { return gc.m_alloc(n > N ? n : N); }
3261       inline void   operator delete(void *);
3262       virtual void trace(GC *) = 0;
3263    };
3264 \end{verbatim}
3265
3266 \subsection{Memory Allocation}
3267
3268 The base class \CLASSDEF{GC} is slightly more complex, as it has to provide
3269 a few different functionalities.  The first service that class \CLASSDEF{GC}
3270 must
3271 provide is of course memory allocation and deallocation.  As a time saving
3272 device we can specify what the default collector is using the methods
3273 \verb|get_default_gc| and \verb|set_default_gc|.  Method \verb|GCObject::new|
3274 will use this collector by default, unless placement syntax is used.
3275 Method \verb|GCObject::delete|, on the other hand, can correctly infer the
3276 proper collector to use by the address of the object.
3277
3278 The low level methods to allocate and deallocate memory are \verb|m_alloc|
3279 and \verb|free| respectively.  The programmers usually do not have to
3280 use these methods directly.
3281
3282 The method to invoke a garbage collection of a specific level is
3283 \verb|collect(int level)|.  This forces an explicit collection.
3284 Method \verb|grow_heap(size_t)| can also be used to explicitly increase
3285 the heap size of a collector.  Depending of the actual behavior
3286 of the subclasses, these methods may have different effects.
3287
3288 \begin{verbatim}
3289    class GC {
3290    protected:
3291       static GC * default_gc;
3292    public:
3293       static GC&  get_default_gc()       { return *default_gc; }
3294       static void set_default_gc(GC& gc) { default_gc = &gc; }
3295       virtual void * m_alloc   (size_t) = 0;
3296       virtual void   free      (void *);
3297       virtual void   collect   (int level = 0) = 0;
3298       virtual void   grow_heap (size_t) = 0;
3299       static  void garbage_collect() { default_gc->collect(); }
3300       virtual void set_gc_ratio(int);
3301       virtual void set_initial_heap_size (size_t);
3302       virtual void set_min_heap_growth   (size_t);
3303    };
3304 \end{verbatim}
3305
3306 \subsection{The GC Protocol}
3307 The collector and collectable objects must cooperate with
3308 each other by abiding to a simple protocol:
3309 \begin{itemize}
3310    \item All objects that are to be garbage collected must be derived
3311 from \CLASSDEF{GCObject}.  The application programmer must also supply a
3312 ``{\bf tracing}'' method.  The purpose of this method is to identify
3313 all internal pointers to other \CLASSDEF{GCObject}'s.   This method is not
3314 used by the application programmer directly but only used internally
3315 by the garbage collectors.
3316    \item The tracing method of each collectable must in turn call
3317     the tracing method in the class \CLASSDEF{GC} with each pointer to
3318     a collectable object that the object owns:
3319 \begin{verbatim}
3320    class GC {
3321    public:
3322       virtual GCObject * trace (GCObject *) = 0;
3323       inline  void trace (GCObject& obj);
3324    };
3325 \end{verbatim}
3326
3327      Briefly, the rules are as follows:
3328    \begin{enumerate}
3329       \item The tracing method of a collectable \T{Foo} has the following
3330          general form:
3331           \begin{verbatim}
3332              void Foo::trace(GC * gc)
3333              {
3334                 ...
3335              }
3336           \end{verbatim}
3337       \item If class \T{Foo} has a member that is a pointer \verb|p|
3338             to a collectable object of type \T{Bar}, then add:
3339            \begin{verbatim}
3340                p = (Bar)gc->trace(p);
3341            \end{verbatim}
3342             to the body of \verb|Foo::trace|.
3343      \item  If class \T{Foo} has a member object
3344             \verb|x| that is a subclass of \T{GCObject}, also add:
3345            \begin{verbatim}
3346                gc->trace(x);
3347            \end{verbatim}
3348             to the body of \verb|Foo::trace|.
3349      \item  If class \T{Foo} is derived from a class \verb|Bar| that
3350             is a subclass of \T{GCObject}, add:
3351            \begin{verbatim}
3352                Bar::trace(gc);
3353            \end{verbatim}
3354             to the body of \verb|Foo::trace|.
3355    \end{enumerate}
3356     Notice that these methods can be arranged in any order.
3357 \end{itemize}
3358
3359 This protocol can be used by both copying and non-copying collectors.
3360 In addition, the class \CLASSDEF{GCVerifier} also uses this protocol to
3361 walk the heap in order to verify the integrity of a garbage collected
3362 data structure.
3363
3364 \subsection{Messages and Statistics}
3365     All garbage collectors use the following protocols for status
3366 reporting and statistics gathering.
3367
3368 \begin{verbatim}
3369    class GC {
3370    public:
3371       enum GCNotify {
3372          gc_no_notify,
3373          gc_notify_minor_collection,
3374          gc_notify_major_collection,
3375          gc_notify_weak_pointer_collection,
3376          gc_print_collection_time,
3377          gc_print_debugging_info
3378       }
3379       virtual int      verbosity() const;
3380       virtual void     set_verbosity(int);
3381       virtual ostream& get_console() const;
3382       virtual void     set_console(ostream&);
3383    };
3384 \end{verbatim}
3385
3386 \begin{verbatim}
3387    class GC {
3388    public:
3389       struct Statistics {
3390          const char *   algorithm;
3391          const char *   version;
3392          size_t         bytes_used;
3393          size_t         bytes_managed;
3394          size_t         bytes_free;
3395          struct timeval gc_user_time;
3396          struct timeval gc_system_time;
3397          struct timeval total_gc_user_time;
3398          struct timeval total_gc_system_time;
3399       }
3400       virtual Statistics statistics();
3401    };
3402 \end{verbatim}
3403
3404     Each collector has a current verbosity level, which can be set
3405 and reset using the methods \verb|set_verbosity(int)| and
3406 \verb|verbosity() const|.  The verbosity level is actually a bit
3407 set containing flags of the type \verb|GC::GCNotify|.  A combination
3408 of these options can be used.
3409 \begin{itemize}
3410   \item \verb|gc_no_notify| --- no messages at all.
3411   \item \verb|gc_notify_minor_collection| --- all minor collections
3412     will be notified by printing a message on the current console.
3413   \item \verb|gc_notify_major_collection| --- similarly for all major
3414     collections.
3415   \item \verb|gc_notify_weak_pointer_collection| --- notify the user
3416   when weak pointers are being collected.
3417   \item \verb|gc_print_collection_time| --- this option will let the
3418     collector to print the time spent during collection (and finalization).
3419   \item \verb|gc_print_debugging_info| --- this option will print additional
3420     information such as stack and heap addresses during garbage collection.
3421 \end{itemize}
3422
3423    The current console of a collector is defaulted to the \Cpp{} stream
3424 \verb|cerr|.  It can be set and inspected with the methods
3425 \verb|set_console(ostream&)| and \verb|ostream& get_console()| respectively.
3426 For example, the user can redirect all garbage collection messages
3427 to a log file \verb|"gc.log"| by executing the following during initialization:
3428
3429 \begin{verbatim}
3430    ofstream gc_log("gc.log");
3431    GC::get_default_gc().set_console(gc_log);
3432 \end{verbatim}
3433
3434 \subsection{The Bartlett style mostly copying collector}
3435
3436    Currently a Bartlett-style mostly copying collector has been implemented.
3437 The current version is non-generational but we expect that a generational
3438 version will be available in the near future.
3439
3440    The Bartlett-style collector is implemented as the concrete class
3441 \CLASSDEF{BGC}.  Aside from all the services provided by class \CLASSDEF{GC},
3442 \CLASSDEF{BGC} also provides the following method:
3443
3444 \begin{verbatim}
3445    class BGC : public CGC {
3446    public:
3447       virtual void set_gc_ratio(int);
3448       virtual void set_initial_heap_size (size_t);
3449       virtual void set_min_heap_growth   (size_t);
3450    }
3451 \end{verbatim}
3452
3453   The gc ratio refers to the ratio of used heap space versus
3454 total heap space.  Class \CLASSDEF{BGC} will invoke the garbage collection
3455 if this is exceeded.  The default gc ratio for \T{BGC} is 75\%.
3456
3457   The initial size of the heap and the minimal number of bytes to
3458 request from the system during a heap expansion can be adjusted using
3459 the methods \verb|set_initial_heap_size| and \verb|set_min_heap_growth|
3460 respectively.  These methods are declared in the \T{GC}
3461 protocol.   By default, the initial heap size for this collector
3462 is 128K and each heap expansion will increase the heap by 512K.
3463
3464 If an application uses a lot garbage collected storage it is a good
3465 idea to set the heap size to larger capacity during initialization.
3466 Otherwise, the collector will have to perform more garbage collection
3467 and heap expansions to reach a stable state.
3468
3469 \subsection{The Mark Sweep collector}
3470
3471    An alternative to the copying collector is the the mark sweep collector.
3472 Like the previous collector, this collector also uses conservative scanning
3473 to locate roots.  Unlikely the Boehm collector, type accurate marking
3474 is used through the user supplied tracing method.
3475
3476    Since the default collector is of the \T{BGC} variety, the user must
3477 create an instance of \T{MarkSweepGC} if the mark sweep collector
3478 is to be used.
3479
3480 \begin{verbatim}
3481    class MarkSweepGC : public CGC {
3482    public:
3483       virtual void set_gc_ratio(int);
3484       virtual void set_initial_heap_size (size_t);
3485       virtual void set_min_heap_growth   (size_t);
3486    }
3487 \end{verbatim}
3488
3489    The gc ratio in class \T{MarkSweepGC} determines whether heap
3490 expansion is to be performed after a collection.  The default gc ratio
3491 is 50\%, thus if after garbage collection the ratio of used versus total
3492 heap space exceeds one half, the heap will be expanded.
3493
3494    For most purposes the two collectors are interchangeable.  Since
3495 all types of garbage collectors under our framework use the same protocol,
3496 applications can be written without fixing a specific garbage collection
3497 algorithm before hand.
3498
3499 \subsection{Finalization}
3500
3501    One common \Cpp{} idiom uses constructor and destructor for
3502 resource allocation: resources (including memory but may include others,
3503 such as file handles, graphical widgets, etc) that are acquired
3504 in a class constructor are released(finalized) in the class'es destructor.
3505
3506    There are, however, two opposing views in how finalization should be handled
3507 in a garbage collector.  The first assumes that garbage collection
3508 simulates an infinite amount of memory and so automatic finalization is
3509 inapproriate.   The second takes a more pragmatic view, and assumes that
3510 automatic finalization should be provided since otherwise explicit resource
3511 tracking must be provided by the user for collectable datatypes, making garbage
3512 collection much less useful than it can be.
3513
3514    We will refrain from participating in any religious and philosophical
3515 wars on which dogma is the ``One True Way''.
3516 nstead, both types of collectors are provided.
3517
3518    By default, all garbage collection classes do
3519 not perform finalization on garbage
3520 collected objects.  If object finalization is desired then the user
3521 can do one of two things:
3522 \begin{itemize}
3523    \item Enable the finalization of the default heap by putting
3524 \begin{verbatim}
3525    GC::get_default_gc().set_finalization(true);
3526 \end{verbatim}
3527    in the initialization code, or
3528    \item Instantiate a different instance of the garbage collector
3529     and allocate objects needing finalization from this collector.
3530     This may be a better method since not all objects need finalization
3531     and those that do not can still be allocated from the default collector.
3532     This way we only have to pay for the performance penalty (if any)
3533     proportional to the usage of finalization.
3534 \end{itemize}
3535
3536    For example, if a collectable class named \T{Foo} should be properly
3537 finalized we can declare it in the following manner:
3538
3539 \begin{verbatim}
3540    extern BGC bgc_f;  // somewhere an instance is defined
3541
3542    class Foo : public GCObject {
3543       Resource r;
3544    public:
3545       Foo() { r.allocate(); /* allocate resource */ }
3546      ~Foo() { r.release(); /* release all resource */ }
3547
3548       // Make object allocate from bgc_f instead
3549       // of the default non-collectable gc.
3550       void * operator new (size_t n) { return bgc_f.m_alloc(n); }
3551    };
3552 \end{verbatim}
3553
3554    When an instance of class \verb|Foo| is identified as garbage, its
3555 destructor will be called.  [Aside: {\it currently issue such as
3556 finalization synchronization is not handled directly by the collector.
3557 So if there is synchronization constraints during finalization it must
3558 be handled by the object destructor.  Future versions of \Prop{} will
3559 provide subclasses with more sophisticated finalization mechanisms.} ]
3560
3561    Notice that the default \verb|delete| operator of all garbage collectable
3562 classes will call the object destructor explicitly.  It is a good idea to use
3563 explicit \verb|delete| if finalization is a rare need since this service
3564 involves an explicit scan of the garbage memory, which may degrade performace
3565 with excessive swapping: without finalization the garbage pages will not
3566 have to be referenced with a copying collector.
3567
3568    It should also be noted that since the collector is conservative, there
3569 is no guarantee that garbage will be identified as such:
3570 there is no guarantee that all garbage resources will be released.
3571 In general, however, the efficacy of the collector is quite good and
3572 so non-critical or plentiful resources can be safely finalized with
3573 this collector.
3574
3575 \Subsubsection{Weak Pointers}
3576
3577    It is frequently very useful to be able to keep track of garbage
3578 collectable objects through the use of ``{\bf weak pointers}.''
3579 Collectors ignore the presence of weak pointers to garbage collected
3580 objects; objects with only referencing weak pointers will still be
3581 collected.  Weak pointers to objects that become garbage will
3582 be automatically reset to null.
3583
3584    Weak pointers are provided through a smart pointer template
3585 \CLASSDEF{WeakPointer}, whose definition is shown below:
3586
3587 \begin{verbatim}
3588 template <class T> class WeakPointer {
3589    public:
3590       inline WeakPointer();
3591       inline WeakPointer(const WeakPointer<T>& wp);
3592       inline WeakPointer(T * ptr);
3593       inline WeakPointer<T>& operator = (const WeakPointer<T>& wp);
3594       inline WeakPointer<T>& operator = (T * ptr);
3595       inline bool is_null() const;
3596       inline operator const T * () const;
3597       inline operator       T * ();
3598       inline const T * operator -> () const;
3599       inline       T * operator -> ();
3600       inline const T&  operator *  () const;
3601       inline       T&  operator *  ();
3602    };
3603 \end{verbatim}
3604
3605 A weakpointer to a garbage collectable
3606 class \verb|A| can be defined as \verb|WeakPointer<A>|.
3607 If a \Prop{} datatype \verb|T| has been defined, a weakpointer to type
3608 \verb|T| can defined using the
3609 \TDEF{classof} keyword.  For example:
3610
3611 \begin{verbatim}
3612    datatype Wff :: collectable = True | False | And(...)  | ...;
3613
3614    WeakPointer<classof Wff> a = And(True,False);
3615 \end{verbatim}
3616
3617 \Subsubsection{The Heap Walker}
3618
3619    There is a simple class called \CLASSDEF{GCVerify} that can be used
3620 to verify the integrity of a complex allocated data structure.
3621 This is frequently useful during the development of a new collector,
3622 or a complex class derive class of \T{GCObject}.
3623
3624    The interface of this class is shown below.
3625
3626 \begin{verbatim}
3627    class GCVerifier : protected GC {
3628    public:
3629       virtual bool is_valid_pointer          (GCObject *);
3630       virtual bool is_valid_interior_pointer (GCObject *);
3631       virtual bool is_valid_structure        (GCObject *);
3632       virtual bool is_valid_pointer          (GCObject *, GC *);
3633       virtual bool is_valid_interior_pointer (GCObject *, GC *);
3634       virtual bool is_valid_structure        (GCObject *, GC *);
3635       size_t number_of_nodes() const;
3636    };
3637 \end{verbatim}
3638
3639    Class \CLASSDEF{GCVerify} is derived from class \CLASSDEF{GC} so that the
3640 same object tracing protocol can be used.  Briefly, three different methods
3641 are provided for verification:
3642 \begin{itemize}
3643    \item Call to \verb|is_valid_pointer(p,gc)| verifies that
3644     \verb|p| is a valid pointer to an object within the collector \verb|gc|.
3645    \item Call to \verb|is_valid_pointer(p,gc)| verifies that
3646     \verb|p| is a valid interior pointer to an object
3647      within the collector \verb|gc|.
3648    \item Call to \verb|is_valid_structure(p,gc)| verifies the entire
3649      structure reachable by \verb|p| is valid.  This method uses
3650      \verb|GCObject::trace| to locate the correct sub-structures.
3651 \end{itemize}
3652
3653    There are alternative versions of the same methods that assumes
3654 the default collector is used.   Furthermore
3655 \begin{itemize}
3656    \item Method \verb|number_of_nodes()| returns the node count of the
3657 last structure traced using \verb|is_valid_structure|, and
3658    \item If \verb|set_verbosity(GC::gc_print_debugging_info)| is used
3659 then error messages will be printed during structure tracing.
3660 \end{itemize}
3661
3662 \bibliography{refman}
3663
3664 \input{refman.ind}
3665 \end{document}