From 7521c3ee91a8c52cc152de6fa8b4c88b44fc3e33 Mon Sep 17 00:00:00 2001
From: Nick Mathewson <nickm@torproject.org>
Date: Mon, 20 Jul 2015 12:05:44 -0400
Subject: [PATCH] Document the torrc format as thoroughly as possible

Closes ticket 2325
---
 changes/ticket2325   |   7 ++
 doc/include.am       |   3 +-
 doc/torrc_format.txt | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/common/util.c    |  33 +-------
 4 files changed, 218 insertions(+), 32 deletions(-)
 create mode 100644 changes/ticket2325
 create mode 100644 doc/torrc_format.txt

diff --git a/changes/ticket2325 b/changes/ticket2325
new file mode 100644
index 0000000000..b96e514ae2
--- /dev/null
+++ b/changes/ticket2325
@@ -0,0 +1,7 @@
+  o Documentation:
+    - Include a specific and (hopefully) accurate documentation of the torrc
+      file's meta-format in doc/torrc_format.txt. This is mainly of
+      interest to people writing programs to parse or generate torrc files.
+      This document is not a commitment to long-term compatibility;
+      some aspects of the current format are a bit ridiculous. 
+      Closes ticket 2325.
diff --git a/doc/include.am b/doc/include.am
index 783aa95c4e..af99501502 100644
--- a/doc/include.am
+++ b/doc/include.am
@@ -36,7 +36,8 @@ endif
 
 EXTRA_DIST+= doc/HACKING doc/asciidoc-helper.sh			\
              $(html_in) $(man_in) $(txt_in)			\
-             doc/state-contents.txt
+             doc/state-contents.txt				\
+             doc/torrc_format.txt
 
 docdir = @docdir@
 
diff --git a/doc/torrc_format.txt b/doc/torrc_format.txt
new file mode 100644
index 0000000000..3ca187fd29
--- /dev/null
+++ b/doc/torrc_format.txt
@@ -0,0 +1,207 @@
+
+This document specifies the current format and semantics of the torrc
+file, as of July 2015.  Note that we make no guarantee about the
+stability of this format.  If you write something designed for strict
+compatibility with this document, please expect us to break it sooner or
+later.
+
+Yes, some of this is quite stupid.  My goal here is to explain what it
+does, not what it should do.
+
+  - Nick
+
+
+
+1. File Syntax
+
+   # A file is interpreted as every Entry in the file, in order.
+   TorrcFile = Line*
+
+   Line = BlankLine | Entry
+
+   BlankLine = WS* OptComment LF
+             | WS* LF
+
+   OptComment =
+              | Comment
+
+   Comment = '#' NonLF*
+
+   # Each Entry is interpreted as an optional "Magic" flag, a key, and a
+   # value.
+   Entry = SP* OptMagic Key (SP+ | '\\' NL SP*)+ Val LF
+         | SP* OptMagic Key (SP* | '\\' NL SP*)* LF
+
+   OptMagic =
+            | "+"
+            | "/"
+
+   # Keys are always specified verbatim.  They are case insensitive.  It
+   # is an error to specify a key that Tor does not recognize.
+   Key = KC*
+
+   # Sadly, every kind of value is decoded differently...
+   Val = QuotedVal | ContinuedVal | PlainVal
+
+   # The text of a PlainVal is the text of its PVBody portion,
+   # plus the optional trailing backslash.
+   PlainVal = PVBody* ('\\')? SP* OptComment
+
+   # Note that a PVBody is copied verbatim.  Slashes are included
+   # verbatim.  No changes are made.  Note that a body may be empty.
+   PVBody = (VC | '\\' NonLF ) *
+
+   # The text of a ContinuedVal is the text of each of its PVBody
+   # sub-elements, in order, concatenated.
+   ContinuedVal = CVal1 CVal2* CVal3
+
+   CVal1 = PVBody '\\' LF
+   CVal2 = PVBody ( '\\' LF | Comment LF )
+   CVal3 = PVBody
+
+   # The text of a QuotedVal is decoded as if it were a C string.
+   QuotedVal = DQ QVBody DQ SP* Comment
+
+   QVBody = QC
+          | '\\' ( 'n' | 'r' | 't' | '\\' | '\'' | DQ | 'x' XD XD | OD OD? OD? )
+
+   XD = any hexadecimal digit
+   OD = any octal digit
+
+   NonLF = Any character but '\n'
+   LF = '\n' | EOF
+   WS = ' ' | '\t' | '\r'
+   SP = ' ' | '\t'
+   DQ = '\"'
+   KC = Any character except an isspace() character or '#'
+   VC = Any character except '\\', '\n', or '#'
+   QC = Any character except '\n', '\\', or '\"'
+
+2. Mid-level Semantics
+
+
+   There are four configuration "domains", from lowest to highest priority:
+
+      * Built-in defaults
+      * The "torrc_defaults" file, if any
+      * The "torrc" file, if any
+      * Arguments provided on the command line, if any.
+
+   Normally, values from high-priority domains override low-priority
+   domains, but see 'magic' below.
+
+   Configuration keys fall into three categories: singletons, lists, and
+   groups.
+
+   A singleton key may appear at most once in any domain.  Its
+   corresponding value is equal to its value in the highest-priority
+   domain in which it occurs.
+
+   A list key may appear any number of times in a domain.  By default,
+   its corresponding value is equal to all of the values specified for
+   it in the highest-priority domain in which it appears. (See 'magic'
+   below).
+
+   A group key may appear any number of times in a domain.  It is
+   associated with a number of other keys in the same group.  The
+   relative positions of entries with the keys in a single group
+   matters, but entries with keys not in the group may be freely
+   interspersed.  By default, the group has a value equal to all keys
+   and values it contains, from the highest-priority domain in which any
+   of its keys occurs.
+
+   Magic:
+
+      If the '/' flag is specified for an entry, it sets the value for
+      that entry to an empty list.  (This will cause a higher-priority
+      domain to clear a list from a lower-priority domain, without
+      actually adding any entries.)
+
+      If the '+' flag is specified for the first entry in a list or a
+      group that appears in a given domain, that list or group is
+      appended to the list or group from the next-lowest-priority
+      domain, rather than replacing it.
+
+3. High-level semantics
+
+   There are further constraints on the values that each entry can take.
+   These constraints are out-of-scope for this document.
+
+4. Examples
+
+   (Indentation is removed in this section, to avoid confusion.)
+
+4.1. Syntax examples
+
+# Here is a simple configuration entry.  The key is "Foo"; the value is
+# "Bar"
+
+Foo Bar
+
+# A configuration entry can have spaces in its value, as below. Here the
+# key is "Foo" and the value is "Bar    Baz"
+Foo    Bar    Baz
+
+# This configuration entry has space at the end of the line, but those
+# spaces don't count, so the key and value are still "Foo" and "Bar    Baz"
+Foo    Bar    Baz    
+
+# There can be an escaped newline between the value and the key.  This
+# is another way to say  key="Hello", value="World"
+Hello\
+World
+
+# In regular entries of this kind, you can have a comment at the end of
+# the line, either with a space before it or not.  Each of these is a
+# different spelling of key="Hello", value="World"
+
+Hello World   #today
+Hello World#tomorrow
+
+# One way to encode a complex entry is as a C string.  This is the same
+# as key="Hello", value="World!"
+Hello "World!"
+
+# The string can contain the usual set of C escapes.  This entry has
+# key="Hello", and value="\"World\"\nand\nuniverse"
+Hello "\"World\"\nand\nuniverse"
+
+# And now we get to the more-or-less awful part.
+#
+# Multi-line entries ending with a backslash on each line aren't so
+# bad.  The backslash is removed, and everything else is included
+# verbatim. So this entry has key="Hello" and value="Worldandfriends"
+Hello\
+World\
+and\
+friends
+
+# Backslashes in the middle of a line are included as-is.  The key of
+# this one is "Too" and the value is "Many\\Backsl\ashes here" (with
+# backslashes in that last string as-is)
+Too \
+Many\\\
+Backsl\ashes \\
+here
+
+# And here's the really yucky part. If a comment appears in a multi-line
+# entry, the entry is still able to continue on the next line, as in the
+# following, where the key is "This" and the value is
+# "entry   and some        are  silly"
+This entry      \
+ # has comments \
+ and some       \
+ are # generally \
+ silly
+
+# But you can also write that without the backslashes at the end of the
+# comment lines.  That is to say, this entry is exactly the same as the
+# one above!
+This entry      \
+ # has comments
+ and some       \
+ are # generally
+ silly
+
+
+
diff --git a/src/common/util.c b/src/common/util.c
index a5b5488b0a..618e6a1b6a 100644
--- a/src/common/util.c
+++ b/src/common/util.c
@@ -2829,38 +2829,9 @@ parse_config_line_from_str_verbose(const char *line, char **key_out,
                                    char **value_out,
                                    const char **err_out)
 {
-  /* I believe the file format here is supposed to be:
-     FILE = (EMPTYLINE | LINE)* (EMPTYLASTLINE | LASTLINE)?
-
-     EMPTYLASTLINE = SPACE* | COMMENT
-     EMPTYLINE = EMPTYLASTLINE NL
-     SPACE = ' ' | '\r' | '\t'
-     COMMENT = '#' NOT-NL*
-     NOT-NL = Any character except '\n'
-     NL = '\n'
-
-     LASTLINE = SPACE* KEY SPACE* VALUES
-     LINE = LASTLINE NL
-     KEY = KEYCHAR+
-     KEYCHAR = Any character except ' ', '\r', '\n', '\t', '#', "\"
-
-     VALUES = QUOTEDVALUE | NORMALVALUE
-     QUOTEDVALUE = QUOTE QVCHAR* QUOTE EOLSPACE?
-     QUOTE = '"'
-     QVCHAR = KEYCHAR | ESC ('n' | 't' | 'r' | '"' | ESC |'\'' | OCTAL | HEX)
-     ESC = "\\"
-     OCTAL = ODIGIT (ODIGIT ODIGIT?)?
-     HEX = ('x' | 'X') HEXDIGIT HEXDIGIT
-     ODIGIT = '0' .. '7'
-     HEXDIGIT = '0'..'9' | 'a' .. 'f' | 'A' .. 'F'
-     EOLSPACE = SPACE* COMMENT?
-
-     NORMALVALUE = (VALCHAR | ESC ESC_IGNORE | CONTINUATION)* EOLSPACE?
-     VALCHAR = Any character except ESC, '#', and '\n'
-     ESC_IGNORE = Any character except '#' or '\n'
-     CONTINUATION = ESC NL ( COMMENT NL )*
+  /*
+    See torrc_format.txt for a description of the (silly) format this parses.
    */
-
   const char *key, *val, *cp;
   int continuation = 0;
 
-- 
2.11.4.GIT