templates/Lexer.x

   1 {
   2 -----------------------------------------------------------------------------
   3 -- |
   4 -- Module      :  Distribution.Fields.Lexer
   5 -- License     :  BSD3
   6 --
   7 -- Maintainer  :  cabal-devel@haskell.org
   8 -- Portability :  portable
   9 --
  10 -- Lexer for the cabal files.
  11 {-# LANGUAGE CPP #-}
  12 {-# LANGUAGE BangPatterns #-}
  13 #ifdef CABAL_PARSEC_DEBUG
  14 {-# LANGUAGE PatternGuards #-}
  15 #endif
  16 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
  17 module Distribution.Fields.Lexer
  18   (ltest, lexToken, Token(..), LToken(..)
  19   ,bol_section, in_section, in_field_layout, in_field_braces
  20   ,mkLexState) where
  21
  22 -- [Note: bootstrapping parsec parser]
  23 --
  24 -- We manually produce the `Lexer.hs` file from `boot/Lexer.x` (make lexer)
  25 -- because bootstrapping cabal-install would be otherwise tricky.
  26 -- Alex is (atm) tricky package to build, cabal-install has some magic
  27 -- to move bundled generated files in place, so rather we don't depend
  28 -- on it before we can build it ourselves.
  29 -- Therefore there is one thing less to worry in bootstrap.sh, which is a win.
  30 --
  31 -- See also https://github.com/haskell/cabal/issues/4633
  32 --
  33
  34 import Prelude ()
  35 import qualified Prelude as Prelude
  36 import Distribution.Compat.Prelude
  37
  38 import Distribution.Fields.LexerMonad
  39 import Distribution.Parsec.Position (Position (..), incPos, retPos)
  40 import Data.ByteString (ByteString)
  41 import qualified Data.ByteString as B
  42 import qualified Data.ByteString.Char8 as B.Char8
  43 import qualified Data.Word as Word
  44
  45 #ifdef CABAL_PARSEC_DEBUG
  46 import Debug.Trace
  47 import qualified Data.Vector as V
  48 import qualified Data.Text   as T
  49 import qualified Data.Text.Encoding as T
  50 import qualified Data.Text.Encoding.Error as T
  51 #endif
  52
  53 }
  54 -- Various character classes
  55
  56 $space           = \          -- single space char
  57 $ctlchar         = [\x0-\x1f \x7f]
  58 $printable       = \x0-\xff # $ctlchar   -- so no \n \r
  59 $symbol'         = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]
  60 $symbol          = [$symbol' \- \.]
  61 $spacetab        = [$space \t]
  62
  63 $paren           = [ \( \) \[ \] ]
  64 $field_layout    = [$printable \t]
  65 $field_layout'   = [$printable] # [$space]
  66 $field_braces    = [$printable \t] # [\{ \}]
  67 $field_braces'   = [$printable] # [\{ \} $space]
  68 $comment         = [$printable \t]
  69 $namecore        = [$printable] # [$space \: \" \{ \} $paren $symbol']
  70 $instr           = [$printable $space] # [\"]
  71 $instresc        = $printable
  72
  73 @bom          = \xef \xbb \xbf
  74 @nbsp         = \xc2 \xa0
  75 @nbspspacetab = ($spacetab | @nbsp)
  76 @nbspspace    = ($space | @nbsp)
  77 @nl           = \n | \r\n | \r
  78 @name         = $namecore+
  79 @string       = \" ( $instr | \\ $instresc )* \"
  80 @oplike       = $symbol+
  81
  82
  83 tokens :-
  84
  85 <0> {
  86   @bom?  { \_ len _ -> do
  87               when (len /= 0) $ addWarning LexWarningBOM
  88               setStartCode bol_section
  89               lexToken
  90          }
  91 }
  92
  93 <bol_section, bol_field_layout, bol_field_braces> {
  94   @nbspspacetab* @nl         { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
  95   -- no @nl here to allow for comments on last line of the file with no trailing \n
  96   $spacetab* "--" $comment*  ;  -- TODO: check the lack of @nl works here
  97                                 -- including counting line numbers
  98 }
  99
 100 <bol_section> {
 101   @nbspspacetab*   { \pos len inp -> checkLeadingWhitespace len inp >>
 102                                      if B.length inp == len
 103                                        then return (L pos EOF)
 104                                        else setStartCode in_section
 105                                          >> return (L pos (Indent len)) }
 106   $spacetab* \{    { tok  OpenBrace }
 107   $spacetab* \}    { tok  CloseBrace }
 108 }
 109
 110 <in_section> {
 111   $spacetab+   ; --TODO: don't allow tab as leading space
 112
 113   "--" $comment* ;
 114
 115   @name        { toki TokSym }
 116   @string      { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
 117   @oplike      { toki TokOther }
 118   $paren       { toki TokOther }
 119   \:           { tok  Colon }
 120   \{           { tok  OpenBrace }
 121   \}           { tok  CloseBrace }
 122   @nl          { \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexToken }
 123 }
 124
 125 <bol_field_layout> {
 126   @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >>= \len' ->
 127                                   if B.length inp == len
 128                                     then return (L pos EOF)
 129                                     else setStartCode in_field_layout
 130                                       >> return (L pos (Indent len')) }
 131 }
 132
 133 <in_field_layout> {
 134   $spacetab+;
 135   $field_layout' $field_layout*  { toki TokFieldLine }
 136   @nl             { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
 137 }
 138
 139 <bol_field_braces> {
 140    ()                { \_ _ _ -> setStartCode in_field_braces >> lexToken }
 141 }
 142
 143 <in_field_braces> {
 144   $spacetab+;
 145   $field_braces' $field_braces*    { toki TokFieldLine }
 146   \{                { tok  OpenBrace  }
 147   \}                { tok  CloseBrace }
 148   @nl               { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_braces >> lexToken }
 149 }
 150
 151 {
 152
 153 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
 154 data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or operator
 155            | TokStr   !ByteString       -- ^ String in quotes
 156            | TokOther !ByteString       -- ^ Operators and parens
 157            | Indent   !Int              -- ^ Indentation token
 158            | TokFieldLine !ByteString   -- ^ Lines after @:@
 159            | Colon
 160            | OpenBrace
 161            | CloseBrace
 162            | EOF
 163            | LexicalError InputStream --TODO: add separate string lexical error
 164   deriving Show
 165
 166 data LToken = L !Position !Token
 167   deriving Show
 168
 169 toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken
 170 toki t pos  len  input = return $! L pos (t (B.take len input))
 171
 172 tok :: Token -> Position -> Int -> ByteString -> Lex LToken
 173 tok  t pos _len _input = return $! L pos t
 174
 175 checkLeadingWhitespace :: Int -> ByteString -> Lex Int
 176 checkLeadingWhitespace len bs
 177     | B.any (== 9) (B.take len bs) = do
 178         addWarning LexWarningTab
 179         checkWhitespace len bs
 180     | otherwise = checkWhitespace len bs
 181
 182 checkWhitespace :: Int -> ByteString -> Lex Int
 183 checkWhitespace len bs
 184     | B.any (== 194) (B.take len bs) = do
 185         addWarning LexWarningNBSP
 186         return $ len - B.count 194 (B.take len bs)
 187     | otherwise = return len
 188
 189 -- -----------------------------------------------------------------------------
 190 -- The input type
 191
 192 type AlexInput = InputStream
 193
 194 alexInputPrevChar :: AlexInput -> Char
 195 alexInputPrevChar _ = error "alexInputPrevChar not used"
 196
 197 alexGetByte :: AlexInput -> Maybe (Word.Word8,AlexInput)
 198 alexGetByte = B.uncons
 199
 200 lexicalError :: Position -> InputStream -> Lex LToken
 201 lexicalError pos inp = do
 202   setInput B.empty
 203   return $! L pos (LexicalError inp)
 204
 205 lexToken :: Lex LToken
 206 lexToken = do
 207   pos <- getPos
 208   inp <- getInput
 209   st  <- getStartCode
 210   case alexScan inp st of
 211     AlexEOF -> return (L pos EOF)
 212     AlexError inp' ->
 213         let !len_bytes = B.length inp - B.length inp' in
 214             --FIXME: we want len_chars here really
 215             -- need to decode utf8 up to this point
 216         lexicalError (incPos len_bytes pos) inp'
 217     AlexSkip  inp' len_chars -> do
 218         checkPosition pos inp inp' len_chars
 219         adjustPos (incPos len_chars)
 220         setInput inp'
 221         lexToken
 222     AlexToken inp' len_chars action -> do
 223         checkPosition pos inp inp' len_chars
 224         adjustPos (incPos len_chars)
 225         setInput inp'
 226         let !len_bytes = B.length inp - B.length inp'
 227         t <- action pos len_bytes inp
 228         --traceShow t $ return tok
 229         return t
 230
 231
 232 checkPosition :: Position -> ByteString -> ByteString -> Int -> Lex ()
 233 #ifdef CABAL_PARSEC_DEBUG
 234 checkPosition pos@(Position lineno colno) inp inp' len_chars = do
 235     text_lines <- getDbgText
 236     let len_bytes = B.length inp - B.length inp'
 237         pos_txt   | lineno-1 < V.length text_lines = T.take len_chars (T.drop (colno-1) (text_lines V.! (lineno-1)))
 238                   | otherwise = T.empty
 239         real_txt  = B.take len_bytes inp
 240     when (pos_txt /= T.decodeUtf8 real_txt) $
 241       traceShow (pos, pos_txt, T.decodeUtf8 real_txt) $
 242       traceShow (take 3 (V.toList text_lines)) $ return ()
 243   where
 244     getDbgText = Lex $ \s@LexState{ dbgText = txt } -> LexResult s txt
 245 #else
 246 checkPosition _ _ _ _ = return ()
 247 #endif
 248
 249 lexAll :: Lex [LToken]
 250 lexAll = do
 251   t <- lexToken
 252   case t of
 253     L _ EOF -> return [t]
 254     _       -> do ts <- lexAll
 255                   return (t : ts)
 256
 257 ltest :: Int -> String -> Prelude.IO ()
 258 ltest code s =
 259   let (ws, xs) = execLexer (setStartCode code >> lexAll) (B.Char8.pack s)
 260    in traverse_ print ws >> traverse_ print xs
 261
 262
 263 mkLexState :: ByteString -> LexState
 264 mkLexState input = LexState
 265   { curPos   = Position 1 1
 266   , curInput = input
 267   , curCode  = 0
 268   , warnings = []
 269 #ifdef CABAL_PARSEC_DEBUG
 270   , dbgText  = V.fromList . lines' . T.decodeUtf8With T.lenientDecode $ input
 271 #endif
 272   }
 273
 274 #ifdef CABAL_PARSEC_DEBUG
 275 lines' :: T.Text -> [T.Text]
 276 lines' s1
 277   | T.null s1 = []
 278   | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
 279                   (l, s2) | Just (c,s3) <- T.uncons s2
 280                          -> case T.uncons s3 of
 281                               Just ('\n', s4) | c == '\r' -> l `T.snoc` '\r' `T.snoc` '\n' : lines' s4
 282                               _                           -> l `T.snoc` c : lines' s3
 283
 284                           | otherwise
 285                          -> [l]
 286 #endif
 287 }