2 -----------------------------------------------------------------------------
4 -- Module : Distribution.Fields.Lexer
7 -- Maintainer : cabal-devel@haskell.org
8 -- Portability : portable
10 -- Lexer for the cabal files.
12 {-# LANGUAGE BangPatterns #-}
13 #ifdef CABAL_PARSEC_DEBUG
14 {-# LANGUAGE PatternGuards #-}
16 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
17 module Distribution.Fields.Lexer
18 (ltest, lexToken, Token(..), LToken(..)
19 ,bol_section, in_section, in_field_layout, in_field_braces
22 -- [Note: bootstrapping parsec parser]
24 -- We manually produce the `Lexer.hs` file from `boot/Lexer.x` (make lexer)
25 -- because bootstrapping cabal-install would be otherwise tricky.
26 -- Alex is (atm) tricky package to build, cabal-install has some magic
27 -- to move bundled generated files in place, so rather we don't depend
28 -- on it before we can build it ourselves.
29 -- Therefore there is one thing less to worry in bootstrap.sh, which is a win.
31 -- See also https://github.com/haskell/cabal/issues/4633
35 import qualified Prelude as Prelude
36 import Distribution.Compat.Prelude
38 import Distribution.Fields.LexerMonad
39 import Distribution.Parsec.Position (Position (..), incPos, retPos)
40 import Data.ByteString (ByteString)
41 import qualified Data.ByteString as B
42 import qualified Data.ByteString.Char8 as B.Char8
43 import qualified Data.Word as Word
45 #ifdef CABAL_PARSEC_DEBUG
47 import qualified Data.Vector as V
48 import qualified Data.Text as T
49 import qualified Data.Text.Encoding as T
50 import qualified Data.Text.Encoding.Error as T
54 -- Various character classes
56 $space = \ -- single space char
57 $ctlchar = [\x0-\x1f \x7f]
58 $printable = \x0-\xff # $ctlchar -- so no \n \r
59 $symbol' = [ \, \= \< \> \+ \* \& \| \! \$ \% \^ \@ \# \? \/ \\ \~ ]
60 $symbol = [$symbol' \- \.]
61 $spacetab = [$space \t]
63 $paren = [ \( \) \[ \] ]
64 $field_layout = [$printable \t]
65 $field_layout' = [$printable] # [$space]
66 $field_braces = [$printable \t] # [\{ \}]
67 $field_braces' = [$printable] # [\{ \} $space]
68 $comment = [$printable \t]
69 $namecore = [$printable] # [$space \: \" \{ \} $paren $symbol']
70 $instr = [$printable $space] # [\"]
71 $instresc = $printable
75 @nbspspacetab = ($spacetab | @nbsp)
76 @nbspspace = ($space | @nbsp)
79 @string = \" ( $instr | \\ $instresc )* \"
86 @bom? { \_ len _ -> do
87 when (len /= 0) $ addWarning LexWarningBOM
88 setStartCode bol_section
93 <bol_section, bol_field_layout, bol_field_braces> {
94 @nbspspacetab* @nl { \_pos len inp -> checkWhitespace len inp >> adjustPos retPos >> lexToken }
95 -- no @nl here to allow for comments on last line of the file with no trailing \n
96 $spacetab* "--" $comment* ; -- TODO: check the lack of @nl works here
97 -- including counting line numbers
101 @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >>
102 if B.length inp == len
103 then return (L pos EOF)
104 else setStartCode in_section
105 >> return (L pos (Indent len)) }
106 $spacetab* \{ { tok OpenBrace }
107 $spacetab* \} { tok CloseBrace }
111 $spacetab+ ; --TODO: don't allow tab as leading space
115 @name { toki TokSym }
116 @string { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
117 @oplike { toki TokOther }
118 $paren { toki TokOther }
121 \} { tok CloseBrace }
122 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexToken }
126 @nbspspacetab* { \pos len inp -> checkLeadingWhitespace len inp >>= \len' ->
127 if B.length inp == len
128 then return (L pos EOF)
129 else setStartCode in_field_layout
130 >> return (L pos (Indent len')) }
135 $field_layout' $field_layout* { toki TokFieldLine }
136 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
140 () { \_ _ _ -> setStartCode in_field_braces >> lexToken }
145 $field_braces' $field_braces* { toki TokFieldLine }
147 \} { tok CloseBrace }
148 @nl { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_braces >> lexToken }
153 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
154 data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or operator
155 | TokStr !ByteString -- ^ String in quotes
156 | TokOther !ByteString -- ^ Operators and parens
157 | Indent !Int -- ^ Indentation token
158 | TokFieldLine !ByteString -- ^ Lines after @:@
163 | LexicalError InputStream --TODO: add separate string lexical error
166 data LToken = L !Position !Token
169 toki :: (ByteString -> Token) -> Position -> Int -> ByteString -> Lex LToken
170 toki t pos len input = return $! L pos (t (B.take len input))
172 tok :: Token -> Position -> Int -> ByteString -> Lex LToken
173 tok t pos _len _input = return $! L pos t
175 checkLeadingWhitespace :: Int -> ByteString -> Lex Int
176 checkLeadingWhitespace len bs
177 | B.any (== 9) (B.take len bs) = do
178 addWarning LexWarningTab
179 checkWhitespace len bs
180 | otherwise = checkWhitespace len bs
182 checkWhitespace :: Int -> ByteString -> Lex Int
183 checkWhitespace len bs
184 | B.any (== 194) (B.take len bs) = do
185 addWarning LexWarningNBSP
186 return $ len - B.count 194 (B.take len bs)
187 | otherwise = return len
189 -- -----------------------------------------------------------------------------
192 type AlexInput = InputStream
194 alexInputPrevChar :: AlexInput -> Char
195 alexInputPrevChar _ = error "alexInputPrevChar not used"
197 alexGetByte :: AlexInput -> Maybe (Word.Word8,AlexInput)
198 alexGetByte = B.uncons
200 lexicalError :: Position -> InputStream -> Lex LToken
201 lexicalError pos inp = do
203 return $! L pos (LexicalError inp)
205 lexToken :: Lex LToken
210 case alexScan inp st of
211 AlexEOF -> return (L pos EOF)
213 let !len_bytes = B.length inp - B.length inp' in
214 --FIXME: we want len_chars here really
215 -- need to decode utf8 up to this point
216 lexicalError (incPos len_bytes pos) inp'
217 AlexSkip inp' len_chars -> do
218 checkPosition pos inp inp' len_chars
219 adjustPos (incPos len_chars)
222 AlexToken inp' len_chars action -> do
223 checkPosition pos inp inp' len_chars
224 adjustPos (incPos len_chars)
226 let !len_bytes = B.length inp - B.length inp'
227 t <- action pos len_bytes inp
228 --traceShow t $ return tok
232 checkPosition :: Position -> ByteString -> ByteString -> Int -> Lex ()
233 #ifdef CABAL_PARSEC_DEBUG
234 checkPosition pos@(Position lineno colno) inp inp' len_chars = do
235 text_lines <- getDbgText
236 let len_bytes = B.length inp - B.length inp'
237 pos_txt | lineno-1 < V.length text_lines = T.take len_chars (T.drop (colno-1) (text_lines V.! (lineno-1)))
238 | otherwise = T.empty
239 real_txt = B.take len_bytes inp
240 when (pos_txt /= T.decodeUtf8 real_txt) $
241 traceShow (pos, pos_txt, T.decodeUtf8 real_txt) $
242 traceShow (take 3 (V.toList text_lines)) $ return ()
244 getDbgText = Lex $ \s@LexState{ dbgText = txt } -> LexResult s txt
246 checkPosition _ _ _ _ = return ()
249 lexAll :: Lex [LToken]
253 L _ EOF -> return [t]
257 ltest :: Int -> String -> Prelude.IO ()
259 let (ws, xs) = execLexer (setStartCode code >> lexAll) (B.Char8.pack s)
260 in traverse_ print ws >> traverse_ print xs
263 mkLexState :: ByteString -> LexState
264 mkLexState input = LexState
265 { curPos = Position 1 1
269 #ifdef CABAL_PARSEC_DEBUG
270 , dbgText = V.fromList . lines' . T.decodeUtf8With T.lenientDecode $ input
274 #ifdef CABAL_PARSEC_DEBUG
275 lines' :: T.Text -> [T.Text]
278 | otherwise = case T.break (\c -> c == '\r' || c == '\n') s1 of
279 (l, s2) | Just (c,s3) <- T.uncons s2
280 -> case T.uncons s3 of
281 Just ('\n', s4) | c == '\r' -> l `T.snoc` '\r' `T.snoc` '\n' : lines' s4
282 _ -> l `T.snoc` c : lines' s3