diff --git a/src/Data/Bytes/Parser.hs b/src/Data/Bytes/Parser.hs index 5cee4df..0433eff 100644 --- a/src/Data/Bytes/Parser.hs +++ b/src/Data/Bytes/Parser.hs @@ -1,86 +1,104 @@ -{-# language BangPatterns #-} -{-# language BinaryLiterals #-} -{-# language DataKinds #-} -{-# language DerivingStrategies #-} -{-# language GADTSyntax #-} -{-# language LambdaCase #-} -{-# language MagicHash #-} -{-# language MultiWayIf #-} -{-# language PolyKinds #-} -{-# language RankNTypes #-} -{-# language ScopedTypeVariables #-} -{-# language TypeApplications #-} -{-# language UnboxedTuples #-} -{-# language CPP #-} - --- | Parse non-resumable sequence of bytes. To parse a byte sequence --- as text, use the @Ascii@, @Latin@, and @Utf8@ modules instead. --- Functions for parsing decimal-encoded numbers are found in those --- modules. +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BinaryLiterals #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GADTSyntax #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PolyKinds #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | Parse non-resumable sequence of bytes. To parse a byte sequence +as text, use the @Ascii@, @Latin@, and @Utf8@ modules instead. +Functions for parsing decimal-encoded numbers are found in those +modules. +-} module Data.Bytes.Parser ( -- * Types Parser - , Result(..) - , Slice(..) + , Result (..) + , Slice (..) + -- * Run Parsers + -- ** Result , parseByteArray , parseBytes , parseBytesEffectfully , parseBytesEither , parseBytesMaybe + -- * One Byte , any + -- * Many Bytes , take , takeN , takeUpTo , takeWhile , takeTrailedBy + -- * Skip , skipWhile , skipTrailedBy , skipTrailedBy2 , skipTrailedBy2# , skipTrailedBy3# + -- * Match , byteArray , bytes , satisfy , satisfyWith , cstring + -- * End of Input , endOfInput , isEndOfInput , remaining , peekRemaining + -- * Scanning , scan + -- * Lookahead , peek , peek' + -- * Control Flow , fail , orElse , annotate , () , mapErrorEffectfully + -- * Repetition , replicate + -- * Subparsing , delimit , measure , measure_ , measure_# + -- * Lift Effects , effect + -- * Box Result , boxWord32 , boxIntPair + -- * Unbox Result , unboxWord32 , unboxIntPair + -- * Specialized Bind + -- | Sometimes, GHC ends up building join points in a way that -- boxes arguments unnecessarily. In this situation, special variants -- of monadic @>>=@ can be helpful. If @C#@, @I#@, etc. never @@ -92,26 +110,26 @@ module Data.Bytes.Parser , bindFromCharToIntPair , bindFromMaybeCharToIntPair , bindFromMaybeCharToLifted + -- * Specialized Pure , pureIntPair + -- * Specialized Fail , failIntPair ) where -import Prelude hiding (length,any,fail,takeWhile,take,replicate) +import Prelude hiding (any, fail, length, replicate, take, takeWhile) -import Data.Bytes.Parser.Internal (Parser(..),ST#,unboxBytes) -import Data.Bytes.Parser.Internal (boxBytes,Result#,uneffectful,fail) -import Data.Bytes.Parser.Internal (uneffectful#,uneffectfulInt#) -import Data.Bytes.Parser.Types (Result(Failure,Success),Slice(Slice)) -import Data.Bytes.Parser.Unsafe (unconsume,expose,cursor) -import Data.Bytes.Types (Bytes(..),BytesN(BytesN)) -import Data.Primitive (ByteArray(..)) -import Data.Primitive.Contiguous (Contiguous,Element) +import Data.Bytes.Parser.Internal (Parser (..), Result#, ST#, boxBytes, fail, unboxBytes, uneffectful, uneffectful#, uneffectfulInt#) +import Data.Bytes.Parser.Types (Result (Failure, Success), Slice (Slice)) +import Data.Bytes.Parser.Unsafe (cursor, expose, unconsume) +import Data.Bytes.Types (Bytes (..), BytesN (BytesN)) +import Data.Primitive (ByteArray (..)) +import Data.Primitive.Contiguous (Contiguous, Element) import Foreign.C.String (CString) -import GHC.Exts (Int(I#),Word#,Int#,Char#,runRW#,(+#),(-#),(>=#)) -import GHC.ST (ST(..)) -import GHC.Word (Word32(W32#),Word8) +import GHC.Exts (Char#, Int (I#), Int#, Word#, runRW#, (+#), (-#), (>=#)) +import GHC.ST (ST (..)) +import GHC.Word (Word32 (W32#), Word8) import qualified Arithmetic.Nat as Nat import qualified Arithmetic.Types as Arithmetic @@ -121,49 +139,52 @@ import qualified Data.Primitive as PM import qualified Data.Primitive.Contiguous as C import qualified GHC.Exts as Exts --- | Parse a byte sequence. This can succeed even if the --- entire slice was not consumed by the parser. +{- | Parse a byte sequence. This can succeed even if the +entire slice was not consumed by the parser. +-} parseBytes :: forall e a. (forall s. Parser e s a) -> Bytes -> Result e a -{-# inline parseBytes #-} +{-# INLINE parseBytes #-} parseBytes p !b = runResultST action - where + where action :: forall s. ST# s (Result# e a) action s0 = case p @s of Parser f -> f (unboxBytes b) s0 --- | Variant of 'parseBytesEither' that discards the error message on failure. --- Just like 'parseBytesEither', this does not impose any checks on the length --- of the remaining input. +{- | Variant of 'parseBytesEither' that discards the error message on failure. +Just like 'parseBytesEither', this does not impose any checks on the length +of the remaining input. +-} parseBytesMaybe :: forall e a. (forall s. Parser e s a) -> Bytes -> Maybe a -{-# inline parseBytesMaybe #-} +{-# INLINE parseBytesMaybe #-} parseBytesMaybe p !b = runMaybeST action - where + where action :: forall s. ST# s (Result# e a) action s0 = case p @s of Parser f -> f (unboxBytes b) s0 --- | Variant of 'parseBytes' that discards the new offset and the --- remaining length. This does not, however, require the remaining --- length to be zero. Use 'endOfInput' to accomplish that. +{- | Variant of 'parseBytes' that discards the new offset and the +remaining length. This does not, however, require the remaining +length to be zero. Use 'endOfInput' to accomplish that. +-} parseBytesEither :: forall e a. (forall s. Parser e s a) -> Bytes -> Either e a -{-# inline parseBytesEither #-} +{-# INLINE parseBytesEither #-} parseBytesEither p !b = runEitherST action - where + where action :: forall s. ST# s (Result# e a) action s0 = case p @s of Parser f -> f (unboxBytes b) s0 -- Similar to runResultST runMaybeST :: (forall s. ST# s (Result# e x)) -> Maybe x -{-# inline runMaybeST #-} -runMaybeST f = case (runRW# (\s0 -> case f s0 of { (# _, r #) -> r })) of +{-# INLINE runMaybeST #-} +runMaybeST f = case (runRW# (\s0 -> case f s0 of (# _, r #) -> r)) of (# _ | #) -> Nothing (# | (# x, _, _ #) #) -> Just x -- Similar to runResultST runEitherST :: (forall s. ST# s (Result# e x)) -> Either e x -{-# inline runEitherST #-} -runEitherST f = case (runRW# (\s0 -> case f s0 of { (# _, r #) -> r })) of +{-# INLINE runEitherST #-} +runEitherST f = case (runRW# (\s0 -> case f s0 of (# _, r #) -> r)) of (# e | #) -> Left e (# | (# x, _, _ #) #) -> Right x @@ -173,49 +194,56 @@ runEitherST f = case (runRW# (\s0 -> case f s0 of { (# _, r #) -> r })) of -- it avoids the additional boxing that the Success data -- constructor would normally cause. runResultST :: (forall s. ST# s (Result# e x)) -> Result e x -{-# inline runResultST #-} -runResultST f = case (runRW# (\s0 -> case f s0 of { (# _, r #) -> r })) of +{-# INLINE runResultST #-} +runResultST f = case (runRW# (\s0 -> case f s0 of (# _, r #) -> r)) of (# e | #) -> Failure e (# | (# x, off, len #) #) -> Success (Slice (I# off) (I# len) x) -- | Variant of 'parseBytes' that accepts an unsliced 'ByteArray'. parseByteArray :: (forall s. Parser e s a) -> ByteArray -> Result e a -{-# inline parseByteArray #-} +{-# INLINE parseByteArray #-} parseByteArray p b = parseBytes p (Bytes b 0 (PM.sizeofByteArray b)) --- | Variant of 'parseBytes' that allows the parser to be run --- as part of an existing effectful context. +{- | Variant of 'parseBytes' that allows the parser to be run +as part of an existing effectful context. +-} parseBytesEffectfully :: Parser e s a -> Bytes -> ST s (Result e a) -{-# inline parseBytesEffectfully #-} -parseBytesEffectfully (Parser f) !b = ST - (\s0 -> case f (unboxBytes b) s0 of - (# s1, r #) -> (# s1, boxPublicResult r #) - ) +{-# INLINE parseBytesEffectfully #-} +parseBytesEffectfully (Parser f) !b = + ST + ( \s0 -> case f (unboxBytes b) s0 of + (# s1, r #) -> (# s1, boxPublicResult r #) + ) -- | Lift an effectful computation into a parser. effect :: ST s a -> Parser e s a -{-# inline effect #-} -effect (ST f) = Parser - ( \(# _, off, len #) s0 -> case f s0 of - (# s1, a #) -> (# s1, (# | (# a, off, len #) #) #) - ) +{-# INLINE effect #-} +effect (ST f) = + Parser + ( \(# _, off, len #) s0 -> case f s0 of + (# s1, a #) -> (# s1, (# | (# a, off, len #) #) #) + ) byteArray :: e -> ByteArray -> Parser e s () -{-# inline byteArray #-} +{-# INLINE byteArray #-} byteArray e !expected = bytes e (B.fromByteArray expected) -- | Consume input matching the byte sequence. bytes :: e -> Bytes -> Parser e s () -bytes e !expected = Parser - ( \actual@(# _, off, len #) s -> - let r = if B.isPrefixOf expected (boxBytes actual) - then let !(I# movement) = length expected in - (# | (# (), off +# movement, len -# movement #) #) - else (# e | #) - in (# s, r #) - ) - +bytes e !expected = + Parser + ( \actual@(# _, off, len #) s -> + let r = + if B.isPrefixOf expected (boxBytes actual) + then + let !(I# movement) = length expected + in (# | (# (), off +# movement, len -# movement #) #) + else (# e | #) + in (# s, r #) + ) + +{- FOURMOLU_DISABLE -} -- | Consume input matching the @NUL@-terminated C String. cstring :: e -> CString -> Parser e s () cstring e (Exts.Ptr ptr0) = Parser @@ -237,6 +265,7 @@ cstring e (Exts.Ptr ptr0) = Parser _ -> (# s, (# e | #) #) in go ptr0 off0 len0 ) +{- FOURMOLU_ENABLE -} infix 0 @@ -244,57 +273,65 @@ infix 0 () :: Parser x s a -> e -> Parser e s a () = annotate --- | Annotate a parser. If the parser fails, the error will --- be returned. +{- | Annotate a parser. If the parser fails, the error will + be returned. +-} annotate :: Parser x s a -> e -> Parser e s a annotate p e = p `orElse` fail e --- | Consumes and returns the next byte in the input. --- Fails if no characters are left. +{- | Consumes and returns the next byte in the input. +Fails if no characters are left. +-} any :: e -> Parser e s Word8 -{-# inline any #-} -any e = uneffectful $ \chunk -> if length chunk > 0 - then - let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 - in Internal.Success w (offset chunk + 1) (length chunk - 1) - else Internal.Failure e - --- | Match any byte, to perform lookahead. Returns 'Nothing' if --- end of input has been reached. Does not consume any input. --- --- /Note/: Because this parser does not fail, do not use it --- with combinators such as 'many', because such as 'many', --- because such parsers loop until a failure occurs. Careless --- use will thus result in an infinite loop. +{-# INLINE any #-} +any e = uneffectful $ \chunk -> + if length chunk > 0 + then + let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 + in Internal.Success w (offset chunk + 1) (length chunk - 1) + else Internal.Failure e + +{- | Match any byte, to perform lookahead. Returns 'Nothing' if + end of input has been reached. Does not consume any input. + + /Note/: Because this parser does not fail, do not use it + with combinators such as 'many', because such as 'many', + because such parsers loop until a failure occurs. Careless + use will thus result in an infinite loop. +-} peek :: Parser e s (Maybe Word8) -{-# inline peek #-} +{-# INLINE peek #-} peek = uneffectful $ \chunk -> - let v = if length chunk > 0 - then Just (B.unsafeIndex chunk 0) - else Nothing - in Internal.Success v (offset chunk) (length chunk) - --- | Match any byte, to perform lookahead. Does not consume any --- input, but will fail if end of input has been reached. + let v = + if length chunk > 0 + then Just (B.unsafeIndex chunk 0) + else Nothing + in Internal.Success v (offset chunk) (length chunk) + +{- | Match any byte, to perform lookahead. Does not consume any + input, but will fail if end of input has been reached. +-} peek' :: e -> Parser e s Word8 -{-# inline peek' #-} -peek' e = uneffectful $ \chunk -> if length chunk > 0 - then Internal.Success (B.unsafeIndex chunk 0) (offset chunk) (length chunk) - else Internal.Failure e - --- | A stateful scanner. The predicate consumes and transforms a --- state argument, and each transformed state is passed to --- successive invocations of the predicate on each byte of the input --- until one returns 'Nothing' or the input ends. --- --- This parser does not fail. It will return the initial state --- if the predicate returns 'Nothing' on the first byte of input. --- --- /Note/: Because this parser does not fail, do not use it with --- combinators such a 'many', because such parsers loop until a --- failure occurs. Careless use will thus result in an infinite loop. +{-# INLINE peek' #-} +peek' e = uneffectful $ \chunk -> + if length chunk > 0 + then Internal.Success (B.unsafeIndex chunk 0) (offset chunk) (length chunk) + else Internal.Failure e + +{- | A stateful scanner. The predicate consumes and transforms a + state argument, and each transformed state is passed to + successive invocations of the predicate on each byte of the input + until one returns 'Nothing' or the input ends. + + This parser does not fail. It will return the initial state + if the predicate returns 'Nothing' on the first byte of input. + + /Note/: Because this parser does not fail, do not use it with + combinators such a 'many', because such parsers loop until a + failure occurs. Careless use will thus result in an infinite loop. +-} scan :: state -> (state -> Word8 -> Maybe state) -> Parser e s state -{-# inline scan #-} +{-# INLINE scan #-} scan s0 t = do let go s = do mw <- peek @@ -309,26 +346,28 @@ scan s0 t = do -- Does not check to see if any characters are left. This -- is not exported. anyUnsafe :: Parser e s Word8 -{-# inline anyUnsafe #-} +{-# INLINE anyUnsafe #-} anyUnsafe = uneffectful $ \chunk -> let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 in Internal.Success w (offset chunk + 1) (length chunk - 1) --- | Take while the predicate is matched. This is always inlined. This --- always succeeds. +{- | Take while the predicate is matched. This is always inlined. This +always succeeds. +-} takeWhile :: (Word8 -> Bool) -> Parser e s Bytes -{-# inline takeWhile #-} +{-# INLINE takeWhile #-} takeWhile f = uneffectful $ \chunk -> case B.takeWhile f chunk of bs -> Internal.Success bs (offset chunk + length bs) (length chunk - length bs) --- | Take bytes until the specified byte is encountered. Consumes --- the matched byte as well. Fails if the byte is not present. --- Visually, the cursor advancement and resulting @Bytes@ for --- @takeTrailedBy 0x19@ look like this: --- --- > 0x10 0x13 0x08 0x15 0x19 0x23 0x17 | input --- > |---->---->---->---->----| | cursor --- > {----*----*----*----} | result bytes +{- | Take bytes until the specified byte is encountered. Consumes +the matched byte as well. Fails if the byte is not present. +Visually, the cursor advancement and resulting @Bytes@ for +@takeTrailedBy 0x19@ look like this: + +> 0x10 0x13 0x08 0x15 0x19 0x23 0x17 | input +> |---->---->---->---->----| | cursor +> {\----*----*----*----\} | result bytes +-} takeTrailedBy :: e -> Word8 -> Parser e s Bytes takeTrailedBy e !w = do !start <- cursor @@ -337,107 +376,129 @@ takeTrailedBy e !w = do !arr <- expose pure (Bytes arr start (end - (start + 1))) --- | Skip all characters until the character from the is encountered --- and then consume the matching byte as well. +{- | Skip all characters until the character from the is encountered +and then consume the matching byte as well. +-} skipTrailedBy :: e -> Word8 -> Parser e s () -{-# inline skipTrailedBy #-} +{-# INLINE skipTrailedBy #-} skipTrailedBy e !w = uneffectful# (\c -> skipUntilConsumeByteLoop e w c) skipUntilConsumeByteLoop :: - e -- Error message - -> Word8 -- byte to match - -> Bytes -- Chunk - -> Result# e () -skipUntilConsumeByteLoop e !w !c = if length c > 0 - then if PM.indexByteArray (array c) (offset c) /= (w :: Word8) - then skipUntilConsumeByteLoop e w (B.unsafeDrop 1 c) - else (# | (# (), unI (offset c + 1), unI (length c - 1) #) #) - else (# e | #) - --- | Skip all bytes until either of the bytes in encountered. Then, --- consume the matched byte. @True@ indicates that the first argument --- byte was encountered. @False@ indicates that the second argument --- byte was encountered. + e -> -- Error message + Word8 -> -- byte to match + Bytes -> -- Chunk + Result# e () +skipUntilConsumeByteLoop e !w !c = + if length c > 0 + then + if PM.indexByteArray (array c) (offset c) /= (w :: Word8) + then skipUntilConsumeByteLoop e w (B.unsafeDrop 1 c) + else (# | (# (), unI (offset c + 1), unI (length c - 1) #) #) + else (# e | #) + +{- | Skip all bytes until either of the bytes in encountered. Then, +consume the matched byte. @True@ indicates that the first argument +byte was encountered. @False@ indicates that the second argument +byte was encountered. +-} skipTrailedBy2 :: - e -- ^ Error message - -> Word8 -- ^ First trailer, @False@ indicates that this was encountered - -> Word8 -- ^ Second trailer, @True@ indicates that this was encountered - -> Parser e s Bool -{-# inline skipTrailedBy2 #-} + -- | Error message + e -> + -- | First trailer, @False@ indicates that this was encountered + Word8 -> + -- | Second trailer, @True@ indicates that this was encountered + Word8 -> + Parser e s Bool +{-# INLINE skipTrailedBy2 #-} skipTrailedBy2 e !wa !wb = boxBool (skipTrailedBy2# e wa wb) skipTrailedBy2# :: - e -- ^ Error message - -> Word8 -- ^ First trailer, 0 indicates that this was encountered - -> Word8 -- ^ Second trailer, 1 indicates that this was encountered - -> Parser e s Int# -{-# inline skipTrailedBy2# #-} + -- | Error message + e -> + -- | First trailer, 0 indicates that this was encountered + Word8 -> + -- | Second trailer, 1 indicates that this was encountered + Word8 -> + Parser e s Int# +{-# INLINE skipTrailedBy2# #-} skipTrailedBy2# e !wa !wb = uneffectfulInt# (\c -> skipUntilConsumeByteEitherLoop e wa wb c) skipTrailedBy3# :: - e -- ^ Error message - -> Word8 -- ^ First trailer, 0 indicates that this was encountered - -> Word8 -- ^ Second trailer, 1 indicates that this was encountered - -> Word8 -- ^ Third trailer, 2 indicates that this was encountered - -> Parser e s Int# -{-# inline skipTrailedBy3# #-} + -- | Error message + e -> + -- | First trailer, 0 indicates that this was encountered + Word8 -> + -- | Second trailer, 1 indicates that this was encountered + Word8 -> + -- | Third trailer, 2 indicates that this was encountered + Word8 -> + Parser e s Int# +{-# INLINE skipTrailedBy3# #-} skipTrailedBy3# e !wa !wb !wc = uneffectfulInt# (\c -> skipUntilConsumeByte3Loop e wa wb wc c) skipUntilConsumeByteEitherLoop :: - e -- Error message - -> Word8 -- first trailer - -> Word8 -- second trailer - -> Bytes -- Chunk - -> Result# e Int# -skipUntilConsumeByteEitherLoop e !wa !wb !c = if length c > 0 - then let byte = PM.indexByteArray (array c) (offset c) in - if | byte == wa -> (# | (# 0#, unI (offset c + 1), unI (length c - 1) #) #) - | byte == wb -> (# | (# 1#, unI (offset c + 1), unI (length c - 1) #) #) - | otherwise -> skipUntilConsumeByteEitherLoop e wa wb (B.unsafeDrop 1 c) - else (# e | #) + e -> -- Error message + Word8 -> -- first trailer + Word8 -> -- second trailer + Bytes -> -- Chunk + Result# e Int# +skipUntilConsumeByteEitherLoop e !wa !wb !c = + if length c > 0 + then + let byte = PM.indexByteArray (array c) (offset c) + in if + | byte == wa -> (# | (# 0#, unI (offset c + 1), unI (length c - 1) #) #) + | byte == wb -> (# | (# 1#, unI (offset c + 1), unI (length c - 1) #) #) + | otherwise -> skipUntilConsumeByteEitherLoop e wa wb (B.unsafeDrop 1 c) + else (# e | #) skipUntilConsumeByte3Loop :: - e -- Error message - -> Word8 -- first trailer - -> Word8 -- second trailer - -> Word8 -- third trailer - -> Bytes -- Chunk - -> Result# e Int# -skipUntilConsumeByte3Loop e !wa !wb !wc !c = if length c > 0 - then let byte = PM.indexByteArray (array c) (offset c) in - if | byte == wa -> (# | (# 0#, unI (offset c + 1), unI (length c - 1) #) #) - | byte == wb -> (# | (# 1#, unI (offset c + 1), unI (length c - 1) #) #) - | byte == wc -> (# | (# 2#, unI (offset c + 1), unI (length c - 1) #) #) - | otherwise -> skipUntilConsumeByte3Loop e wa wb wc (B.unsafeDrop 1 c) - else (# e | #) - --- | Take the given number of bytes. Fails if there is not enough --- remaining input. + e -> -- Error message + Word8 -> -- first trailer + Word8 -> -- second trailer + Word8 -> -- third trailer + Bytes -> -- Chunk + Result# e Int# +skipUntilConsumeByte3Loop e !wa !wb !wc !c = + if length c > 0 + then + let byte = PM.indexByteArray (array c) (offset c) + in if + | byte == wa -> (# | (# 0#, unI (offset c + 1), unI (length c - 1) #) #) + | byte == wb -> (# | (# 1#, unI (offset c + 1), unI (length c - 1) #) #) + | byte == wc -> (# | (# 2#, unI (offset c + 1), unI (length c - 1) #) #) + | otherwise -> skipUntilConsumeByte3Loop e wa wb wc (B.unsafeDrop 1 c) + else (# e | #) + +{- | Take the given number of bytes. Fails if there is not enough + remaining input. +-} take :: e -> Int -> Parser e s Bytes -{-# inline take #-} -take e n = uneffectful $ \chunk -> if n <= B.length chunk - then case B.unsafeTake n chunk of - bs -> Internal.Success bs (offset chunk + n) (length chunk - n) - else Internal.Failure e +{-# INLINE take #-} +take e n = uneffectful $ \chunk -> + if n <= B.length chunk + then case B.unsafeTake n chunk of + bs -> Internal.Success bs (offset chunk + n) (length chunk - n) + else Internal.Failure e -- | Variant of 'take' that tracks the length of the result in the result type. takeN :: e -> Arithmetic.Nat n -> Parser e s (BytesN n) -takeN e n0 = uneffectful $ \chunk -> if n <= B.length chunk - then case B.unsafeTake n chunk of - Bytes theChunk theOff _ -> Internal.Success (BytesN theChunk theOff) (offset chunk + n) (length chunk - n) - else Internal.Failure e - where +takeN e n0 = uneffectful $ \chunk -> + if n <= B.length chunk + then case B.unsafeTake n chunk of + Bytes theChunk theOff _ -> Internal.Success (BytesN theChunk theOff) (offset chunk + n) (length chunk - n) + else Internal.Failure e + where !n = Nat.demote n0 - - --- | Take at most the given number of bytes. This is greedy. It will --- consume as many bytes as there are available until it has consumed --- @n@ bytes. This never fails. +{- | Take at most the given number of bytes. This is greedy. It will + consume as many bytes as there are available until it has consumed + @n@ bytes. This never fails. +-} takeUpTo :: Int -> Parser e s Bytes -{-# inline takeUpTo #-} +{-# INLINE takeUpTo #-} takeUpTo n = uneffectful $ \chunk -> let m = min n (B.length chunk) in case B.unsafeTake m chunk of @@ -445,68 +506,76 @@ takeUpTo n = uneffectful $ \chunk -> -- | Consume all remaining bytes in the input. remaining :: Parser e s Bytes -{-# inline remaining #-} +{-# INLINE remaining #-} remaining = uneffectful $ \chunk -> Internal.Success chunk (offset chunk + length chunk) 0 -- | Return all remaining bytes in the input without consuming them. peekRemaining :: Parser e s Bytes -{-# inline peekRemaining #-} +{-# INLINE peekRemaining #-} peekRemaining = uneffectful $ \b@(Bytes _ off len) -> Internal.Success b off len -- | Skip while the predicate is matched. This is always inlined. skipWhile :: (Word8 -> Bool) -> Parser e s () -{-# inline skipWhile #-} -skipWhile f = go where - go = isEndOfInput >>= \case - True -> pure () - False -> do - w <- anyUnsafe - if f w - then go - else unconsume 1 - --- | The parser @satisfy p@ succeeds for any byte for which the --- predicate @p@ returns 'True'. Returns the byte that is --- actually parsed. +{-# INLINE skipWhile #-} +skipWhile f = go + where + go = + isEndOfInput >>= \case + True -> pure () + False -> do + w <- anyUnsafe + if f w + then go + else unconsume 1 + +{- | The parser @satisfy p@ succeeds for any byte for which the + predicate @p@ returns 'True'. Returns the byte that is + actually parsed. +-} satisfy :: e -> (Word8 -> Bool) -> Parser e s Word8 satisfy e p = satisfyWith e id p -{-# inline satisfy #-} +{-# INLINE satisfy #-} --- | The parser @satisfyWith f p@ transforms a byte, and succeeds --- if the predicate @p@ returns 'True' on the transformed value. --- The parser returns the transformed byte that was parsed. +{- | The parser @satisfyWith f p@ transforms a byte, and succeeds + if the predicate @p@ returns 'True' on the transformed value. + The parser returns the transformed byte that was parsed. +-} satisfyWith :: e -> (Word8 -> a) -> (a -> Bool) -> Parser e s a -{-# inline satisfyWith #-} -satisfyWith e f p = uneffectful $ \chunk -> if length chunk > 0 - then case B.unsafeIndex chunk 0 of - w -> - let v = f w - in if p v - then Internal.Success v (offset chunk + 1) (length chunk - 1) - else Internal.Failure e - else Internal.Failure e +{-# INLINE satisfyWith #-} +satisfyWith e f p = uneffectful $ \chunk -> + if length chunk > 0 + then case B.unsafeIndex chunk 0 of + w -> + let v = f w + in if p v + then Internal.Success v (offset chunk + 1) (length chunk - 1) + else Internal.Failure e + else Internal.Failure e -- | Fails if there is still more input remaining. endOfInput :: e -> Parser e s () -{-# inline endOfInput #-} -endOfInput e = uneffectful $ \chunk -> if length chunk == 0 - then Internal.Success () (offset chunk) 0 - else Internal.Failure e - --- | Returns true if there are no more bytes in the input. Returns --- false otherwise. Always succeeds. +{-# INLINE endOfInput #-} +endOfInput e = uneffectful $ \chunk -> + if length chunk == 0 + then Internal.Success () (offset chunk) 0 + else Internal.Failure e + +{- | Returns true if there are no more bytes in the input. Returns +false otherwise. Always succeeds. +-} isEndOfInput :: Parser e s Bool -{-# inline isEndOfInput #-} +{-# INLINE isEndOfInput #-} isEndOfInput = uneffectful $ \chunk -> Internal.Success (length chunk == 0) (offset chunk) (length chunk) boxPublicResult :: Result# e a -> Result e a -{-# inline boxPublicResult #-} +{-# INLINE boxPublicResult #-} boxPublicResult (# | (# a, b, c #) #) = Success (Slice (I# b) (I# c) a) boxPublicResult (# e | #) = Failure e +{- FOURMOLU_DISABLE -} -- | Convert a 'Word32' parser to a 'Word#' parser. unboxWord32 :: Parser e s Word32 -> Parser e s Word# {-# inline unboxWord32 #-} @@ -520,17 +589,20 @@ unboxWord32 (Parser f) = Parser #endif a, b, c #) #) #) ) +{- FOURMOLU_ENABLE -} -- | Convert a @(Int,Int)@ parser to a @(# Int#, Int# #)@ parser. -unboxIntPair :: Parser e s (Int,Int) -> Parser e s (# Int#, Int# #) -{-# inline unboxIntPair #-} -unboxIntPair (Parser f) = Parser - (\x s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# (I# y, I# z), b, c #) #) -> (# s1, (# | (# (# y, z #), b, c #) #) #) - ) - +unboxIntPair :: Parser e s (Int, Int) -> Parser e s (# Int#, Int# #) +{-# INLINE unboxIntPair #-} +unboxIntPair (Parser f) = + Parser + ( \x s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# (I# y, I# z), b, c #) #) -> (# s1, (# | (# (# y, z #), b, c #) #) #) + ) + +{- FOURMOLU_DISABLE -} -- | Convert a 'Word#' parser to a 'Word32' parser. Precondition: -- the argument parser only returns words less than 4294967296. boxWord32 :: Parser e s Word# -> Parser e s Word32 @@ -545,233 +617,263 @@ boxWord32 (Parser f) = Parser #endif a), b, c #) #) #) ) +{- FOURMOLU_ENABLE -} -- | Convert a @(# Int#, Int# #)@ parser to a @(Int,Int)@ parser. boxInt :: Parser e s Int# -> Parser e s Int -{-# inline boxInt #-} -boxInt (Parser f) = Parser - (\x s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> (# s1, (# | (# I# y, b, c #) #) #) - ) +{-# INLINE boxInt #-} +boxInt (Parser f) = + Parser + ( \x s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> (# s1, (# | (# I# y, b, c #) #) #) + ) -- | Convert a @(# Int#, Int# #)@ parser to a @(Int,Int)@ parser. boxBool :: Parser e s Int# -> Parser e s Bool -{-# inline boxBool #-} -boxBool (Parser f) = Parser - (\x s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> (# s1, (# | (# case y of {1# -> True; _ -> False}, b, c #) #) #) - ) +{-# INLINE boxBool #-} +boxBool (Parser f) = + Parser + ( \x s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> (# s1, (# | (# case y of 1# -> True; _ -> False, b, c #) #) #) + ) -- | Convert a @(# Int#, Int# #)@ parser to a @(Int,Int)@ parser. -boxIntPair :: Parser e s (# Int#, Int# #) -> Parser e s (Int,Int) -{-# inline boxIntPair #-} -boxIntPair (Parser f) = Parser - (\x s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# (# y, z #), b, c #) #) -> (# s1, (# | (# (I# y, I# z), b, c #) #) #) - ) - - --- | There is a law-abiding instance of 'Alternative' for 'Parser'. --- However, it is not terribly useful since error messages seldom --- have a 'Monoid' instance. This function is a variant of @\<|\>@ --- that is right-biased in its treatment of error messages. --- Consequently, @orElse@ lacks an identity. --- See --- for more discussion of this topic. +boxIntPair :: Parser e s (# Int#, Int# #) -> Parser e s (Int, Int) +{-# INLINE boxIntPair #-} +boxIntPair (Parser f) = + Parser + ( \x s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# (# y, z #), b, c #) #) -> (# s1, (# | (# (I# y, I# z), b, c #) #) #) + ) + +{- | There is a law-abiding instance of 'Alternative' for 'Parser'. +However, it is not terribly useful since error messages seldom +have a 'Monoid' instance. This function is a variant of @\<|\>@ +that is right-biased in its treatment of error messages. +Consequently, @orElse@ lacks an identity. +See +for more discussion of this topic. +-} infixl 3 `orElse` + orElse :: Parser x s a -> Parser e s a -> Parser e s a -{-# inline orElse #-} -orElse (Parser f) (Parser g) = Parser - (\x s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# _ | #) -> g x s1 - (# | r #) -> (# s1, (# | r #) #) - ) +{-# INLINE orElse #-} +orElse (Parser f) (Parser g) = + Parser + ( \x s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# _ | #) -> g x s1 + (# | r #) -> (# s1, (# | r #) #) + ) -- | Effectfully adjusts the error message if an error occurs. mapErrorEffectfully :: (e1 -> ST s e2) -> Parser e1 s a -> Parser e2 s a -{-# inline mapErrorEffectfully #-} -mapErrorEffectfully f (Parser g) = Parser - (\x s0 -> case g x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> case f e of - ST h -> case h s1 of - (# s2, e' #) -> (# s2, (# e' | #) #) - (# | r #) -> (# s1, (# | r #) #) - ) +{-# INLINE mapErrorEffectfully #-} +mapErrorEffectfully f (Parser g) = + Parser + ( \x s0 -> case g x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> case f e of + ST h -> case h s1 of + (# s2, e' #) -> (# s2, (# e' | #) #) + (# | r #) -> (# s1, (# | r #) #) + ) bindFromCharToLifted :: Parser s e Char# -> (Char# -> Parser s e a) -> Parser s e a -{-# inline bindFromCharToLifted #-} -bindFromCharToLifted (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) +{-# INLINE bindFromCharToLifted #-} +bindFromCharToLifted (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromCharToIntPair :: Parser s e Char# -> (Char# -> Parser s e (# Int#, Int# #)) -> Parser s e (# Int#, Int# #) -{-# inline bindFromCharToIntPair #-} -bindFromCharToIntPair (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) +{-# INLINE bindFromCharToIntPair #-} +bindFromCharToIntPair (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromLiftedToInt :: Parser s e a -> (a -> Parser s e Int#) -> Parser s e Int# -{-# inline bindFromLiftedToInt #-} -bindFromLiftedToInt (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) +{-# INLINE bindFromLiftedToInt #-} +bindFromLiftedToInt (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromLiftedToIntPair :: Parser s e a -> (a -> Parser s e (# Int#, Int# #)) -> Parser s e (# Int#, Int# #) -{-# inline bindFromLiftedToIntPair #-} -bindFromLiftedToIntPair (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) +{-# INLINE bindFromLiftedToIntPair #-} +bindFromLiftedToIntPair (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromIntToIntPair :: Parser s e Int# -> (Int# -> Parser s e (# Int#, Int# #)) -> Parser s e (# Int#, Int# #) -{-# inline bindFromIntToIntPair #-} -bindFromIntToIntPair (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) +{-# INLINE bindFromIntToIntPair #-} +bindFromIntToIntPair (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromMaybeCharToIntPair :: - Parser s e (# (# #) | Char# #) - -> ((# (# #) | Char# #) -> Parser s e (# Int#, Int# #)) - -> Parser s e (# Int#, Int# #) -{-# inline bindFromMaybeCharToIntPair #-} -bindFromMaybeCharToIntPair (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) + Parser s e (# (# #) | Char# #) -> + ((# (# #) | Char# #) -> Parser s e (# Int#, Int# #)) -> + Parser s e (# Int#, Int# #) +{-# INLINE bindFromMaybeCharToIntPair #-} +bindFromMaybeCharToIntPair (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) bindFromMaybeCharToLifted :: - Parser s e (# (# #) | Char# #) - -> ((# (# #) | Char# #) -> Parser s e a) - -> Parser s e a -{-# inline bindFromMaybeCharToLifted #-} -bindFromMaybeCharToLifted (Parser f) g = Parser - (\x@(# arr, _, _ #) s0 -> case f x s0 of - (# s1, r0 #) -> case r0 of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, b, c #) #) -> - runParser (g y) (# arr, b, c #) s1 - ) + Parser s e (# (# #) | Char# #) -> + ((# (# #) | Char# #) -> Parser s e a) -> + Parser s e a +{-# INLINE bindFromMaybeCharToLifted #-} +bindFromMaybeCharToLifted (Parser f) g = + Parser + ( \x@(# arr, _, _ #) s0 -> case f x s0 of + (# s1, r0 #) -> case r0 of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, b, c #) #) -> + runParser (g y) (# arr, b, c #) s1 + ) pureIntPair :: - (# Int#, Int# #) - -> Parser s e (# Int#, Int# #) -{-# inline pureIntPair #-} -pureIntPair a = Parser - (\(# _, b, c #) s -> (# s, (# | (# a, b, c #) #) #)) + (# Int#, Int# #) -> + Parser s e (# Int#, Int# #) +{-# INLINE pureIntPair #-} +pureIntPair a = + Parser + (\(# _, b, c #) s -> (# s, (# | (# a, b, c #) #) #)) failIntPair :: e -> Parser e s (# Int#, Int# #) -{-# inline failIntPair #-} -failIntPair e = Parser - (\(# _, _, _ #) s -> (# s, (# e | #) #)) - --- | Augment a parser with the number of bytes that were consume while --- it executed. -measure :: Parser e s a -> Parser e s (Int,a) -{-# inline measure #-} -measure (Parser f) = Parser - (\x@(# _, pre, _ #) s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# y, post, c #) #) -> (# s1, (# | (# (I# (post -# pre), y),post,c #) #) #) - ) - --- | Run a parser and discard the result, returning instead the number --- of bytes that the parser consumed. +{-# INLINE failIntPair #-} +failIntPair e = + Parser + (\(# _, _, _ #) s -> (# s, (# e | #) #)) + +{- | Augment a parser with the number of bytes that were consume while +it executed. +-} +measure :: Parser e s a -> Parser e s (Int, a) +{-# INLINE measure #-} +measure (Parser f) = + Parser + ( \x@(# _, pre, _ #) s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# y, post, c #) #) -> (# s1, (# | (# (I# (post -# pre), y), post, c #) #) #) + ) + +{- | Run a parser and discard the result, returning instead the number +of bytes that the parser consumed. +-} measure_ :: Parser e s a -> Parser e s Int -{-# inline measure_ #-} +{-# INLINE measure_ #-} measure_ p = boxInt (measure_# p) -- | Variant of 'measure_' with an unboxed result. measure_# :: Parser e s a -> Parser e s Int# -{-# inline measure_# #-} -measure_# (Parser f) = Parser - (\x@(# _, pre, _ #) s0 -> case f x s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# _, post, c #) #) -> (# s1, (# | (# post -# pre,post,c #) #) #) - ) - - - --- | Run a parser in a delimited context, failing if the requested number --- of bytes are not available or if the delimited parser does not --- consume all input. This combinator can be understood as a composition --- of 'take', 'effect', 'parseBytesEffectfully', and 'endOfInput'. It is --- provided as a single combinator because for convenience and because it is --- easy to make mistakes when manually assembling the aforementioned parsers. --- The pattern of prefixing an encoding with its length is common. --- This is discussed more in --- . --- --- > delimit e1 e2 n remaining === take e1 n +{-# INLINE measure_# #-} +measure_# (Parser f) = + Parser + ( \x@(# _, pre, _ #) s0 -> case f x s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# _, post, c #) #) -> (# s1, (# | (# post -# pre, post, c #) #) #) + ) + +{- | Run a parser in a delimited context, failing if the requested number +of bytes are not available or if the delimited parser does not +consume all input. This combinator can be understood as a composition +of 'take', 'effect', 'parseBytesEffectfully', and 'endOfInput'. It is +provided as a single combinator because for convenience and because it is +easy to make mistakes when manually assembling the aforementioned parsers. +The pattern of prefixing an encoding with its length is common. +This is discussed more in +. + +> delimit e1 e2 n remaining === take e1 n +-} delimit :: - e -- ^ Error message when not enough bytes are present - -> e -- ^ Error message when delimited parser does not consume all input - -> Int -- ^ Exact number of bytes delimited parser is expected to consume - -> Parser e s a -- ^ Parser to execute in delimited context - -> Parser e s a -{-# inline delimit #-} -delimit esz eleftovers (I# n) (Parser f) = Parser - ( \(# arr, off, len #) s0 -> case len >=# n of - 1# -> case f (# arr, off, n #) s0 of - (# s1, r #) -> case r of - (# e | #) -> (# s1, (# e | #) #) - (# | (# a, newOff, leftovers #) #) -> case leftovers of - 0# -> (# s1, (# | (# a, newOff, len -# n #) #) #) - _ -> (# s1, (# eleftovers | #) #) - _ -> (# s0, (# esz | #) #) - ) - --- | Replicate a parser @n@ times, writing the results into --- an array of length @n@. For @Array@ and @SmallArray@, this --- is lazy in the elements, so be sure the they result of the --- parser is evaluated appropriately to avoid unwanted thunks. -replicate :: forall arr e s a. (Contiguous arr, Element arr a) - => Int -- ^ Number of times to run the parser - -> Parser e s a -- ^ Parser - -> Parser e s (arr a) -{-# inline replicate #-} + -- | Error message when not enough bytes are present + e -> + -- | Error message when delimited parser does not consume all input + e -> + -- | Exact number of bytes delimited parser is expected to consume + Int -> + -- | Parser to execute in delimited context + Parser e s a -> + Parser e s a +{-# INLINE delimit #-} +delimit esz eleftovers (I# n) (Parser f) = + Parser + ( \(# arr, off, len #) s0 -> case len >=# n of + 1# -> case f (# arr, off, n #) s0 of + (# s1, r #) -> case r of + (# e | #) -> (# s1, (# e | #) #) + (# | (# a, newOff, leftovers #) #) -> case leftovers of + 0# -> (# s1, (# | (# a, newOff, len -# n #) #) #) + _ -> (# s1, (# eleftovers | #) #) + _ -> (# s0, (# esz | #) #) + ) + +{- | Replicate a parser @n@ times, writing the results into +an array of length @n@. For @Array@ and @SmallArray@, this +is lazy in the elements, so be sure the they result of the +parser is evaluated appropriately to avoid unwanted thunks. +-} +replicate :: + forall arr e s a. + (Contiguous arr, Element arr a) => + -- | Number of times to run the parser + Int -> + -- | Parser + Parser e s a -> + Parser e s (arr a) +{-# INLINE replicate #-} replicate !len p = do marr <- effect (C.new len) let go :: Int -> Parser e s (arr a) - go !ix = if ix < len - then do - a <- p - effect (C.write marr ix a) - go (ix + 1) - else effect (C.unsafeFreeze marr) + go !ix = + if ix < len + then do + a <- p + effect (C.write marr ix a) + go (ix + 1) + else effect (C.unsafeFreeze marr) go 0 unI :: Int -> Int# -{-# inline unI #-} +{-# INLINE unI #-} unI (I# w) = w diff --git a/src/Data/Bytes/Parser/Latin.hs b/src/Data/Bytes/Parser/Latin.hs index 7e10f0d..f967da9 100644 --- a/src/Data/Bytes/Parser/Latin.hs +++ b/src/Data/Bytes/Parser/Latin.hs @@ -1,27 +1,29 @@ -{-# language BangPatterns #-} -{-# language BinaryLiterals #-} -{-# language DataKinds #-} -{-# language DeriveFunctor #-} -{-# language DerivingStrategies #-} -{-# language GADTSyntax #-} -{-# language KindSignatures #-} -{-# language LambdaCase #-} -{-# language MagicHash #-} -{-# language MultiWayIf #-} -{-# language PolyKinds #-} -{-# language RankNTypes #-} -{-# language ScopedTypeVariables #-} -{-# language StandaloneDeriving #-} -{-# language TypeApplications #-} -{-# language UnboxedSums #-} -{-# language UnboxedTuples #-} -{-# language CPP #-} - --- | Parse input as though it were text encoded by --- ISO 8859-1 (Latin-1). All byte sequences are valid --- text under ISO 8859-1. +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BinaryLiterals #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GADTSyntax #-} +{-# LANGUAGE KindSignatures #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PolyKinds #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE UnboxedSums #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | Parse input as though it were text encoded by +ISO 8859-1 (Latin-1). All byte sequences are valid +text under ISO 8859-1. +-} module Data.Bytes.Parser.Latin ( -- * Matching + -- ** Required char , char2 @@ -35,18 +37,23 @@ module Data.Bytes.Parser.Latin , char10 , char11 , char12 + -- ** Try , trySatisfy , trySatisfyThen + -- * One Character , any , opt , opt# + -- * Many Characters , takeTrailedBy + -- * Lookahead , peek , peek' + -- * Skip , skipDigits , skipDigits1 @@ -55,17 +62,22 @@ module Data.Bytes.Parser.Latin , skipTrailedBy , skipUntil , skipWhile + -- * End of Input , endOfInput , isEndOfInput + -- * Numbers + -- ** Decimal + -- *** Unsigned , decWord , decWord8 , decWord16 , decWord32 , decWord64 + -- *** Signed , decUnsignedInt , decUnsignedInt# @@ -76,11 +88,14 @@ module Data.Bytes.Parser.Latin , decSignedInteger , decUnsignedInteger , decTrailingInteger + -- ** Hexadecimal + -- *** Variable Length , hexWord8 , hexWord16 , hexWord32 + -- *** Fixed Length , hexFixedWord8 , hexFixedWord16 @@ -88,6 +103,7 @@ module Data.Bytes.Parser.Latin , hexFixedWord64 , hexFixedWord128 , hexFixedWord256 + -- *** Digit , hexNibbleLower , tryHexNibbleLower @@ -95,33 +111,29 @@ module Data.Bytes.Parser.Latin , tryHexNibble ) where -import Prelude hiding (length,any,fail,takeWhile) +import Prelude hiding (any, fail, length, takeWhile) import Data.Bits ((.|.)) -import Data.Bytes.Types (Bytes(..)) -import Data.Bytes.Parser.Internal (InternalStep(..),unfailing) -import Data.Bytes.Parser (bindFromLiftedToInt,isEndOfInput,endOfInput) -import Data.Bytes.Parser.Internal (Parser(..),ST#,uneffectful,Result#,uneffectful#) -import Data.Bytes.Parser.Internal (Result(..),indexLatinCharArray,upcastUnitSuccess) -import Data.Bytes.Parser.Internal (boxBytes) -import Data.Bytes.Parser.Unsafe (expose,cursor,unconsume) +import Data.Bytes.Parser (bindFromLiftedToInt, endOfInput, isEndOfInput) +import Data.Bytes.Parser.Internal (InternalStep (..), Parser (..), Result (..), Result#, ST#, boxBytes, indexLatinCharArray, uneffectful, uneffectful#, unfailing, upcastUnitSuccess) +import Data.Bytes.Parser.Unsafe (cursor, expose, unconsume) +import Data.Bytes.Types (Bytes (..)) import Data.Char (ord) import Data.Kind (Type) -import Data.WideWord (Word256(Word256),Word128(Word128)) +import Data.WideWord (Word128 (Word128), Word256 (Word256)) import Data.Word (Word8) -import GHC.Exts (Int(I#),Char(C#),Word#,Int#,Char#,(+#),(-#),indexCharArray#) -import GHC.Exts (TYPE,RuntimeRep,int2Word#,or#) -import GHC.Exts (ltWord#,gtWord#,notI#) -import GHC.Word (Word(W#),Word8(W8#),Word16(W16#),Word32(W32#),Word64(W64#)) +import GHC.Exts (Char (C#), Char#, Int (I#), Int#, RuntimeRep, TYPE, Word#, gtWord#, indexCharArray#, int2Word#, ltWord#, notI#, or#, (+#), (-#)) +import GHC.Word (Word (W#), Word16 (W16#), Word32 (W32#), Word64 (W64#), Word8 (W8#)) -import qualified GHC.Exts as Exts import qualified Data.Bytes as Bytes import qualified Data.Primitive as PM +import qualified GHC.Exts as Exts --- | Runs the predicate on the next character in the input. If the --- predicate is matched, this consumes the character. Otherwise, --- the character is not consumed. This returns @False@ if the end --- of the input has been reached. This never fails. +{- | Runs the predicate on the next character in the input. If the +predicate is matched, this consumes the character. Otherwise, +the character is not consumed. This returns @False@ if the end +of the input has been reached. This never fails. +-} trySatisfy :: (Char -> Bool) -> Parser e s Bool trySatisfy f = uneffectful $ \chunk -> case length chunk of 0 -> Success False (offset chunk) (length chunk) @@ -129,448 +141,561 @@ trySatisfy f = uneffectful $ \chunk -> case length chunk of True -> Success True (offset chunk + 1) (length chunk - 1) False -> Success False (offset chunk) (length chunk) --- | Runs the function on the next character in the input. If the --- function returns @Just@, this consumes the character and then --- runs the parser on the remaining input. If the function returns --- @Nothing@, this does not consume the tested character, and it --- runs the default parser on the input (which includes the tested --- character). If there is no input remaining, this also runs the --- default parser. This combinator never fails. -trySatisfyThen :: forall (r :: RuntimeRep) (e :: Type) (s :: Type) (a :: TYPE r). - Parser e s a -- ^ Default parser. Runs on @Nothing@ or end of input. - -> (Char -> Maybe (Parser e s a)) -- ^ Parser-selecting predicate - -> Parser e s a -{-# inline trySatisfyThen #-} -trySatisfyThen (Parser g) f = Parser - (\input@(# arr,off0,len0 #) s0 -> case len0 of - 0# -> g input s0 - _ -> case f (C# (indexCharArray# arr off0)) of - Nothing -> g input s0 - Just (Parser p) -> p (# arr, off0 +# 1#, len0 -# 1# #) s0 - ) +{- | Runs the function on the next character in the input. If the +function returns @Just@, this consumes the character and then +runs the parser on the remaining input. If the function returns +@Nothing@, this does not consume the tested character, and it +runs the default parser on the input (which includes the tested +character). If there is no input remaining, this also runs the +default parser. This combinator never fails. +-} +trySatisfyThen :: + forall (r :: RuntimeRep) (e :: Type) (s :: Type) (a :: TYPE r). + -- | Default parser. Runs on @Nothing@ or end of input. + Parser e s a -> + -- | Parser-selecting predicate + (Char -> Maybe (Parser e s a)) -> + Parser e s a +{-# INLINE trySatisfyThen #-} +trySatisfyThen (Parser g) f = + Parser + ( \input@(# arr, off0, len0 #) s0 -> case len0 of + 0# -> g input s0 + _ -> case f (C# (indexCharArray# arr off0)) of + Nothing -> g input s0 + Just (Parser p) -> p (# arr, off0 +# 1#, len0 -# 1# #) s0 + ) --- | Consume the next character, failing if it does not --- match the expected value or if there is no more input. +{- | Consume the next character, failing if it does not +match the expected value or if there is no more input. +-} char :: e -> Char -> Parser e s () -{-# inline char #-} -char e !c = uneffectful $ \chunk -> if length chunk > 0 - then if indexLatinCharArray (array chunk) (offset chunk) == c - then Success () (offset chunk + 1) (length chunk - 1) +{-# INLINE char #-} +char e !c = uneffectful $ \chunk -> + if length chunk > 0 + then + if indexLatinCharArray (array chunk) (offset chunk) == c + then Success () (offset chunk + 1) (length chunk - 1) + else Failure e else Failure e - else Failure e --- | Consume the next two characters, failing if they do --- not match the expected values. --- --- > char2 e a b === char e a *> char e b +{- | Consume the next two characters, failing if they do +not match the expected values. + +> char2 e a b === char e a *> char e b +-} char2 :: e -> Char -> Char -> Parser e s () -{-# inline char2 #-} +{-# INLINE char2 #-} char2 e !c0 !c1 = uneffectful $ \chunk -> - if | length chunk > 1 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - -> Success () (offset chunk + 2) (length chunk - 2) - | otherwise -> Failure e - --- | Consume three characters, failing if they do --- not match the expected values. --- --- > char3 e a b c === char e a *> char e b *> char e c + if + | length chunk > 1 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 -> + Success () (offset chunk + 2) (length chunk - 2) + | otherwise -> Failure e + +{- | Consume three characters, failing if they do +not match the expected values. + +> char3 e a b c === char e a *> char e b *> char e c +-} char3 :: e -> Char -> Char -> Char -> Parser e s () -{-# inline char3 #-} +{-# INLINE char3 #-} char3 e !c0 !c1 !c2 = uneffectful $ \chunk -> - if | length chunk > 2 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - -> Success () (offset chunk + 3) (length chunk - 3) - | otherwise -> Failure e - --- | Consume four characters, failing if they do --- not match the expected values. --- --- > char4 e a b c d === char e a *> char e b *> char e c *> char e d + if + | length chunk > 2 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 -> + Success () (offset chunk + 3) (length chunk - 3) + | otherwise -> Failure e + +{- | Consume four characters, failing if they do +not match the expected values. + +> char4 e a b c d === char e a *> char e b *> char e c *> char e d +-} char4 :: e -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char4 #-} +{-# INLINE char4 #-} char4 e !c0 !c1 !c2 !c3 = uneffectful $ \chunk -> - if | length chunk > 3 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - -> Success () (offset chunk + 4) (length chunk - 4) - | otherwise -> Failure e - --- | Consume five characters, failing if they do --- not match the expected values. + if + | length chunk > 3 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 -> + Success () (offset chunk + 4) (length chunk - 4) + | otherwise -> Failure e + +{- | Consume five characters, failing if they do +not match the expected values. +-} char5 :: e -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char5 #-} +{-# INLINE char5 #-} char5 e !c0 !c1 !c2 !c3 !c4 = uneffectful $ \chunk -> - if | length chunk > 4 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - -> Success () (offset chunk + 5) (length chunk - 5) - | otherwise -> Failure e - --- | Consume six characters, failing if they do --- not match the expected values. + if + | length chunk > 4 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 -> + Success () (offset chunk + 5) (length chunk - 5) + | otherwise -> Failure e + +{- | Consume six characters, failing if they do +not match the expected values. +-} char6 :: e -> Char -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char6 #-} +{-# INLINE char6 #-} char6 e !c0 !c1 !c2 !c3 !c4 !c5 = uneffectful $ \chunk -> - if | length chunk > 5 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - -> Success () (offset chunk + 6) (length chunk - 6) - | otherwise -> Failure e - --- | Consume seven characters, failing if they do --- not match the expected values. + if + | length chunk > 5 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 -> + Success () (offset chunk + 6) (length chunk - 6) + | otherwise -> Failure e + +{- | Consume seven characters, failing if they do +not match the expected values. +-} char7 :: e -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char7 #-} +{-# INLINE char7 #-} char7 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 = uneffectful $ \chunk -> - if | length chunk > 6 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - -> Success () (offset chunk + 7) (length chunk - 7) - | otherwise -> Failure e - --- | Consume eight characters, failing if they do --- not match the expected values. + if + | length chunk > 6 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 -> + Success () (offset chunk + 7) (length chunk - 7) + | otherwise -> Failure e + +{- | Consume eight characters, failing if they do +not match the expected values. +-} char8 :: e -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char8 #-} +{-# INLINE char8 #-} char8 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 = uneffectful $ \chunk -> - if | length chunk > 7 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 - -> Success () (offset chunk + 8) (length chunk - 8) - | otherwise -> Failure e - --- | Consume nine characters, failing if they do --- not match the expected values. -char9 :: e -> Char -> Char -> Char -> Char - -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char9 #-} + if + | length chunk > 7 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 + , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 -> + Success () (offset chunk + 8) (length chunk - 8) + | otherwise -> Failure e + +{- | Consume nine characters, failing if they do +not match the expected values. +-} +char9 :: + e -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Parser e s () +{-# INLINE char9 #-} char9 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 = uneffectful $ \chunk -> - if | length chunk > 8 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 - , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 - -> Success () (offset chunk + 9) (length chunk - 9) - | otherwise -> Failure e - --- | Consume ten characters, failing if they do --- not match the expected values. -char10 :: e -> Char -> Char -> Char -> Char -> Char - -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char10 #-} + if + | length chunk > 8 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 + , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 + , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 -> + Success () (offset chunk + 9) (length chunk - 9) + | otherwise -> Failure e + +{- | Consume ten characters, failing if they do +not match the expected values. +-} +char10 :: + e -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Parser e s () +{-# INLINE char10 #-} char10 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 = uneffectful $ \chunk -> - if | length chunk > 9 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 - , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 - , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 - -> Success () (offset chunk + 10) (length chunk - 10) - | otherwise -> Failure e - --- | Consume eleven characters, failing if they do --- not match the expected values. -char11 :: e -> Char -> Char -> Char -> Char -> Char -> Char - -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char11 #-} + if + | length chunk > 9 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 + , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 + , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 + , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 -> + Success () (offset chunk + 10) (length chunk - 10) + | otherwise -> Failure e + +{- | Consume eleven characters, failing if they do +not match the expected values. +-} +char11 :: + e -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Parser e s () +{-# INLINE char11 #-} char11 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 = uneffectful $ \chunk -> - if | length chunk > 10 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 - , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 - , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 - , indexLatinCharArray (array chunk) (offset chunk + 10) == c10 - -> Success () (offset chunk + 11) (length chunk - 11) - | otherwise -> Failure e - --- | Consume twelve characters, failing if they do --- not match the expected values. -char12 :: e -> Char -> Char -> Char -> Char -> Char -> Char - -> Char -> Char -> Char -> Char -> Char -> Char -> Parser e s () -{-# inline char12 #-} + if + | length chunk > 10 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 + , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 + , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 + , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 + , indexLatinCharArray (array chunk) (offset chunk + 10) == c10 -> + Success () (offset chunk + 11) (length chunk - 11) + | otherwise -> Failure e + +{- | Consume twelve characters, failing if they do +not match the expected values. +-} +char12 :: + e -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Char -> + Parser e s () +{-# INLINE char12 #-} char12 e !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 = uneffectful $ \chunk -> - if | length chunk > 11 - , indexLatinCharArray (array chunk) (offset chunk) == c0 - , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 - , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 - , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 - , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 - , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 - , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 - , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 - , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 - , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 - , indexLatinCharArray (array chunk) (offset chunk + 10) == c10 - , indexLatinCharArray (array chunk) (offset chunk + 11) == c11 - -> Success () (offset chunk + 12) (length chunk - 12) - | otherwise -> Failure e + if + | length chunk > 11 + , indexLatinCharArray (array chunk) (offset chunk) == c0 + , indexLatinCharArray (array chunk) (offset chunk + 1) == c1 + , indexLatinCharArray (array chunk) (offset chunk + 2) == c2 + , indexLatinCharArray (array chunk) (offset chunk + 3) == c3 + , indexLatinCharArray (array chunk) (offset chunk + 4) == c4 + , indexLatinCharArray (array chunk) (offset chunk + 5) == c5 + , indexLatinCharArray (array chunk) (offset chunk + 6) == c6 + , indexLatinCharArray (array chunk) (offset chunk + 7) == c7 + , indexLatinCharArray (array chunk) (offset chunk + 8) == c8 + , indexLatinCharArray (array chunk) (offset chunk + 9) == c9 + , indexLatinCharArray (array chunk) (offset chunk + 10) == c10 + , indexLatinCharArray (array chunk) (offset chunk + 11) == c11 -> + Success () (offset chunk + 12) (length chunk - 12) + | otherwise -> Failure e -- | Consumes and returns the next character in the input. any :: e -> Parser e s Char -{-# inline any #-} -any e = uneffectful $ \chunk -> if length chunk > 0 - then - let c = indexLatinCharArray (array chunk) (offset chunk) - in Success c (offset chunk + 1) (length chunk - 1) - else Failure e +{-# INLINE any #-} +any e = uneffectful $ \chunk -> + if length chunk > 0 + then + let c = indexLatinCharArray (array chunk) (offset chunk) + in Success c (offset chunk + 1) (length chunk - 1) + else Failure e --- | Consume a character from the input or return @Nothing@ if --- end of the stream has been reached. Since ISO 8859-1 maps every --- bytes to a character, this parser never fails. +{- | Consume a character from the input or return @Nothing@ if +end of the stream has been reached. Since ISO 8859-1 maps every +bytes to a character, this parser never fails. +-} opt :: Parser e s (Maybe Char) -{-# inline opt #-} +{-# INLINE opt #-} opt = uneffectful $ \chunk -> case length chunk of 0 -> Success Nothing (offset chunk) (length chunk) - _ -> Success - (Just (indexLatinCharArray (array chunk) (offset chunk))) - (offset chunk + 1) (length chunk - 1) + _ -> + Success + (Just (indexLatinCharArray (array chunk) (offset chunk))) + (offset chunk + 1) + (length chunk - 1) -- | Variant of @opt@ with unboxed result. opt# :: Parser e s (# (# #) | Char# #) -{-# inline opt# #-} -opt# = Parser - (\(# arr, off, len #) s0 -> case len of - 0# -> (# s0, (# | (# (# (# #) | #), off, len #) #) #) - _ -> (# s0, (# | (# (# | indexCharArray# arr off #), off +# 1#, len -# 1# #) #) #) - ) +{-# INLINE opt# #-} +opt# = + Parser + ( \(# arr, off, len #) s0 -> case len of + 0# -> (# s0, (# | (# (# (# #) | #), off, len #) #) #) + _ -> (# s0, (# | (# (# | indexCharArray# arr off #), off +# 1#, len -# 1# #) #) #) + ) skipDigitsAsciiLoop :: - Bytes -- Chunk - -> (# Int#, Int# #) -skipDigitsAsciiLoop !c = if length c > 0 - then - let w = indexLatinCharArray (array c) (offset c) - in if w >= '0' && w <= '9' - then skipDigitsAsciiLoop (Bytes.unsafeDrop 1 c) - else (# unI (offset c), unI (length c) #) - else (# unI (offset c), unI (length c) #) + Bytes -> -- Chunk + (# Int#, Int# #) +skipDigitsAsciiLoop !c = + if length c > 0 + then + let w = indexLatinCharArray (array c) (offset c) + in if w >= '0' && w <= '9' + then skipDigitsAsciiLoop (Bytes.unsafeDrop 1 c) + else (# unI (offset c), unI (length c) #) + else (# unI (offset c), unI (length c) #) skipDigitsAscii1LoopStart :: - e - -> Bytes -- chunk - -> Result# e () -skipDigitsAscii1LoopStart e !c = if length c > 0 - then - let w = indexLatinCharArray (array c) (offset c) - in if w >= '0' && w <= '9' - then upcastUnitSuccess (skipDigitsAsciiLoop (Bytes.unsafeDrop 1 c)) - else (# e | #) - else (# e | #) + e -> + Bytes -> -- chunk + Result# e () +skipDigitsAscii1LoopStart e !c = + if length c > 0 + then + let w = indexLatinCharArray (array c) (offset c) + in if w >= '0' && w <= '9' + then upcastUnitSuccess (skipDigitsAsciiLoop (Bytes.unsafeDrop 1 c)) + else (# e | #) + else (# e | #) --- | Variant of 'skipDigits' that requires at least one digit --- to be present. +{- | Variant of 'skipDigits' that requires at least one digit +to be present. +-} skipDigits1 :: e -> Parser e s () -{-# inline skipDigits1 #-} +{-# INLINE skipDigits1 #-} skipDigits1 e = uneffectful# $ \c -> skipDigitsAscii1LoopStart e c --- | Skip the characters @0-9@ until a non-digit is encountered. --- This parser does not fail. +{- | Skip the characters @0-9@ until a non-digit is encountered. +This parser does not fail. +-} skipDigits :: Parser e s () skipDigits = uneffectful# $ \c -> upcastUnitSuccess (skipDigitsAsciiLoop c) unI :: Int -> Int# -{-# inline unI #-} +{-# INLINE unI #-} unI (I# w) = w --- | Skip the character any number of times. This succeeds --- even if the character was not present. +{- | Skip the character any number of times. This succeeds +even if the character was not present. +-} skipChar :: Char -> Parser e s () -{-# inline skipChar #-} +{-# INLINE skipChar #-} skipChar !w = uneffectful# $ \c -> upcastUnitSuccess (skipLoop w c) --- | Skip the character any number of times. It must occur --- at least once or else this will fail. +{- | Skip the character any number of times. It must occur +at least once or else this will fail. +-} skipChar1 :: e -> Char -> Parser e s () -{-# inline skipChar1 #-} +{-# INLINE skipChar1 #-} skipChar1 e !w = uneffectful# $ \c -> skipLoop1Start e w c skipLoop :: - Char -- byte to match - -> Bytes -- Chunk - -> (# Int#, Int# #) -skipLoop !w !c = if length c > 0 - then if indexLatinCharArray (array c) (offset c) == w - then skipLoop w (Bytes.unsafeDrop 1 c) + Char -> -- byte to match + Bytes -> -- Chunk + (# Int#, Int# #) +skipLoop !w !c = + if length c > 0 + then + if indexLatinCharArray (array c) (offset c) == w + then skipLoop w (Bytes.unsafeDrop 1 c) + else (# unI (offset c), unI (length c) #) else (# unI (offset c), unI (length c) #) - else (# unI (offset c), unI (length c) #) skipLoop1Start :: - e - -> Char -- byte to match - -> Bytes -- chunk - -> Result# e () -skipLoop1Start e !w !chunk0 = if length chunk0 > 0 - then if indexLatinCharArray (array chunk0) (offset chunk0) == w - then upcastUnitSuccess (skipLoop w (Bytes.unsafeDrop 1 chunk0)) + e -> + Char -> -- byte to match + Bytes -> -- chunk + Result# e () +skipLoop1Start e !w !chunk0 = + if length chunk0 > 0 + then + if indexLatinCharArray (array chunk0) (offset chunk0) == w + then upcastUnitSuccess (skipLoop w (Bytes.unsafeDrop 1 chunk0)) + else (# e | #) else (# e | #) - else (# e | #) --- | Parse a decimal-encoded 8-bit word. If the number is larger --- than 255, this parser fails. +{- | Parse a decimal-encoded 8-bit word. If the number is larger +than 255, this parser fails. +-} decWord8 :: e -> Parser e s Word8 -decWord8 e = Parser - (\chunk0 s0 -> case decSmallWordStart e 256 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord8Result r #) - ) +decWord8 e = + Parser + ( \chunk0 s0 -> case decSmallWordStart e 256 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord8Result r #) + ) --- | Parse a hexadecimal-encoded 8-bit word. If the number is larger --- than 255, this parser fails. This allows leading zeroes and is --- insensitive to case. For example, @00A@, @0a@ and @A@ would all --- be accepted as the same number. +{- | Parse a hexadecimal-encoded 8-bit word. If the number is larger +than 255, this parser fails. This allows leading zeroes and is +insensitive to case. For example, @00A@, @0a@ and @A@ would all +be accepted as the same number. +-} hexWord8 :: e -> Parser e s Word8 -hexWord8 e = Parser - (\chunk0 s0 -> case hexSmallWordStart e 256 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord8Result r #) - ) +hexWord8 e = + Parser + ( \chunk0 s0 -> case hexSmallWordStart e 256 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord8Result r #) + ) --- | Parse a hexadecimal-encoded 16-bit word. If the number is larger --- than 65535, this parser fails. This allows leading zeroes and is --- insensitive to case. For example, @0100a@ and @100A@ would both --- be accepted as the same number. +{- | Parse a hexadecimal-encoded 16-bit word. If the number is larger +than 65535, this parser fails. This allows leading zeroes and is +insensitive to case. For example, @0100a@ and @100A@ would both +be accepted as the same number. +-} hexWord16 :: e -> Parser e s Word16 -hexWord16 e = Parser - (\chunk0 s0 -> case hexSmallWordStart e 65536 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord16Result r #) - ) +hexWord16 e = + Parser + ( \chunk0 s0 -> case hexSmallWordStart e 65536 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord16Result r #) + ) hexWord32 :: e -> Parser e s Word32 -hexWord32 e = Parser - (\chunk0 s0 -> case hexSmallWordStart e 4294967296 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord32Result r #) - ) +hexWord32 e = + Parser + ( \chunk0 s0 -> case hexSmallWordStart e 4294967296 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord32Result r #) + ) --- | Parse a decimal-encoded 16-bit word. If the number is larger --- than 65535, this parser fails. +{- | Parse a decimal-encoded 16-bit word. If the number is larger +than 65535, this parser fails. +-} decWord16 :: e -> Parser e s Word16 -decWord16 e = Parser - (\chunk0 s0 -> case decSmallWordStart e 65536 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord16Result r #) - ) +decWord16 e = + Parser + ( \chunk0 s0 -> case decSmallWordStart e 65536 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord16Result r #) + ) --- | Parse a decimal-encoded 32-bit word. If the number is larger --- than 4294967295, this parser fails. +{- | Parse a decimal-encoded 32-bit word. If the number is larger +than 4294967295, this parser fails. +-} decWord32 :: e -> Parser e s Word32 -- This will not work on 32-bit platforms. -decWord32 e = Parser - (\chunk0 s0 -> case decSmallWordStart e 4294967296 (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord32Result r #) - ) +decWord32 e = + Parser + ( \chunk0 s0 -> case decSmallWordStart e 4294967296 (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord32Result r #) + ) --- | Parse a decimal-encoded number. If the number is too large to be --- represented by a machine word, this fails with the provided --- error message. This accepts any number of leading zeroes. +{- | Parse a decimal-encoded number. If the number is too large to be +represented by a machine word, this fails with the provided +error message. This accepts any number of leading zeroes. +-} decWord :: e -> Parser e s Word -decWord e = Parser - (\chunk0 s0 -> case decWordStart e (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWordResult r #) - ) +decWord e = + Parser + ( \chunk0 s0 -> case decWordStart e (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWordResult r #) + ) --- | Parse a decimal-encoded unsigned number. If the number is --- too large to be represented by a 64-bit word, this fails with --- the provided error message. This accepts any number of leading --- zeroes. +{- | Parse a decimal-encoded unsigned number. If the number is +too large to be represented by a 64-bit word, this fails with +the provided error message. This accepts any number of leading +zeroes. +-} decWord64 :: e -> Parser e s Word64 -decWord64 e = Parser - (\chunk0 s0 -> case decWordStart e (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastWord64Result r #) - ) +decWord64 e = + Parser + ( \chunk0 s0 -> case decWordStart e (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastWord64Result r #) + ) hexSmallWordStart :: - e -- Error message - -> Word -- Upper Bound - -> Bytes -- Chunk - -> ST# s (Result# e Word# ) -hexSmallWordStart e !limit !chunk0 s0 = if length chunk0 > 0 - then case oneHexMaybe (PM.indexByteArray (array chunk0) (offset chunk0)) of - Nothing -> (# s0, (# e | #) #) - Just w -> (# s0, hexSmallWordMore e w limit (Bytes.unsafeDrop 1 chunk0) #) - else (# s0, (# e | #) #) + e -> -- Error message + Word -> -- Upper Bound + Bytes -> -- Chunk + ST# s (Result# e Word#) +hexSmallWordStart e !limit !chunk0 s0 = + if length chunk0 > 0 + then case oneHexMaybe (PM.indexByteArray (array chunk0) (offset chunk0)) of + Nothing -> (# s0, (# e | #) #) + Just w -> (# s0, hexSmallWordMore e w limit (Bytes.unsafeDrop 1 chunk0) #) + else (# s0, (# e | #) #) decSmallWordStart :: - e -- Error message - -> Word -- Upper Bound - -> Bytes -- Chunk - -> ST# s (Result# e Word# ) -decSmallWordStart e !limit !chunk0 s0 = if length chunk0 > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then (# s0, decSmallWordMore e w limit (Bytes.unsafeDrop 1 chunk0) #) - else (# s0, (# e | #) #) - else (# s0, (# e | #) #) + e -> -- Error message + Word -> -- Upper Bound + Bytes -> -- Chunk + ST# s (Result# e Word#) +decSmallWordStart e !limit !chunk0 s0 = + if length chunk0 > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then (# s0, decSmallWordMore e w limit (Bytes.unsafeDrop 1 chunk0) #) + else (# s0, (# e | #) #) + else (# s0, (# e | #) #) -- This will not inline since it is recursive, but worker -- wrapper will still happen. decWordMore :: - e -- Error message - -> Word -- Accumulator - -> Bytes -- Chunk - -> Result# e Word# + e -> -- Error message + Word -> -- Accumulator + Bytes -> -- Chunk + Result# e Word# decWordMore e !acc !chunk0 = case len of 0 -> (# | (# unW (fromIntegral acc), unI (offset chunk0), 0# #) #) _ -> - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 in if w < 10 then - let (overflow,acc') = unsignedPushBase10 acc w + let (overflow, acc') = unsignedPushBase10 acc w in if overflow - then (# e | #) - else decWordMore e acc' (Bytes.unsafeDrop 1 chunk0) + then (# e | #) + else decWordMore e acc' (Bytes.unsafeDrop 1 chunk0) else (# | (# unW (fromIntegral acc), unI (offset chunk0), len# #) #) - where - !len@(I# len# ) = length chunk0 + where + !len@(I# len#) = length chunk0 upcastWordResult :: Result# e Word# -> Result# e Word -{-# inline upcastWordResult #-} +{-# INLINE upcastWordResult #-} upcastWordResult (# e | #) = (# e | #) upcastWordResult (# | (# a, b, c #) #) = (# | (# W# a, b, c #) #) +{- FOURMOLU_DISABLE -} -- This only works on 64-bit platforms. upcastWord64Result :: Result# e Word# -> Result# e Word64 {-# inline upcastWord64Result #-} @@ -582,58 +707,68 @@ upcastWord64Result (# | (# a, b, c #) #) = (# | (# W64# ( a #endif ), b, c #) #) +{- FOURMOLU_ENABLE -} hexSmallWordMore :: - e -- Error message - -> Word -- Accumulator - -> Word -- Upper Bound - -> Bytes -- Chunk - -> Result# e Word# -hexSmallWordMore e !acc !limit !chunk0 = if length chunk0 > 0 - then case oneHexMaybe (PM.indexByteArray (array chunk0) (offset chunk0)) of - Nothing -> (# | (# unW acc, unI (offset chunk0), unI (length chunk0) #) #) - Just w -> let w' = acc * 16 + w in - if w' < limit - then hexSmallWordMore e w' limit (Bytes.unsafeDrop 1 chunk0) - else (# e | #) - else (# | (# unW acc, unI (offset chunk0), 0# #) #) + e -> -- Error message + Word -> -- Accumulator + Word -> -- Upper Bound + Bytes -> -- Chunk + Result# e Word# +hexSmallWordMore e !acc !limit !chunk0 = + if length chunk0 > 0 + then case oneHexMaybe (PM.indexByteArray (array chunk0) (offset chunk0)) of + Nothing -> (# | (# unW acc, unI (offset chunk0), unI (length chunk0) #) #) + Just w -> + let w' = acc * 16 + w + in if w' < limit + then hexSmallWordMore e w' limit (Bytes.unsafeDrop 1 chunk0) + else (# e | #) + else (# | (# unW acc, unI (offset chunk0), 0# #) #) decSmallWordMore :: - e -- Error message - -> Word -- Accumulator - -> Word -- Upper Bound - -> Bytes -- Chunk - -> Result# e Word# -decSmallWordMore e !acc !limit !chunk0 = if length chunk0 > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then - let w' = acc * 10 + w - in if w' < limit - then decSmallWordMore e w' limit (Bytes.unsafeDrop 1 chunk0) - else (# e | #) - else (# | (# unW acc, unI (offset chunk0), unI (length chunk0) #) #) - else (# | (# unW acc, unI (offset chunk0), 0# #) #) + e -> -- Error message + Word -> -- Accumulator + Word -> -- Upper Bound + Bytes -> -- Chunk + Result# e Word# +decSmallWordMore e !acc !limit !chunk0 = + if length chunk0 > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then + let w' = acc * 10 + w + in if w' < limit + then decSmallWordMore e w' limit (Bytes.unsafeDrop 1 chunk0) + else (# e | #) + else (# | (# unW acc, unI (offset chunk0), unI (length chunk0) #) #) + else (# | (# unW acc, unI (offset chunk0), 0# #) #) unW :: Word -> Word# -{-# inline unW #-} +{-# INLINE unW #-} unW (W# w) = w decWordStart :: - e -- Error message - -> Bytes -- Chunk - -> ST# s (Result# e Word# ) -decWordStart e !chunk0 s0 = if length chunk0 > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then (# s0, decWordMore e w (Bytes.unsafeDrop 1 chunk0) #) - else (# s0, (# e | #) #) - else (# s0, (# e | #) #) - + e -> -- Error message + Bytes -> -- Chunk + ST# s (Result# e Word#) +decWordStart e !chunk0 s0 = + if length chunk0 > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then (# s0, decWordMore e w (Bytes.unsafeDrop 1 chunk0) #) + else (# s0, (# e | #) #) + else (# s0, (# e | #) #) + +{- FOURMOLU_DISABLE -} -- Precondition: the word is small enough upcastWord16Result :: Result# e Word# -> Result# e Word16 {-# inline upcastWord16Result #-} @@ -663,219 +798,256 @@ upcastWord8Result (# | (# a, b, c #) #) = (# | (# W8# ( Exts.wordToWord8# #endif a), b, c #) #) - --- | Parse a decimal-encoded number. If the number is too large to be --- represented by a machine integer, this fails with the provided --- error message. This rejects input with that is preceeded by plus --- or minus. Consequently, it does not parse negative numbers. Use --- 'decStandardInt' or 'decSignedInt' for that purpose. On a 64-bit --- platform 'decWord' will successfully parse 9223372036854775808 --- (i.e. @2 ^ 63@), but 'decUnsignedInt' will fail. This parser allows --- leading zeroes. +{- FOURMOLU_ENABLE -} + +{- | Parse a decimal-encoded number. If the number is too large to be +represented by a machine integer, this fails with the provided +error message. This rejects input with that is preceeded by plus +or minus. Consequently, it does not parse negative numbers. Use +'decStandardInt' or 'decSignedInt' for that purpose. On a 64-bit +platform 'decWord' will successfully parse 9223372036854775808 +(i.e. @2 ^ 63@), but 'decUnsignedInt' will fail. This parser allows +leading zeroes. +-} decUnsignedInt :: e -> Parser e s Int -decUnsignedInt e = Parser - (\chunk0 s0 -> case decPosIntStart e (boxBytes chunk0) s0 of - (# s1, r #) -> (# s1, upcastIntResult r #) - ) +decUnsignedInt e = + Parser + ( \chunk0 s0 -> case decPosIntStart e (boxBytes chunk0) s0 of + (# s1, r #) -> (# s1, upcastIntResult r #) + ) -- | Variant of 'decUnsignedInt' with an unboxed result. decUnsignedInt# :: e -> Parser e s Int# -decUnsignedInt# e = Parser - (\chunk0 s0 -> decPosIntStart e (boxBytes chunk0) s0) - --- | Parse a decimal-encoded number. If the number is too large to be --- represented by a machine integer, this fails with the provided --- error message. This allows the number to optionally be prefixed --- by plus or minus. If the sign prefix is not present, the number --- is interpreted as positive. This allows leading zeroes. +decUnsignedInt# e = + Parser + (\chunk0 s0 -> decPosIntStart e (boxBytes chunk0) s0) + +{- | Parse a decimal-encoded number. If the number is too large to be +represented by a machine integer, this fails with the provided +error message. This allows the number to optionally be prefixed +by plus or minus. If the sign prefix is not present, the number +is interpreted as positive. This allows leading zeroes. +-} decSignedInt :: e -> Parser e s Int -decSignedInt e = Parser - (\chunk0 s0 -> case runParser (decSignedInt# e) chunk0 s0 of - (# s1, r #) -> (# s1, upcastIntResult r #) - ) +decSignedInt e = + Parser + ( \chunk0 s0 -> case runParser (decSignedInt# e) chunk0 s0 of + (# s1, r #) -> (# s1, upcastIntResult r #) + ) --- | Variant of 'decUnsignedInt' that lets the caller supply a leading --- digit. This is useful when parsing formats like JSON where integers with --- leading zeroes are considered invalid. The calling context must --- consume the first digit before calling this parser. Results are --- always positive numbers. +{- | Variant of 'decUnsignedInt' that lets the caller supply a leading +digit. This is useful when parsing formats like JSON where integers with +leading zeroes are considered invalid. The calling context must +consume the first digit before calling this parser. Results are +always positive numbers. +-} decTrailingInt :: - e -- ^ Error message - -> Int -- ^ Leading digit, should be between @0@ and @9@. - -> Parser e s Int -decTrailingInt e (I# w) = Parser - (\chunk0 s0 -> case runParser (decTrailingInt# e w) chunk0 s0 of - (# s1, r #) -> (# s1, upcastIntResult r #) - ) + -- | Error message + e -> + -- | Leading digit, should be between @0@ and @9@. + Int -> + Parser e s Int +decTrailingInt e (I# w) = + Parser + ( \chunk0 s0 -> case runParser (decTrailingInt# e w) chunk0 s0 of + (# s1, r #) -> (# s1, upcastIntResult r #) + ) decTrailingInt# :: - e -- Error message - -> Int# -- Leading digit, should be between @0@ and @9@. - -> Parser e s Int# + e -> -- Error message + Int# -> -- Leading digit, should be between @0@ and @9@. + Parser e s Int# decTrailingInt# e !w = Parser (\chunk0 s0 -> (# s0, decPosIntMore e (W# (int2Word# w)) maxIntAsWord (boxBytes chunk0) #)) maxIntAsWord :: Word maxIntAsWord = fromIntegral (maxBound :: Int) --- | Parse a decimal-encoded number. If the number is too large to be --- represented by a machine integer, this fails with the provided --- error message. This allows the number to optionally be prefixed --- by minus. If the minus prefix is not present, the number --- is interpreted as positive. The disallows a leading plus sign. --- For example, 'decStandardInt' rejects @+42@, but 'decSignedInt' --- allows it. +{- | Parse a decimal-encoded number. If the number is too large to be +represented by a machine integer, this fails with the provided +error message. This allows the number to optionally be prefixed +by minus. If the minus prefix is not present, the number +is interpreted as positive. The disallows a leading plus sign. +For example, 'decStandardInt' rejects @+42@, but 'decSignedInt' +allows it. +-} decStandardInt :: e -> Parser e s Int -decStandardInt e = Parser - (\chunk0 s0 -> case runParser (decStandardInt# e) chunk0 s0 of - (# s1, r #) -> (# s1, upcastIntResult r #) - ) +decStandardInt e = + Parser + ( \chunk0 s0 -> case runParser (decStandardInt# e) chunk0 s0 of + (# s1, r #) -> (# s1, upcastIntResult r #) + ) decSignedInt# :: e -> Parser e s Int# -{-# noinline decSignedInt# #-} -decSignedInt# e = any e `bindFromLiftedToInt` \c -> case c of - '+' -> Parser -- plus sign - (\chunk0 s0 -> decPosIntStart e (boxBytes chunk0) s0) - '-' -> Parser -- minus sign - (\chunk0 s0 -> decNegIntStart e (boxBytes chunk0) s0) - _ -> Parser -- no sign, there should be a digit here - (\chunk0 s0 -> - let !w = char2Word c - 48 - in if w < 10 - then (# s0, decPosIntMore e w maxIntAsWord (boxBytes chunk0) #) - else (# s0, (# e | #) #) - ) +{-# NOINLINE decSignedInt# #-} +decSignedInt# e = + any e `bindFromLiftedToInt` \c -> case c of + '+' -> + Parser -- plus sign + (\chunk0 s0 -> decPosIntStart e (boxBytes chunk0) s0) + '-' -> + Parser -- minus sign + (\chunk0 s0 -> decNegIntStart e (boxBytes chunk0) s0) + _ -> + Parser -- no sign, there should be a digit here + ( \chunk0 s0 -> + let !w = char2Word c - 48 + in if w < 10 + then (# s0, decPosIntMore e w maxIntAsWord (boxBytes chunk0) #) + else (# s0, (# e | #) #) + ) -- This is the same as decSignedInt except that we disallow -- a leading plus sign. decStandardInt# :: e -> Parser e s Int# -{-# noinline decStandardInt# #-} -decStandardInt# e = any e `bindFromLiftedToInt` \c -> case c of - '-' -> Parser -- minus sign - (\chunk0 s0 -> decNegIntStart e (boxBytes chunk0) s0) - _ -> Parser -- no sign, there should be a digit here - (\chunk0 s0 -> - let !w = char2Word c - 48 - in if w < 10 - then (# s0, decPosIntMore e w maxIntAsWord (boxBytes chunk0) #) - else (# s0, (# e | #) #) - ) - --- | Variant of 'decUnsignedInteger' that lets the caller supply a leading --- digit. This is useful when parsing formats like JSON where integers with --- leading zeroes are considered invalid. The calling context must --- consume the first digit before calling this parser. Results are --- always positive numbers. +{-# NOINLINE decStandardInt# #-} +decStandardInt# e = + any e `bindFromLiftedToInt` \c -> case c of + '-' -> + Parser -- minus sign + (\chunk0 s0 -> decNegIntStart e (boxBytes chunk0) s0) + _ -> + Parser -- no sign, there should be a digit here + ( \chunk0 s0 -> + let !w = char2Word c - 48 + in if w < 10 + then (# s0, decPosIntMore e w maxIntAsWord (boxBytes chunk0) #) + else (# s0, (# e | #) #) + ) + +{- | Variant of 'decUnsignedInteger' that lets the caller supply a leading +digit. This is useful when parsing formats like JSON where integers with +leading zeroes are considered invalid. The calling context must +consume the first digit before calling this parser. Results are +always positive numbers. +-} decTrailingInteger :: - Int -- ^ Leading digit, should be between @0@ and @9@. - -> Parser e s Integer + -- | Leading digit, should be between @0@ and @9@. + Int -> + Parser e s Integer decTrailingInteger (I# w) = Parser (\chunk0 s0 -> (# s0, (# | decIntegerChunks (I# w) 10 0 (boxBytes chunk0) #) #)) --- | Parse a decimal-encoded positive integer of arbitrary --- size. This rejects input that begins with a plus or minus --- sign. +{- | Parse a decimal-encoded positive integer of arbitrary +size. This rejects input that begins with a plus or minus +sign. +-} decUnsignedInteger :: e -> Parser e s Integer -decUnsignedInteger e = Parser - (\chunk0 s0 -> decUnsignedIntegerStart e (boxBytes chunk0) s0) - --- | Parse a decimal-encoded integer of arbitrary size. --- This accepts input that begins with a plus or minus sign. --- Input without a sign prefix is interpreted as positive. +decUnsignedInteger e = + Parser + (\chunk0 s0 -> decUnsignedIntegerStart e (boxBytes chunk0) s0) + +{- | Parse a decimal-encoded integer of arbitrary size. +This accepts input that begins with a plus or minus sign. +Input without a sign prefix is interpreted as positive. +-} decSignedInteger :: e -> Parser e s Integer -{-# noinline decSignedInteger #-} -decSignedInteger e = any e >>= \c -> case c of - '+' -> do - decUnsignedInteger e - '-' -> do - x <- decUnsignedInteger e - pure $! negate x - _ -> Parser -- no sign, there should be a digit here - (\chunk0 s0 -> - let !w = char2Word c - 48 in - if w < 10 - then - let !r = decIntegerChunks - (fromIntegral @Word @Int w) - 10 - 0 - (boxBytes chunk0) - in (# s0, (# | r #) #) - else (# s0, (# e | #) #) - ) +{-# NOINLINE decSignedInteger #-} +decSignedInteger e = + any e >>= \c -> case c of + '+' -> do + decUnsignedInteger e + '-' -> do + x <- decUnsignedInteger e + pure $! negate x + _ -> + Parser -- no sign, there should be a digit here + ( \chunk0 s0 -> + let !w = char2Word c - 48 + in if w < 10 + then + let !r = + decIntegerChunks + (fromIntegral @Word @Int w) + 10 + 0 + (boxBytes chunk0) + in (# s0, (# | r #) #) + else (# s0, (# e | #) #) + ) decPosIntStart :: - e -- Error message - -> Bytes -- Chunk - -> ST# s (Result# e Int# ) -decPosIntStart e !chunk0 s0 = if length chunk0 > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then (# s0, decPosIntMore e w maxIntAsWord (Bytes.unsafeDrop 1 chunk0) #) - else (# s0, (# e | #) #) - else (# s0, (# e | #) #) + e -> -- Error message + Bytes -> -- Chunk + ST# s (Result# e Int#) +decPosIntStart e !chunk0 s0 = + if length chunk0 > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then (# s0, decPosIntMore e w maxIntAsWord (Bytes.unsafeDrop 1 chunk0) #) + else (# s0, (# e | #) #) + else (# s0, (# e | #) #) decNegIntStart :: - e -- Error message - -> Bytes -- Chunk - -> ST# s (Result# e Int# ) -decNegIntStart e !chunk0 s0 = if length chunk0 > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then - case decPosIntMore e w (maxIntAsWord + 1) (Bytes.unsafeDrop 1 chunk0) of - (# | (# x, y, z #) #) -> - (# s0, (# | (# (notI# x +# 1# ), y, z #) #) #) - (# err | #) -> - (# s0, (# err | #) #) - else (# s0, (# e | #) #) - else (# s0, (# e | #) #) + e -> -- Error message + Bytes -> -- Chunk + ST# s (Result# e Int#) +decNegIntStart e !chunk0 s0 = + if length chunk0 > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then case decPosIntMore e w (maxIntAsWord + 1) (Bytes.unsafeDrop 1 chunk0) of + (# | (# x, y, z #) #) -> + (# s0, (# | (# (notI# x +# 1#), y, z #) #) #) + (# err | #) -> + (# s0, (# err | #) #) + else (# s0, (# e | #) #) + else (# s0, (# e | #) #) decUnsignedIntegerStart :: - e - -> Bytes - -> ST# s (Result# e Integer) -decUnsignedIntegerStart e !chunk0 s0 = if length chunk0 > 0 - then - let !w = (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < (10 :: Word8) - then - let !r = decIntegerChunks - (fromIntegral @Word8 @Int w) - 10 - 0 - (Bytes.unsafeDrop 1 chunk0) - in (# s0, (# | r #) #) - else (# s0, (# e | #) #) - else (# s0, (# e | #) #) + e -> + Bytes -> + ST# s (Result# e Integer) +decUnsignedIntegerStart e !chunk0 s0 = + if length chunk0 > 0 + then + let !w = (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 + in if w < (10 :: Word8) + then + let !r = + decIntegerChunks + (fromIntegral @Word8 @Int w) + 10 + 0 + (Bytes.unsafeDrop 1 chunk0) + in (# s0, (# | r #) #) + else (# s0, (# e | #) #) + else (# s0, (# e | #) #) -- This will not inline since it is recursive, but worker -- wrapper will still happen. Fails if the accumulator -- exceeds the upper bound. decPosIntMore :: - e -- Error message - -> Word -- Accumulator, precondition: less than or equal to bound - -> Word -- Inclusive Upper Bound, either (2^63 - 1) or 2^63 - -> Bytes -- Chunk - -> Result# e Int# -decPosIntMore e !acc !upper !chunk0 = if len > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then - let (overflow,acc') = positivePushBase10 acc w upper - in if overflow - then (# e | #) - else decPosIntMore e acc' upper (Bytes.unsafeDrop 1 chunk0) - else (# | (# unI (fromIntegral acc), unI (offset chunk0), len# #) #) - else (# | (# unI (fromIntegral acc), unI (offset chunk0), 0# #) #) - where - !len@(I# len# ) = length chunk0 + e -> -- Error message + Word -> -- Accumulator, precondition: less than or equal to bound + Word -> -- Inclusive Upper Bound, either (2^63 - 1) or 2^63 + Bytes -> -- Chunk + Result# e Int# +decPosIntMore e !acc !upper !chunk0 = + if len > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then + let (overflow, acc') = positivePushBase10 acc w upper + in if overflow + then (# e | #) + else decPosIntMore e acc' upper (Bytes.unsafeDrop 1 chunk0) + else (# | (# unI (fromIntegral acc), unI (offset chunk0), len# #) #) + else (# | (# unI (fromIntegral acc), unI (offset chunk0), 0# #) #) + where + !len@(I# len#) = length chunk0 -- This will not inline since it is recursive, but worker -- wrapper will still happen. When the accumulator @@ -888,40 +1060,48 @@ decPosIntMore e !acc !upper !chunk0 = if len > 0 -- Because of how we track overflow, we are able to use the -- same function for both positive and negative numbers. decIntegerChunks :: - Int -- Chunk accumulator (e.g. 236) - -> Int -- Chunk base-ten bound (e.g. 1000) - -> Integer -- Accumulator - -> Bytes -- Chunk - -> (# Integer, Int#, Int# #) -decIntegerChunks !nAcc !eAcc acc !chunk0 = if len > 0 - then - let !w = fromIntegral @Word8 @Word - (PM.indexByteArray (array chunk0) (offset chunk0)) - 48 - in if w < 10 - then let !eAcc' = eAcc * 10 in - if eAcc' >= eAcc - then decIntegerChunks - (nAcc * 10 + fromIntegral @Word @Int w) - eAcc' - acc - (Bytes.unsafeDrop 1 chunk0) - else - -- In this case, notice that we deliberately - -- unconsume the digit that would have caused - -- an overflow. - let !r = (acc * fromIntegral @Int @Integer eAcc) - + (fromIntegral @Int @Integer nAcc) - in decIntegerChunks 0 1 r chunk0 - else - let !r = (acc * fromIntegral @Int @Integer eAcc) - + (fromIntegral @Int @Integer nAcc) - in (# r, unI (offset chunk0), len# #) - else - let !r = (acc * fromIntegral @Int @Integer eAcc) - + (fromIntegral @Int @Integer nAcc) - in (# r, unI (offset chunk0), 0# #) - where - !len@(I# len# ) = length chunk0 + Int -> -- Chunk accumulator (e.g. 236) + Int -> -- Chunk base-ten bound (e.g. 1000) + Integer -> -- Accumulator + Bytes -> -- Chunk + (# Integer, Int#, Int# #) +decIntegerChunks !nAcc !eAcc acc !chunk0 = + if len > 0 + then + let !w = + fromIntegral @Word8 @Word + (PM.indexByteArray (array chunk0) (offset chunk0)) + - 48 + in if w < 10 + then + let !eAcc' = eAcc * 10 + in if eAcc' >= eAcc + then + decIntegerChunks + (nAcc * 10 + fromIntegral @Word @Int w) + eAcc' + acc + (Bytes.unsafeDrop 1 chunk0) + else -- In this case, notice that we deliberately + -- unconsume the digit that would have caused + -- an overflow. + + let !r = + (acc * fromIntegral @Int @Integer eAcc) + + (fromIntegral @Int @Integer nAcc) + in decIntegerChunks 0 1 r chunk0 + else + let !r = + (acc * fromIntegral @Int @Integer eAcc) + + (fromIntegral @Int @Integer nAcc) + in (# r, unI (offset chunk0), len# #) + else + let !r = + (acc * fromIntegral @Int @Integer eAcc) + + (fromIntegral @Int @Integer nAcc) + in (# r, unI (offset chunk0), 0# #) + where + !len@(I# len#) = length chunk0 upcastIntResult :: Result# e Int# -> Result# e Int upcastIntResult (# e | #) = (# e | #) @@ -930,14 +1110,15 @@ upcastIntResult (# | (# a, b, c #) #) = (# | (# I# a, b, c #) #) char2Word :: Char -> Word char2Word = fromIntegral . ord --- | Take characters until the specified character is encountered. --- Consumes the matched character as well. Fails if the character --- is not present. Visually, the cursor advancement and resulting --- @Bytes@ for @takeTrailedBy \'D\'@ look like this: --- --- > A B C D E F | input --- > |->->->-| | cursor --- > {-*-*-} | result bytes +{- | Take characters until the specified character is encountered. +Consumes the matched character as well. Fails if the character +is not present. Visually, the cursor advancement and resulting +@Bytes@ for @takeTrailedBy \'D\'@ look like this: + +> A B C D E F | input +> |->->->-| | cursor +> {\-*-*-\} | result bytes +-} takeTrailedBy :: e -> Char -> Parser e s Bytes takeTrailedBy e !w = do !start <- cursor @@ -946,53 +1127,57 @@ takeTrailedBy e !w = do !arr <- expose pure (Bytes arr start (end - (start + 1))) --- | Skip all characters until the terminator is encountered --- and then consume the matching character as well. Visually, --- @skipTrailedBy \'C\'@ advances the cursor like this: --- --- > A Z B Y C X C W --- > |->->->->-| --- --- This fails if it reaches the end of input without encountering --- the character. +{- | Skip all characters until the terminator is encountered +and then consume the matching character as well. Visually, +@skipTrailedBy \'C\'@ advances the cursor like this: + +> A Z B Y C X C W +> |->->->->-| + +This fails if it reaches the end of input without encountering +the character. +-} skipTrailedBy :: e -> Char -> Parser e s () skipTrailedBy e !w = uneffectful# $ \c -> skipUntilConsumeLoop e w c --- | Skip all characters until the terminator is encountered. --- This does not consume the terminator. Visually, @skipUntil \'C\'@ --- advances the cursor like this: --- --- > A Z B Y C X C W --- > |->->->-| --- --- This succeeds if it reaches the end of the input without --- encountering the terminator. It never fails. +{- | Skip all characters until the terminator is encountered. +This does not consume the terminator. Visually, @skipUntil \'C\'@ +advances the cursor like this: + +> A Z B Y C X C W +> |->->->-| + +This succeeds if it reaches the end of the input without +encountering the terminator. It never fails. +-} skipUntil :: Char -> Parser e s () skipUntil !w = uneffectful# $ \c -> skipUntilLoop w c skipUntilLoop :: - Char -- byte to match - -> Bytes -- Chunk - -> Result# e () + Char -> -- byte to match + Bytes -> -- Chunk + Result# e () skipUntilLoop !w !c = case length c of 0 -> (# | (# (), unI (offset c), 0# #) #) - _ -> if indexLatinCharArray (array c) (offset c) /= w - then skipUntilLoop w (Bytes.unsafeDrop 1 c) - else (# | (# (), unI (offset c), unI (length c) #) #) + _ -> + if indexLatinCharArray (array c) (offset c) /= w + then skipUntilLoop w (Bytes.unsafeDrop 1 c) + else (# | (# (), unI (offset c), unI (length c) #) #) skipUntilConsumeLoop :: - e -- Error message - -> Char -- byte to match - -> Bytes -- Chunk - -> Result# e () + e -> -- Error message + Char -> -- byte to match + Bytes -> -- Chunk + Result# e () skipUntilConsumeLoop e !w !c = case length c of 0 -> (# e | #) - _ -> if indexLatinCharArray (array c) (offset c) /= w - then skipUntilConsumeLoop e w (Bytes.unsafeDrop 1 c) - else (# | (# (), unI (offset c + 1), unI (length c - 1) #) #) - + _ -> + if indexLatinCharArray (array c) (offset c) /= w + then skipUntilConsumeLoop e w (Bytes.unsafeDrop 1 c) + else (# | (# (), unI (offset c + 1), unI (length c - 1) #) #) +{- FOURMOLU_DISABLE -} -- | Parse exactly eight ASCII-encoded characters, interpreting them as the -- hexadecimal encoding of a 32-bit number. Note that this rejects a sequence -- such as @BC5A9@, requiring @000BC5A9@ instead. This is insensitive to case. @@ -1008,34 +1193,41 @@ hexFixedWord32 e = Parser #endif a), b, c #) #) #) ) +{- FOURMOLU_ENABLE -} hexFixedWord32# :: e -> Parser e s Word# -{-# noinline hexFixedWord32# #-} -hexFixedWord32# e = uneffectfulWord# $ \chunk -> if length chunk >= 8 - then - let !w0@(W# n0) = oneHex $ PM.indexByteArray (array chunk) (offset chunk) - !w1@(W# n1) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 1) - !w2@(W# n2) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 2) - !w3@(W# n3) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 3) - !w4@(W# n4) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 4) - !w5@(W# n5) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 5) - !w6@(W# n6) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 6) - !w7@(W# n7) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 7) - in if | w0 .|. w1 .|. w2 .|. w3 .|. w4 .|. w5 .|. w6 .|. w7 /= maxBound -> - (# | - (# (n0 `Exts.timesWord#` 268435456##) `Exts.plusWord#` - (n1 `Exts.timesWord#` 16777216##) `Exts.plusWord#` - (n2 `Exts.timesWord#` 1048576##) `Exts.plusWord#` - (n3 `Exts.timesWord#` 65536##) `Exts.plusWord#` - (n4 `Exts.timesWord#` 4096##) `Exts.plusWord#` - (n5 `Exts.timesWord#` 256##) `Exts.plusWord#` - (n6 `Exts.timesWord#` 16##) `Exts.plusWord#` - n7 - , unI (offset chunk) +# 8# - , unI (length chunk) -# 8# #) #) - | otherwise -> (# e | #) - else (# e | #) +{-# NOINLINE hexFixedWord32# #-} +hexFixedWord32# e = uneffectfulWord# $ \chunk -> + if length chunk >= 8 + then + let !w0@(W# n0) = oneHex $ PM.indexByteArray (array chunk) (offset chunk) + !w1@(W# n1) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 1) + !w2@(W# n2) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 2) + !w3@(W# n3) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 3) + !w4@(W# n4) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 4) + !w5@(W# n5) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 5) + !w6@(W# n6) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 6) + !w7@(W# n7) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 7) + in if + | w0 .|. w1 .|. w2 .|. w3 .|. w4 .|. w5 .|. w6 .|. w7 /= maxBound -> + (# + | (# + (n0 `Exts.timesWord#` 268435456##) + `Exts.plusWord#` (n1 `Exts.timesWord#` 16777216##) + `Exts.plusWord#` (n2 `Exts.timesWord#` 1048576##) + `Exts.plusWord#` (n3 `Exts.timesWord#` 65536##) + `Exts.plusWord#` (n4 `Exts.timesWord#` 4096##) + `Exts.plusWord#` (n5 `Exts.timesWord#` 256##) + `Exts.plusWord#` (n6 `Exts.timesWord#` 16##) + `Exts.plusWord#` n7 + , unI (offset chunk) +# 8# + , unI (length chunk) -# 8# + #) + #) + | otherwise -> (# e | #) + else (# e | #) +{- FOURMOLU_DISABLE -} -- | Parse exactly 16 ASCII-encoded characters, interpreting them as the -- hexadecimal encoding of a 64-bit number. Note that this rejects a sequence -- such as @BC5A9@, requiring @00000000000BC5A9@ instead. This is insensitive @@ -1055,35 +1247,44 @@ hexFixedWord64 e = Parser #endif ), b, c #) #) #) ) +{- FOURMOLU_ENABLE -} hexFixedWord128 :: e -> Parser e s Word128 -hexFixedWord128 e = Word128 - <$> hexFixedWord64 e - <*> hexFixedWord64 e +hexFixedWord128 e = + Word128 + <$> hexFixedWord64 e + <*> hexFixedWord64 e hexFixedWord256 :: e -> Parser e s Word256 -hexFixedWord256 e = Word256 - <$> hexFixedWord64 e - <*> hexFixedWord64 e - <*> hexFixedWord64 e - <*> hexFixedWord64 e +hexFixedWord256 e = + Word256 + <$> hexFixedWord64 e + <*> hexFixedWord64 e + <*> hexFixedWord64 e + <*> hexFixedWord64 e hexFixedWord64# :: e -> Parser e s Word# -{-# noinline hexFixedWord64# #-} -hexFixedWord64# e = uneffectfulWord# $ \chunk -> if length chunk >= 16 - then - let go !off !len !acc = case len of - 0 -> case acc of - W# r -> - (# | (# r - , unI off - , unI (length chunk) -# 16# #) #) - _ -> case oneHexMaybe (PM.indexByteArray (array chunk) off) of - Nothing -> (# e | #) - Just w -> go (off + 1) (len - 1) ((acc * 16) + w) - in go (offset chunk) (16 :: Int) (0 :: Word) - else (# e | #) +{-# NOINLINE hexFixedWord64# #-} +hexFixedWord64# e = uneffectfulWord# $ \chunk -> + if length chunk >= 16 + then + let go !off !len !acc = case len of + 0 -> case acc of + W# r -> + (# + | (# + r + , unI off + , unI (length chunk) -# 16# + #) + #) + _ -> case oneHexMaybe (PM.indexByteArray (array chunk) off) of + Nothing -> (# e | #) + Just w -> go (off + 1) (len - 1) ((acc * 16) + w) + in go (offset chunk) (16 :: Int) (0 :: Word) + else (# e | #) +{- FOURMOLU_DISABLE -} -- | Parse exactly four ASCII-encoded characters, interpreting -- them as the hexadecimal encoding of a 16-bit number. Note that -- this rejects a sequence such as @5A9@, requiring @05A9@ instead. @@ -1138,78 +1339,92 @@ hexFixedWord8 e = Parser #endif a), b, c #) #) #) ) +{- FOURMOLU_ENABLE -} hexFixedWord8# :: e -> Parser e s Word# -{-# noinline hexFixedWord8# #-} -hexFixedWord8# e = uneffectfulWord# $ \chunk -> if length chunk >= 2 - then - let !w0@(W# n0) = oneHex $ PM.indexByteArray (array chunk) (offset chunk) - !w1@(W# n1) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 1) - in if | w0 .|. w1 /= maxBound -> - (# | - (# (n0 `Exts.timesWord#` 16##) `Exts.plusWord#` - n1 - , unI (offset chunk) +# 2# - , unI (length chunk) -# 2# #) #) - | otherwise -> (# e | #) - else (# e | #) +{-# NOINLINE hexFixedWord8# #-} +hexFixedWord8# e = uneffectfulWord# $ \chunk -> + if length chunk >= 2 + then + let !w0@(W# n0) = oneHex $ PM.indexByteArray (array chunk) (offset chunk) + !w1@(W# n1) = oneHex $ PM.indexByteArray (array chunk) (offset chunk + 1) + in if + | w0 .|. w1 /= maxBound -> + (# + | (# + (n0 `Exts.timesWord#` 16##) + `Exts.plusWord#` n1 + , unI (offset chunk) +# 2# + , unI (length chunk) -# 2# + #) + #) + | otherwise -> (# e | #) + else (# e | #) --- | Consume a single character that is the lowercase hexadecimal --- encoding of a 4-bit word. Fails if the character is not in the class --- @[a-f0-9]@. +{- | Consume a single character that is the lowercase hexadecimal +encoding of a 4-bit word. Fails if the character is not in the class +@[a-f0-9]@. +-} hexNibbleLower :: e -> Parser e s Word hexNibbleLower e = uneffectful $ \chunk -> case length chunk of 0 -> Failure e _ -> - let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 in - if | w >= 48 && w < 58 -> Success (fromIntegral w - 48) (offset chunk + 1) (length chunk - 1) - | w >= 97 && w < 103 -> Success (fromIntegral w - 87) (offset chunk + 1) (length chunk - 1) - | otherwise -> Failure e - --- | Consume a single character that is the case-insensitive hexadecimal --- encoding of a 4-bit word. Fails if the character is not in the class --- @[a-fA-F0-9]@. + let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 + in if + | w >= 48 && w < 58 -> Success (fromIntegral w - 48) (offset chunk + 1) (length chunk - 1) + | w >= 97 && w < 103 -> Success (fromIntegral w - 87) (offset chunk + 1) (length chunk - 1) + | otherwise -> Failure e + +{- | Consume a single character that is the case-insensitive hexadecimal +encoding of a 4-bit word. Fails if the character is not in the class +@[a-fA-F0-9]@. +-} hexNibble :: e -> Parser e s Word hexNibble e = uneffectful $ \chunk -> case length chunk of 0 -> Failure e _ -> - let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 in - if | w >= 48 && w < 58 -> Success (fromIntegral w - 48) (offset chunk + 1) (length chunk - 1) - | w >= 65 && w < 71 -> Success (fromIntegral w - 55) (offset chunk + 1) (length chunk - 1) - | w >= 97 && w < 103 -> Success (fromIntegral w - 87) (offset chunk + 1) (length chunk - 1) - | otherwise -> Failure e - --- | Consume a single character that is the lowercase hexadecimal --- encoding of a 4-bit word. Returns @Nothing@ without consuming --- the character if it is not in the class @[a-f0-9]@. The parser --- never fails. + let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 + in if + | w >= 48 && w < 58 -> Success (fromIntegral w - 48) (offset chunk + 1) (length chunk - 1) + | w >= 65 && w < 71 -> Success (fromIntegral w - 55) (offset chunk + 1) (length chunk - 1) + | w >= 97 && w < 103 -> Success (fromIntegral w - 87) (offset chunk + 1) (length chunk - 1) + | otherwise -> Failure e + +{- | Consume a single character that is the lowercase hexadecimal +encoding of a 4-bit word. Returns @Nothing@ without consuming +the character if it is not in the class @[a-f0-9]@. The parser +never fails. +-} tryHexNibbleLower :: Parser e s (Maybe Word) tryHexNibbleLower = unfailing $ \chunk -> case length chunk of 0 -> InternalStep Nothing (offset chunk) (length chunk) _ -> - let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 in - if | w >= 48 && w < 58 -> InternalStep (Just (fromIntegral w - 48)) (offset chunk + 1) (length chunk - 1) - | w >= 97 && w < 103 -> InternalStep (Just (fromIntegral w - 87)) (offset chunk + 1) (length chunk - 1) - | otherwise -> InternalStep Nothing (offset chunk) (length chunk) - --- | Consume a single character that is the case-insensitive hexadecimal --- encoding of a 4-bit word. Returns @Nothing@ without consuming --- the character if it is not in the class @[a-fA-F0-9]@. This parser --- never fails. + let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 + in if + | w >= 48 && w < 58 -> InternalStep (Just (fromIntegral w - 48)) (offset chunk + 1) (length chunk - 1) + | w >= 97 && w < 103 -> InternalStep (Just (fromIntegral w - 87)) (offset chunk + 1) (length chunk - 1) + | otherwise -> InternalStep Nothing (offset chunk) (length chunk) + +{- | Consume a single character that is the case-insensitive hexadecimal +encoding of a 4-bit word. Returns @Nothing@ without consuming +the character if it is not in the class @[a-fA-F0-9]@. This parser +never fails. +-} tryHexNibble :: Parser e s (Maybe Word) tryHexNibble = unfailing $ \chunk -> case length chunk of 0 -> InternalStep Nothing (offset chunk) (length chunk) _ -> - let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 in - if | w >= 48 && w < 58 -> InternalStep (Just (fromIntegral w - 48)) (offset chunk + 1) (length chunk - 1) - | w >= 65 && w < 71 -> InternalStep (Just (fromIntegral w - 55)) (offset chunk + 1) (length chunk - 1) - | w >= 97 && w < 103 -> InternalStep (Just (fromIntegral w - 87)) (offset chunk + 1) (length chunk - 1) - | otherwise -> InternalStep Nothing (offset chunk) (length chunk) + let w = PM.indexByteArray (array chunk) (offset chunk) :: Word8 + in if + | w >= 48 && w < 58 -> InternalStep (Just (fromIntegral w - 48)) (offset chunk + 1) (length chunk - 1) + | w >= 65 && w < 71 -> InternalStep (Just (fromIntegral w - 55)) (offset chunk + 1) (length chunk - 1) + | w >= 97 && w < 103 -> InternalStep (Just (fromIntegral w - 87)) (offset chunk + 1) (length chunk - 1) + | otherwise -> InternalStep Nothing (offset chunk) (length chunk) -- Returns the maximum machine word if the argument is not -- the ASCII encoding of a hexadecimal digit. oneHex :: Word8 -> Word -{-# inline oneHex #-} +{-# INLINE oneHex #-} oneHex w | w >= 48 && w < 58 = (fromIntegral w - 48) | w >= 65 && w < 71 = (fromIntegral w - 55) @@ -1217,7 +1432,7 @@ oneHex w | otherwise = maxBound oneHexMaybe :: Word8 -> Maybe Word -{-# inline oneHexMaybe #-} +{-# INLINE oneHexMaybe #-} oneHexMaybe w | w >= 48 && w < 58 = Just (fromIntegral w - 48) | w >= 65 && w < 71 = Just (fromIntegral w - 55) @@ -1225,78 +1440,85 @@ oneHexMaybe w | otherwise = Nothing uneffectfulWord# :: (Bytes -> Result# e Word#) -> Parser e s Word# -{-# inline uneffectfulWord# #-} -uneffectfulWord# f = Parser - ( \b s0 -> (# s0, (f (boxBytes b)) #) ) +{-# INLINE uneffectfulWord# #-} +uneffectfulWord# f = + Parser + (\b s0 -> (# s0, (f (boxBytes b)) #)) -- Precondition: the arguments are non-negative. Boolean is -- true when overflow happens. Performs: a * 10 + b -- Postcondition: when overflow is false, the resulting -- word is less than or equal to the upper bound -positivePushBase10 :: Word -> Word -> Word -> (Bool,Word) -{-# inline positivePushBase10 #-} +positivePushBase10 :: Word -> Word -> Word -> (Bool, Word) +{-# INLINE positivePushBase10 #-} positivePushBase10 (W# a) (W# b) (W# upper) = let !(# ca, r0 #) = Exts.timesWord2# a 10## !r1 = Exts.plusWord# r0 b !cb = int2Word# (gtWord# r1 upper) !cc = int2Word# (ltWord# r1 0##) !c = ca `or#` cb `or#` cc - in (case c of { 0## -> False; _ -> True }, W# r1) + in (case c of 0## -> False; _ -> True, W# r1) -unsignedPushBase10 :: Word -> Word -> (Bool,Word) -{-# inline unsignedPushBase10 #-} +unsignedPushBase10 :: Word -> Word -> (Bool, Word) +{-# INLINE unsignedPushBase10 #-} unsignedPushBase10 (W# a) (W# b) = let !(# ca, r0 #) = Exts.timesWord2# a 10## !r1 = Exts.plusWord# r0 b !cb = int2Word# (ltWord# r1 r0) !c = ca `or#` cb - in (case c of { 0## -> False; _ -> True }, W# r1) + in (case c of 0## -> False; _ -> True, W# r1) -- | Skip while the predicate is matched. This is always inlined. skipWhile :: (Char -> Bool) -> Parser e s () -{-# inline skipWhile #-} -skipWhile f = go where - go = isEndOfInput >>= \case - True -> pure () - False -> do - w <- anyUnsafe - if f w - then go - else unconsume 1 +{-# INLINE skipWhile #-} +skipWhile f = go + where + go = + isEndOfInput >>= \case + True -> pure () + False -> do + w <- anyUnsafe + if f w + then go + else unconsume 1 -- Interpret the next byte as an Latin1-encoded character. -- Does not check to see if any characters are left. This -- is not exported. anyUnsafe :: Parser e s Char -{-# inline anyUnsafe #-} +{-# INLINE anyUnsafe #-} anyUnsafe = uneffectful $ \chunk -> let w = indexCharArray (array chunk) (offset chunk) :: Char in Success w (offset chunk + 1) (length chunk - 1) -- Reads one byte and interprets it as Latin1-encoded character. indexCharArray :: PM.ByteArray -> Int -> Char -{-# inline indexCharArray #-} +{-# INLINE indexCharArray #-} indexCharArray (PM.ByteArray x) (I# i) = C# (indexCharArray# x i) --- | Match any character, to perform lookahead. Returns 'Nothing' if --- end of input has been reached. Does not consume any input. --- --- /Note/: Because this parser does not fail, do not use it --- with combinators such as 'many', because such as 'many', --- because such parsers loop until a failure occurs. Careless --- use will thus result in an infinite loop. +{- | Match any character, to perform lookahead. Returns 'Nothing' if + end of input has been reached. Does not consume any input. + + /Note/: Because this parser does not fail, do not use it + with combinators such as 'many', because such as 'many', + because such parsers loop until a failure occurs. Careless + use will thus result in an infinite loop. +-} peek :: Parser e s (Maybe Char) -{-# inline peek #-} +{-# INLINE peek #-} peek = uneffectful $ \(Bytes arr off len) -> - let v = if len > 0 - then Just (indexCharArray arr off) - else Nothing - in Success v off len - --- | Match any byte, to perform lookahead. Does not consume any --- input, but will fail if end of input has been reached. + let v = + if len > 0 + then Just (indexCharArray arr off) + else Nothing + in Success v off len + +{- | Match any byte, to perform lookahead. Does not consume any + input, but will fail if end of input has been reached. +-} peek' :: e -> Parser e s Char -{-# inline peek' #-} -peek' e = uneffectful $ \(Bytes arr off len) -> if len > 0 - then Success (indexCharArray arr off) off len - else Failure e +{-# INLINE peek' #-} +peek' e = uneffectful $ \(Bytes arr off len) -> + if len > 0 + then Success (indexCharArray arr off) off len + else Failure e diff --git a/src/Data/Bytes/Parser/Utf8.hs b/src/Data/Bytes/Parser/Utf8.hs index d182636..2e35dee 100644 --- a/src/Data/Bytes/Parser/Utf8.hs +++ b/src/Data/Bytes/Parser/Utf8.hs @@ -1,32 +1,33 @@ -{-# language BangPatterns #-} -{-# language BinaryLiterals #-} -{-# language DataKinds #-} -{-# language DerivingStrategies #-} -{-# language GADTSyntax #-} -{-# language MagicHash #-} -{-# language MultiWayIf #-} -{-# language PolyKinds #-} -{-# language RankNTypes #-} -{-# language ScopedTypeVariables #-} -{-# language TypeApplications #-} -{-# language UnboxedTuples #-} -{-# language CPP #-} - --- | Parse input as UTF-8-encoded text. Parsers in this module will --- fail if they encounter a byte above @0x7F@. +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BinaryLiterals #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GADTSyntax #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PolyKinds #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | Parse input as UTF-8-encoded text. Parsers in this module will +fail if they encounter a byte above @0x7F@. +-} module Data.Bytes.Parser.Utf8 ( -- * Get Character any# , shortText ) where -import Prelude hiding (length,any,fail,takeWhile) +import Prelude hiding (any, fail, length, takeWhile) -import Data.Bits ((.&.),(.|.),unsafeShiftL,xor) -import Data.Bytes.Parser.Internal (Parser(..)) +import Data.Bits (unsafeShiftL, xor, (.&.), (.|.)) +import Data.Bytes.Parser.Internal (Parser (..)) import Data.Text.Short (ShortText) -import GHC.Exts (Int(I#),Char(C#),Int#,Char#,(-#),(+#),(>#),chr#) -import GHC.Word (Word8(W8#)) +import GHC.Exts (Char (C#), Char#, Int (I#), Int#, chr#, (+#), (-#), (>#)) +import GHC.Word (Word8 (W8#)) import qualified Data.ByteString.Short.Internal as BSS import qualified Data.Bytes.Parser as Parser @@ -34,6 +35,7 @@ import qualified Data.Primitive as PM import qualified Data.Text.Short as TS import qualified GHC.Exts as Exts +{- FOURMOLU_DISABLE -} -- | Interpret the next one to four bytes as a UTF-8-encoded character. -- Fails if the decoded codepoint is in the range U+D800 through U+DFFF. any# :: e -> Parser e s Char# @@ -76,39 +78,46 @@ any# e = Parser | otherwise -> (# s0, (# e | #) #) _ -> (# s0, (# e | #) #) ) +{- FOURMOLU_ENABLE -} codepointFromFourBytes :: Word8 -> Word8 -> Word8 -> Word8 -> Char -codepointFromFourBytes w1 w2 w3 w4 = C# - ( chr# - ( unI $ fromIntegral - ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 18 .|. - unsafeShiftL (word8ToWord w2 .&. 0b00111111) 12 .|. - unsafeShiftL (word8ToWord w3 .&. 0b00111111) 6 .|. - (word8ToWord w4 .&. 0b00111111) - ) +codepointFromFourBytes w1 w2 w3 w4 = + C# + ( chr# + ( unI $ + fromIntegral + ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 18 + .|. unsafeShiftL (word8ToWord w2 .&. 0b00111111) 12 + .|. unsafeShiftL (word8ToWord w3 .&. 0b00111111) 6 + .|. (word8ToWord w4 .&. 0b00111111) + ) + ) ) - ) codepointFromThreeBytes :: Word8 -> Word8 -> Word8 -> Char -codepointFromThreeBytes w1 w2 w3 = C# - ( chr# - ( unI $ fromIntegral - ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 12 .|. - unsafeShiftL (word8ToWord w2 .&. 0b00111111) 6 .|. - (word8ToWord w3 .&. 0b00111111) - ) +codepointFromThreeBytes w1 w2 w3 = + C# + ( chr# + ( unI $ + fromIntegral + ( unsafeShiftL (word8ToWord w1 .&. 0b00001111) 12 + .|. unsafeShiftL (word8ToWord w2 .&. 0b00111111) 6 + .|. (word8ToWord w3 .&. 0b00111111) + ) + ) ) - ) codepointFromTwoBytes :: Word8 -> Word8 -> Char -codepointFromTwoBytes w1 w2 = C# - ( chr# - ( unI $ fromIntegral @Word @Int - ( unsafeShiftL (word8ToWord w1 .&. 0b00011111) 6 .|. - (word8ToWord w2 .&. 0b00111111) - ) +codepointFromTwoBytes w1 w2 = + C# + ( chr# + ( unI $ + fromIntegral @Word @Int + ( unsafeShiftL (word8ToWord w1 .&. 0b00011111) 6 + .|. (word8ToWord w2 .&. 0b00111111) + ) + ) ) - ) oneByteChar :: Word8 -> Bool oneByteChar !w = w .&. 0b10000000 == 0 @@ -131,13 +140,16 @@ word8ToWord = fromIntegral unI :: Int -> Int# unI (I# w) = w --- | Consume input that matches the argument. Fails if the --- input does not match. +{- | Consume input that matches the argument. Fails if the +input does not match. +-} shortText :: e -> ShortText -> Parser e s () -shortText e !t = Parser.byteArray e - (shortByteStringToByteArray (TS.toShortByteString t)) +shortText e !t = + Parser.byteArray + e + (shortByteStringToByteArray (TS.toShortByteString t)) shortByteStringToByteArray :: - BSS.ShortByteString - -> PM.ByteArray + BSS.ShortByteString -> + PM.ByteArray shortByteStringToByteArray (BSS.SBS x) = PM.ByteArray x