Dhall-haskell: Improve parser performance

Created on 26 Aug 2017  Â·  24Comments  Â·  Source: dhall-lang/dhall-haskell

Parsing is a huge performance bottleneck for the dhall compiler right now

For example, the following ~4 KB file about 2 seconds the dhall executable to process:

λ(xs : List { cores : Natural, host : Text, key : Text, mandatoryFeatures : 
List Text, platforms : List < AArch64_Linux : {} | ARMv5tel_Linux : {} | 
ARMv7l_Linux : {} | I686_Cygwin : {} | I686_Linux : {} | MIPS64el_Linux : {} | 
PowerPC_Linux : {} | X86_64_Cygwin : {} | X86_64_Darwin : {} | X86_64_FreeBSD : 
{} | X86_64_Linux : {} | X86_64_Solaris : {} >, speedFactor : Natural, 
supportedFeatures : List Text, user : Optional Text }) → List/fold { cores : 
Natural, host : Text, key : Text, mandatoryFeatures : List Text, platforms : 
List < AArch64_Linux : {} | ARMv5tel_Linux : {} | ARMv7l_Linux : {} | 
I686_Cygwin : {} | I686_Linux : {} | MIPS64el_Linux : {} | PowerPC_Linux : {} | 
X86_64_Cygwin : {} | X86_64_Darwin : {} | X86_64_FreeBSD : {} | X86_64_Linux : 
{} | X86_64_Solaris : {} >, speedFactor : Natural, supportedFeatures : List 
Text, user : Optional Text } xs Text (λ(x : { cores : Natural, host : Text, 
key : Text, mandatoryFeatures : List Text, platforms : List < AArch64_Linux : 
{} | ARMv5tel_Linux : {} | ARMv7l_Linux : {} | I686_Cygwin : {} | I686_Linux : 
{} | MIPS64el_Linux : {} | PowerPC_Linux : {} | X86_64_Cygwin : {} | 
X86_64_Darwin : {} | X86_64_FreeBSD : {} | X86_64_Linux : {} | X86_64_Solaris : 
{} >, speedFactor : Natural, supportedFeatures : List Text, user : Optional 
Text }) → λ(y : Text) → (Optional/fold Text x.user Text (λ(user : Text) 
→ user ++ "@" ++ x.host ++ "") x.host ++ " " ++ (merge { Empty = λ(_ : {}) 
→ "", NonEmpty = λ(result : Text) → result } (List/fold < AArch64_Linux : 
{} | ARMv5tel_Linux : {} | ARMv7l_Linux : {} | I686_Cygwin : {} | I686_Linux : 
{} | MIPS64el_Linux : {} | PowerPC_Linux : {} | X86_64_Cygwin : {} | 
X86_64_Darwin : {} | X86_64_FreeBSD : {} | X86_64_Linux : {} | X86_64_Solaris : 
{} > x.platforms < Empty : {} | NonEmpty : Text > (λ(element : < AArch64_Linux 
: {} | ARMv5tel_Linux : {} | ARMv7l_Linux : {} | I686_Cygwin : {} | I686_Linux 
: {} | MIPS64el_Linux : {} | PowerPC_Linux : {} | X86_64_Cygwin : {} | 
X86_64_Darwin : {} | X86_64_FreeBSD : {} | X86_64_Linux : {} | X86_64_Solaris : 
{} >) → λ(status : < Empty : {} | NonEmpty : Text >) → merge { Empty = 
λ(_ : {}) → < NonEmpty = merge { AArch64_Linux = λ(_ : {}) → 
"aarch64-linux", ARMv5tel_Linux = λ(_ : {}) → "armv5tel-linux", ARMv7l_Linux 
= λ(_ : {}) → "armv7l-linux", I686_Cygwin = λ(_ : {}) → "i686-cygwin", 
I686_Linux = λ(_ : {}) → "i686-linux", MIPS64el_Linux = λ(_ : {}) → 
"mips64el-linux", PowerPC_Linux = λ(_ : {}) → "powerpc-linux", X86_64_Cygwin 
= λ(_ : {}) → "x86_64-cygwin", X86_64_Darwin = λ(_ : {}) → 
"x86_64-darwin", X86_64_FreeBSD = λ(_ : {}) → "x86_64-freebsd", X86_64_Linux 
= λ(_ : {}) → "x86_64-linux", X86_64_Solaris = λ(_ : {}) → 
"x86_64-solaris" } element | Empty : {} >, NonEmpty = λ(result : Text) → < 
NonEmpty = (merge { AArch64_Linux = λ(_ : {}) → "aarch64-linux", 
ARMv5tel_Linux = λ(_ : {}) → "armv5tel-linux", ARMv7l_Linux = λ(_ : {}) → 
"armv7l-linux", I686_Cygwin = λ(_ : {}) → "i686-cygwin", I686_Linux = λ(_ : 
{}) → "i686-linux", MIPS64el_Linux = λ(_ : {}) → "mips64el-linux", 
PowerPC_Linux = λ(_ : {}) → "powerpc-linux", X86_64_Cygwin = λ(_ : {}) → 
"x86_64-cygwin", X86_64_Darwin = λ(_ : {}) → "x86_64-darwin", X86_64_FreeBSD 
= λ(_ : {}) → "x86_64-freebsd", X86_64_Linux = λ(_ : {}) → 
"x86_64-linux", X86_64_Solaris = λ(_ : {}) → "x86_64-solaris" } element) ++ 
"," ++ result | Empty : {} > } status : < Empty : {} | NonEmpty : Text >) < 
Empty = {=} | NonEmpty : Text >) : Text) ++ " " ++ x.key ++ " " ++ Integer/show 
(Natural/toInteger x.cores) ++ " " ++ Integer/show (Natural/toInteger 
x.speedFactor) ++ " " ++ (merge { Empty = λ(_ : {}) → "", NonEmpty = 
λ(result : Text) → result } (List/fold Text x.supportedFeatures < Empty : {} 
| NonEmpty : Text > (λ(element : Text) → λ(status : < Empty : {} | NonEmpty 
: Text >) → merge { Empty = λ(_ : {}) → < NonEmpty = element | Empty : {} 
>, NonEmpty = λ(result : Text) → < NonEmpty = element ++ "," ++ result | 
Empty : {} > } status : < Empty : {} | NonEmpty : Text >) < Empty = {=} | 
NonEmpty : Text >) : Text) ++ " " ++ (merge { Empty = λ(_ : {}) → "", 
NonEmpty = λ(result : Text) → result } (List/fold Text x.mandatoryFeatures < 
Empty : {} | NonEmpty : Text > (λ(element : Text) → λ(status : < Empty : {} 
| NonEmpty : Text >) → merge { Empty = λ(_ : {}) → < NonEmpty = element | 
Empty : {} >, NonEmpty = λ(result : Text) → < NonEmpty = element ++ "," ++ 
result | Empty : {} > } status : < Empty : {} | NonEmpty : Text >) < Empty = 
{=} | NonEmpty : Text >) : Text) ++ "\n") ++ y) ""

When I benchmark the dhall program running on the above file, all of the bottlenecks are parsing-related:

    Sat Aug 26 11:42 2017 Time and Allocation Profiling Report  (Final)

       dhall +RTS -p -RTS

    total time  =        6.90 secs   (6897 ticks @ 1000 us, 1 processor)
    total alloc = 3,784,992,880 bytes  (excludes profiling overheads)

COST CENTRE            MODULE                  SRC                                              %time %alloc

noted                  Dhall.Parser            src/Dhall/Parser.hs:(120,1)-(124,44)              23.6   20.8
token                  Dhall.Parser            src/Dhall/Parser.hs:105:5-57                      21.7   21.9
buildSomeSpaceParser   Text.Parser.Token.Style src/Text/Parser/Token/Style.hs:(110,1)-(132,26)    5.4    5.4
reserve                Dhall.Parser            src/Dhall/Parser.hs:(141,1)-(143,13)               5.3    5.2
exprC.chain            Dhall.Parser            src/Dhall/Parser.hs:(485,5)-(487,79)               5.0    4.5
symbol                 Dhall.Parser            src/Dhall/Parser.hs:(146,1)-(148,13)               4.1    4.3
uncons                 Data.ByteString.UTF8    Data/ByteString/UTF8.hs:(166,1)-(167,38)           3.5    5.6
exprF                  Dhall.Parser            src/Dhall/Parser.hs:(527,1)-(725,16)               2.2    1.7
exprF.exprParseDouble  Dhall.Parser            src/Dhall/Parser.hs:(698,5)-(703,35)               2.0    1.4
decode                 Data.ByteString.UTF8    Data/ByteString/UTF8.hs:(68,1)-(124,32)            1.9    6.6
manyAccum              Text.Trifecta.Parser    src/Text/Trifecta/Parser.hs:(183,1)-(187,47)       1.7    1.0

I know that we can do better because that parsing speed is two bytes per millisecond, which is definitely too slow

performance

Most helpful comment

Also, just in case people are curious, here is the new heap profile:

screen shot 2018-09-27 at 11 11 38 am

It's still roughly the same maximum residency, but now compressed in time

Also, the new profiling indicates that parsing is no longer the overwhelming bottleneck. We now see pretty-printing showing up in the profile from a dhall format invocation:

········Thu Sep 27 11:13 2018 Time and Allocation Profiling Report  (Final)

········   dhall +RTS -p -RTS format

········total time  =        0.05 secs   (46 ticks @ 1000 us, 1 processor)
········total alloc =  45,344,376 bytes  (excludes profiling overheads)

COST CENTRE                                  MODULE                                             SRC                                                                          %time %alloc

>>=                                          Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:95:5-48                                       43.5    8.6
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(191,29)-(194,23)   13.0    1.8
<|>                                          Text.Megaparsec.Internal                           Text/Megaparsec/Internal.hs:195:3-16                                           4.3   10.5
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(195,36)-(200,23)    4.3    2.9
fmap                                         Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:69:5-41                                        4.3    6.1
pure                                         Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:76:5-24                                        4.3    4.8
getParserState                               Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:156:5-58                                       4.3    3.7
take1_                                       Text.Megaparsec.Stream                             Text/Megaparsec/Stream.hs:262:3-19                                             2.2    3.6
*>                                           Text.Megaparsec.Internal                           Text/Megaparsec/Internal.hs:172:3-32                                           2.2    3.2
mplus                                        Options.Applicative.Internal                       Options/Applicative/Internal.hs:(218,3)-(222,52)                               2.2    0.0
layoutWadlerLeijen.best                      Data.Text.Prettyprint.Doc.Internal                 src/Data/Text/Prettyprint/Doc/Internal.hs:(1707,5)-(1724,76)                   2.2    2.6
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(188,31)-(190,23)    2.2    1.0
reservedChar                                 Dhall.Parser.Token                                 src/Dhall/Parser/Token.hs:498:1-60                                             2.2    0.0
token                                        Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:146:5-50                                       2.2    2.1
label                                        Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:130:5-59                                       2.2    5.0

All 24 comments

This issue caught my eye. Having this year made like five language parsers for different projects, and recently made a parser that parses a 10MB file of MS SQL procedures in 9 seconds (so about 1MB/s) with Parsec, I think I can share some ideas:

  • Personally, I write all my lexers/tokenizers in Attoparsec. You probably won't want to do this at this stage because you've already written your code, but it does help for things like e.g. tokenizing a string. In your parser you're using combinators like this which would be great in a PEG parser but in Parsec are slow.
  • In Attoparsec, you can write a string tokenizer as a scan (and runScanner below it), even complicated strings with interpolation. Using attoparsec I tokenized the 10MB file in 2 seconds. Admittedly I parsed it as ByteString because all keyword tokens are ASCII. Here is my comment lexer not particularly clearly written, but it demonstrates the point.
  • In Duet my parser doesn't actually use any try. In this case I wasn't aiming for speed or anything (this is the one lexer I just wrote up in Parsec for easy access to location information), it just turned out that way. Its error messages are (in my opinion) very good partly due to this. So I think it's definitely practical to write the parser without any try.

I'm probably a bit out of date because I don't use Trifecta or Megaparsec, but in your place I'd probably just throw this parser away and write a fast one from scratch, assuming you have a nice set of parser tests built up. If I end up using Dhall I might contribute a faster parser. The Liquid Haskell group also have trouble with their parser. I think our parser combinator libraries are much stupider than people expect them to be. It's convenient up front but then you end up spending time refactoring the parser later anyway because its errors are bad or it's super slow.

@chrisdone: The fundamental problem here is that Dhall's support for string interpolation and nested block comments makes it difficult (impossible?) to write a correct lexer

However, if you would like to take a stab at it, here is the ABNF grammar for Dhall: https://github.com/dhall-lang/dhall-lang/blob/master/standard/dhall.abnf

The next thing I was likely to try was switching to megaparsec since, unlike trifecta, it supports high-efficiency bulk parsers (i.e. takeWhile) but still provides nice error messages

So I took a quick stab at implementing the bare minimum attoparsec lexer to tokenize the above example like @chrisdone suggested and it is pretty fast (about 150 ns per character) so this is pretty promising.

@Gabriel439 I saw this and thought I'd throw together a criterion benchmark suite for the parser in dhall here: https://github.com/FintanH/dhall-haskell/blob/dev/benchmark/benchmark/parser/Main.hs

For the example above I got back:

benchmarking Issue #108/108
time                 1.024 s    (867.7 ms .. 1.123 s)
                     0.997 R²   (0.992 R² .. 1.000 R²)
mean                 1.080 s    (1.072 s .. 1.086 s)
std dev              8.451 ms   (0.0 s .. 9.660 ms)
variance introduced by outliers: 19% (moderately inflated)

If this seems useful I can make a PR :)

@FintanH: Yes, please do! Also, thank you :)

Further follow up, I tried converting the Lazy Text to Strict and this gave us some improvements.

I compared these two branches for benchmarking:

And the results were:

total time  =      248.35 secs   (248347 ticks @ 1000 us, 1 processor)

vs.

total time  =      294.33 secs   (294332 ticks @ 1000 us, 1 processor)

With the text hotspot I mentioned in the comment above disappearing.

Also mentioned in #415 that we had a non-terminating (i.e. was running for more than 45minutes). This now terminates using strict Text.

One idea we discussed at ZuriHac was to take advantage of the generic parsers interface to reuse the same parsing logic for both attoparsec and megaparsec. Then we could improve performance by first trying to use attoparsec to parse the program on the fast path and if that failed then falling back on using megaparsec.

Some work that's been already done on the attoparsec front is done here: https://github.com/FintanH/dhall-haskell/tree/fintan/attoparsec

So I've recently been working on minimizing the original pathological example to a much smaller example which eliminates a lot of conflating factors, which is this newly added benchmark:

https://github.com/dhall-lang/dhall-haskell/pull/526/files#diff-7d7876f2897ca672302d9b91c78a786dR52

If you just try to parse a file with x applied to itself a large number of times (i.e. x x x x x x x ...) you take a significant hit to performance. Specifically it parses 20,000 characters / 141.6 ms or ~ 1 character / 7 microseconds. It's not quite as slow as the original example (about ~1 character / 73 microseconds currently on the same machine), but it's still slow enough that the performance is not acceptable and is a very clear test case to optimize.

One of the things that I observed was that for both examples is that noted is a significant bottleneck (specifically getPosition) and setting noted the identity function almost doubles performance. So I'm going to be working on an upcoming pull request to first try to parse with noted = id and then reparse with the real noted if the parse fails for better error messages.

However, even doubling performance is still not good enough, so I'm still profiling. I still agree that using attoparsec to tokenize the input is probably going to give us the biggest wins and I just want to confirm that via profiling first once I've fixed all the low-hanging fruit.

I also spent some time trying to take @FintanH's branch and use attoparsec for the happy path and megaparsec for the slow path if parsing fails. I was only able to get a performance speedup (about 4x) when using only attoparsec, but when I made the parsers polymorphic to take advantage of both parsing libraries it killed performance again (and INLINABLE and SPECIALIZE pragmas didn't appear to help things).

Also, I want to clarify that if we do switch to attoparsec to tokenize the input that doesn't imply that we need to fix the grammar to remove nested block comments or string interpolation. We can still use attoparsec to produce a stream of tokens that is mostly linear but supports nesting when there is no alternative, in like string interpolation:

data Token = OpenParen | CloseParen | TextLit [(Text, [Token])] Text | ...

This is something that @chrisdone had already pointed out before that I just wanted to highlight

Also, here are some numbers showing the impact of setting noted = id:

Before:

benchmarking Issue #108
time                 350.2 ms   (318.8 ms .. 364.8 ms)
                     0.999 R²   (0.998 R² .. 1.000 R²)
mean                 356.6 ms   (351.8 ms .. 359.6 ms)
std dev              4.614 ms   (1.575 ms .. 6.094 ms)
variance introduced by outliers: 19% (moderately inflated)

benchmarking Large number of function arguments
time                 141.8 ms   (133.2 ms .. 148.8 ms)
                     0.996 R²   (0.985 R² .. 1.000 R²)
mean                 156.9 ms   (150.9 ms .. 170.5 ms)
std dev              12.42 ms   (5.603 ms .. 18.08 ms)
variance introduced by outliers: 14% (moderately inflated)

After:

benchmarking Issue #108
time                 201.9 ms   (200.1 ms .. 204.3 ms)
                     1.000 R²   (1.000 R² .. 1.000 R²)
mean                 201.5 ms   (200.7 ms .. 202.2 ms)
std dev              963.4 μs   (629.4 μs .. 1.333 ms)
variance introduced by outliers: 14% (moderately inflated)

benchmarking Large number of function arguments
time                 59.92 ms   (58.39 ms .. 61.13 ms)
                     0.999 R²   (0.997 R² .. 1.000 R²)
mean                 61.52 ms   (60.72 ms .. 62.30 ms)
std dev              1.497 ms   (1.230 ms .. 1.761 ms)

I am not an expert in parsers and maybe it's a no-sense, but could we make string interpolation and nested comments optional and use a faster parser (like happy?) in this case?

@jneira: I think we can get this fast enough while still using megaparsec. There are a lot of inefficient idioms in the parser code that are due to previously using parsers (which didn't support high-efficiency primitives and I'm not sure why) and then translating that code directly to megaparsec without taking advantage of the high-efficiency primitives that megaparsec does provide. You can see an example of that here where we finally used takeWhile f instead of many (satisfy f), which produced a significant performance improvement for parsing identifiers:

https://github.com/dhall-lang/dhall-haskell/commit/5571f4d783d14eb0aa3abfb2bf33d877f922adcd#diff-a2e8d90f856595d56498893dab34ed36R159

The main reason this is slow is that this just hasn't gotten enough attention yet because I've been working on completing and fixing the standard to unblock alternative implementations of Dhall (such as the Clojure implementation). I don't think we need to disable string interpolation to get the performance we need. megaparsec is actually a pretty fast library and we just aren't taking advantage of it correctly.

There's another low-hanging fruit, which is not computing source spans at all if everything succeeds (since source spans are one of the big performance bottlenecks), and then re-parsing with source spans if type-checking fails to get a correctly localized error message. That way you get fast parsing performance on the happy path at the expense of slower performance on the unhappy path, which I think people would be fine with.

Thanks for the explanation, i'll try to take a look at what you've pointed out

I recently contributed some low-hanging improvements that improve parsing of various bulk elements (i.e. string literals, whitespace, identifiers) which improve the performance alot for token-sparse code, but the parser is still not where it needs to to be for token-dense code.

Based on some performance tests when parsing a minimalist lambda calculus using both attoparsec and megaparsec, the speed limit for parsing "pathological" code (i.e. code that is token-dense) is on the order of < 0.5 microseconds / byte (or 2 MB/s) on my development laptop. Here are the two example test files I used when benchmarking the two libraries:

{-# LANGUAGE OverloadedStrings #-}

import Control.Applicative (many, (<|>))
import Data.Attoparsec.Text (Parser)
import Data.Monoid ((<>))
import Data.Text (Text)

import qualified Data.Attoparsec.Text as Attoparsec
import qualified Data.Text

input :: Text
input = "x" <> result
  where
    result = Data.Text.unfoldrN 1000000 step False

    step False = Just (' ', True )
    step True  = Just ('x', False)

data Expression
    = Var Text
    | Lam Text Expression
    | App Expression Expression
    deriving (Show)

whiteSpace :: Parser ()
whiteSpace = do
    _ <- Attoparsec.takeWhile (== ' ')

    return ()

identifier :: Parser Text
identifier = do
    bytes <- Attoparsec.takeWhile1 (== 'x')
    whiteSpace
    return bytes

lambda :: Parser ()
lambda = do
    Attoparsec.string "\\"
    whiteSpace

arrow :: Parser ()
arrow = do
    Attoparsec.string "->"
    whiteSpace

openParens :: Parser ()
openParens = do
    Attoparsec.string "("
    whiteSpace

closeParens :: Parser ()
closeParens = do
    Attoparsec.string ")"
    whiteSpace

parseExpression :: Parser Expression
parseExpression = parseLam <|> parseApplication
  where
    parseLam = do
        lambda

        x <- identifier

        arrow

        e <- parseExpression

        return (Lam x e)

parseApplication :: Parser Expression
parseApplication = parseApp <|> parseVariable
  where
    parseApp = do
        f  <- parseVariable

        xs <- many parseVariable

        return (foldl App f xs)

parseVariable :: Parser Expression
parseVariable = parseVar <|> parseParens
  where
    parseVar = do
        x <- identifier

        return (Var x)

    parseParens = do
        openParens

        e <- parseExpression

        closeParens

        return e

main :: IO ()
main = print (Attoparsec.parseOnly parseExpression input)
{-# LANGUAGE OverloadedStrings #-}

import Control.Applicative (many, (<|>))
import Data.Monoid ((<>))
import Data.Text (Text)
import Data.Void (Void)

import qualified Text.Megaparsec      as Megaparsec
import qualified Text.Megaparsec.Char as Megaparsec
import qualified Data.Char
import qualified Data.Text

type Parser = Megaparsec.Parsec Void Text

input :: Text
input = "x" <> result
  where
    result = Data.Text.unfoldrN 1000000 step False

    step False = Just (' ', True )
    step True  = Just ('x', False)

data Expression
    = Var Text
    | Lam Text Expression
    | App Expression Expression
    deriving (Show)

whiteSpace :: Parser ()
whiteSpace = do
    _ <- Megaparsec.takeWhileP Nothing Data.Char.isSpace

    return ()

identifier :: Parser Text
identifier = do
    bytes <- Megaparsec.takeWhile1P Nothing Data.Char.isAlpha
    whiteSpace
    return bytes

lambda :: Parser ()
lambda = do
    Megaparsec.string "\\"
    whiteSpace

arrow :: Parser ()
arrow = do
    Megaparsec.string "->"
    whiteSpace

openParens :: Parser ()
openParens = do
    Megaparsec.string "("
    whiteSpace

closeParens :: Parser ()
closeParens = do
    Megaparsec.string ")"
    whiteSpace

parseExpression :: Parser Expression
parseExpression = parseLam <|> parseApplication
  where
    parseLam = do
        lambda

        x <- identifier

        arrow

        e <- parseExpression

        return (Lam x e)

parseApplication :: Parser Expression
parseApplication = parseApp <|> parseVariable
  where
    parseApp = do
        f  <- parseVariable

        xs <- many parseVariable

        return (foldl App f xs)

parseVariable :: Parser Expression
parseVariable = parseVar <|> parseParens
  where
    parseVar = do
        x <- identifier

        return (Var x)

    parseParens = do
        openParens

        e <- parseExpression

        closeParens

        return e

main :: IO ()
main = Megaparsec.parseTest parseExpression input

The performance difference between the two libraries on that idealized example is small.

dhall's parser is still about 2 orders of magnitude slower than that (~50 microseconds / byte) when you set noted = id. That's 10x faster than the speed when I first opened this issue (~500 microseconds / bytes), but there's still quite a way to go for token-dense code.

Obviously, I don't expect Dhall to be as easy to parse as a minimal lambda calculus, but I think at least another order of magnitude performance improvement should be possible.

Also, here is what I think may still be contributing to the performance issues on token-dense code and how to fix them:

  • Excessive use of (<|>)/choice

    There are many cases in the code where we could significantly cut down on the amount of
    backtracking by just looking ahead one character and then caseing on that. For example:
    when parsing built-in functions we can narrow down the possibilities quite rapidly by just
    looking at one character instead of doing a test matching against the entire function name for
    every built-in.

    For people who have stared at the profiling output, I believe this change would make the large
    amount of work attributed both to (<|>) and reserved go down

  • Inefficient algorithm for parsing operators and their precedence

    This is the whole makeOperatorExpression section of the parser. Every time we parse a token
    we have to traverse that entire tree of parsers dedicated to operators, which is inefficient. There's
    probably a better way to parse all of the operators while still respecting their precedence.

    This may dovetail with the previous point: we can often determine which operator (if any) that
    we're looking at by just looking ahead by one character

  • Parametrizing every parser on embedded which may be interfering with optimization

    I may end up dropping support for customizable parsing of the Embed constructor if I can
    produce evidence that this is harming performance by interfering with specialization and/or
    inlining.

Also, anything we can do to optimize the performance of the whitespace parser on certain
common paths will speed up token-dense code since that sort of code is punctuated heavily by
whitespace.

I've just run into the most bizarre result. I was trying to test the hypothesis that matching against all the possible built-in functions was slowing down parsing performance. However, when I completely removed support for parsing built-ins and just treated them as ordinary non-reserved identifiers the parser got 3x slower. Here's the diff:

https://github.com/dhall-lang/dhall-haskell/compare/gabriel/no_builtin

This result makes no sense to me. My intuition was that the parser should under all possible cases be doing less work, but there is a very consistent 3x slowdown when running:

$ dhall format < './benchmark/examples/issue108.dhall'

The time to run increases from ~350 ms to ~1.1 s when you apply the above diff.

I feel like this weird result might be a clue to why the parsing performance is so slow if I could only understand why.

So I did both runtime profiling and heap profiling before and after the change and the most notable difference is that removing support for parsing builtins leads to 4x the heap usage (5 MB heap size before and 20 MB after the change)

Before:

screen shot 2018-09-25 at 6 49 30 pm

After:

screen shot 2018-09-25 at 6 50 00 pm

Given that the original file is only about 4 KB, this heap usage seems excessive

This might also partially explain why parsing all built-ins as identifiers slowed things down, because instead of being converted to argument-free Expr constructors they are being converted to Var storing (possibly lazy) Text and Integers. It's still doesn't entirely explain the giant growth in the heap, though since the file only has about 882 tokens, so even if every single token were encoded as a Var instead of a built-in that would imply an additional ~15 KB of memory being used per token, which still doesn't make sense.

Also, one thing that both profiles point to is that most of the memory is being held onto by the makeOperatorExpression function.

https://github.com/dhall-lang/dhall-haskell/pull/602 improves the heap utilization dramatically. Here's the new heap profile:

screen shot 2018-09-26 at 5 36 06 pm

... and the corresponding performance gain is modest (~18%)

I discovered that parentheses absolutely KILL performance. Every pair of parentheses doubles the cost of parsing the expression in its interior. For example, this tiny expression:

((((((((((((((((((((x))))))))))))))))))))

... takes over a minute to format. I'm still investigating why this occurs.

I found the root cause of that. It's due to excessive backtracking induced by the try used when parsing function types here:

https://github.com/dhall-lang/dhall-haskell/blob/master/src/Dhall/Parser/Expression.hs#L106

If I comment out that alternative the parsing performance on this example (which doesn't use any function types) improves by 100x, which brings us exactly to the speed limit for parsing token-dense code. So that suggests that all remaining effort should be focused on finding a way to avoid having to backtrack there

The fix is up here: https://github.com/dhall-lang/dhall-haskell/pull/606

The original file now takes only 5 ms to parse after the fix

Also, just in case people are curious, here is the new heap profile:

screen shot 2018-09-27 at 11 11 38 am

It's still roughly the same maximum residency, but now compressed in time

Also, the new profiling indicates that parsing is no longer the overwhelming bottleneck. We now see pretty-printing showing up in the profile from a dhall format invocation:

········Thu Sep 27 11:13 2018 Time and Allocation Profiling Report  (Final)

········   dhall +RTS -p -RTS format

········total time  =        0.05 secs   (46 ticks @ 1000 us, 1 processor)
········total alloc =  45,344,376 bytes  (excludes profiling overheads)

COST CENTRE                                  MODULE                                             SRC                                                                          %time %alloc

>>=                                          Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:95:5-48                                       43.5    8.6
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(191,29)-(194,23)   13.0    1.8
<|>                                          Text.Megaparsec.Internal                           Text/Megaparsec/Internal.hs:195:3-16                                           4.3   10.5
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(195,36)-(200,23)    4.3    2.9
fmap                                         Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:69:5-41                                        4.3    6.1
pure                                         Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:76:5-24                                        4.3    4.8
getParserState                               Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:156:5-58                                       4.3    3.7
take1_                                       Text.Megaparsec.Stream                             Text/Megaparsec/Stream.hs:262:3-19                                             2.2    3.6
*>                                           Text.Megaparsec.Internal                           Text/Megaparsec/Internal.hs:172:3-32                                           2.2    3.2
mplus                                        Options.Applicative.Internal                       Options/Applicative/Internal.hs:(218,3)-(222,52)                               2.2    0.0
layoutWadlerLeijen.best                      Data.Text.Prettyprint.Doc.Internal                 src/Data/Text/Prettyprint/Doc/Internal.hs:(1707,5)-(1724,76)                   2.2    2.6
renderIO.go.\                                Data.Text.Prettyprint.Doc.Render.Terminal.Internal src/Data/Text/Prettyprint/Doc/Render/Terminal/Internal.hs:(188,31)-(190,23)    2.2    1.0
reservedChar                                 Dhall.Parser.Token                                 src/Dhall/Parser/Token.hs:498:1-60                                             2.2    0.0
token                                        Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:146:5-50                                       2.2    2.1
label                                        Dhall.Parser.Combinators                           src/Dhall/Parser/Combinators.hs:130:5-59                                       2.2    5.0
Was this page helpful?
0 / 5 - 0 ratings

Related issues

mmhat picture mmhat  Â·  23Comments

jneira picture jneira  Â·  17Comments

ocharles picture ocharles  Â·  20Comments

quasicomputational picture quasicomputational  Â·  19Comments

sjakobi picture sjakobi  Â·  18Comments