Moving parsing preprocessing into ghc-parser

bfb0af88 · Andrew Gibiansky · 806aa5cf · bfb0af88 · bfb0af88 · bfb0af88
Commit bfb0af88 authored Feb 07, 2014 by Andrew Gibiansky
5 changed files
--- a/ghc-parser/Language/Haskell/GHC/Parser.hs
+++ b/ghc-parser/Language/Haskell/GHC/Parser.hs
+{-# LANGUAGE DeriveFunctor #-}
 module Language.Haskell.GHC.Parser (
  -- Parser handling
  runParser,
@@ -7,6 +8,7 @@ module Language.Haskell.GHC.Parser (
  StringLoc(..),
  ParseOutput(..),
  Parser,
+  Located(..),
  -- Different parsers
  parserStatement,
@@ -21,18 +23,22 @@ module Language.Haskell.GHC.Parser (
  partialTypeSignature,
  partialModule,
  partialExpression,
+  -- Haskell string preprocessing.
+  removeComments,
+  layoutChunks,
  ) where
-import Data.List (intercalate)
+import Data.List (intercalate, findIndex)
 import Bag
 import ErrUtils hiding (ErrMsg)
 import FastString
-import GHC
+import GHC hiding (Located)
 import Lexer
 import OrdList
 import Outputable hiding ((<>))
-import SrcLoc
+import SrcLoc hiding (Located)
 import StringBuffer
 import qualified Language.Haskell.GHC.HappyParser as Parse
@@ -57,6 +63,13 @@ data ParseOutput a
    deriving (Eq, Show)                  -- Auxiliary strings say what part of the
                                  -- input string was used and what
                                  -- part is remaining.
+                                  --
+-- | Store locations along with a value.
+data Located a = Located {
+    line :: LineNumber, -- Where this element is located.
+    unloc :: a          -- Located element. 
+  } deriving (Eq, Show, Functor)
 data ParserType = FullParser | PartialParser
 data Parser a = Parser ParserType (P a)
@@ -128,3 +141,79 @@ splitAtLoc line col string =
 -- Not the same as 'unlines', due to trailing \n
 joinLines :: [String] -> String
 joinLines = intercalate "\n"
+-- | Split an input string into chunks based on indentation.
+-- A chunk is a line and all lines immediately following that are indented
+-- beyond the indentation of the first line. This parses Haskell layout
+-- rules properly, and allows using multiline expressions via indentation. 
+layoutChunks :: String -> [Located String]
+layoutChunks = go 1
+  where
+    go :: LineNumber -> String -> [Located String]
+    go line = filter (not . null . unloc) . map (fmap strip) . layoutLines line . lines
+    -- drop spaces on left and right
+    strip = dropRight . dropLeft
+      where
+        dropLeft = dropWhile (`elem` whitespace)
+        dropRight = reverse . dropWhile (`elem` whitespace) . reverse
+        whitespace = " \t\n"
+    layoutLines :: LineNumber -> [String] -> [Located String]
+    -- Empty string case.  If there's no input, output is empty.
+    layoutLines _ [] = []
+    -- Use the indent of the first line to find the end of the first block.
+    layoutLines lineIdx all@(firstLine:rest) = 
+      let firstIndent = indentLevel firstLine
+          blockEnded line = indentLevel line <= firstIndent in
+        case findIndex blockEnded rest of
+          -- If the first block doesn't end, return the whole string, since
+          -- that just means the block takes up the entire string. 
+          Nothing -> [Located lineIdx $ intercalate "\n" all]
+          -- We found the end of the block. Split this bit out and recurse.
+          Just idx -> 
+            let (before, after) = splitAt idx rest in
+              Located lineIdx (joinLines $ firstLine:before) : go (lineIdx + idx + 1) (joinLines after)
+    -- Compute indent level of a string as number of leading spaces.
+    indentLevel :: String -> Int
+    indentLevel (' ':str) = 1 + indentLevel str
+    -- Count a tab as two spaces.
+    indentLevel ('\t':str) = 2 + indentLevel str
+    -- Count empty lines as a large indent level, so they're always with the previous expression.
+    indentLevel "" = 100000
+    indentLevel _ = 0
+-- | Drop comments from Haskell source.
+-- Simply gets rid of them, does not replace them in any way.
+removeComments :: String -> String
+removeComments = removeOneLineComments . removeMultilineComments 0
+  where
+    removeOneLineComments str =
+      case str of
+        -- Don't remove comments after cmd directives
+        ':':'!':remaining ->":!" ++ takeLine remaining ++ dropLine remaining
+        '-':'-':remaining -> dropLine remaining
+        x:xs -> x:removeOneLineComments xs
+        [] -> []
+      where
+        dropLine = removeOneLineComments . dropWhile (/= '\n')
+        takeLine = takeWhile (/= '\n')
+    removeMultilineComments nesting str =
+      case str of
+        '{':'-':remaining -> removeMultilineComments (nesting + 1) remaining
+        '-':'}':remaining -> 
+          if nesting > 0
+          then removeMultilineComments (nesting - 1) remaining
+          else '-':'}':removeMultilineComments nesting remaining
+        x:xs -> 
+          if nesting > 0
+          then removeMultilineComments nesting xs
+          else x:removeMultilineComments nesting xs
+        [] -> []
--- a/ghc-parser/ghc-parser.cabal
+++ b/ghc-parser/ghc-parser.cabal
@@ -2,7 +2,7 @@
 -- documentation, see http://haskell.org/cabal/users-guide/
 name:                ghc-parser
-version:             0.1.0.0
+version:             0.1.1.0
 synopsis:            Haskell source parser from GHC.
 -- description:         
 homepage:            https://github.com/gibiansky/IHaskell

--- a/ihaskell.cabal
+++ b/ihaskell.cabal
@@ -64,7 +64,7 @@ library
                       directory            -any,
                       filepath             -any,
                       ghc                  ==7.6.*,
-                       ghc-parser           -any,
+                       ghc-parser           >=0.1.1,
                       ghc-paths            ==0.1.*,
                       haskeline            -any,
                       here                 -any,

--- a/src/IHaskell/Eval/Lint.hs
+++ b/src/IHaskell/Eval/Lint.hs
@@ -17,7 +17,7 @@ import Data.Monoid
 import IHaskell.Types
 import IHaskell.Display
 import IHaskell.IPython
-import IHaskell.Eval.Parser
+import IHaskell.Eval.Parser hiding (line)
 data LintSeverity = LintWarning | LintError deriving (Eq, Show)

--- a/src/IHaskell/Eval/Parser.hs
+++ b/src/IHaskell/Eval/Parser.hs
@@ -50,11 +50,6 @@ data CodeBlock
  | ParseError StringLoc ErrMsg        -- ^ An error indicating that parsing the code block failed.
  deriving (Show, Eq)
-- | Store locations along with a value.
-data Located a = Located LineNumber a deriving (Eq, Show)
-instance Functor Located where
-  fmap f (Located line a) = Located line $ f a
 -- | Directive types. Each directive is associated with a string in the
 -- directive code block.
 data DirectiveType
@@ -70,14 +65,6 @@ data DirectiveType
  | GetDoc          -- ^ Get documentation for an identifier via Hoogle.
  deriving (Show, Eq)
-- | Unlocate something - drop the position.
-unloc :: Located a -> a
-unloc (Located _ a) = a
-- | Get the line number of a located element.
-line :: Located a -> LineNumber
-line (Located l _) = l
 -- | Parse a string into code blocks.
 parseString :: GhcMonad m => String -> m [Located CodeBlock]
 parseString codeString = do
@@ -88,7 +75,7 @@ parseString codeString = do
    Parsed {} -> return [Located 1 $ Module codeString]
    Failure {} -> do
      -- Split input into chunks based on indentation.
-      let chunks = layoutChunks $ dropComments codeString
+      let chunks = layoutChunks $ removeComments codeString
      result <- joinFunctions <$> processChunks [] chunks
      -- Return to previous flags. When parsing, flags can be set to make
@@ -268,69 +255,6 @@ parseDirective (':':directive) line = case find rightDirective directives of
      ]
 parseDirective _ _ = error "Directive must start with colon!"
-- | Split an input string into chunks based on indentation.
-- A chunk is a line and all lines immediately following that are indented
-- beyond the indentation of the first line. This parses Haskell layout
-- rules properly, and allows using multiline expressions via indentation. 
-layoutChunks :: String -> [Located String]
-layoutChunks = go 1
-  where
-    go :: LineNumber -> String -> [Located String]
-    go line = filter (not . null . unloc) . map (fmap strip) . layoutLines line . lines
-    layoutLines :: LineNumber -> [String] -> [Located String]
-    -- Empty string case.  If there's no input, output is empty.
-    layoutLines _ [] = []
-    -- Use the indent of the first line to find the end of the first block.
-    layoutLines lineIdx all@(firstLine:rest) = 
-      let firstIndent = indentLevel firstLine
-          blockEnded line = indentLevel line <= firstIndent in
-        case findIndex blockEnded rest of
-          -- If the first block doesn't end, return the whole string, since
-          -- that just means the block takes up the entire string. 
-          Nothing -> [Located lineIdx $ intercalate "\n" all]
-          -- We found the end of the block. Split this bit out and recurse.
-          Just idx -> 
-            let (before, after) = splitAt idx rest in
-              Located lineIdx (joinLines $ firstLine:before) : go (lineIdx + idx + 1) (joinLines after)
-    -- Compute indent level of a string as number of leading spaces.
-    indentLevel :: String -> Int
-    indentLevel (' ':str) = 1 + indentLevel str
-    -- Count a tab as two spaces.
-    indentLevel ('\t':str) = 2 + indentLevel str
-    -- Count empty lines as a large indent level, so they're always with the previous expression.
-    indentLevel "" = 100000
-    indentLevel _ = 0
-- Not the same as 'unlines', due to trailing \n
-joinLines :: [String] -> String
-joinLines = intercalate "\n"
-- | Drop comments from Haskell source.
-dropComments :: String -> String
-dropComments = removeOneLineComments . removeMultilineComments
-  where
-    -- Don't remove comments after cmd directives
-    removeOneLineComments (':':'!':remaining) = ":!" ++ takeWhile (/= '\n') remaining ++ 
-      removeOneLineComments (dropWhile (/= '\n') remaining)
-    removeOneLineComments ('-':'-':remaining) = removeOneLineComments (dropWhile (/= '\n') remaining)
-    removeOneLineComments (x:xs) = x:removeOneLineComments xs
-    removeOneLineComments x = x
-    removeMultilineComments ('{':'-':remaining) = 
-      case subIndex "-}" remaining of
-        Nothing -> ""
-        Just idx -> removeMultilineComments $ drop (2 + idx) remaining
-    removeMultilineComments (x:xs) = x:removeMultilineComments xs
-    removeMultilineComments x = x
 -- | Parse a module and return the name declared in the 'module X where'
 -- line. That line is required, and if it does not exist, this will error.
 -- Names with periods in them are returned piece y piece.
@@ -344,3 +268,7 @@ getModuleName moduleSrc = do
      case unLoc <$> hsmodName (unLoc mod) of
        Nothing -> error "Module must have a name."
        Just name -> return $ split "." $ moduleNameString name
+-- Not the same as 'unlines', due to trailing \n
+joinLines :: [String] -> String
+joinLines = intercalate "\n"