diff --git a/app/CleanCsvCorpus.hs b/bin/gargantext-cli/CleanCsvCorpus.hs similarity index 100% rename from app/CleanCsvCorpus.hs rename to bin/gargantext-cli/CleanCsvCorpus.hs diff --git a/app/Main.hs b/bin/gargantext-server/Main.hs similarity index 100% rename from app/Main.hs rename to bin/gargantext-server/Main.hs diff --git a/notes/foldFinal.hs b/notes/foldFinal.hs deleted file mode 100644 index d267f9ac6873a878fc9673fdd651c4c02c52e319..0000000000000000000000000000000000000000 --- a/notes/foldFinal.hs +++ /dev/null @@ -1,25 +0,0 @@ -{-# LANGUAGE ExistentialQuantification #-} -{-# LANGUAGE RankNTypes #-} - -import Control.Lens (Getting, foldMapOf) - -data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o) - - -instance Functor (Fold i) where - fmap k (Fold tally summarize) = Fold tally (k . summarize) - -instance Applicative (Fold i) where - pure o = Fold (\_ -> ()) (\_ -> o) - - Fold tallyF summarizeF <*> Fold tallyX summarizeX = Fold tally summarize - where - tally i = (tallyF i, tallyX i) - summarize (nF, nX) = summarizeF nF (summarizeX nX) - -focus :: (forall m . Monoid m => Getting m b a) -> Fold a o -> Fold b o -focus lens (Fold tally summarize) = Fold (foldMapOf lens tally) summarize - - - - diff --git a/notes/folds.hs b/notes/folds.hs deleted file mode 100644 index 1bc98b547f0594747ac93fbdaaa95dafbd2fbbad..0000000000000000000000000000000000000000 --- a/notes/folds.hs +++ /dev/null @@ -1,56 +0,0 @@ -{-# LANGUAGE ExistentialQuantification #-} - --- | Thanks to Gabriel Gonzales and his beautiful folds - -import Data.Monoid -import Prelude hiding (head, last, all, any, sum, product, length) - -import qualified Data.Foldable - -data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o) - -fold :: Fold i o -> [i] -> o -fold (Fold tally summarize) is = summarize (reduce (map tally is)) - where - reduce = Data.Foldable.foldl' (<>) mempty - --- -head :: Fold a (Maybe a) -head = Fold (First . Just) getFirst - -last :: Fold a (Maybe a) -last = Fold (Last . Just) getLast --- -all :: (a -> Bool) -> Fold a Bool -all predicate = Fold (All . predicate) getAll - -any :: (a -> Bool) -> Fold a Bool -any predicate = Fold (Any . predicate) getAny --- -sum :: Num n => Fold n n -sum = Fold Sum getSum - -product :: Num n => Fold n n -product = Fold Product getProduct - -length :: Num n => Fold i n -length = Fold (\_ -> Sum 1) getSum - --- -{-# LANGUAGE BangPatterns #-} - -data Average a = Average { numerator :: !a, denominator :: !Int } - -instance Num a => Monoid (Average a) where - mempty = Average 0 0 - mappend (Average xL nL) (Average xR nR) = Average (xL + xR) (nL + nR) - -average :: Fractional a => Fold a a -average = Fold tally summarize - where - tally x = Average x 1 - summarize (Average numerator denominator) = - numerator / fromIntegral denominator - - - diff --git a/package.yaml b/package.yaml index 1ceb401bf92ecb8229e0f48fc867980a1ed75954..bc7f35b26c09a1c6236fb1b0a167836fe19b440c 100644 --- a/package.yaml +++ b/package.yaml @@ -124,9 +124,29 @@ library: # - utc executables: - gargantext: + gargantext-server: main: Main.hs - source-dirs: app + source-dirs: bin/gargantext-server + ghc-options: + - -threaded + - -rtsopts + - -with-rtsopts=-N + - -O2 + - -Wmissing-signatures + dependencies: + - base + - containers + - gargantext + - vector + - cassava + - ini + - optparse-generic + - unordered-containers + - full-text-search + + gargantext-cli: + main: Main.hs + source-dirs: bin/gargantext-cli ghc-options: - -threaded - -rtsopts diff --git a/src/Gargantext/Text/Parsers.hs b/src/Gargantext/Text/Parsers.hs index c94da34eabdd3d7660ffcc095f606ad65e04e5f8..dbddbbd644b359b7813ec36b1a69bd24cfec4a24 100644 --- a/src/Gargantext/Text/Parsers.hs +++ b/src/Gargantext/Text/Parsers.hs @@ -23,20 +23,31 @@ please follow the types. module Gargantext.Text.Parsers -- (parse, FileFormat(..)) where -import Gargantext.Prelude +import System.FilePath (FilePath(), takeExtension) +import Codec.Archive.Zip (withArchive, getEntry, getEntries) -import System.FilePath (FilePath()) +import Data.Either.Extra (partitionEithers) +import Data.List (concat) import qualified Data.Map as DM +import qualified Data.ByteString as DB import Data.Ord() import Data.String() +import Data.Either(Either(..)) +import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Text (Text) import qualified Data.Text as DT -- | Activate Async for to parse in parallel ---import Control.Concurrent.Async as CCA (mapConcurrently) +import Control.Concurrent.Async as CCA (mapConcurrently) +import Data.Text.Encoding (decodeUtf8) import Data.String (String()) +------------------------------------------------------------------------ +import Gargantext.Prelude +import Gargantext.Text.Parsers.WOS (wosParser) +------------------------------------------------------------------------ + type ParseError = String type Field = Text @@ -60,38 +71,37 @@ data FileFormat = WOS -- Implemented (ISI Format) -- TODO: to debug maybe add the filepath in error message ---parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) ---parse format path = do --- files <- case takeExtension path of --- ".zip" -> openZip path --- _ -> pure <$> DB.readFile path --- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files --- pure (as, map toText $ concat bs) --- where --- -- TODO : decode with bayesian inference on encodings --- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b)) --- --- ----- | withParser: ----- According the format of the text, choosing the right parser. ----- TODO withParser :: FileFormat -> Parser [Document] ---withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]] ---withParser WOS = wosParser -----withParser DOC = docParser -----withParser ODT = odtParser -----withParser XML = xmlParser -----withParser _ = error "[ERROR] Parser not implemented yet" --- ---runParser :: FileFormat -> DB.ByteString --- -> IO (Either String [[(DB.ByteString, DB.ByteString)]]) ---runParser format text = pure $ parseOnly (withParser format) text --- ---openZip :: FilePath -> IO [DB.ByteString] ---openZip fp = do --- path <- resolveFile' fp --- entries <- withArchive path (DM.keys <$> getEntries) --- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries --- pure bs +parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) +parse format path = do + files <- case takeExtension path of + ".zip" -> openZip path + _ -> pure <$> DB.readFile path + (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files + pure (as, map toText $ concat bs) + where + -- TODO : decode with bayesian inference on encodings + toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b)) + + +-- | withParser: +-- According the format of the text, choosing the right parser. +-- TODO withParser :: FileFormat -> Parser [Document] +withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]] +withParser WOS = wosParser +--withParser DOC = docParser +--withParser ODT = odtParser +--withParser XML = xmlParser +--withParser _ = error "[ERROR] Parser not implemented yet" + +runParser :: FileFormat -> DB.ByteString + -> IO (Either String [[(DB.ByteString, DB.ByteString)]]) +runParser format text = pure $ parseOnly (withParser format) text + +openZip :: FilePath -> IO [DB.ByteString] +openZip fp = do + entries <- withArchive fp (DM.keys <$> getEntries) + bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries + pure bs clean :: Text -> Text clean txt = DT.map clean' txt