Commit 6492bff3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] gargantext-import, too much RAM used

parent b6b50641
Pipeline #288 failed with stage
...@@ -22,7 +22,7 @@ module Main where ...@@ -22,7 +22,7 @@ module Main where
import Control.Exception (finally) import Control.Exception (finally)
import Servant (ServantErr) import Servant (ServantErr)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Flow (FlowCmdM, flowCorpus) import Gargantext.Database.Flow (FlowCmdM, flowCorpus'')
import Gargantext.Text.Parsers (FileFormat(CsvHalFormat)) import Gargantext.Text.Parsers (FileFormat(CsvHalFormat))
import Gargantext.Database.Utils (Cmd, ) import Gargantext.Database.Utils (Cmd, )
import Gargantext.Database.Types.Node (CorpusId) import Gargantext.Database.Types.Node (CorpusId)
...@@ -32,6 +32,9 @@ import Gargantext.Core (Lang(..)) ...@@ -32,6 +32,9 @@ import Gargantext.Core (Lang(..))
import Gargantext.API.Node () -- instances import Gargantext.API.Node () -- instances
import Gargantext.API.Settings (newDevEnvWith, runCmdDev, DevEnv) import Gargantext.API.Settings (newDevEnvWith, runCmdDev, DevEnv)
import System.Environment (getArgs) import System.Environment (getArgs)
import Gargantext.Text.Parsers.GrandDebat (readFile, GrandDebatReference(..))
import qualified Data.Text as Text
import Control.Monad.IO.Class (liftIO)
main :: IO () main :: IO ()
main = do main = do
...@@ -39,10 +42,15 @@ main = do ...@@ -39,10 +42,15 @@ main = do
{-let createUsers :: Cmd ServantErr Int64 {-let createUsers :: Cmd ServantErr Int64
createUsers = insertUsers [gargantuaUser,simpleUser] createUsers = insertUsers [gargantuaUser,simpleUser]
-}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmdCorpus = flowCorpus (cs user) (cs name) (Mono EN) CsvHalFormat corpusPath cmdCorpus = flowCorpus (cs user) (cs name) (Mono EN) CsvHalFormat corpusPath
-}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m [CorpusId]
cmdCorpus = do
docs <- liftIO (splitEvery 1000 <$> take 5000 <$> readFile corpusPath :: IO [[GrandDebatReference ]])
ids <- flowCorpus'' (Text.pack user) (Text.pack name) (Mono FR) docs
pure ids
-- cmd = {-createUsers >>-} cmdCorpus -- cmd = {-createUsers >>-} cmdCorpus
......
...@@ -116,6 +116,7 @@ library: ...@@ -116,6 +116,7 @@ library:
- insert-ordered-containers - insert-ordered-containers
- jose-jwt - jose-jwt
# - kmeans-vector # - kmeans-vector
- json-stream
- KMP - KMP
- lens - lens
- located-base - located-base
......
...@@ -80,6 +80,12 @@ flowCorpus :: FlowCmdM env ServantErr m ...@@ -80,6 +80,12 @@ flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId => Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
flowCorpus u cn la ff fp = liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs flowCorpus u cn la ff fp = liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
--{-
flowCorpus'' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId]
flowCorpus'' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
--}
flowCorpus' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a) flowCorpus' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [a] -> m CorpusId => Username -> CorpusName -> TermType Lang -> [a] -> m CorpusId
flowCorpus' u cn la docs = do flowCorpus' u cn la docs = do
......
...@@ -20,8 +20,10 @@ module Gargantext.Text.Parsers.GrandDebat ...@@ -20,8 +20,10 @@ module Gargantext.Text.Parsers.GrandDebat
where where
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Data.Aeson (ToJSON, FromJSON, decode) import Data.Aeson (ToJSON, FromJSON)
import Data.Maybe (Maybe(), maybe) import Data.JsonStream.Parser (eitherDecode)
import Data.Either (either)
import Data.Maybe (Maybe())
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
...@@ -76,13 +78,13 @@ instance ToHyperdataDocument GrandDebatReference ...@@ -76,13 +78,13 @@ instance ToHyperdataDocument GrandDebatReference
responses') = responses') =
HyperdataDocument (Just "GrandDebat") id' HyperdataDocument (Just "GrandDebat") id'
Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
title' authorType' authorZipCode' authorZipCode' title' authorType' authorType' authorZipCode'
(toAbstract <$> responses') (toAbstract <$> responses')
publishedAt' publishedAt'
Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing
(Just $ Text.pack $ show FR) (Just $ Text.pack $ show FR)
where where
toAbstract = (Text.intercalate " . ") . (map toSentence) toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence))
toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
Nothing -> "" Nothing -> ""
Just r' -> case Text.length r' > 10 of Just r' -> case Text.length r' > 10 of
...@@ -95,4 +97,5 @@ class ReadFile a ...@@ -95,4 +97,5 @@ class ReadFile a
instance ReadFile [GrandDebatReference] instance ReadFile [GrandDebatReference]
where where
readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp --readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
readFile fp = either (panic . Text.pack) identity <$> eitherDecode <$> DBL.readFile fp
...@@ -35,6 +35,7 @@ extra-deps: ...@@ -35,6 +35,7 @@ extra-deps:
- probable-0.1.3 - probable-0.1.3
- rake-0.0.1 - rake-0.0.1
- rdf4h-3.1.1 - rdf4h-3.1.1
- json-stream-0.4.2.4 # Text.Parsers (JSON)
- serialise-0.2.0.0 - serialise-0.2.0.0
- servant-flatten-0.2 - servant-flatten-0.2
- servant-multipart-0.11.2 - servant-multipart-0.11.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment