Commit 6492bff3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] gargantext-import, too much RAM used

parent b6b50641
Pipeline #288 failed with stage
......@@ -22,7 +22,7 @@ module Main where
import Control.Exception (finally)
import Servant (ServantErr)
import Gargantext.Prelude
import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
import Gargantext.Database.Flow (FlowCmdM, flowCorpus'')
import Gargantext.Text.Parsers (FileFormat(CsvHalFormat))
import Gargantext.Database.Utils (Cmd, )
import Gargantext.Database.Types.Node (CorpusId)
......@@ -32,6 +32,9 @@ import Gargantext.Core (Lang(..))
import Gargantext.API.Node () -- instances
import Gargantext.API.Settings (newDevEnvWith, runCmdDev, DevEnv)
import System.Environment (getArgs)
import Gargantext.Text.Parsers.GrandDebat (readFile, GrandDebatReference(..))
import qualified Data.Text as Text
import Control.Monad.IO.Class (liftIO)
main :: IO ()
main = do
......@@ -39,10 +42,15 @@ main = do
{-let createUsers :: Cmd ServantErr Int64
createUsers = insertUsers [gargantuaUser,simpleUser]
-}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmdCorpus = flowCorpus (cs user) (cs name) (Mono EN) CsvHalFormat corpusPath
-}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m [CorpusId]
cmdCorpus = do
docs <- liftIO (splitEvery 1000 <$> take 5000 <$> readFile corpusPath :: IO [[GrandDebatReference ]])
ids <- flowCorpus'' (Text.pack user) (Text.pack name) (Mono FR) docs
pure ids
-- cmd = {-createUsers >>-} cmdCorpus
......
......@@ -116,6 +116,7 @@ library:
- insert-ordered-containers
- jose-jwt
# - kmeans-vector
- json-stream
- KMP
- lens
- located-base
......
......@@ -80,6 +80,12 @@ flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
flowCorpus u cn la ff fp = liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
--{-
flowCorpus'' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId]
flowCorpus'' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
--}
flowCorpus' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [a] -> m CorpusId
flowCorpus' u cn la docs = do
......
......@@ -20,8 +20,10 @@ module Gargantext.Text.Parsers.GrandDebat
where
import GHC.IO (FilePath)
import Data.Aeson (ToJSON, FromJSON, decode)
import Data.Maybe (Maybe(), maybe)
import Data.Aeson (ToJSON, FromJSON)
import Data.JsonStream.Parser (eitherDecode)
import Data.Either (either)
import Data.Maybe (Maybe())
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.ByteString.Lazy as DBL
......@@ -76,13 +78,13 @@ instance ToHyperdataDocument GrandDebatReference
responses') =
HyperdataDocument (Just "GrandDebat") id'
Nothing Nothing Nothing Nothing
title' authorType' authorZipCode' authorZipCode'
title' authorType' authorType' authorZipCode'
(toAbstract <$> responses')
publishedAt'
Nothing Nothing Nothing Nothing Nothing Nothing
(Just $ Text.pack $ show FR)
where
toAbstract = (Text.intercalate " . ") . (map toSentence)
toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence))
toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
Nothing -> ""
Just r' -> case Text.length r' > 10 of
......@@ -95,4 +97,5 @@ class ReadFile a
instance ReadFile [GrandDebatReference]
where
readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
readFile fp = either (panic . Text.pack) identity <$> eitherDecode <$> DBL.readFile fp
......@@ -35,6 +35,7 @@ extra-deps:
- probable-0.1.3
- rake-0.0.1
- rdf4h-3.1.1
- json-stream-0.4.2.4 # Text.Parsers (JSON)
- serialise-0.2.0.0
- servant-flatten-0.2
- servant-multipart-0.11.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment