Commit 7592f5a8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch '111-dev-refactor-text-corpus-api-with-conduit-alp' of...

Merge branch '111-dev-refactor-text-corpus-api-with-conduit-alp' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext into dev
parents 1d721ba3 7c64de1e
...@@ -17,10 +17,11 @@ module Main where ...@@ -17,10 +17,11 @@ module Main where
import Control.Exception (finally) import Control.Exception (finally)
import Data.Either import Data.Either
import Data.Maybe (Maybe(..))
import Data.Text (Text) import Data.Text (Text)
import Prelude (read)
import System.Environment (getArgs) import System.Environment (getArgs)
import qualified Data.Text as Text import qualified Data.Text as Text
import Text.Read (readMaybe)
import Gargantext.API.Dev (withDevEnv, runCmdDev) import Gargantext.API.Dev (withDevEnv, runCmdDev)
import Gargantext.API.Admin.EnvTypes (DevEnv(..)) import Gargantext.API.Admin.EnvTypes (DevEnv(..))
...@@ -46,11 +47,14 @@ main = do ...@@ -46,11 +47,14 @@ main = do
--tt = (Unsupervised EN 6 0 Nothing) --tt = (Unsupervised EN 6 0 Nothing)
tt = (Multi EN) tt = (Multi EN)
format = CsvGargV3 -- CsvHal --WOS format = CsvGargV3 -- CsvHal --WOS
limit' = case (readMaybe limit :: Maybe Int) of
Nothing -> panic $ "Cannot read limit: " <> (Text.pack limit)
Just l -> l
corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) (read limit :: Int) tt format corpusPath Nothing (\_ -> pure ()) corpus = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt format corpusPath Nothing (\_ -> pure ())
corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId corpusCsvHal :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) (read limit :: Int) tt CsvHal corpusPath Nothing (\_ -> pure ()) corpusCsvHal = flowCorpusFile (UserName $ cs user) (Left (cs name :: Text)) limit' tt CsvHal corpusPath Nothing (\_ -> pure ())
annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId annuaire :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath (\_ -> pure ()) annuaire = flowAnnuaire (UserName $ cs user) (Left "Annuaire") (Multi EN) corpusPath (\_ -> pure ())
......
...@@ -22,6 +22,7 @@ Portability : POSIX ...@@ -22,6 +22,7 @@ Portability : POSIX
module Gargantext.API.Node.Contact module Gargantext.API.Node.Contact
where where
import Conduit
import Data.Aeson import Data.Aeson
import Data.Either (Either(Right)) import Data.Either (Either(Right))
import Data.Maybe (Maybe(..)) import Data.Maybe (Maybe(..))
...@@ -93,7 +94,7 @@ addContact u nId (AddContactParams fn ln) logStatus = do ...@@ -93,7 +94,7 @@ addContact u nId (AddContactParams fn ln) logStatus = do
, _scst_remaining = Just 1 , _scst_remaining = Just 1
, _scst_events = Just [] , _scst_events = Just []
} }
_ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing [[hyperdataContact fn ln]] logStatus _ <- flow (Nothing :: Maybe HyperdataAnnuaire) u (Right [nId]) (Multi EN) Nothing (Just 1, yield $ hyperdataContact fn ln) logStatus
pure JobLog { _scst_succeeded = Just 2 pure JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0 , _scst_failed = Just 0
......
...@@ -18,6 +18,8 @@ New corpus means either: ...@@ -18,6 +18,8 @@ New corpus means either:
module Gargantext.API.Node.Corpus.New module Gargantext.API.Node.Corpus.New
where where
import Conduit
import Control.Lens hiding (elements, Empty) import Control.Lens hiding (elements, Empty)
import Data.Aeson import Data.Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
...@@ -39,7 +41,7 @@ import Gargantext.Prelude ...@@ -39,7 +41,7 @@ import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events) import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage) import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Corpus.Types
...@@ -213,15 +215,20 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -213,15 +215,20 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- TODO if cid is folder -> create Corpus -- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus -- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private -- if cid is root -> create corpus in Private
txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs] eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
let lTxts = lefts eTxts
logStatus JobLog { _scst_succeeded = Just 2 case lTxts of
[] -> do
let txts = rights eTxts
-- TODO Sum lenghts of each txt elements
logStatus $ JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0 , _scst_failed = Just 0
, _scst_remaining = Just $ 1 + length txts , _scst_remaining = Just $ 1 + length txts
, _scst_events = Just [] , _scst_events = Just []
} }
cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts cids <- mapM (\txt -> do
flowDataText user txt (Multi l) cid Nothing logStatus) txts
printDebug "corpus id" cids printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user sendMail user
...@@ -232,6 +239,14 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -232,6 +239,14 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
, _scst_events = Just [] , _scst_events = Just []
} }
(err:_) -> do
pure $ addEvent "ERROR" (T.pack $ show err) $
JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 1
, _scst_remaining = Just 0
, _scst_events = Just []
}
type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint" type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "corpus" :> "corpus"
...@@ -268,15 +283,16 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -268,15 +283,16 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
_ -> cs d _ -> cs d
eDocs <- liftBase $ parse data' eDocs <- liftBase $ parse data'
case eDocs of case eDocs of
Right docs' -> do Right docs -> do
-- TODO Add progress (jobStatus) update for docs - this is a -- TODO Add progress (jobStatus) update for docs - this is a
-- long action -- long action
limit' <- view $ hasConfig . gc_max_docs_parsers limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit' let limit = fromIntegral limit'
if length docs' > limit then do if length docs > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs') printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs)
let panicMsg' = [ "[addToCorpusWithForm] number of docs (" let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
, show $ length docs' , show $ length docs
, ") exceeds the MAX_DOCS_PARSERS limit (" , ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit , show limit
, ")" ] , ")" ]
...@@ -285,7 +301,6 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -285,7 +301,6 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
panic panicMsg panic panicMsg
else else
pure () pure ()
let docs = splitEvery 500 $ take limit docs'
printDebug "Parsing corpus finished : " cid printDebug "Parsing corpus finished : " cid
logStatus jobLog2 logStatus jobLog2
...@@ -296,7 +311,8 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do ...@@ -296,7 +311,8 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
(Right [cid]) (Right [cid])
(Multi $ fromMaybe EN l) (Multi $ fromMaybe EN l)
Nothing Nothing
(map (map toHyperdataDocument) docs) (Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument)
--(map (map toHyperdataDocument) docs)
logStatus logStatus
printDebug "Extraction finished : " cid printDebug "Extraction finished : " cid
......
...@@ -16,6 +16,7 @@ Portability : POSIX ...@@ -16,6 +16,7 @@ Portability : POSIX
module Gargantext.API.Node.DocumentsFromWriteNodes module Gargantext.API.Node.DocumentsFromWriteNodes
where where
import Conduit
import Control.Lens ((^.)) import Control.Lens ((^.))
import Data.Aeson import Data.Aeson
import Data.Either (Either(..), rights) import Data.Either (Either(..), rights)
...@@ -100,7 +101,7 @@ documentsFromWriteNodes uId nId _p logStatus = do ...@@ -100,7 +101,7 @@ documentsFromWriteNodes uId nId _p logStatus = do
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = rights parsedE let parsed = rights parsedE
_ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) cId Nothing logStatus _ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus
pure $ jobLogSuccess jobLog pure $ jobLogSuccess jobLog
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -18,6 +18,8 @@ module Gargantext.Core.Text.Corpus.API ...@@ -18,6 +18,8 @@ module Gargantext.Core.Text.Corpus.API
) )
where where
import Conduit
import Data.Either (Either(..))
import Data.Maybe import Data.Maybe
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs) import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -27,6 +29,7 @@ import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL ...@@ -27,6 +29,7 @@ import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL
import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE
import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as PUBMED import qualified Gargantext.Core.Text.Corpus.API.Pubmed as PUBMED
import Servant.Client (ClientError)
-- | TODO put in gargantext.init -- | TODO put in gargantext.init
default_limit :: Maybe Integer default_limit :: Maybe Integer
...@@ -37,11 +40,18 @@ get :: ExternalAPIs ...@@ -37,11 +40,18 @@ get :: ExternalAPIs
-> Lang -> Lang
-> Query -> Query
-> Maybe Limit -> Maybe Limit
-> IO [HyperdataDocument] -- -> IO [HyperdataDocument]
get PubMed _la q _l = PUBMED.get q default_limit -- EN only by default -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get HAL la q _l = HAL.get la q default_limit get PubMed _la q _l = PUBMED.get q Nothing
get IsTex la q _l = ISTEX.get la q default_limit --docs <- PUBMED.get q default_limit -- EN only by default
get Isidore la q _l = ISIDORE.get la (fromIntegral <$> default_limit) (Just q) Nothing --pure (Just $ fromIntegral $ length docs, yieldMany docs)
get HAL la q _l = HAL.getC la q Nothing
get IsTex la q _l = do
docs <- ISTEX.get la q default_limit
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get Isidore la q _l = do
docs <- ISIDORE.get la (fromIntegral <$> default_limit) (Just q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get _ _ _ _ = undefined get _ _ _ _ = undefined
-- | Some Sugar for the documentation -- | Some Sugar for the documentation
......
...@@ -12,8 +12,11 @@ Portability : POSIX ...@@ -12,8 +12,11 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Hal module Gargantext.Core.Text.Corpus.API.Hal
where where
import Conduit
import Data.Either
import Data.Maybe import Data.Maybe
import Data.Text (Text, pack, intercalate) import Data.Text (Text, pack, intercalate)
import Servant.Client (ClientError)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
...@@ -25,8 +28,16 @@ import qualified HAL.Doc.Corpus as HAL ...@@ -25,8 +28,16 @@ import qualified HAL.Doc.Corpus as HAL
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument] get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get la q ml = do get la q ml = do
docs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml) eDocs <- HAL.getMetadataWith q (Just 0) ml
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) docs either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Integer -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC q (Just 0) ml
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
-- Right (len, docsC) -> pure (len, docsC .| mapMC (toDoc' la))
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
......
...@@ -13,9 +13,12 @@ Portability : POSIX ...@@ -13,9 +13,12 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API.Pubmed module Gargantext.Core.Text.Corpus.API.Pubmed
where where
import Conduit
import Data.Either (Either)
import Data.Maybe import Data.Maybe
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text import qualified Data.Text as Text
import Servant.Client (ClientError)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -31,17 +34,21 @@ type Limit = PubMed.Limit ...@@ -31,17 +34,21 @@ type Limit = PubMed.Limit
-- | TODO put default pubmed query in gargantext.ini -- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs -- by default: 10K docs
get :: Query -> Maybe Limit -> IO [HyperdataDocument] get :: Query -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get q l = either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN)) get q l = do
<$> PubMed.getMetadataWith q l eRes <- PubMed.getMetadataWithC q l
pure $ (\(len, docsC) -> (len, docsC .| mapC (toDoc EN))) <$> eRes
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
-- <$> PubMed.getMetadataWithC q l
toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument
toDoc l (PubMedDoc.PubMed (PubMedDoc.PubMedArticle t j as aus) toDoc l (PubMedDoc.PubMed { pubmed_id
(PubMedDoc.PubMedDate a y m d) , pubmed_article = PubMedDoc.PubMedArticle t j as aus
, pubmed_date = PubMedDoc.PubMedDate a y m d }
) = HyperdataDocument { _hd_bdd = Just "PubMed" ) = HyperdataDocument { _hd_bdd = Just "PubMed"
, _hd_doi = Nothing , _hd_doi = Nothing
, _hd_url = Nothing , _hd_url = Nothing
, _hd_uniqId = Nothing , _hd_uniqId = Just $ Text.pack $ show pubmed_id
, _hd_uniqIdBdd = Nothing , _hd_uniqIdBdd = Nothing
, _hd_page = Nothing , _hd_page = Nothing
, _hd_title = t , _hd_title = t
......
...@@ -34,7 +34,7 @@ import Gargantext.Prelude ...@@ -34,7 +34,7 @@ import Gargantext.Prelude
import Servant.API (FromHttpApiData(..), ToHttpApiData(..)) import Servant.API (FromHttpApiData(..), ToHttpApiData(..))
import Test.QuickCheck (elements) import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary) import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
import Text.Read (read) import Text.Read (readMaybe)
type CorpusName = Text type CorpusName = Text
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -74,7 +74,11 @@ instance Semigroup ListType ...@@ -74,7 +74,11 @@ instance Semigroup ListType
instance FromHttpApiData ListType where instance FromHttpApiData ListType where
parseUrlPiece = Right . read . unpack parseUrlPiece s = Right s'
where
s' = case (readMaybe $ unpack s) of
Nothing -> panic $ "Cannot read url piece: " <> s
Just s'' -> s''
instance ToHttpApiData ListType where instance ToHttpApiData ListType where
toUrlPiece = pack . show toUrlPiece = pack . show
......
...@@ -46,8 +46,10 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -46,8 +46,10 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
) )
where where
import Conduit
import Control.Lens ((^.), view, _Just, makeLenses) import Control.Lens ((^.), view, _Just, makeLenses)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources)
import Data.Either import Data.Either
import Data.HashMap.Strict (HashMap) import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable) import Data.Hashable (Hashable)
...@@ -60,6 +62,7 @@ import qualified Data.Text as T ...@@ -60,6 +62,7 @@ import qualified Data.Text as T
import Data.Traversable (traverse) import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Servant.Client (ClientError)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
...@@ -103,6 +106,7 @@ import Gargantext.Prelude ...@@ -103,6 +106,7 @@ import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash) import Gargantext.Prelude.Crypto.Hash (Hash)
import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
import qualified Prelude as Prelude
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Imports for upgrade function -- Imports for upgrade function
...@@ -127,7 +131,8 @@ allDataOrigins = map InternalOrigin API.externalAPIs ...@@ -127,7 +131,8 @@ allDataOrigins = map InternalOrigin API.externalAPIs
--------------- ---------------
data DataText = DataOld ![NodeId] data DataText = DataOld ![NodeId]
| DataNew ![[HyperdataDocument]] | DataNew !(Maybe Integer, ConduitT () HyperdataDocument IO ())
-- | DataNew ![[HyperdataDocument]]
-- TODO use the split parameter in config file -- TODO use the split parameter in config file
getDataText :: FlowCmdM env err m getDataText :: FlowCmdM env err m
...@@ -135,10 +140,10 @@ getDataText :: FlowCmdM env err m ...@@ -135,10 +140,10 @@ getDataText :: FlowCmdM env err m
-> TermType Lang -> TermType Lang
-> API.Query -> API.Query
-> Maybe API.Limit -> Maybe API.Limit
-> m DataText -> m (Either ClientError DataText)
getDataText (ExternalOrigin api) la q li = liftBase $ DataNew getDataText (ExternalOrigin api) la q li = liftBase $ do
<$> splitEvery 500 eRes <- API.get api (_tt_lang la) q li
<$> API.get api (_tt_lang la) q li pure $ DataNew <$> eRes
getDataText (InternalOrigin _) _la q _li = do getDataText (InternalOrigin _) _la q _li = do
(_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
...@@ -146,10 +151,11 @@ getDataText (InternalOrigin _) _la q _li = do ...@@ -146,10 +151,11 @@ getDataText (InternalOrigin _) _la q _li = do
(Left "") (Left "")
(Nothing :: Maybe HyperdataCorpus) (Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt q) ids <- map fst <$> searchDocInDatabase cId (stemIt q)
pure $ DataOld ids pure $ Right $ DataOld ids
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
flowDataText :: ( FlowCmdM env err m flowDataText :: forall env err m.
( FlowCmdM env err m
) )
=> User => User
-> DataText -> DataText
...@@ -161,7 +167,8 @@ flowDataText :: ( FlowCmdM env err m ...@@ -161,7 +167,8 @@ flowDataText :: ( FlowCmdM env err m
flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
where where
corpusType = (Nothing :: Maybe HyperdataCorpus) corpusType = (Nothing :: Maybe HyperdataCorpus)
flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus flowDataText u (DataNew (mLen, txtC)) tt cid mfslw logStatus =
flowCorpus u (Right [cid]) tt mfslw (mLen, (transPipe liftBase txtC)) logStatus
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- TODO use proxy -- TODO use proxy
...@@ -173,8 +180,9 @@ flowAnnuaire :: (FlowCmdM env err m) ...@@ -173,8 +180,9 @@ flowAnnuaire :: (FlowCmdM env err m)
-> (JobLog -> m ()) -> (JobLog -> m ())
-> m AnnuaireId -> m AnnuaireId
flowAnnuaire u n l filePath logStatus = do flowAnnuaire u n l filePath logStatus = do
docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]]) -- TODO Conduit for file
flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus docs <- liftBase $ ((readFile_Annuaire filePath) :: IO [HyperdataContact])
flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing (Just $ fromIntegral $ length docs, yieldMany docs) logStatus
------------------------------------------------------------------------ ------------------------------------------------------------------------
flowCorpusFile :: (FlowCmdM env err m) flowCorpusFile :: (FlowCmdM env err m)
...@@ -185,12 +193,13 @@ flowCorpusFile :: (FlowCmdM env err m) ...@@ -185,12 +193,13 @@ flowCorpusFile :: (FlowCmdM env err m)
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> (JobLog -> m ()) -> (JobLog -> m ())
-> m CorpusId -> m CorpusId
flowCorpusFile u n l la ff fp mfslw logStatus = do flowCorpusFile u n _l la ff fp mfslw logStatus = do
eParsed <- liftBase $ parseFile ff fp eParsed <- liftBase $ parseFile ff fp
case eParsed of case eParsed of
Right parsed -> do Right parsed -> do
let docs = splitEvery 500 $ take l parsed flowCorpus u n la mfslw (Just $ fromIntegral $ length parsed, yieldMany parsed .| mapC toHyperdataDocument) logStatus
flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus --let docs = splitEvery 500 $ take l parsed
--flowCorpus u n la mfslw (yieldMany $ map (map toHyperdataDocument) docs) logStatus
Left e -> panic $ "Error: " <> (T.pack e) Left e -> panic $ "Error: " <> (T.pack e)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -201,13 +210,14 @@ flowCorpus :: (FlowCmdM env err m, FlowCorpus a) ...@@ -201,13 +210,14 @@ flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
-> Either CorpusName [CorpusId] -> Either CorpusName [CorpusId]
-> TermType Lang -> TermType Lang
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> [[a]] -> (Maybe Integer, ConduitT () a m ())
-> (JobLog -> m ()) -> (JobLog -> m ())
-> m CorpusId -> m CorpusId
flowCorpus = flow (Nothing :: Maybe HyperdataCorpus) flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
flow :: ( FlowCmdM env err m flow :: forall env err m a c.
( FlowCmdM env err m
, FlowCorpus a , FlowCorpus a
, MkCorpus c , MkCorpus c
) )
...@@ -216,22 +226,39 @@ flow :: ( FlowCmdM env err m ...@@ -216,22 +226,39 @@ flow :: ( FlowCmdM env err m
-> Either CorpusName [CorpusId] -> Either CorpusName [CorpusId]
-> TermType Lang -> TermType Lang
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> [[a]] -> (Maybe Integer, ConduitT () a m ())
-> (JobLog -> m ()) -> (JobLog -> m ())
-> m CorpusId -> m CorpusId
flow c u cn la mfslw docs logStatus = do flow c u cn la mfslw (mLength, docsC) logStatus = do
-- TODO if public insertMasterDocs else insertUserDocs -- TODO if public insertMasterDocs else insertUserDocs
ids <- traverse (\(idx, doc) -> do ids <- runConduit $
id <- insertMasterDocs c la doc zipSources (yieldMany [1..]) docsC
logStatus JobLog { _scst_succeeded = Just $ 1 + idx .| mapMC insertDoc
.| sinkList
-- ids <- traverse (\(idx, doc) -> do
-- id <- insertMasterDocs c la doc
-- logStatus JobLog { _scst_succeeded = Just $ 1 + idx
-- , _scst_failed = Just 0
-- , _scst_remaining = Just $ length docs - idx
-- , _scst_events = Just []
-- }
-- pure id
-- ) (zip [1..] docs)
flowCorpusUser (la ^. tt_lang) u cn c ids mfslw
where
insertDoc :: (Integer, a) -> m NodeId
insertDoc (idx, doc) = do
id <- insertMasterDocs c la [doc]
case mLength of
Nothing -> pure ()
Just len -> do
logStatus JobLog { _scst_succeeded = Just $ fromIntegral $ 1 + idx
, _scst_failed = Just 0 , _scst_failed = Just 0
, _scst_remaining = Just $ length docs - idx , _scst_remaining = Just $ fromIntegral $ len - idx
, _scst_events = Just [] , _scst_events = Just []
} }
pure id pure $ Prelude.head id
) (zip [1..] docs)
flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
...@@ -250,7 +277,7 @@ flowCorpusUser l user corpusName ctype ids mfslw = do ...@@ -250,7 +277,7 @@ flowCorpusUser l user corpusName ctype ids mfslw = do
-- User Flow -- User Flow
(userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
-- NodeTexts is first -- NodeTexts is first
_tId <- insertDefaultNode NodeTexts userCorpusId userId _tId <- insertDefaultNodeIfNotExists NodeTexts userCorpusId userId
-- printDebug "NodeTexts: " tId -- printDebug "NodeTexts: " tId
-- NodeList is second -- NodeList is second
...@@ -276,8 +303,8 @@ flowCorpusUser l user corpusName ctype ids mfslw = do ...@@ -276,8 +303,8 @@ flowCorpusUser l user corpusName ctype ids mfslw = do
-- _ <- insertOccsUpdates userCorpusId mastListId -- _ <- insertOccsUpdates userCorpusId mastListId
-- printDebug "userListId" userListId -- printDebug "userListId" userListId
-- User Graph Flow -- User Graph Flow
_ <- insertDefaultNode NodeDashboard userCorpusId userId _ <- insertDefaultNodeIfNotExists NodeDashboard userCorpusId userId
_ <- insertDefaultNode NodeGraph userCorpusId userId _ <- insertDefaultNodeIfNotExists NodeGraph userCorpusId userId
--_ <- mkPhylo userCorpusId userId --_ <- mkPhylo userCorpusId userId
-- Annuaire Flow -- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId -- _ <- mkAnnuaire rootUserId userId
...@@ -322,7 +349,7 @@ saveDocNgramsWith :: ( FlowCmdM env err m) ...@@ -322,7 +349,7 @@ saveDocNgramsWith :: ( FlowCmdM env err m)
-> m () -> m ()
saveDocNgramsWith lId mapNgramsDocs' = do saveDocNgramsWith lId mapNgramsDocs' = do
terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
printDebug "terms2id" terms2id --printDebug "terms2id" terms2id
let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs' let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
...@@ -331,7 +358,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do ...@@ -331,7 +358,7 @@ saveDocNgramsWith lId mapNgramsDocs' = do
$ map (first _ngramsTerms . second Map.keys) $ map (first _ngramsTerms . second Map.keys)
$ HashMap.toList mapNgramsDocs $ HashMap.toList mapNgramsDocs
printDebug "saveDocNgramsWith" mapCgramsId --printDebug "saveDocNgramsWith" mapCgramsId
-- insertDocNgrams -- insertDocNgrams
_return <- insertContextNodeNgrams2 _return <- insertContextNodeNgrams2
$ catMaybes [ ContextNodeNgrams2 <$> Just nId $ catMaybes [ ContextNodeNgrams2 <$> Just nId
...@@ -504,5 +531,3 @@ extractInsert docs = do ...@@ -504,5 +531,3 @@ extractInsert docs = do
documentsWithId documentsWithId
_ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs' _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
pure () pure ()
...@@ -24,7 +24,7 @@ import Data.ByteString.Char8 (hPutStrLn) ...@@ -24,7 +24,7 @@ import Data.ByteString.Char8 (hPutStrLn)
import Data.Either.Extra (Either) import Data.Either.Extra (Either)
import Data.Pool (Pool, withResource) import Data.Pool (Pool, withResource)
import Data.Profunctor.Product.Default (Default) import Data.Profunctor.Product.Default (Default)
import Data.Text (unpack, Text) import Data.Text (pack, unpack, Text)
import Data.Word (Word16) import Data.Word (Word16)
import Database.PostgreSQL.Simple (Connection, connect) import Database.PostgreSQL.Simple (Connection, connect)
import Database.PostgreSQL.Simple.FromField ( Conversion, ResultError(ConversionFailed), fromField, returnError) import Database.PostgreSQL.Simple.FromField ( Conversion, ResultError(ConversionFailed), fromField, returnError)
...@@ -36,7 +36,7 @@ import Opaleye (Unpackspec, showSql, FromFields, Select, runSelect, SqlJsonb, De ...@@ -36,7 +36,7 @@ import Opaleye (Unpackspec, showSql, FromFields, Select, runSelect, SqlJsonb, De
import Opaleye.Aggregate (countRows) import Opaleye.Aggregate (countRows)
import System.IO (FilePath) import System.IO (FilePath)
import System.IO (stderr) import System.IO (stderr)
import Text.Read (read) import Text.Read (readMaybe)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
import qualified Data.List as DL import qualified Data.List as DL
import qualified Database.PostgreSQL.Simple as PGS import qualified Database.PostgreSQL.Simple as PGS
...@@ -176,9 +176,13 @@ databaseParameters :: FilePath -> IO PGS.ConnectInfo ...@@ -176,9 +176,13 @@ databaseParameters :: FilePath -> IO PGS.ConnectInfo
databaseParameters fp = do databaseParameters fp = do
ini <- readIniFile' fp ini <- readIniFile' fp
let val' key = unpack $ val ini "database" key let val' key = unpack $ val ini "database" key
let dbPortRaw = val' "DB_PORT"
let dbPort = case (readMaybe dbPortRaw :: Maybe Word16) of
Nothing -> panic $ "DB_PORT incorrect: " <> (pack dbPortRaw)
Just d -> d
pure $ PGS.ConnectInfo { PGS.connectHost = val' "DB_HOST" pure $ PGS.ConnectInfo { PGS.connectHost = val' "DB_HOST"
, PGS.connectPort = read (val' "DB_PORT") :: Word16 , PGS.connectPort = dbPort
, PGS.connectUser = val' "DB_USER" , PGS.connectUser = val' "DB_USER"
, PGS.connectPassword = val' "DB_PASS" , PGS.connectPassword = val' "DB_PASS"
, PGS.connectDatabase = val' "DB_NAME" , PGS.connectDatabase = val' "DB_NAME"
......
...@@ -255,6 +255,14 @@ insertDefaultNode :: HasDBid NodeType ...@@ -255,6 +255,14 @@ insertDefaultNode :: HasDBid NodeType
=> NodeType -> ParentId -> UserId -> Cmd err [NodeId] => NodeType -> ParentId -> UserId -> Cmd err [NodeId]
insertDefaultNode nt p u = insertNode nt Nothing Nothing p u insertDefaultNode nt p u = insertNode nt Nothing Nothing p u
insertDefaultNodeIfNotExists :: HasDBid NodeType
=> NodeType -> ParentId -> UserId -> Cmd err [NodeId]
insertDefaultNodeIfNotExists nt p u = do
children <- getChildrenByType p nt
case children of
[] -> insertDefaultNode nt p u
xs -> pure xs
insertNode :: HasDBid NodeType insertNode :: HasDBid NodeType
=> NodeType -> Maybe Name -> Maybe DefaultHyperdata -> ParentId -> UserId -> Cmd err [NodeId] => NodeType -> Maybe Name -> Maybe DefaultHyperdata -> ParentId -> UserId -> Cmd err [NodeId]
insertNode nt n h p u = insertNodesR [nodeW nt n h p u] insertNode nt n h p u = insertNodesR [nodeW nt n h p u]
......
...@@ -69,11 +69,11 @@ extra-deps: ...@@ -69,11 +69,11 @@ extra-deps:
# External Data API connectors # External Data API connectors
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed.git
commit: 9cdba6423decad5acfacb0f274212fd8723ce734 commit: bfbdc993675ec647af400f2491a17468f2c4e045
- git: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/istex.git
commit: daeae80365250c4bd539f0a65e271f9aa37f731f commit: daeae80365250c4bd539f0a65e271f9aa37f731f
- git: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
commit: 020f5f9b308f5c23c925aedf5fb11f8b4728fb19 commit: 2fc233399af29ead31d4020c445810721435d2e6
- git: https://gitlab.iscpif.fr/gargantext/crawlers/isidore.git - git: https://gitlab.iscpif.fr/gargantext/crawlers/isidore.git
commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b commit: 3db385e767d2100d8abe900833c6e7de3ac55e1b
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment