Verified Commit 1af015a5 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 551-dev-graphql-contexts-ngrams

parents 1c01593e 6e57d503
Pipeline #4197 passed with stages
in 123 minutes and 53 seconds
## Version 0.0.6.9.9.6.3
* [BACK][FIX][Boolean Query Parser (#182)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/182)
* [BACK][FEAT] Gitlab Issue Parser, Welcome Christian
## Version 0.0.6.9.9.6.2
* [BACK][FIX][Ngrams Status change (#217)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/217)
......
......@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9.6.2
version: 0.0.6.9.9.6.3
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -53,6 +53,9 @@ library
Gargantext.Core.Text
Gargantext.Core.Text.Context
Gargantext.Core.Text.Corpus.API
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
......@@ -185,15 +188,14 @@ library
Gargantext.Core.NodeStoryFile
Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Arxiv
Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.API.Pubmed
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.Gitlab
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
......@@ -382,6 +384,7 @@ library
, blaze-html
, blaze-markup
, blaze-svg
, boolexpr
, bytestring
, case-insensitive
, cassava
......@@ -846,6 +849,7 @@ test-suite garg-test
main-is: Main.hs
other-modules:
Core.Text
Core.Text.Corpus.Query
Core.Text.Examples
Core.Text.Flow
Core.Utils
......@@ -863,6 +867,7 @@ test-suite garg-test
Parsers.Types
Parsers.WOS
Utils.Crypto
Utils.Jobs
Paths_gargantext
hs-source-dirs:
src-test
......@@ -893,61 +898,33 @@ test-suite garg-test
build-depends:
QuickCheck
, aeson
, async
, base
, boolexpr
, bytestring
, conduit
, containers
, crawlerArxiv
, duckling
, extra
, gargantext
, gargantext-prelude
, hspec
, http-client
, http-client-tls
, mtl
, parsec
, patches-class
, patches-map
, quickcheck-instances
, servant-job
, stm
, tasty
, tasty-hspec
, tasty-hunit
, tasty-quickcheck
, text
, time
, unordered-containers
, validity
default-language: Haskell2010
test-suite jobqueue-test
type: exitcode-stdio-1.0
main-is: Main.hs
other-modules:
Paths_gargantext
hs-source-dirs:
tests/queue
default-extensions:
DataKinds
DeriveGeneric
FlexibleContexts
FlexibleInstances
GADTs
GeneralizedNewtypeDeriving
MultiParamTypeClasses
NamedFieldPuns
NoImplicitPrelude
OverloadedStrings
RankNTypes
RecordWildCards
StrictData
ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N
build-depends:
aeson
, async
, base
, containers
, extra
, gargantext
, hspec
, http-client
, http-client-tls
, mtl
, servant-job
, stm
, text
, time
default-language: Haskell2010
......@@ -6,7 +6,7 @@ name: gargantext
# | | | +----- Layers * : New versions with API additions
# | | | | +--- Layers * : New versions without API breaking changes
# | | | | |
version: '0.0.6.9.9.6.2'
version: '0.0.6.9.9.6.3'
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -81,6 +81,9 @@ library:
- Gargantext.Core.Text
- Gargantext.Core.Text.Context
- Gargantext.Core.Text.Corpus.API
- Gargantext.Core.Text.Corpus.API.Arxiv
- Gargantext.Core.Text.Corpus.API.Pubmed
- Gargantext.Core.Text.Corpus.Query
- Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
......@@ -166,6 +169,7 @@ library:
- blaze-html
- blaze-markup
- blaze-svg
- boolexpr
- bytestring
- case-insensitive
- cassava
......@@ -492,7 +496,6 @@ executables:
- aeson
- serialise
tests:
garg-test:
main: Main.hs
......@@ -512,45 +515,37 @@ tests:
- -rtsopts
- -with-rtsopts=-N
dependencies:
- QuickCheck
- aeson
- async
- base
- boolexpr
- bytestring
- conduit
- containers
- crawlerArxiv
- duckling
- gargantext
- gargantext-prelude
- hspec
- QuickCheck
- quickcheck-instances
- time
- http-client
- http-client-tls
- mtl
- parsec
- patches-class
- patches-map
- duckling
- quickcheck-instances
- servant-job
- stm
- tasty
- tasty-hspec
- tasty-hunit
- tasty-quickcheck
- text
- time
- unordered-containers
- validity
jobqueue-test:
main: Main.hs
source-dirs: tests/queue
ghc-options:
- -threaded
- -rtsopts
- -with-rtsopts=-N
dependencies:
- aeson
- async
- base
- containers
- gargantext
- mtl
- hspec
- http-client
- http-client-tls
- servant-job
- stm
- time
# garg-doctest:
# main: Main.hs
# source-dirs: src-doctest
......
This diff is collapsed.
......@@ -18,8 +18,8 @@ import Gargantext.Prelude
import Gargantext.Core.Utils
-- | Core.Utils tests
test :: IO ()
test = hspec $ do
test :: Spec
test = do
describe "check if groupWithCounts works" $ do
it "simple integer array" $ do
(groupWithCounts [1, 2, 3, 1, 2, 3]) `shouldBe` [(1, 2), (2, 2), (3, 2)]
......
......@@ -30,8 +30,8 @@ myCooc = HashMap.fromList [((NgramsTerm {unNgramsTerm = "gev au"},NgramsTerm {un
test :: IO ()
test = hspec $ do
test :: Spec
test = do
describe "Cross" $ do
let
(distanceMap,_,_) = doSimilarityMap Conditional 0 Weak myCooc
......
......@@ -11,6 +11,7 @@ Portability : POSIX
import Gargantext.Prelude
import qualified Core.Text.Corpus.Query as CorpusQuery
import qualified Core.Utils as Utils
--import qualified Ngrams.Lang.Fr as Fr
--import qualified Ngrams.Lang as Lang
......@@ -20,17 +21,32 @@ import qualified Parsers.Date as PD
-- import qualified Graph.Distance as GD
import qualified Graph.Clustering as Graph
import qualified Utils.Crypto as Crypto
import qualified Utils.Jobs as Jobs
import Test.Tasty
import Test.Tasty.Hspec
main :: IO ()
main = do
Utils.test
utilSpec <- testSpec "Utils" Utils.test
clusteringSpec <- testSpec "Graph Clustering" Graph.test
dateParserSpec <- testSpec "Date Parsing" PD.testFromRFC3339
cryptoSpec <- testSpec "Crypto" Crypto.test
nlpSpec <- testSpec "NLP" NLP.test
jobsSpec <- testSpec "Jobs" Jobs.test
defaultMain $ testGroup "Gargantext"
[ utilSpec
, clusteringSpec
, dateParserSpec
, cryptoSpec
, nlpSpec
, jobsSpec
, NgramsQuery.tests
, CorpusQuery.tests
]
-- Occ.parsersTest
-- Lang.ngramsExtractionTest FR
-- Lang.ngramsExtractionTest EN
-- Metrics.main
Graph.test
PD.testFromRFC3339
-- GD.test
Crypto.test
NLP.main
NgramsQuery.main
......@@ -19,8 +19,8 @@ import Gargantext.Prelude
import Gargantext.Core.Text.Terms.Multi
main :: IO ()
main = hspec $ do
test :: Spec
test = do
describe "Text that should be cleaned before sending it to NLP tools as micro-services." $ do
let text = "This is a url http://cnrs.gargantext.org to be remove and another one www.gargantext.org and digits 343242-2332 to be remove and some to keep: 232 231 33." :: Text
let result = "This is a url to be remove and another one and digits to be remove and some to keep: 232 231 33."
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}
module Ngrams.Query where
module Ngrams.Query (tests) where
import Control.Monad
import Data.Coerce
......@@ -21,9 +21,6 @@ import Test.Tasty
import Test.Tasty.HUnit
main :: IO ()
main = defaultMain tests
tests :: TestTree
tests = testGroup "Ngrams" [unitTests]
......
......@@ -36,8 +36,8 @@ fromRFC3339Inv :: Either ParseError ZonedTime -> Text
fromRFC3339Inv (Right z) = toRFC3339 z
fromRFC3339Inv (Left pe) = panic . pack $ show pe
testFromRFC3339 :: IO ()
testFromRFC3339 = hspec $ do
testFromRFC3339 :: Spec
testFromRFC3339 = do
describe "Test fromRFC3339: " $ do
it "is the inverse of Duckling's toRFC3339" $ property $
((==) <*> (fromRFC3339 . fromRFC3339Inv)) . Right . looseZonedTimePrecision
......
......@@ -16,11 +16,10 @@ import Test.Hspec
import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash
import Gargantext.Prelude.Utils
-- | Crypto Hash tests
test :: IO ()
test = hspec $ do
test :: Spec
test = do
describe "Hash String with frontend works" $ do
let text = "To hash with backend" :: Text
let hashed = "8a69a94d164279af2b7d1443ce08da6184b3d7e815406076e148159c284b53c3" :: Hash
......
......@@ -3,7 +3,7 @@
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE NumericUnderscores #-}
module Main where
module Utils.Jobs (test) where
import Control.Concurrent
import qualified Control.Concurrent.Async as Async
......@@ -17,7 +17,6 @@ import Data.Either
import Data.List
import Data.Sequence (Seq, (|>), fromList)
import Data.Time
import GHC.Stack
import Prelude
import System.IO.Unsafe
import Network.HTTP.Client.TLS (newTlsManager)
......@@ -54,16 +53,6 @@ addJobToSchedule jobt mvar = do
data Counts = Counts { countAs :: Int, countBs :: Int }
deriving (Eq, Show)
inc, dec :: JobT -> Counts -> Counts
inc A cs = cs { countAs = countAs cs + 1 }
inc B cs = cs { countBs = countBs cs + 1 }
inc C cs = cs
inc D cs = cs
dec A cs = cs { countAs = countAs cs - 1 }
dec B cs = cs { countBs = countBs cs - 1 }
dec C cs = cs
dec D cs = cs
jobDuration, initialDelay :: Int
jobDuration = 100000
initialDelay = 20000
......@@ -194,9 +183,6 @@ testTlsManager :: Manager
testTlsManager = unsafePerformIO newTlsManager
{-# NOINLINE testTlsManager #-}
shouldBeE :: (MonadIO m, HasCallStack, Show a, Eq a) => a -> a -> m ()
shouldBeE a b = liftIO (shouldBe a b)
withJob :: Env
-> (JobHandle MyDummyMonad -> () -> MyDummyMonad ())
-> IO (SJ.JobStatus 'SJ.Safe JobLog)
......@@ -362,8 +348,8 @@ testMarkProgress = do
]
}
main :: IO ()
main = hspec $ do
test :: Spec
test = do
describe "job queue" $ do
it "respects max runners limit" $
testMaxRunners
......
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Admin.Orchestrator.Types
where
import Control.Lens hiding (elements)
import Control.Monad.Reader (MonadReader)
import Data.Aeson
import Data.Morpheus.Types
( GQLType
......@@ -24,9 +23,7 @@ import Test.QuickCheck.Arbitrary
import qualified Gargantext.API.GraphQL.Utils as GQLU
import Gargantext.Core.Types (TODO(..))
import Gargantext.Database.Prelude (HasConfig(..))
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_pubmed_api_key)
------------------------------------------------------------------------
instance Arbitrary a => Arbitrary (JobStatus 'Safe a) where
......@@ -37,39 +34,24 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- | Main Types
-- TODO IsidoreAuth
data ExternalAPIs = All
| PubMed { mAPIKey :: Maybe Text }
data ExternalAPIs = PubMed
| Arxiv
| HAL
| IsTex
| Isidore
deriving (Show, Eq, Generic)
deriving (Show, Eq, Generic, Enum, Bounded)
-- | Main Instances
instance FromJSON ExternalAPIs
instance ToJSON ExternalAPIs
externalAPIs :: ( MonadReader env m
, HasConfig env) => m [ExternalAPIs]
externalAPIs = do
pubmed_api_key <- view $ hasConfig . gc_pubmed_api_key
pure [ All
, PubMed { mAPIKey = Just pubmed_api_key }
, Arxiv
, HAL
, IsTex
, Isidore ]
externalAPIs :: [ExternalAPIs]
externalAPIs = [minBound .. maxBound]
instance Arbitrary ExternalAPIs
where
arbitrary = elements [ All
, PubMed { mAPIKey = Nothing }
, Arxiv
, HAL
, IsTex
, Isidore ]
arbitrary = arbitraryBoundedEnum
instance ToSchema ExternalAPIs where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
......
......@@ -54,17 +54,18 @@ import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..))
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith, updateCorpusPubmedAPIKey)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_max_docs_parsers)
import Gargantext.Prelude.Config (gc_max_docs_parsers, gc_pubmed_api_key)
import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
import qualified Gargantext.Database.GargDB as GargDB
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
......@@ -130,16 +131,13 @@ deriveJSON (unPrefix "") 'ApiInfo
instance ToSchema ApiInfo
info :: FlowCmdM env err m => UserId -> m ApiInfo
info _u = do
ext <- API.externalAPIs
pure $ ApiInfo ext
info :: ApiInfo
info = ApiInfo API.externalAPIs
------------------------------------------------------------------------
------------------------------------------------------------------------
data WithQuery = WithQuery
{ _wq_query :: !Text
{ _wq_query :: !API.RawQuery
, _wq_databases :: !Database
, _wq_datafield :: !(Maybe Datafield)
, _wq_lang :: !Lang
......@@ -185,7 +183,7 @@ addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> WithQuery
-> Maybe Integer
-> Maybe API.Limit
-> JobHandle m
-> m ()
addToCorpusWithQuery user cid (WithQuery { _wq_query = q
......@@ -210,7 +208,8 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
_ -> do
case datafield of
Just (External (PubMed { _api_key })) -> do
Just (External PubMed) -> do
_api_key <- view $ hasConfig . gc_pubmed_api_key
printDebug "[addToCorpusWithQuery] pubmed api key" _api_key
_ <- updateCorpusPubmedAPIKey cid _api_key
pure ()
......@@ -222,7 +221,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
-- printDebug "[G.A.N.C.New] getDataText with query" q
db <- database2origin dbs
let db = database2origin dbs
eTxt <- getDataText db (Multi l) q maybeLimit
-- printDebug "[G.A.N.C.New] lTxts" lTxts
......
......@@ -150,7 +150,7 @@ insertSearxResponse user cId listId l (Right (SearxResponse { _srs_results })) =
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m, MonadJobStatus m)
=> User
-> CorpusId
-> API.Query
-> API.RawQuery
-> Lang
-> JobHandle m
-> m ()
......@@ -183,7 +183,7 @@ triggerSearxSearch user cId q l jobHandle = do
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_language = l
, _fsp_manager = manager
, _fsp_pageno = page
, _fsp_query = q
, _fsp_query = API.getRawQuery q
, _fsp_url = surl }
insertSearxResponse user cId listId l res
......
......@@ -3,44 +3,37 @@
module Gargantext.API.Node.Corpus.Types where
import Control.Lens hiding (elements, Empty)
import Control.Monad.Reader (MonadReader)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Monoid (mempty)
import Data.Swagger
import GHC.Generics (Generic)
import qualified PUBMED.Types as PUBMED
import Gargantext.Prelude
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import qualified Gargantext.API.Admin.Orchestrator.Types as Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (DataOrigin(..))
import Gargantext.Database.Prelude (HasConfig(..))
data Database = Empty
| PubMed { _api_key :: Maybe PUBMED.APIKey }
| PubMed
| Arxiv
| HAL
| IsTex
| Isidore
deriving (Eq, Show, Generic)
deriving (Eq, Show, Generic, Enum, Bounded)
deriveJSON (unPrefix "") ''Database
instance ToSchema Database where
declareNamedSchema = genericDeclareNamedSchemaUnrestricted defaultSchemaOptions
database2origin :: ( MonadReader env m
, HasConfig env ) => Database -> m DataOrigin
database2origin Empty = pure $ InternalOrigin T.IsTex
database2origin (PubMed { _api_key }) = do
-- pubmed_api_key <- view $ hasConfig . gc_pubmed_api_key
pure $ ExternalOrigin $ T.PubMed { mAPIKey = _api_key }
database2origin Arxiv = pure $ ExternalOrigin T.Arxiv
database2origin HAL = pure $ ExternalOrigin T.HAL
database2origin IsTex = pure $ ExternalOrigin T.IsTex
database2origin Isidore = pure $ ExternalOrigin T.Isidore
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin Types.IsTex
database2origin PubMed = ExternalOrigin Types.PubMed
database2origin Arxiv = ExternalOrigin Types.Arxiv
database2origin HAL = ExternalOrigin Types.HAL
database2origin IsTex = ExternalOrigin Types.IsTex
database2origin Isidore = ExternalOrigin Types.Isidore
------------------------------------------------------------------------
data Datafield = Gargantext
......
......@@ -290,7 +290,7 @@ addCorpusWithQuery :: User -> ServerT New.AddWithQuery (GargM Env GargError)
addCorpusWithQuery user cid =
serveJobsAPI AddCorpusQueryJob $ \jHandle q -> do
limit <- view $ hasConfig . gc_max_docs_scrapers
New.addToCorpusWithQuery user cid q (Just limit) jHandle
New.addToCorpusWithQuery user cid q (Just $ fromIntegral limit) jHandle
{- let log' x = do
printDebug "addToCorpusWithQuery" x
liftBase $ log x
......
......@@ -11,46 +11,58 @@ Portability : POSIX
module Gargantext.Core.Text.Corpus.API
( ExternalAPIs(..)
, Query
, Limit
, Corpus.RawQuery(..)
, Corpus.Limit(..)
, GetCorpusError(..)
, get
, externalAPIs
) where
import Conduit
import Control.Lens ((^.))
import Data.Bifunctor
import Data.Either (Either(..))
import Data.Maybe
import qualified Data.Text as T
import Gargantext.API.Admin.Orchestrator.Types (ExternalAPIs(..), externalAPIs)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig, gc_pubmed_api_key)
import qualified Gargantext.Core.Text.Corpus.API.Arxiv as Arxiv
import qualified Gargantext.Core.Text.Corpus.API.Hal as HAL
import qualified Gargantext.Core.Text.Corpus.API.Isidore as ISIDORE
import qualified Gargantext.Core.Text.Corpus.API.Istex as ISTEX
import qualified Gargantext.Core.Text.Corpus.API.Pubmed as PUBMED
import qualified Gargantext.Core.Text.Corpus.Query as Corpus
import Servant.Client (ClientError)
data GetCorpusError
= -- | We couldn't parse the user input query into something meaningful.
InvalidInputQuery !Corpus.RawQuery !T.Text
-- | The external service returned an error.
| ExternalAPIError !ClientError
deriving (Show, Eq)
-- | Get External API metadata main function
get :: ExternalAPIs
get :: GargConfig
-> ExternalAPIs
-> Lang
-> Query
-> Maybe Limit
-> Corpus.RawQuery
-> Maybe Corpus.Limit
-- -> IO [HyperdataDocument]
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get PubMed { mAPIKey = mAPIKey } _la q limit = PUBMED.get mAPIKey q limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
get Arxiv la q limit = Arxiv.get la q (fromIntegral <$> limit)
get HAL la q limit = HAL.getC la q limit
get IsTex la q limit = do
docs <- ISTEX.get la q limit
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get Isidore la q limit = do
docs <- ISIDORE.get la (fromIntegral <$> limit) (Just q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
get externalApi _ _ _ = panic $ "[G.C.T.Corpus.API] This options are note taken into account: " <> (cs $ show externalApi)
-- | Some Sugar for the documentation
type Query = PUBMED.Query
type Limit = PUBMED.Limit
-> IO (Either GetCorpusError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get cfg externalAPI la q limit = do
case Corpus.parseQuery q of
Left err -> pure $ Left $ InvalidInputQuery q (T.pack err)
Right corpusQuery -> case externalAPI of
PubMed -> first ExternalAPIError <$>
PUBMED.get (cfg ^. gc_pubmed_api_key) corpusQuery limit
--docs <- PUBMED.get q default_limit -- EN only by default
--pure (Just $ fromIntegral $ length docs, yieldMany docs)
Arxiv -> Right <$> Arxiv.get la corpusQuery limit
HAL -> first ExternalAPIError <$> HAL.getC la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
IsTex -> do docs <- ISTEX.get la (Corpus.getRawQuery q) (Corpus.getLimit <$> limit)
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
Isidore -> do docs <- ISIDORE.get la (Corpus.getLimit <$> limit) (Just $ Corpus.getRawQuery q) Nothing
pure $ Right (Just $ fromIntegral $ length docs, yieldMany docs)
......@@ -10,34 +10,81 @@ Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans -fno-warn-unused-top-binds #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Corpus.API.Arxiv
where
( get
-- * Internals for testing
, convertQuery
) where
import Conduit
import Data.Either (Either(..))
import Data.Maybe
import Data.Text (Text)
import Data.Text (Text, unpack)
import qualified Data.Text as Text
import Servant.Client (ClientError)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import qualified Arxiv as Arxiv
import qualified Network.Api.Arxiv as Ax
type Query = Text
type Limit = Arxiv.Limit
-- | Converts a Gargantext's generic boolean query into an Arxiv Query.
convertQuery :: Corpus.Query -> Ax.Query
convertQuery q = mkQuery (interpretQuery q transformAST)
where
mkQuery :: Maybe Ax.Expression -> Ax.Query
mkQuery mb_exp = Ax.Query { Ax.qExp = mb_exp
, Ax.qIds = []
, Ax.qStart = 0
, Ax.qItems = Arxiv.batchSize }
-- Converts a 'BoolExpr' with 'Term's on the leaves into an Arxiv's expression.
-- It yields 'Nothing' if the AST cannot be converted into a meaningful expression.
transformAST :: BoolExpr Term -> Maybe Ax.Expression
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> Ax.AndNot <$> (transformAST sub) <*> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> Ax.AndNot <$> transformAST term1 <*> transformAST term2
BAnd sub1 sub2
-> Ax.And <$> transformAST sub1 <*> transformAST sub2
BOr sub1 sub2
-> Ax.Or <$> transformAST sub1 <*> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
-- We can handle negatives via `ANDNOT` with itself.
BNot sub
-> Ax.AndNot <$> transformAST sub <*> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> Nothing
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> Nothing
BConst (Positive (Term term))
-> Just $ Ax.Exp $ Ax.Abs [unpack term]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term))
-> Just $ Ax.AndNot (Ax.Exp $ Ax.Abs [unpack term]) (Ax.Exp $ Ax.Abs [unpack term])
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
get :: Lang -> Query -> Maybe Limit -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get la q _l = do
(cnt, resC) <- Arxiv.apiSimpleC Nothing [Text.unpack q]
pure $ Right (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
get :: Lang
-> Corpus.Query
-> Maybe Corpus.Limit
-> IO (Maybe Integer, ConduitT () HyperdataDocument IO ())
get la (convertQuery -> query) (fmap getLimit -> limit) = do
(cnt, resC) <- case limit of
Nothing -> Arxiv.searchAxv' query
(Just l) -> do (cnt, res) <- Arxiv.searchAxv' query
pure (cnt, res .| takeC l)
pure $ (Just $ fromIntegral cnt, resC .| mapC (toDoc la))
toDoc :: Lang -> Arxiv.Result -> HyperdataDocument
toDoc l (Arxiv.Result { abstract
......
......@@ -27,14 +27,14 @@ import qualified HAL as HAL
import qualified HAL.Client as HAL
import qualified HAL.Doc.Corpus as HAL
get :: Lang -> Text -> Maybe Integer -> IO [HyperdataDocument]
get :: Lang -> Text -> Maybe Int -> IO [HyperdataDocument]
get la q ml = do
eDocs <- HAL.getMetadataWith q (Just 0) ml
eDocs <- HAL.getMetadataWith q (Just 0) (fromIntegral <$> ml)
either (panic . pack . show) (\d -> mapM (toDoc' la) $ HAL._docs d) eDocs
getC :: Lang -> Text -> Maybe Integer -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC :: Lang -> Text -> Maybe Int -> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
getC la q ml = do
eRes <- HAL.getMetadataWithC q (Just 0) ml
eRes <- HAL.getMetadataWithC q (Just 0) (fromIntegral <$> ml)
pure $ (\(len, docsC) -> (len, docsC .| mapMC (toDoc' la))) <$> eRes
-- case eRes of
-- Left err -> panic $ pack $ show err
......
......@@ -29,7 +29,7 @@ import qualified ISTEX as ISTEX
import qualified ISTEX.Client as ISTEX
type Query = Text
type MaxResults = Maybe Integer
type MaxResults = Maybe Int
get :: Lang -> Query -> MaxResults -> IO [HyperdataDocument]
get la query' maxResults = do
......@@ -57,7 +57,7 @@ get la query' maxResults = do
-- Complex queries of IsTex needs parameters using ":" so we leave the query as it is
-- in that case we suppose user is knowing what s.he is doing
eDocs <- ISTEX.getMetadataWith query (fromIntegral <$> maxResults)
eDocs <- ISTEX.getMetadataWith query maxResults
-- printDebug "[Istex.get] will print length" (0 :: Int)
case eDocs of
Left _ -> pure ()
......
......@@ -9,20 +9,33 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Text.Corpus.API.Pubmed
( get
-- * Internals for testing
, ESearch(..)
, convertQuery
, getESearch
)
where
import Conduit
import Control.Monad.Reader (runReaderT)
import Data.Either (Either)
import Data.Maybe
import Data.Semigroup
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as TE
import Network.HTTP.Types.URI (EscapeItem(..), renderQueryPartialEscape)
import Servant.Client (ClientError)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Query as Corpus
import Gargantext.Core.Types (Term(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import qualified PUBMED as PubMed
......@@ -30,24 +43,72 @@ import qualified PUBMED.Parser as PubMedDoc
import PUBMED.Types (Config(..))
type Query = Text
type Limit = Integer
-- | A pubmed query.
-- See: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
newtype ESearch = ESearch { _ESearch :: [EscapeItem] }
deriving stock (Show, Eq)
deriving newtype (Semigroup, Monoid)
-- | Returns an /url encoded/ query ready to be sent to pubmed.
getESearch :: ESearch -> Text
getESearch (ESearch items) =
Text.replace "term=" "" . TE.decodeUtf8 . renderQueryPartialEscape False $ [
("term", items)
]
convertQuery :: Corpus.Query -> ESearch
convertQuery q = ESearch (interpretQuery q transformAST)
where
transformAST :: BoolExpr Term -> [EscapeItem]
transformAST ast = case ast of
BAnd sub (BConst (Negative term))
-- The second term become positive, so that it can be translated.
-> (transformAST sub) <> [QN "+AND+NOT+"] <> transformAST (BConst (Positive term))
BAnd term1 (BNot term2)
-> transformAST term1 <> [QN "+AND+NOT+"] <> transformAST term2
BAnd sub1 sub2
-> transformAST sub1 <> [QN "+AND+"] <> transformAST sub2
BOr sub1 sub2
-> transformAST sub1 <> [QN "+OR+"] <> transformAST sub2
BNot (BConst (Negative term))
-> transformAST (BConst (Positive term)) -- double negation
BNot sub
-> [QN "NOT+"] <> transformAST sub
-- BTrue cannot happen is the query parser doesn't support parsing 'TRUE' alone.
BTrue
-> mempty
-- BTrue cannot happen is the query parser doesn't support parsing 'FALSE' alone.
BFalse
-> mempty
BConst (Positive (Term term))
-> [QE (TE.encodeUtf8 term)]
-- We can handle negatives via `ANDNOT` with itself.
BConst (Negative (Term term))
-> [QN "NOT+", QE (TE.encodeUtf8 term)]
-- | TODO put default pubmed query in gargantext.ini
-- by default: 10K docs
get :: Maybe Text
-> Query
get :: Text
-> Corpus.Query
-> Maybe Limit
-> IO (Either ClientError (Maybe Integer, ConduitT () HyperdataDocument IO ()))
get mAPIKey q l = do
eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = mAPIKey
, query = q
get apiKey q l = do
-- The documentation for PUBMED says:
-- Values for query keys may also be provided in term if they are preceeded by a
-- '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch,
-- any number of query keys can be combined in term. Also, if query keys are provided in term,
-- they can be combined with OR or NOT in addition to AND.
-- Example:
-- esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
--
-- Therefore, we can pretty-print our 'Query' back into something that PubMed could understand.
eRes <- runReaderT PubMed.getMetadataWithC (Config { apiKey = Just apiKey
, query = getESearch $ convertQuery q
, perPage = Just 200
, mWebEnv = Nothing })
let takeLimit = case l of
Nothing -> mapC identity
Just l' -> takeC $ fromIntegral l'
Just l' -> takeC $ getLimit l'
pure $ (\(len, docsC) -> (len, docsC .| takeLimit .| mapC (toDoc EN))) <$> eRes
--either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
-- <$> PubMed.getMetadataWithC q l
......
module Gargantext.Core.Text.Corpus.Parsers.Gitlab (
Issue(..), gitlabIssue2hyperdataDocument, readFile_Issues, readFile_IssuesAsDocs
) where
import Data.Aeson
import Data.Time
import qualified Data.Text as DT
import qualified Data.ByteString.Lazy as DBL
import System.FilePath (FilePath)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
data Issue = Issue { _issue_id :: !Int
, _issue_title :: !DT.Text
, _issue_content :: !DT.Text
, _issue_created :: !LocalTime
, _issue_closed :: !(Maybe UTCTime)
}
deriving (Show)
instance FromJSON Issue where
parseJSON = withObject "Issue" $ \v -> Issue
<$> v .: "c0" -- id
<*> v .: "c1" -- title
<*> v .: "c2" -- content
<*> v .: "c3" -- creation time
<*> v .:? "c4" -- close time
gitlabIssue2hyperdataDocument :: Issue -> HyperdataDocument
gitlabIssue2hyperdataDocument issue = HyperdataDocument
{ _hd_bdd = Nothing
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just (_issue_title issue)
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Nothing
, _hd_abstract = Just (_issue_content issue)
, _hd_publication_date = Just $ DT.pack $ show date
, _hd_publication_year = Just $ fromIntegral year
, _hd_publication_month = Just month
, _hd_publication_day = Just day
, _hd_publication_hour = Just (todHour tod)
, _hd_publication_minute = Just (todMin tod)
, _hd_publication_second = Just (round $ todSec tod)
, _hd_language_iso2 = Just $ (DT.pack . show) lang
}
where lang = EN
date = _issue_created issue
(year, month, day) = toGregorian $ localDay date
tod = localTimeOfDay date
readFile_Issues :: FilePath -> IO [Issue]
readFile_Issues fp = do
raw <- DBL.readFile fp
let mayIssues = decode raw
case mayIssues of
Just is -> pure is
Nothing -> pure []
readFile_IssuesAsDocs :: FilePath -> IO [HyperdataDocument]
readFile_IssuesAsDocs = fmap (fmap gitlabIssue2hyperdataDocument) . readFile_Issues
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Text.Corpus.Query (
Query -- * opaque
, RawQuery(..)
, Limit(..)
, getQuery
, parseQuery
, renderQuery
, interpretQuery
, ExternalAPIs(..)
, module BoolExpr
-- * Useful for testing
, unsafeMkQuery
) where
import Data.Bifunctor
import Data.String
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.Core.Types
import Prelude
import Text.ParserCombinators.Parsec
import qualified Data.Aeson as Aeson
import Data.BoolExpr as BoolExpr
import Data.BoolExpr.Parser as BoolExpr
import Data.BoolExpr.Printer as BoolExpr
import qualified Data.Swagger as Swagger
import qualified Data.Text as T
import qualified Servant.API as Servant
import qualified Text.Parsec as P
-- | A raw query, as typed by the user from the frontend.
newtype RawQuery = RawQuery { getRawQuery :: T.Text }
deriving newtype ( Show, Eq, IsString
, Servant.FromHttpApiData, Servant.ToHttpApiData
, Aeson.FromJSON, Aeson.ToJSON
, Swagger.ToParamSchema, Swagger.ToSchema)
-- | A limit to the number of results we want to retrieve.
newtype Limit = Limit { getLimit :: Int }
deriving newtype ( Show, Eq, Num
, Servant.FromHttpApiData, Servant.ToHttpApiData
, Aeson.FromJSON, Aeson.ToJSON
, Swagger.ToParamSchema, Swagger.ToSchema)
-- | An opaque wrapper around a 'Query' type which can be parsed from a boolean
-- expression like (a AND b) OR c, and which can be interpreted in many ways
-- according to the particular service we are targeting.
newtype Query = Query { getQuery :: (BoolExpr.CNF Term) }
deriving Show
interpretQuery :: Query -> (BoolExpr.BoolExpr Term -> ast) -> ast
interpretQuery (Query q) transform = transform . simplify . BoolExpr.fromCNF $ q
simplify :: BoolExpr.BoolExpr a -> BoolExpr.BoolExpr a
simplify expr = case expr of
BAnd sub BTrue -> simplify sub
BAnd BTrue sub -> simplify sub
BAnd BFalse _ -> BFalse
BAnd _ BFalse -> BFalse
BAnd sub1 sub2 -> BAnd (simplify sub1) (simplify sub2)
BOr _ BTrue -> BTrue
BOr BTrue _ -> BTrue
BOr sub BFalse -> simplify sub
BOr BFalse sub -> simplify sub
BOr sub1 sub2 -> BOr (simplify sub1) (simplify sub2)
BNot BTrue -> BFalse
BNot BFalse -> BTrue
BNot (BNot sub) -> simplify sub
BNot sub -> BNot (simplify sub)
BTrue -> BTrue
BFalse -> BFalse
BConst signed -> BConst signed
unsafeMkQuery :: BoolExpr.BoolExpr Term -> Query
unsafeMkQuery = Query . BoolExpr.boolTreeToCNF
termToken :: CharParser st Term
termToken = Term <$> (try (T.pack <$> BoolExpr.identifier) <|> (between dubQuote dubQuote multipleTerms))
where
dubQuote = BoolExpr.symbol "\""
multipleTerms = T.intercalate " " . map T.pack <$> sepBy BoolExpr.identifier BoolExpr.whiteSpace
-- | Parses an input 'Text' into a 'Query', reporting an error if it fails.
parseQuery :: RawQuery -> Either String Query
parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $
P.runParser (BoolExpr.parseBoolExpr termToken) () "Corpus.Query" (T.unpack txt)
renderQuery :: Query -> RawQuery
renderQuery (Query cnf) = RawQuery . T.pack $ BoolExpr.boolExprPrinter (showsPrec 0) (BoolExpr.fromCNF cnf) ""
......@@ -13,11 +13,12 @@ commentary with @some markup@.
------------------------------------------------------------------------
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DerivingStrategies #-}
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Database.Admin.Types.Node
, DebugMode(..), withDebugMode
, Term, Terms(..), TermsCount, TermsWithCount
, Term(..), Terms(..), TermsCount, TermsWithCount
, TokenTag(..), POS(..), NER(..)
, Label, Stems
, HasInvalidError(..), assertValid
......@@ -38,6 +39,7 @@ import Data.Maybe
import Data.Monoid
import Data.Semigroup
import Data.Set (Set, empty)
import Data.String
import Data.Swagger (ToParamSchema)
import Data.Swagger (ToSchema(..))
import Data.Text (Text, unpack)
......@@ -63,7 +65,10 @@ data Ordering = Down | Up
------------------------------------------------------------------------
type Name = Text
type Term = Text
newtype Term = Term { getTerm :: Text }
deriving newtype (Eq, Ord, IsString, Show)
type Stems = Set Text
type Label = [Text]
......
......@@ -50,7 +50,6 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Conduit
import Control.Lens ((^.), view, _Just, makeLenses, over, traverse)
import Control.Monad.Reader (MonadReader)
import Data.Aeson.TH (deriveJSON)
import Data.Conduit.Internal (zipSources)
import qualified Data.Conduit.List as CList
......@@ -65,7 +64,6 @@ import Data.Swagger
import qualified Data.Text as T
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import Servant.Client (ClientError)
import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
......@@ -133,13 +131,8 @@ deriveJSON (unPrefix "_do_") ''DataOrigin
instance ToSchema DataOrigin where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
allDataOrigins :: ( MonadReader env m
, HasConfig env) => m [DataOrigin]
allDataOrigins = do
ext <- API.externalAPIs
pure $ map InternalOrigin ext
<> map ExternalOrigin ext
allDataOrigins :: [DataOrigin]
allDataOrigins = map InternalOrigin API.externalAPIs <> map ExternalOrigin API.externalAPIs
---------------
data DataText = DataOld ![NodeId]
......@@ -157,11 +150,12 @@ printDataText (DataNew (maybeInt, conduitData)) = do
getDataText :: FlowCmdM env err m
=> DataOrigin
-> TermType Lang
-> API.Query
-> API.RawQuery
-> Maybe API.Limit
-> m (Either ClientError DataText)
getDataText (ExternalOrigin api) la q li = liftBase $ do
eRes <- API.get api (_tt_lang la) q li
-> m (Either API.GetCorpusError DataText)
getDataText (ExternalOrigin api) la q li = do
cfg <- view $ hasConfig
eRes <- liftBase $ API.get cfg api (_tt_lang la) q li
pure $ DataNew <$> eRes
getDataText (InternalOrigin _) _la q _li = do
......@@ -169,13 +163,13 @@ getDataText (InternalOrigin _) _la q _li = do
(UserName userMaster)
(Left "")
(Nothing :: Maybe HyperdataCorpus)
ids <- map fst <$> searchDocInDatabase cId (stemIt q)
ids <- map fst <$> searchDocInDatabase cId (stemIt $ API.getRawQuery q)
pure $ Right $ DataOld ids
getDataText_Debug :: FlowCmdM env err m
=> DataOrigin
-> TermType Lang
-> API.Query
-> API.RawQuery
-> Maybe API.Limit
-> m ()
getDataText_Debug a l q li = do
......
......@@ -342,8 +342,8 @@ getCorpusPubmedAPIKey cId = do
|]
params = PGS.Only cId
updateCorpusPubmedAPIKey :: NodeId -> Maybe PUBMED.APIKey -> Cmd err Int64
updateCorpusPubmedAPIKey cId mAPIKey =
updateCorpusPubmedAPIKey :: NodeId -> PUBMED.APIKey -> Cmd err Int64
updateCorpusPubmedAPIKey cId apiKey =
execPGSQuery query params
where
query :: PGS.Query
......@@ -352,7 +352,7 @@ updateCorpusPubmedAPIKey cId mAPIKey =
SET hyperdata = hyperdata || ?
WHERE id = ?
|]
params = (encode $ object [ "pubmed_api_key" .= mAPIKey ], cId)
params = (encode $ object [ "pubmed_api_key" .= apiKey ], cId)
------------------------------------------------------------------------
-- TODO
-- currently this function removes the child relation
......
......@@ -37,6 +37,7 @@ extra-deps:
- HSvm-0.1.1.3.22
- hsparql-0.3.8
- ghc-clippy-plugin-0.0.0.1
- boolexpr-0.2
#- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
# commit: 791c2a7046a3760f8ae5fabdbd708f61caa63741
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment