Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
199
Issues
199
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
9dd874c4
Commit
9dd874c4
authored
Sep 22, 2025
by
Alfredo Di Napoli
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Barebone but buggy prototype
parent
0d1d0886
Pipeline
#7921
passed with stages
in 57 minutes and 11 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
41 additions
and
39 deletions
+41
-39
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+1
-1
Update.hs
src/Gargantext/API/Node/Update.hs
+6
-10
API.hs
src/Gargantext/Core/Text/Corpus/API.hs
+5
-3
Hal.hs
src/Gargantext/Core/Text/Corpus/API/Hal.hs
+12
-13
Types.hs
src/Gargantext/Core/Worker/Jobs/Types.hs
+2
-2
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+15
-10
No files found.
src/Gargantext/API/Node/Corpus/New.hs
View file @
9dd874c4
...
...
@@ -146,7 +146,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
let
db
=
datafield2origin
datafield
-- mPubmedAPIKey <- getUserPubmedAPIKey user
-- printDebug "[addToCorpusWithQuery] mPubmedAPIKey" mPubmedAPIKey
eTxt
<-
getDataText
db
(
Multi
l
)
q
maybeLimit
eTxt
<-
getDataText
cid
db
(
Multi
l
)
q
maybeLimit
-- printDebug "[G.A.N.C.New] lTxts" lTxts
case
eTxt
of
...
...
src/Gargantext/API/Node/Update.hs
View file @
9dd874c4
...
...
@@ -57,12 +57,10 @@ import Gargantext.System.Logging.Types (LogLevel(..))
import
Gargantext.Utils.Jobs.Monad
(
MonadJobStatus
(
..
))
import
Gargantext.Utils.UTCTime
(
timeMeasured
)
import
HAL
qualified
import
HAL.Types
qualified
as
HAL
import
Servant.Server.Generic
(
AsServerT
)
import
Data.Conduit
import
Data.Conduit.List
hiding
(
mapM_
)
import
Gargantext.Core
import
Gargantext.Core.Text.Corpus.API.Hal
(
toDoc'
)
import
Conduit
(
mapMC
)
import
Gargantext.Database.Admin.Types.Hyperdata.Corpus
(
HyperdataCorpus
)
------------------------------------------------------------------------
...
...
@@ -234,21 +232,19 @@ fetchHALDocuments :: ( IsDBCmd env err m
,
MonadLogger
m
,
MonadCatch
m
)
=>
Maybe
CorpusId
=>
CorpusId
->
Maybe
ISO639
.
ISO639_1
->
Text
->
Int
->
Int
->
m
()
fetchHALDocuments
Nothing
_query
_lang
_offset
_limit
=
do
$
(
logLocM
)
ERROR
$
"fetchHALDocuments failed because no corpusId was provided."
fetchHALDocuments
(
Just
corpusId
)
lang
query
offset
limit
=
do
docs_e
<-
liftBase
$
HAL
.
getMetadataWithOptsC
HAL
.
defaultHalOptions
[
query
]
(
Just
offset
)
(
Just
$
fromIntegral
limit
)
lang
fetchHALDocuments
corpusId
lang
query
offset
limit
=
do
docs_e
<-
liftBase
$
HAL
.
getMetadataWith
[
query
]
(
Just
offset
)
(
Just
$
fromIntegral
limit
)
lang
case
docs_e
of
Left
err
->
do
$
(
logLocM
)
ERROR
$
T
.
pack
(
show
err
)
Right
(
_len
,
docsC
)
->
do
docs
<-
liftBase
$
runConduit
$
(
docsC
.|
mapMC
(
toDoc'
lang
)
.|
consume
)
Right
HAL
.
Response
{
HAL
.
_docs
}
->
do
docs
<-
mapM
(
liftBase
.
toDoc'
lang
)
_docs
-- FIXME(adn) How can we pass the TermType correctly in a serialised fashion?
void
$
addDocumentsToHyperCorpus
@
_
@
_
@
_
@
_
@
HyperdataCorpus
Nothing
(
Mono
$
fromISOLang
lang
)
corpusId
docs
...
...
src/Gargantext/Core/Text/Corpus/API.hs
View file @
9dd874c4
...
...
@@ -38,6 +38,7 @@ import Gargantext.Prelude hiding (get)
import
Gargantext.Utils.Jobs.Error
import
Servant.Client
(
ClientError
)
import
Gargantext.Database.Action.Flow.Types
import
Gargantext.Core.Types
(
CorpusId
)
data
GetCorpusError
=
-- | We couldn't parse the user input query into something meaningful.
...
...
@@ -54,7 +55,8 @@ instance ToHumanFriendlyError GetCorpusError where
"There was a network problem while contacting the "
<>
T
.
pack
(
show
api
)
<>
" API provider. Please try again later or contact your network administrator."
-- | Get External API metadata main function
get
::
ExternalAPIs
get
::
CorpusId
->
ExternalAPIs
->
Lang
-- ^ A user-selected language in which documents needs to be retrieved.
-- If the provider doesn't support the search filtered by language, or if the language
...
...
@@ -64,7 +66,7 @@ get :: ExternalAPIs
->
Maybe
Corpus
.
Limit
-- -> IO [HyperdataDocument]
->
IO
(
Either
GetCorpusError
(
ResultsCount
,
DataProducer
IO
HyperdataDocument
))
get
externalAPI
lang
q
epoAPIUrl
limit
=
do
get
corpusId
externalAPI
lang
q
epoAPIUrl
limit
=
do
-- For PUBMED, HAL, IsTex, Isidore and OpenAlex, we want to send the query as-it.
-- For Arxiv we parse the query into a structured boolean query we submit over.
case
externalAPI
of
...
...
@@ -80,7 +82,7 @@ get externalAPI lang q epoAPIUrl limit = do
-- let's create a data producer that spins out separate jobs, and process batches
-- of 25 documents at the time.
first
(
ExternalAPIError
externalAPI
)
<$>
HAL
.
getDataProducer
(
Just
$
toISO639
lang
)
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
HAL
.
getDataProducer
corpusId
(
Just
$
toISO639
lang
)
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
IsTex
->
do
docs
<-
ISTEX
.
get
lang
(
Corpus
.
getRawQuery
q
)
(
Corpus
.
getLimit
<$>
limit
)
pure
$
Right
$
toConduitProducer
(
Just
$
fromIntegral
$
length
docs
,
yieldMany
docs
)
...
...
src/Gargantext/Core/Text/Corpus/API/Hal.hs
View file @
9dd874c4
...
...
@@ -27,8 +27,8 @@ import HAL qualified
import
HAL.Doc.Document
qualified
as
HAL
import
HAL.Types
qualified
as
HAL
import
Servant.Client
(
ClientError
(
..
))
import
System.IO.Error
(
userError
)
import
Gargantext.Core.Worker.Jobs.Types
(
FetchDocumentsHALPayload
(
..
),
Job
(
..
))
import
Gargantext.Core.Types
(
CorpusId
)
get
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
[
HyperdataDocument
]
get
la
q
ml
=
do
...
...
@@ -74,26 +74,25 @@ toDoc' la (HAL.Document { .. }) = do
-- A Simple ExceptT to make working with network requests a bit more pleasant.
type
HALMonad
a
=
ExceptT
ClientError
IO
a
getDataProducer
::
Maybe
ISO639
.
ISO639_1
getDataProducer
::
CorpusId
->
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
(
Either
ClientError
(
ResultsCount
,
DataProducer
IO
HyperdataDocument
))
getDataProducer
la
q
_mb_limit
=
runExceptT
$
do
getDataProducer
corpusId
la
q
_mb_limit
=
runExceptT
$
do
-- First of all, make a trivial query to fetch the full number of documents. Then, split the
-- total requests into suitable batches and turn them into Jobs.
(
mb_docs
,
_
)
<-
ExceptT
$
HAL
.
getMetadataWithC
[
q
]
(
Just
0
)
(
Just
1
)
la
case
mb_docs
of
Nothing
->
throwError
$
ConnectionError
(
toException
$
userError
"impossible, hal didn't return numDocs in the response."
)
Just
total
->
do
let
(
batches
,
finalBatchSize
)
=
(
fromInteger
total
)
`
divMod
`
halBatchSize
pure
(
ResultsCount
total
,
DataAsyncBatchProducer
$
mkBatches
batches
finalBatchSize
0
)
total
<-
ExceptT
$
HAL
.
countResultsOpts'
(
HAL
.
defaultHalOptions
{
HAL
.
_hco_batchSize
=
1
})
q
la
putStrLn
$
"Found "
<>
show
total
<>
" documents matching the query."
++
""
let
(
batches
,
finalBatchSize
)
=
(
fromInteger
total
)
`
divMod
`
halBatchSize
pure
(
ResultsCount
total
,
DataAsyncBatchProducer
$
mkBatches
(
max
0
batches
)
finalBatchSize
0
)
where
mkBatches
1
finalBatchSize
offset
=
[
FetchDocumentsHAL
(
FetchDocumentsHALPayload
Nothing
q
la
offset
finalBatchSize
)]
mkBatches
0
finalBatchSize
offset
=
[
FetchDocumentsHAL
(
FetchDocumentsHALPayload
corpusId
q
la
offset
finalBatchSize
)]
mkBatches
curBatch
finalBatchSize
offset
=
FetchDocumentsHAL
(
FetchDocumentsHALPayload
Nothing
q
la
offset
halBatchSize
)
FetchDocumentsHAL
(
FetchDocumentsHALPayload
corpusId
q
la
offset
halBatchSize
)
:
mkBatches
(
curBatch
-
1
)
finalBatchSize
(
offset
+
halBatchSize
)
-- | The size of a single batch.
halBatchSize
::
Int
halBatchSize
=
5
00
halBatchSize
=
1
00
src/Gargantext/Core/Worker/Jobs/Types.hs
View file @
9dd874c4
...
...
@@ -37,7 +37,7 @@ import qualified Data.LanguageCodes as ISO639
data
FetchDocumentsHALPayload
=
FetchDocumentsHALPayload
{
_fdhp_corpus_id
::
Maybe
CorpusId
{
_fdhp_corpus_id
::
CorpusId
,
_fdhp_query
::
Text
,
_fdhp_lang
::
Maybe
ISO639
.
ISO639_1
,
_fdhp_offset
::
Int
...
...
@@ -345,4 +345,4 @@ getWorkerMNodeId (UpdateNode { _un_node_id }) = Just _un_node_id
getWorkerMNodeId
(
UploadDocument
{
_ud_node_id
})
=
Just
_ud_node_id
getWorkerMNodeId
(
ImportRemoteDocuments
(
ImportRemoteDocumentsPayload
_
_
corpusId
_
_
))
=
Just
corpusId
getWorkerMNodeId
(
ImportRemoteTerms
(
ImportRemoteTermsPayload
listId
_
))
=
Just
listId
getWorkerMNodeId
(
FetchDocumentsHAL
(
FetchDocumentsHALPayload
{
_fdhp_corpus_id
}))
=
_fdhp_corpus_id
getWorkerMNodeId
(
FetchDocumentsHAL
(
FetchDocumentsHALPayload
{
_fdhp_corpus_id
}))
=
Just
_fdhp_corpus_id
src/Gargantext/Database/Action/Flow.hs
View file @
9dd874c4
...
...
@@ -98,7 +98,7 @@ import Gargantext.Database.Action.Search (searchDocInDatabase)
import
Gargantext.Database.Admin.Types.Hyperdata.Contact
(
HyperdataContact
)
import
Gargantext.Database.Admin.Types.Hyperdata.Corpus
(
HyperdataAnnuaire
,
HyperdataCorpus
(
_hc_lang
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
ToHyperdataDocument
(
toHyperdataDocument
),
HyperdataDocument
)
import
Gargantext.Database.Admin.Types.Node
hiding
(
ERROR
,
DEBUG
)
-- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import
Gargantext.Database.Admin.Types.Node
hiding
(
INFO
,
ERROR
,
DEBUG
)
-- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import
Gargantext.Database.Prelude
import
Gargantext.Database.Query.Table.ContextNodeNgrams2
(
ContextNodeNgrams2Poly
(
..
),
insertContextNodeNgrams2
)
import
Gargantext.Database.Query.Table.Node
(
MkCorpus
,
insertDefaultNodeIfNotExists
,
getOrMkList
,
getNodeWith
)
...
...
@@ -112,7 +112,7 @@ import Gargantext.Database.Schema.Ngrams ( indexNgrams, NgramsId )
import
Gargantext.Database.Schema.Node
import
Gargantext.Database.Types
import
Gargantext.Prelude
hiding
(
catch
,
onException
,
to
)
import
Gargantext.System.Logging
(
logLocM
,
LogLevel
(
DEBUG
,
ERROR
),
MonadLogger
)
import
Gargantext.System.Logging
(
logLocM
,
LogLevel
(
DEBUG
,
INFO
,
ERROR
),
MonadLogger
)
import
Gargantext.Utils.Jobs.Monad
(
JobHandle
,
MonadJobStatus
(
..
),
markFailureNoErr
)
import
Servant.Client.Core
(
ClientError
)
...
...
@@ -147,16 +147,17 @@ printDataProducer = \case
-- TODO use the split parameter in config file
getDataText
::
(
HasNodeError
err
)
=>
DataOrigin
=>
CorpusId
->
DataOrigin
->
TermType
Lang
->
API
.
RawQuery
->
Maybe
API
.
Limit
->
DBCmdWithEnv
env
err
(
Either
API
.
GetCorpusError
(
DataText
IO
))
getDataText
(
ExternalOrigin
api
)
la
q
li
=
do
getDataText
corpusId
(
ExternalOrigin
api
)
la
q
li
=
do
cfg
<-
view
hasConfig
eRes
<-
liftBase
$
API
.
get
api
(
_tt_lang
la
)
q
(
_ac_epo_api_url
$
_gc_apis
cfg
)
li
eRes
<-
liftBase
$
API
.
get
corpusId
api
(
_tt_lang
la
)
q
(
_ac_epo_api_url
$
_gc_apis
cfg
)
li
pure
$
uncurry
DataNew
<$>
eRes
getDataText
(
InternalOrigin
_
)
la
q
_li
=
do
getDataText
_
(
InternalOrigin
_
)
la
q
_li
=
do
cfg
<-
view
hasConfig
runDBTx
$
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMkRootWithCorpus
cfg
MkCorpusUserMaster
(
Nothing
::
Maybe
HyperdataCorpus
)
...
...
@@ -164,13 +165,14 @@ getDataText (InternalOrigin _) la q _li = do
pure
$
Right
$
DataOld
ids
getDataText_Debug
::
(
HasNodeError
err
)
=>
DataOrigin
=>
CorpusId
->
DataOrigin
->
TermType
Lang
->
API
.
RawQuery
->
Maybe
API
.
Limit
->
DBCmdWithEnv
env
err
()
getDataText_Debug
a
l
q
li
=
do
result
<-
getDataText
a
l
q
li
getDataText_Debug
cid
a
l
q
li
=
do
result
<-
getDataText
cid
a
l
q
li
case
result
of
Left
err
->
liftBase
$
putText
$
show
err
Right
res
->
liftBase
$
printDataText
res
...
...
@@ -381,7 +383,9 @@ runDataProducer jobHandle processData = \case
]
DataAsyncBatchProducer
jobs
->
forM_
jobs
sendJob
->
do
addMoreSteps
(
fromIntegral
$
length
jobs
)
jobHandle
forM_
jobs
sendJob
-- | Given a list of corpus documents and a 'NodeId' identifying the 'CorpusId', adds
-- the given documents to the corpus. Returns the Ids of the inserted documents.
...
...
@@ -400,6 +404,7 @@ addDocumentsToHyperCorpus :: ( IsDBCmd env err m
->
[
document
]
->
m
[
DocId
]
addDocumentsToHyperCorpus
mb_hyper
la
corpusId
docs
=
do
$
(
logLocM
)
INFO
$
"Adding "
<>
T
.
pack
(
show
$
length
docs
)
<>
" to the hyper corpus "
<>
T
.
pack
(
show
corpusId
)
cfg
<-
view
hasConfig
nlp
<-
view
(
nlpServerGet
$
_tt_lang
la
)
-- First extract all the ngrams for the input documents via the nlp server,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment