Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
b2f7a9a8
Commit
b2f7a9a8
authored
Jan 13, 2025
by
Alfredo Di Napoli
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Chunks the insertion of remote docs
parent
0d4e0554
Pipeline
#7208
passed with stages
in 54 minutes and 17 seconds
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
46 additions
and
15 deletions
+46
-15
gargantext.cabal
gargantext.cabal
+1
-0
DocumentUpload.hs
src/Gargantext/API/Node/DocumentUpload.hs
+7
-5
Remote.hs
src/Gargantext/API/Server/Named/Remote.hs
+11
-3
Worker.hs
src/Gargantext/Core/Worker.hs
+2
-2
Types.hs
src/Gargantext/Core/Worker/Jobs/Types.hs
+25
-5
No files found.
gargantext.cabal
View file @
b2f7a9a8
...
@@ -613,6 +613,7 @@ library
...
@@ -613,6 +613,7 @@ library
, singletons ^>= 3.0.2
, singletons ^>= 3.0.2
, singletons-th >= 3.1 && < 3.2
, singletons-th >= 3.1 && < 3.2
, smtp-mail >= 0.3.0.0
, smtp-mail >= 0.3.0.0
, split >= 0.2.0
, stemmer == 0.5.2
, stemmer == 0.5.2
, stm >= 2.5.1.0 && < 2.6
, stm >= 2.5.1.0 && < 2.6
, stm-containers >= 1.2.0.3 && < 1.3
, stm-containers >= 1.2.0.3 && < 1.3
...
...
src/Gargantext/API/Node/DocumentUpload.hs
View file @
b2f7a9a8
...
@@ -22,7 +22,6 @@ import Gargantext.API.Admin.EnvTypes (Env)
...
@@ -22,7 +22,6 @@ import Gargantext.API.Admin.EnvTypes (Env)
import
Gargantext.API.Errors.Types
(
BackendInternalError
(
..
)
)
import
Gargantext.API.Errors.Types
(
BackendInternalError
(
..
)
)
import
Gargantext.API.Node.Corpus.New
(
commitCorpus
)
import
Gargantext.API.Node.Corpus.New
(
commitCorpus
)
import
Gargantext.API.Node.Document.Export.Types
(
Document
(
..
))
import
Gargantext.API.Node.Document.Export.Types
(
Document
(
..
))
import
Gargantext.API.Node.Document.Export.Types
(
DocumentExport
(
..
))
import
Gargantext.API.Node.DocumentUpload.Types
import
Gargantext.API.Node.DocumentUpload.Types
import
Gargantext.API.Prelude
(
GargM
)
import
Gargantext.API.Prelude
(
GargM
)
import
Gargantext.API.Routes.Named.Document
qualified
as
Named
import
Gargantext.API.Routes.Named.Document
qualified
as
Named
...
@@ -34,6 +33,7 @@ import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
...
@@ -34,6 +33,7 @@ import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import
Gargantext.Core.Text.Terms
(
TermType
(
..
))
import
Gargantext.Core.Text.Terms
(
TermType
(
..
))
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Types.Individu
(
User
(
..
))
import
Gargantext.Core.Worker.Jobs.Types
qualified
as
Jobs
import
Gargantext.Core.Worker.Jobs.Types
qualified
as
Jobs
import
Gargantext.Core.Worker.Jobs.Types
(
WorkSplit
(
..
))
import
Gargantext.Database.Action.Flow
(
addDocumentsToHyperCorpus
)
import
Gargantext.Database.Action.Flow
(
addDocumentsToHyperCorpus
)
import
Gargantext.Database.Action.Flow.Types
(
FlowCmdM
)
import
Gargantext.Database.Action.Flow.Types
(
FlowCmdM
)
import
Gargantext.Database.Admin.Types.Hyperdata.Corpus
(
HyperdataCorpus
)
import
Gargantext.Database.Admin.Types.Hyperdata.Corpus
(
HyperdataCorpus
)
...
@@ -116,13 +116,15 @@ remoteImportDocuments :: ( HasNodeError err
...
@@ -116,13 +116,15 @@ remoteImportDocuments :: ( HasNodeError err
=>
AuthenticatedUser
=>
AuthenticatedUser
->
ParentId
->
ParentId
->
NodeId
->
NodeId
->
DocumentExport
->
WorkSplit
->
[
Document
]
-- ^ Total docs
->
m
[
NodeId
]
->
m
[
NodeId
]
remoteImportDocuments
loggedInUser
corpusId
nodeId
(
DocumentExport
documents
_gargVersion
)
=
do
remoteImportDocuments
loggedInUser
corpusId
nodeId
WorkSplit
{
..
}
documents
=
do
let
la
=
Multi
EN
let
la
=
Multi
EN
nlpServerConfig
<-
view
$
nlpServerGet
(
_tt_lang
la
)
nlpServerConfig
<-
view
$
nlpServerGet
(
_tt_lang
la
)
$
(
logLocM
)
INFO
$
"Importing "
<>
T
.
pack
(
show
$
length
documents
)
<>
" documents for corpus node "
<>
T
.
pack
(
show
nodeId
)
$
(
logLocM
)
INFO
$
"Importing "
<>
T
.
pack
(
show
_ws_current
)
<>
"/"
<>
T
.
pack
(
show
_ws_total
)
<>
" documents for corpus node "
<>
T
.
pack
(
show
nodeId
)
docs
<-
addDocumentsToHyperCorpus
nlpServerConfig
(
Nothing
::
Maybe
HyperdataCorpus
)
la
corpusId
(
map
(
_node_hyperdata
.
_d_document
)
documents
)
docs
<-
addDocumentsToHyperCorpus
nlpServerConfig
(
Nothing
::
Maybe
HyperdataCorpus
)
la
corpusId
(
map
(
_node_hyperdata
.
_d_document
)
documents
)
_versioned
<-
commitCorpus
corpusId
(
RootId
$
_auth_node_id
loggedInUser
)
_versioned
<-
commitCorpus
corpusId
(
RootId
$
_auth_node_id
loggedInUser
)
$
(
logLocM
)
INFO
$
"Done importing "
<>
T
.
pack
(
show
$
length
documents
)
<>
" documents for corpus node "
<>
T
.
pack
(
show
nodeId
)
$
(
logLocM
)
INFO
$
"Done importing "
<>
T
.
pack
(
show
_ws_current
)
<>
"/"
<>
T
.
pack
(
show
_ws_total
)
<>
" documents for corpus node "
<>
T
.
pack
(
show
nodeId
)
pure
docs
pure
docs
src/Gargantext/API/Server/Named/Remote.hs
View file @
b2f7a9a8
...
@@ -16,13 +16,14 @@ import Control.Exception.Safe qualified as Safe
...
@@ -16,13 +16,14 @@ import Control.Exception.Safe qualified as Safe
import
Control.Exception
(
toException
)
import
Control.Exception
(
toException
)
import
Control.Lens
(
view
,
(
#
))
import
Control.Lens
(
view
,
(
#
))
import
Control.Monad.Except
(
throwError
,
MonadError
)
import
Control.Monad.Except
(
throwError
,
MonadError
)
import
Control.Monad
(
void
,
liftM2
)
import
Control.Monad
(
void
,
liftM2
,
forM_
)
import
Data.Aeson
qualified
as
JSON
import
Data.Aeson
qualified
as
JSON
import
Data.ByteString.Builder
qualified
as
B
import
Data.ByteString.Builder
qualified
as
B
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Conduit.Combinators
qualified
as
C
import
Data.Conduit.Combinators
qualified
as
C
import
Data.Conduit.List
qualified
as
CL
import
Data.Conduit.List
qualified
as
CL
import
Data.Foldable
(
for_
,
foldlM
)
import
Data.Foldable
(
for_
,
foldlM
)
import
Data.List.Split
qualified
as
Split
import
Data.Text
qualified
as
T
import
Data.Text
qualified
as
T
import
Gargantext.API.Admin.Auth
import
Gargantext.API.Admin.Auth
import
Gargantext.API.Admin.Auth.Types
(
AuthenticatedUser
(
..
))
import
Gargantext.API.Admin.Auth.Types
(
AuthenticatedUser
(
..
))
...
@@ -118,8 +119,15 @@ remoteImportHandler loggedInUser c = do
...
@@ -118,8 +119,15 @@ remoteImportHandler loggedInUser c = do
$
(
logLocM
)
INFO
$
"Created a new node "
<>
T
.
pack
(
show
$
new_node
)
<>
" of type "
<>
T
.
pack
(
show
ty
)
$
(
logLocM
)
INFO
$
"Created a new node "
<>
T
.
pack
(
show
$
new_node
)
<>
" of type "
<>
T
.
pack
(
show
ty
)
for_
(
liftM2
(,)
mb_docs
mb_parent
)
$
\
(
docsList
,
parentId
)
->
do
for_
(
liftM2
(,)
mb_docs
mb_parent
)
$
\
(
docsList
,
parentId
)
->
do
$
(
logLocM
)
INFO
$
"Found document list to import..."
$
(
logLocM
)
INFO
$
"Found document list to import..."
let
payload
=
Jobs
.
ImportRemoteDocumentsPayload
loggedInUser
parentId
new_node
docsList
let
totalDocs
=
_de_documents
docsList
void
$
sendJob
$
Jobs
.
ImportRemoteDocuments
payload
let
chunks
=
Split
.
chunksOf
100
totalDocs
forM_
(
zip
[
1
..
]
chunks
)
$
\
(
local_ix
,
chunk
)
->
do
let
ws
=
Jobs
.
WorkSplit
{
Jobs
.
_ws_current
=
min
(
length
totalDocs
)
(((
local_ix
-
1
)
*
length
chunk
)
+
length
chunk
)
,
Jobs
.
_ws_total
=
length
totalDocs
}
let
payload
=
Jobs
.
ImportRemoteDocumentsPayload
loggedInUser
parentId
new_node
chunk
ws
void
$
sendJob
$
Jobs
.
ImportRemoteDocuments
payload
for_
mb_terms
$
\
ngramsList
->
do
for_
mb_terms
$
\
ngramsList
->
do
$
(
logLocM
)
INFO
$
"Found ngrams list to import..."
$
(
logLocM
)
INFO
$
"Found ngrams list to import..."
void
$
sendJob
$
Jobs
.
ImportRemoteTerms
$
Jobs
.
ImportRemoteTermsPayload
new_node
ngramsList
void
$
sendJob
$
Jobs
.
ImportRemoteTerms
$
Jobs
.
ImportRemoteTermsPayload
new_node
ngramsList
...
...
src/Gargantext/Core/Worker.hs
View file @
b2f7a9a8
...
@@ -309,7 +309,7 @@ performAction env _state bm = do
...
@@ -309,7 +309,7 @@ performAction env _state bm = do
$
(
logLocM
)
DEBUG
$
"Done updating node scores for corpus node "
<>
T
.
pack
(
show
list_id
)
$
(
logLocM
)
DEBUG
$
"Done updating node scores for corpus node "
<>
T
.
pack
(
show
list_id
)
-- | Remotely import documents
-- | Remotely import documents
ImportRemoteDocuments
(
ImportRemoteDocumentsPayload
loggedInUser
parentId
corpusId
docs
)
ImportRemoteDocuments
(
ImportRemoteDocumentsPayload
loggedInUser
parentId
corpusId
docs
workSplit
)
->
runWorkerMonad
env
$
do
->
runWorkerMonad
env
$
do
$
(
logLocM
)
DEBUG
$
"[performAction] import remote documents"
$
(
logLocM
)
DEBUG
$
"[performAction] import remote documents"
void
$
remoteImportDocuments
loggedInUser
parentId
corpusId
docs
void
$
remoteImportDocuments
loggedInUser
parentId
corpusId
workSplit
docs
src/Gargantext/Core/Worker/Jobs/Types.hs
View file @
b2f7a9a8
...
@@ -23,7 +23,7 @@ import Gargantext.API.Admin.Auth.Types (AuthenticatedUser, ForgotPasswordAsyncPa
...
@@ -23,7 +23,7 @@ import Gargantext.API.Admin.Auth.Types (AuthenticatedUser, ForgotPasswordAsyncPa
import
Gargantext.API.Ngrams.Types
(
NgramsList
,
UpdateTableNgramsCharts
(
_utn_list_id
))
import
Gargantext.API.Ngrams.Types
(
NgramsList
,
UpdateTableNgramsCharts
(
_utn_list_id
))
import
Gargantext.API.Node.Contact.Types
(
AddContactParams
)
import
Gargantext.API.Node.Contact.Types
(
AddContactParams
)
import
Gargantext.API.Node.Corpus.Annuaire
(
AnnuaireWithForm
)
import
Gargantext.API.Node.Corpus.Annuaire
(
AnnuaireWithForm
)
import
Gargantext.API.Node.Document.Export.Types
(
Document
Export
)
import
Gargantext.API.Node.Document.Export.Types
(
Document
)
import
Gargantext.API.Node.DocumentsFromWriteNodes.Types
qualified
as
DFWN
import
Gargantext.API.Node.DocumentsFromWriteNodes.Types
qualified
as
DFWN
import
Gargantext.API.Node.DocumentUpload.Types
(
DocumentUpload
)
import
Gargantext.API.Node.DocumentUpload.Types
(
DocumentUpload
)
import
Gargantext.API.Node.FrameCalcUpload.Types
(
FrameCalcUpload
)
import
Gargantext.API.Node.FrameCalcUpload.Types
(
FrameCalcUpload
)
...
@@ -52,12 +52,30 @@ instance FromJSON ImportRemoteTermsPayload where
...
@@ -52,12 +52,30 @@ instance FromJSON ImportRemoteTermsPayload where
_irtp_ngrams_list
<-
o
.:
"ngrams_list"
_irtp_ngrams_list
<-
o
.:
"ngrams_list"
pure
ImportRemoteTermsPayload
{
..
}
pure
ImportRemoteTermsPayload
{
..
}
data
WorkSplit
=
WorkSplit
{
_ws_current
::
Int
,
_ws_total
::
Int
}
deriving
(
Show
,
Eq
)
instance
ToJSON
WorkSplit
where
toJSON
WorkSplit
{
..
}
=
object
[
"current"
.=
_ws_current
,
"total"
.=
_ws_total
]
instance
FromJSON
WorkSplit
where
parseJSON
=
withObject
"WorkSplit"
$
\
o
->
do
_ws_current
<-
o
.:
"current"
_ws_total
<-
o
.:
"total"
pure
WorkSplit
{
..
}
data
ImportRemoteDocumentsPayload
data
ImportRemoteDocumentsPayload
=
ImportRemoteDocumentsPayload
=
ImportRemoteDocumentsPayload
{
_irdp_user
::
AuthenticatedUser
{
_irdp_user
::
AuthenticatedUser
,
_irdp_parent_id
::
ParentId
,
_irdp_parent_id
::
ParentId
,
_irdp_corpus_id
::
NodeId
,
_irdp_corpus_id
::
NodeId
,
_irdp_document_export
::
DocumentExport
,
_irdp_documents
::
[
Document
]
-- | Useful to compute total progress in logs.
,
_irdp_work_split
::
WorkSplit
}
deriving
(
Show
,
Eq
)
}
deriving
(
Show
,
Eq
)
instance
ToJSON
ImportRemoteDocumentsPayload
where
instance
ToJSON
ImportRemoteDocumentsPayload
where
...
@@ -65,7 +83,8 @@ instance ToJSON ImportRemoteDocumentsPayload where
...
@@ -65,7 +83,8 @@ instance ToJSON ImportRemoteDocumentsPayload where
object
[
"user"
.=
_irdp_user
object
[
"user"
.=
_irdp_user
,
"corpus_id"
.=
_irdp_corpus_id
,
"corpus_id"
.=
_irdp_corpus_id
,
"parent_id"
.=
_irdp_parent_id
,
"parent_id"
.=
_irdp_parent_id
,
"document_export"
.=
_irdp_document_export
,
"documents"
.=
_irdp_documents
,
"work_split"
.=
_irdp_work_split
]
]
instance
FromJSON
ImportRemoteDocumentsPayload
where
instance
FromJSON
ImportRemoteDocumentsPayload
where
...
@@ -73,7 +92,8 @@ instance FromJSON ImportRemoteDocumentsPayload where
...
@@ -73,7 +92,8 @@ instance FromJSON ImportRemoteDocumentsPayload where
_irdp_user
<-
o
.:
"user"
_irdp_user
<-
o
.:
"user"
_irdp_parent_id
<-
o
.:
"parent_id"
_irdp_parent_id
<-
o
.:
"parent_id"
_irdp_corpus_id
<-
o
.:
"corpus_id"
_irdp_corpus_id
<-
o
.:
"corpus_id"
_irdp_document_export
<-
o
.:
"document_export"
_irdp_documents
<-
o
.:
"documents"
_irdp_work_split
<-
o
.:
"work_split"
pure
ImportRemoteDocumentsPayload
{
..
}
pure
ImportRemoteDocumentsPayload
{
..
}
data
Job
=
data
Job
=
...
@@ -283,5 +303,5 @@ getWorkerMNodeId (PostNodeAsync { _pna_node_id }) = Just _pna_node_id
...
@@ -283,5 +303,5 @@ getWorkerMNodeId (PostNodeAsync { _pna_node_id }) = Just _pna_node_id
getWorkerMNodeId
(
RecomputeGraph
{
_rg_node_id
})
=
Just
_rg_node_id
getWorkerMNodeId
(
RecomputeGraph
{
_rg_node_id
})
=
Just
_rg_node_id
getWorkerMNodeId
(
UpdateNode
{
_un_node_id
})
=
Just
_un_node_id
getWorkerMNodeId
(
UpdateNode
{
_un_node_id
})
=
Just
_un_node_id
getWorkerMNodeId
(
UploadDocument
{
_ud_node_id
})
=
Just
_ud_node_id
getWorkerMNodeId
(
UploadDocument
{
_ud_node_id
})
=
Just
_ud_node_id
getWorkerMNodeId
(
ImportRemoteDocuments
(
ImportRemoteDocumentsPayload
_
_
corpusId
_
))
=
Just
corpusId
getWorkerMNodeId
(
ImportRemoteDocuments
(
ImportRemoteDocumentsPayload
_
_
corpusId
_
_
))
=
Just
corpusId
getWorkerMNodeId
(
ImportRemoteTerms
(
ImportRemoteTermsPayload
listId
_
))
=
Just
listId
getWorkerMNodeId
(
ImportRemoteTerms
(
ImportRemoteTermsPayload
listId
_
))
=
Just
listId
Przemyslaw Kaminski
@cgenie
mentioned in commit
942e663f
·
Jan 29, 2025
mentioned in commit
942e663f
mentioned in commit 942e663f539b287b4cc0469fe2bcf735813b4ff2
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment