Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
ee32691f
Commit
ee32691f
authored
Apr 15, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[API|Query] WIP need to fit query with frontend
parent
2b13b734
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
143 additions
and
125 deletions
+143
-125
Types.hs
src/Gargantext/API/Admin/Orchestrator/Types.hs
+3
-9
New.hs
src/Gargantext/API/Corpus/New.hs
+9
-8
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+105
-83
API.hs
src/Gargantext/Text/Corpus/API.hs
+10
-13
Terms.hs
src/Gargantext/Text/Terms.hs
+16
-12
No files found.
src/Gargantext/API/Admin/Orchestrator/Types.hs
View file @
ee32691f
...
...
@@ -36,15 +36,9 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- | Main Types
data
ExternalAPIs
=
All
|
PubMed
|
HAL_EN
|
HAL_FR
|
IsTex_EN
|
IsTex_FR
|
Isidore_EN
|
Isidore_FR
|
HAL
|
IsTex
|
Isidore
-- | IsidoreAuth
deriving
(
Show
,
Eq
,
Enum
,
Bounded
,
Generic
)
...
...
src/Gargantext/API/Corpus/New.hs
View file @
ee32691f
...
...
@@ -38,10 +38,9 @@ import Gargantext.API.Corpus.New.File
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types.Individu
(
UserId
,
User
(
..
))
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Action.Flow
(
FlowCmdM
,
flowCorpus
,
flowCorpusSearchInDatabase
)
import
Gargantext.Database.Action.Flow
(
FlowCmdM
,
flowCorpus
,
getDataText
,
flowDataText
,
TermType
(
..
),
DataOrigin
(
..
)
)
import
Gargantext.Database.Admin.Types.Node
(
CorpusId
,
ToHyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Servant
import
Servant.API.Flatten
(
Flat
)
import
Servant.Job.Core
...
...
@@ -65,8 +64,9 @@ deriveJSON (unPrefix "query_") 'Query
instance
Arbitrary
Query
where
arbitrary
=
elements
[
Query
q
n
fs
|
q
<-
[
"honeybee* AND collopase"
,
"covid 19"
]
|
q
<-
[
"honeybee* AND collapse"
,
"covid 19"
]
,
n
<-
[
0
..
10
]
,
fs
<-
take
3
$
repeat
API
.
externalAPIs
]
...
...
@@ -119,8 +119,8 @@ info _u = pure $ ApiInfo API.externalAPIs
------------------------------------------------------------------------
data
WithQuery
=
WithQuery
{
_wq_query
::
!
Text
,
_wq_databases
::
!
[
ExternalAPIs
]
,
_wq_lang
::
!
(
Maybe
Lang
)
,
_wq_databases
::
!
[
DataOrigin
]
,
_wq_lang
::
!
(
Maybe
(
TermType
Lang
)
)
}
deriving
Generic
...
...
@@ -192,7 +192,7 @@ addToCorpusWithQuery :: FlowCmdM env err m
->
WithQuery
->
(
ScraperStatus
->
m
()
)
->
m
ScraperStatus
addToCorpusWithQuery
u
cid
(
WithQuery
q
_
dbs
l
)
logStatus
=
do
addToCorpusWithQuery
u
cid
(
WithQuery
q
dbs
l
)
logStatus
=
do
-- TODO ...
logStatus
ScraperStatus
{
_scst_succeeded
=
Just
10
,
_scst_failed
=
Just
2
...
...
@@ -204,7 +204,8 @@ addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
cids
<-
flowCorpusSearchInDatabase
u
(
maybe
EN
identity
l
)
q
txts
<-
mapM
(
\
db
->
getDataText
db
(
fromMaybe
(
Multi
EN
)
l
)
q
(
Just
10000
))
dbs
cids
<-
mapM
(
\
txt
->
flowDataText
u
txt
(
fromMaybe
(
Multi
EN
)
l
)
cid
)
txts
printDebug
"corpus id"
cids
-- TODO ...
pure
ScraperStatus
{
_scst_succeeded
=
Just
137
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
ee32691f
...
...
@@ -26,24 +26,32 @@ Portability : POSIX
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Database.Action.Flow
-- (flowDatabase, ngrams2list)
(
FlowCmdM
,
getDataText
,
flowDataText
,
flowCorpusFile
,
flowCorpus
,
flowCorpusSearchInDatabase
,
flowAnnuaire
,
getOrMkRoot
,
getOrMk_RootWithCorpus
,
flowAnnuaire
,
TermType
(
..
)
,
DataOrigin
(
..
)
)
where
import
Control.Lens
((
^.
),
view
,
_Just
)
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
)
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Either
import
Data.List
(
concat
)
import
Data.Map
(
Map
,
lookup
)
import
Data.Maybe
(
Maybe
(
..
),
catMaybes
)
import
Data.Monoid
import
Data.Swagger
import
Data.Text
(
Text
,
splitOn
,
intercalate
)
import
Data.Traversable
(
traverse
)
import
Data.Tuple.Extra
(
first
,
second
)
...
...
@@ -69,46 +77,91 @@ import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsInd
import
Gargantext.Database.Schema.NodeNgrams
(
listInsertDb
,
getCgramsId
)
import
Gargantext.Database.Schema.NodeNodeNgrams2
-- (NodeNodeNgrams2, insertNodeNodeNgrams2)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers
(
parseFile
,
FileFormat
)
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
,
extractTerms
,
uniText
)
import
qualified
Gargantext.Text.Terms
as
GTT
(
TermType
(
..
),
tt_lang
,
extractTerms
,
uniText
)
import
Gargantext.Text.Terms.Eleve
(
buildTries
,
toToken
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
import
GHC.Generics
(
Generic
)
import
Prelude
(
String
)
import
System.FilePath
(
FilePath
)
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
qualified
Data.Text
as
Text
import
qualified
Gargantext.Database.Action.Query.Node.Document.Add
as
Doc
(
add
)
import
qualified
Gargantext.Text.Corpus.API
.Isidore
as
Isidore
import
qualified
Gargantext.Text.Corpus.API
as
API
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
data
DataOrigin
=
Internal
Gargantext
|
External
API
.
ExternalAPIs
-- TODO Web
data
ApiQuery
=
ApiIsidoreQuery
Text
|
ApiIsidoreAuth
Text
-- | APIs
-- TODO instances
getDataApi
::
Lang
->
Maybe
Limit
->
ApiQuery
->
IO
[
HyperdataDocument
]
getDataApi
lang
limit
(
ApiIsidoreQuery
q
)
=
Isidore
.
get
lang
limit
(
Just
q
)
Nothing
getDataApi
lang
limit
(
ApiIsidoreAuth
q
)
=
Isidore
.
get
lang
limit
Nothing
(
Just
q
)
data
DataText
=
DataOld
!
[
NodeId
]
|
DataNew
!
[[
HyperdataDocument
]]
--
UNUSED
_flowCorpusApi
::
(
FlowCmdM
env
err
m
)
=>
User
->
Either
CorpusName
[
CorpusId
]
--
TODO use the split parameter in config file
getDataText
::
FlowCmdM
env
err
m
=>
DataOrigin
->
TermType
Lang
->
Maybe
Limit
->
ApiQuery
->
API
.
Query
->
Maybe
API
.
Limit
->
m
DataText
getDataText
(
External
api
)
la
q
li
=
liftBase
$
DataNew
<$>
splitEvery
500
<$>
API
.
get
api
(
_tt_lang
la
)
q
li
getDataText
Gargantext
la
q
li
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
pure
$
DataOld
ids
-------------------------------------------------------------------------------
-- API for termType
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_windowSize
::
Int
,
_tt_ngramsSize
::
Int
}
deriving
Generic
-- | GTT.TermType as a complex type in Unsupervised configuration that is not needed
-- for the API use
tta2tt
::
TermType
lang
->
GTT
.
TermType
lang
tta2tt
(
Mono
l
)
=
GTT
.
Mono
l
tta2tt
(
Multi
l
)
=
GTT
.
Multi
l
tta2tt
(
MonoMulti
l
)
=
GTT
.
MonoMulti
l
tta2tt
(
Unsupervised
la
w
ng
)
=
GTT
.
Unsupervised
la
w
ng
Nothing
makeLenses
''
T
ermType
deriveJSON
(
unPrefix
"_tt_"
)
''
T
ermType
instance
(
ToSchema
a
)
=>
ToSchema
(
TermType
a
)
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_tta_"
)
flowDataText
::
FlowCmdM
env
err
m
=>
User
->
DataText
->
TermType
Lang
->
CorpusId
->
m
CorpusId
_flowCorpusApi
u
n
tt
l
q
=
do
docs
<-
liftBase
$
splitEvery
500
<$>
getDataApi
(
_tt_lang
tt
)
l
q
flowCorpus
u
n
tt
docs
flowDataText
u
(
DataOld
ids
)
tt
cid
=
flowCorpusUser
(
_tt_lang
tt
)
u
(
Right
[
cid
])
corpusType
ids
where
corpusType
=
(
Nothing
::
Maybe
HyperdataCorpus
)
flowDataText
u
(
DataNew
txt
)
tt
cid
=
flowCorpus
u
(
Right
[
cid
])
tt
txt
------------------------------------------------------------------------
-- TODO use proxy
flowAnnuaire
::
FlowCmdM
env
err
m
=>
User
->
Either
CorpusName
[
CorpusId
]
...
...
@@ -118,10 +171,11 @@ flowAnnuaire :: FlowCmdM env err m
flowAnnuaire
u
n
l
filePath
=
do
docs
<-
liftBase
$
((
splitEvery
500
<$>
deserialiseImtUsersFromFile
filePath
)
::
IO
[[
HyperdataContact
]])
flow
(
Nothing
::
Maybe
HyperdataAnnuaire
)
u
n
l
docs
------------------------------------------------------------------------
------------------------------------------------------------------------
flowCorpusFile
::
FlowCmdM
env
err
m
=>
User
->
Either
CorpusName
[
CorpusId
]
=>
User
->
Either
CorpusName
[
CorpusId
]
->
Limit
-- Limit the number of docs (for dev purpose)
->
TermType
Lang
->
FileFormat
->
FilePath
->
m
CorpusId
...
...
@@ -132,43 +186,17 @@ flowCorpusFile u n l la ff fp = do
)
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
-- TODO query with complex query
flowCorpusSearchInDatabase
::
FlowCmdM
env
err
m
=>
User
->
Lang
->
Text
->
m
CorpusId
flowCorpusSearchInDatabase
u
la
q
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
flowCorpusUser
la
u
(
Left
q
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
-- UNUSED
_flowCorpusSearchInDatabaseApi
::
FlowCmdM
env
err
m
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
-- (For now, Either is enough)
flowCorpus
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
)
=>
User
->
Lang
->
Text
->
Either
CorpusName
[
CorpusId
]
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
_flowCorpusSearchInDatabaseApi
u
la
q
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
flowCorpusUser
la
u
(
Left
q
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
flowCorpus
=
flow
(
Nothing
::
Maybe
HyperdataCorpus
)
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
{- UNUSED
data UserInfo = Username Text
| UserId NodeId
data CorpusInfo = CorpusName Lang Text
| CorpusId Lang NodeId
-}
flow
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
,
MkCorpus
c
)
=>
Maybe
c
...
...
@@ -178,16 +206,9 @@ flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
->
[[
a
]]
->
m
CorpusId
flow
c
u
cn
la
docs
=
do
ids
<-
traverse
(
insertMasterDocs
c
la
)
docs
flowCorpusUser
(
la
^.
tt_lang
)
u
cn
c
(
concat
ids
)
flowCorpus
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
)
=>
User
->
Either
CorpusName
[
CorpusId
]
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
flowCorpus
=
flow
(
Nothing
::
Maybe
HyperdataCorpus
)
let
la'
=
tta2tt
la
ids
<-
traverse
(
insertMasterDocs
c
la'
)
docs
flowCorpusUser
(
la'
^.
GTT
.
tt_lang
)
u
cn
c
(
concat
ids
)
------------------------------------------------------------------------
flowCorpusUser
::
(
FlowCmdM
env
err
m
,
MkCorpus
c
)
...
...
@@ -230,7 +251,7 @@ insertMasterDocs :: ( FlowCmdM env err m
,
MkCorpus
c
)
=>
Maybe
c
->
TermType
Lang
->
GTT
.
TermType
Lang
->
[
a
]
->
m
[
DocId
]
insertMasterDocs
c
lang
hs
=
do
...
...
@@ -278,15 +299,16 @@ insertMasterDocs c lang hs = do
pure
ids'
withLang
::
HasText
a
=>
TermType
Lang
withLang
::
HasText
a
=>
GTT
.
TermType
Lang
->
[
DocumentWithId
a
]
->
TermType
Lang
withLang
(
Unsupervised
l
n
s
m
)
ns
=
Unsupervised
l
n
s
m'
->
GTT
.
TermType
Lang
withLang
(
GTT
.
Unsupervised
l
n
s
m
)
ns
=
GTT
.
Unsupervised
l
n
s
m'
where
m'
=
case
m
of
Nothing
->
trace
(
"buildTries here"
::
String
)
$
Just
$
buildTries
n
(
fmap
toToken
$
uniText
$
buildTries
n
(
fmap
toToken
$
GTT
.
uniText
$
Text
.
intercalate
" . "
$
List
.
concat
$
map
hasText
ns
...
...
@@ -329,7 +351,7 @@ instance ExtractNgramsT HyperdataContact
where
extractNgramsT
l
hc
=
filterNgramsT
255
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
extract
::
GTT
.
TermType
Lang
->
HyperdataContact
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
...
...
@@ -346,12 +368,12 @@ instance HasText HyperdataDocument
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
TermType
Lang
extractNgramsT
::
GTT
.
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
where
extractNgramsT'
::
TermType
Lang
extractNgramsT'
::
GTT
.
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang'
doc
=
do
...
...
@@ -370,7 +392,7 @@ instance ExtractNgramsT HyperdataDocument
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
<$>
liftBase
(
GTT
.
extractTerms
lang'
$
hasText
doc
)
pure
$
Map
.
fromList
$
[(
source
,
Map
.
singleton
Sources
1
)]
<>
[(
i'
,
Map
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
...
...
src/Gargantext/Text/Corpus/API.hs
View file @
ee32691f
...
...
@@ -34,20 +34,17 @@ import qualified Gargantext.Text.Corpus.API.Istex as ISTEX
import
qualified
Gargantext.Text.Corpus.API.Pubmed
as
PUBMED
-- | Get External API metadata main function
get
::
ExternalAPIs
->
Query
->
Maybe
Limit
->
IO
[
HyperdataDocument
]
get
::
ExternalAPIs
->
Lang
->
Query
->
Maybe
Limit
->
IO
[
HyperdataDocument
]
get
PubMed
_la
q
l
=
PUBMED
.
get
q
l
-- EN only by default
get
HAL
la
q
l
=
HAL
.
get
la
q
l
get
IsTex
la
q
l
=
ISTEX
.
get
la
q
l
get
Isidore
la
q
l
=
ISIDORE
.
get
la
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
_
_
_
_
=
undefined
get
PubMed
q
l
=
PUBMED
.
get
q
l
get
HAL_EN
q
l
=
HAL
.
get
EN
q
l
get
HAL_FR
q
l
=
HAL
.
get
FR
q
l
get
IsTex_EN
q
l
=
ISTEX
.
get
EN
q
l
get
IsTex_FR
q
l
=
ISTEX
.
get
FR
q
l
get
Isidore_EN
q
l
=
ISIDORE
.
get
EN
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
Isidore_FR
q
l
=
ISIDORE
.
get
FR
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
_
_
_
=
undefined
-- | Some Sugar for the documentation
type
Query
=
PUBMED
.
Query
type
Limit
=
PUBMED
.
Limit
...
...
src/Gargantext/Text/Terms.hs
View file @
ee32691f
...
...
@@ -30,41 +30,45 @@ compute graph
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Text.Terms
where
import
Control.Lens
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Swagger
import
Data.Text
(
Text
)
import
Data.Traversable
import
GHC.Base
(
String
)
import
Gargantext.
Prelude
import
GHC.Generics
(
Generic
)
import
Gargantext.
Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Prelude
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Data.Set
as
Set
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
qualified
Data.List
as
List
import
qualified
Data.Set
as
Set
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_windo
Size
::
Int
,
_tt_windo
wSize
::
Int
,
_tt_ngramsSize
::
Int
,
_tt_model
::
Maybe
(
Tries
Token
()
)
}
makeLenses
''
T
ermType
deriving
Generic
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
--group = undefined
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment