Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
ee32691f
Commit
ee32691f
authored
Apr 15, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[API|Query] WIP need to fit query with frontend
parent
2b13b734
Pipeline
#823
failed with stage
Changes
5
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
143 additions
and
125 deletions
+143
-125
Types.hs
src/Gargantext/API/Admin/Orchestrator/Types.hs
+3
-9
New.hs
src/Gargantext/API/Corpus/New.hs
+9
-8
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+105
-83
API.hs
src/Gargantext/Text/Corpus/API.hs
+10
-13
Terms.hs
src/Gargantext/Text/Terms.hs
+16
-12
No files found.
src/Gargantext/API/Admin/Orchestrator/Types.hs
View file @
ee32691f
...
...
@@ -36,15 +36,9 @@ instance Arbitrary a => Arbitrary (JobOutput a) where
-- | Main Types
data
ExternalAPIs
=
All
|
PubMed
|
HAL_EN
|
HAL_FR
|
IsTex_EN
|
IsTex_FR
|
Isidore_EN
|
Isidore_FR
|
HAL
|
IsTex
|
Isidore
-- | IsidoreAuth
deriving
(
Show
,
Eq
,
Enum
,
Bounded
,
Generic
)
...
...
src/Gargantext/API/Corpus/New.hs
View file @
ee32691f
...
...
@@ -38,10 +38,9 @@ import Gargantext.API.Corpus.New.File
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Types.Individu
(
UserId
,
User
(
..
))
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Database.Action.Flow
(
FlowCmdM
,
flowCorpus
,
flowCorpusSearchInDatabase
)
import
Gargantext.Database.Action.Flow
(
FlowCmdM
,
flowCorpus
,
getDataText
,
flowDataText
,
TermType
(
..
),
DataOrigin
(
..
)
)
import
Gargantext.Database.Admin.Types.Node
(
CorpusId
,
ToHyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Servant
import
Servant.API.Flatten
(
Flat
)
import
Servant.Job.Core
...
...
@@ -65,8 +64,9 @@ deriveJSON (unPrefix "query_") 'Query
instance
Arbitrary
Query
where
arbitrary
=
elements
[
Query
q
n
fs
|
q
<-
[
"honeybee* AND collopase"
,
"covid 19"
]
|
q
<-
[
"honeybee* AND collapse"
,
"covid 19"
]
,
n
<-
[
0
..
10
]
,
fs
<-
take
3
$
repeat
API
.
externalAPIs
]
...
...
@@ -119,8 +119,8 @@ info _u = pure $ ApiInfo API.externalAPIs
------------------------------------------------------------------------
data
WithQuery
=
WithQuery
{
_wq_query
::
!
Text
,
_wq_databases
::
!
[
ExternalAPIs
]
,
_wq_lang
::
!
(
Maybe
Lang
)
,
_wq_databases
::
!
[
DataOrigin
]
,
_wq_lang
::
!
(
Maybe
(
TermType
Lang
)
)
}
deriving
Generic
...
...
@@ -192,7 +192,7 @@ addToCorpusWithQuery :: FlowCmdM env err m
->
WithQuery
->
(
ScraperStatus
->
m
()
)
->
m
ScraperStatus
addToCorpusWithQuery
u
cid
(
WithQuery
q
_
dbs
l
)
logStatus
=
do
addToCorpusWithQuery
u
cid
(
WithQuery
q
dbs
l
)
logStatus
=
do
-- TODO ...
logStatus
ScraperStatus
{
_scst_succeeded
=
Just
10
,
_scst_failed
=
Just
2
...
...
@@ -204,7 +204,8 @@ addToCorpusWithQuery u cid (WithQuery q _dbs l) logStatus = do
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
cids
<-
flowCorpusSearchInDatabase
u
(
maybe
EN
identity
l
)
q
txts
<-
mapM
(
\
db
->
getDataText
db
(
fromMaybe
(
Multi
EN
)
l
)
q
(
Just
10000
))
dbs
cids
<-
mapM
(
\
txt
->
flowDataText
u
txt
(
fromMaybe
(
Multi
EN
)
l
)
cid
)
txts
printDebug
"corpus id"
cids
-- TODO ...
pure
ScraperStatus
{
_scst_succeeded
=
Just
137
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
ee32691f
...
...
@@ -26,24 +26,32 @@ Portability : POSIX
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Database.Action.Flow
-- (flowDatabase, ngrams2list)
(
FlowCmdM
,
getDataText
,
flowDataText
,
flowCorpusFile
,
flowCorpus
,
flowCorpusSearchInDatabase
,
flowAnnuaire
,
getOrMkRoot
,
getOrMk_RootWithCorpus
,
flowAnnuaire
,
TermType
(
..
)
,
DataOrigin
(
..
)
)
where
import
Control.Lens
((
^.
),
view
,
_Just
)
import
Control.Lens
((
^.
),
view
,
_Just
,
makeLenses
)
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Either
import
Data.List
(
concat
)
import
Data.Map
(
Map
,
lookup
)
import
Data.Maybe
(
Maybe
(
..
),
catMaybes
)
import
Data.Monoid
import
Data.Swagger
import
Data.Text
(
Text
,
splitOn
,
intercalate
)
import
Data.Traversable
(
traverse
)
import
Data.Tuple.Extra
(
first
,
second
)
...
...
@@ -69,46 +77,91 @@ import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsInd
import
Gargantext.Database.Schema.NodeNgrams
(
listInsertDb
,
getCgramsId
)
import
Gargantext.Database.Schema.NodeNodeNgrams2
-- (NodeNodeNgrams2, insertNodeNodeNgrams2)
import
Gargantext.Ext.IMT
(
toSchoolName
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers
(
parseFile
,
FileFormat
)
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
,
extractTerms
,
uniText
)
import
qualified
Gargantext.Text.Terms
as
GTT
(
TermType
(
..
),
tt_lang
,
extractTerms
,
uniText
)
import
Gargantext.Text.Terms.Eleve
(
buildTries
,
toToken
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
import
GHC.Generics
(
Generic
)
import
Prelude
(
String
)
import
System.FilePath
(
FilePath
)
import
qualified
Data.List
as
List
import
qualified
Data.Map
as
Map
import
qualified
Data.Text
as
Text
import
qualified
Gargantext.Database.Action.Query.Node.Document.Add
as
Doc
(
add
)
import
qualified
Gargantext.Text.Corpus.API
.Isidore
as
Isidore
import
qualified
Gargantext.Text.Corpus.API
as
API
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
data
DataOrigin
=
Internal
Gargantext
|
External
API
.
ExternalAPIs
-- TODO Web
data
ApiQuery
=
ApiIsidoreQuery
Text
|
ApiIsidoreAuth
Text
-- | APIs
-- TODO instances
getDataApi
::
Lang
->
Maybe
Limit
->
ApiQuery
->
IO
[
HyperdataDocument
]
getDataApi
lang
limit
(
ApiIsidoreQuery
q
)
=
Isidore
.
get
lang
limit
(
Just
q
)
Nothing
getDataApi
lang
limit
(
ApiIsidoreAuth
q
)
=
Isidore
.
get
lang
limit
Nothing
(
Just
q
)
data
DataText
=
DataOld
!
[
NodeId
]
|
DataNew
!
[[
HyperdataDocument
]]
--
UNUSED
_flowCorpusApi
::
(
FlowCmdM
env
err
m
)
=>
User
->
Either
CorpusName
[
CorpusId
]
--
TODO use the split parameter in config file
getDataText
::
FlowCmdM
env
err
m
=>
DataOrigin
->
TermType
Lang
->
Maybe
Limit
->
ApiQuery
->
API
.
Query
->
Maybe
API
.
Limit
->
m
DataText
getDataText
(
External
api
)
la
q
li
=
liftBase
$
DataNew
<$>
splitEvery
500
<$>
API
.
get
api
(
_tt_lang
la
)
q
li
getDataText
Gargantext
la
q
li
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
pure
$
DataOld
ids
-------------------------------------------------------------------------------
-- API for termType
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_windowSize
::
Int
,
_tt_ngramsSize
::
Int
}
deriving
Generic
-- | GTT.TermType as a complex type in Unsupervised configuration that is not needed
-- for the API use
tta2tt
::
TermType
lang
->
GTT
.
TermType
lang
tta2tt
(
Mono
l
)
=
GTT
.
Mono
l
tta2tt
(
Multi
l
)
=
GTT
.
Multi
l
tta2tt
(
MonoMulti
l
)
=
GTT
.
MonoMulti
l
tta2tt
(
Unsupervised
la
w
ng
)
=
GTT
.
Unsupervised
la
w
ng
Nothing
makeLenses
''
T
ermType
deriveJSON
(
unPrefix
"_tt_"
)
''
T
ermType
instance
(
ToSchema
a
)
=>
ToSchema
(
TermType
a
)
where
declareNamedSchema
=
genericDeclareNamedSchema
(
unPrefixSwagger
"_tta_"
)
flowDataText
::
FlowCmdM
env
err
m
=>
User
->
DataText
->
TermType
Lang
->
CorpusId
->
m
CorpusId
_flowCorpusApi
u
n
tt
l
q
=
do
docs
<-
liftBase
$
splitEvery
500
<$>
getDataApi
(
_tt_lang
tt
)
l
q
flowCorpus
u
n
tt
docs
flowDataText
u
(
DataOld
ids
)
tt
cid
=
flowCorpusUser
(
_tt_lang
tt
)
u
(
Right
[
cid
])
corpusType
ids
where
corpusType
=
(
Nothing
::
Maybe
HyperdataCorpus
)
flowDataText
u
(
DataNew
txt
)
tt
cid
=
flowCorpus
u
(
Right
[
cid
])
tt
txt
------------------------------------------------------------------------
-- TODO use proxy
flowAnnuaire
::
FlowCmdM
env
err
m
=>
User
->
Either
CorpusName
[
CorpusId
]
...
...
@@ -118,10 +171,11 @@ flowAnnuaire :: FlowCmdM env err m
flowAnnuaire
u
n
l
filePath
=
do
docs
<-
liftBase
$
((
splitEvery
500
<$>
deserialiseImtUsersFromFile
filePath
)
::
IO
[[
HyperdataContact
]])
flow
(
Nothing
::
Maybe
HyperdataAnnuaire
)
u
n
l
docs
------------------------------------------------------------------------
------------------------------------------------------------------------
flowCorpusFile
::
FlowCmdM
env
err
m
=>
User
->
Either
CorpusName
[
CorpusId
]
=>
User
->
Either
CorpusName
[
CorpusId
]
->
Limit
-- Limit the number of docs (for dev purpose)
->
TermType
Lang
->
FileFormat
->
FilePath
->
m
CorpusId
...
...
@@ -132,43 +186,17 @@ flowCorpusFile u n l la ff fp = do
)
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
-- TODO query with complex query
flowCorpusSearchInDatabase
::
FlowCmdM
env
err
m
=>
User
->
Lang
->
Text
->
m
CorpusId
flowCorpusSearchInDatabase
u
la
q
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
flowCorpusUser
la
u
(
Left
q
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
-- UNUSED
_flowCorpusSearchInDatabaseApi
::
FlowCmdM
env
err
m
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
-- (For now, Either is enough)
flowCorpus
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
)
=>
User
->
Lang
->
Text
->
Either
CorpusName
[
CorpusId
]
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
_flowCorpusSearchInDatabaseApi
u
la
q
=
do
(
_masterUserId
,
_masterRootId
,
cId
)
<-
getOrMk_RootWithCorpus
(
UserName
userMaster
)
(
Left
""
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
<-
map
fst
<$>
searchInDatabase
cId
(
stemIt
q
)
flowCorpusUser
la
u
(
Left
q
)
(
Nothing
::
Maybe
HyperdataCorpus
)
ids
flowCorpus
=
flow
(
Nothing
::
Maybe
HyperdataCorpus
)
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
{- UNUSED
data UserInfo = Username Text
| UserId NodeId
data CorpusInfo = CorpusName Lang Text
| CorpusId Lang NodeId
-}
flow
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
,
MkCorpus
c
)
=>
Maybe
c
...
...
@@ -178,16 +206,9 @@ flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
->
[[
a
]]
->
m
CorpusId
flow
c
u
cn
la
docs
=
do
ids
<-
traverse
(
insertMasterDocs
c
la
)
docs
flowCorpusUser
(
la
^.
tt_lang
)
u
cn
c
(
concat
ids
)
flowCorpus
::
(
FlowCmdM
env
err
m
,
FlowCorpus
a
)
=>
User
->
Either
CorpusName
[
CorpusId
]
->
TermType
Lang
->
[[
a
]]
->
m
CorpusId
flowCorpus
=
flow
(
Nothing
::
Maybe
HyperdataCorpus
)
let
la'
=
tta2tt
la
ids
<-
traverse
(
insertMasterDocs
c
la'
)
docs
flowCorpusUser
(
la'
^.
GTT
.
tt_lang
)
u
cn
c
(
concat
ids
)
------------------------------------------------------------------------
flowCorpusUser
::
(
FlowCmdM
env
err
m
,
MkCorpus
c
)
...
...
@@ -230,7 +251,7 @@ insertMasterDocs :: ( FlowCmdM env err m
,
MkCorpus
c
)
=>
Maybe
c
->
TermType
Lang
->
GTT
.
TermType
Lang
->
[
a
]
->
m
[
DocId
]
insertMasterDocs
c
lang
hs
=
do
...
...
@@ -278,15 +299,16 @@ insertMasterDocs c lang hs = do
pure
ids'
withLang
::
HasText
a
=>
TermType
Lang
withLang
::
HasText
a
=>
GTT
.
TermType
Lang
->
[
DocumentWithId
a
]
->
TermType
Lang
withLang
(
Unsupervised
l
n
s
m
)
ns
=
Unsupervised
l
n
s
m'
->
GTT
.
TermType
Lang
withLang
(
GTT
.
Unsupervised
l
n
s
m
)
ns
=
GTT
.
Unsupervised
l
n
s
m'
where
m'
=
case
m
of
Nothing
->
trace
(
"buildTries here"
::
String
)
$
Just
$
buildTries
n
(
fmap
toToken
$
uniText
$
buildTries
n
(
fmap
toToken
$
GTT
.
uniText
$
Text
.
intercalate
" . "
$
List
.
concat
$
map
hasText
ns
...
...
@@ -329,7 +351,7 @@ instance ExtractNgramsT HyperdataContact
where
extractNgramsT
l
hc
=
filterNgramsT
255
<$>
extract
l
hc
where
extract
::
TermType
Lang
->
HyperdataContact
extract
::
GTT
.
TermType
Lang
->
HyperdataContact
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extract
_l
hc'
=
do
let
authors
=
map
text2ngrams
...
...
@@ -346,12 +368,12 @@ instance HasText HyperdataDocument
instance
ExtractNgramsT
HyperdataDocument
where
extractNgramsT
::
TermType
Lang
extractNgramsT
::
GTT
.
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT
lang
hd
=
filterNgramsT
255
<$>
extractNgramsT'
lang
hd
where
extractNgramsT'
::
TermType
Lang
extractNgramsT'
::
GTT
.
TermType
Lang
->
HyperdataDocument
->
Cmd
err
(
Map
Ngrams
(
Map
NgramsType
Int
))
extractNgramsT'
lang'
doc
=
do
...
...
@@ -370,7 +392,7 @@ instance ExtractNgramsT HyperdataDocument
terms'
<-
map
text2ngrams
<$>
map
(
intercalate
" "
.
_terms_label
)
<$>
concat
<$>
liftBase
(
extractTerms
lang'
$
hasText
doc
)
<$>
liftBase
(
GTT
.
extractTerms
lang'
$
hasText
doc
)
pure
$
Map
.
fromList
$
[(
source
,
Map
.
singleton
Sources
1
)]
<>
[(
i'
,
Map
.
singleton
Institutes
1
)
|
i'
<-
institutes
]
...
...
src/Gargantext/Text/Corpus/API.hs
View file @
ee32691f
...
...
@@ -34,20 +34,17 @@ import qualified Gargantext.Text.Corpus.API.Istex as ISTEX
import
qualified
Gargantext.Text.Corpus.API.Pubmed
as
PUBMED
-- | Get External API metadata main function
get
::
ExternalAPIs
->
Query
->
Maybe
Limit
->
IO
[
HyperdataDocument
]
get
::
ExternalAPIs
->
Lang
->
Query
->
Maybe
Limit
->
IO
[
HyperdataDocument
]
get
PubMed
_la
q
l
=
PUBMED
.
get
q
l
-- EN only by default
get
HAL
la
q
l
=
HAL
.
get
la
q
l
get
IsTex
la
q
l
=
ISTEX
.
get
la
q
l
get
Isidore
la
q
l
=
ISIDORE
.
get
la
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
_
_
_
_
=
undefined
get
PubMed
q
l
=
PUBMED
.
get
q
l
get
HAL_EN
q
l
=
HAL
.
get
EN
q
l
get
HAL_FR
q
l
=
HAL
.
get
FR
q
l
get
IsTex_EN
q
l
=
ISTEX
.
get
EN
q
l
get
IsTex_FR
q
l
=
ISTEX
.
get
FR
q
l
get
Isidore_EN
q
l
=
ISIDORE
.
get
EN
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
Isidore_FR
q
l
=
ISIDORE
.
get
FR
(
fromIntegral
<$>
l
)
(
Just
q
)
Nothing
get
_
_
_
=
undefined
-- | Some Sugar for the documentation
type
Query
=
PUBMED
.
Query
type
Limit
=
PUBMED
.
Limit
...
...
src/Gargantext/Text/Terms.hs
View file @
ee32691f
...
...
@@ -30,41 +30,45 @@ compute graph
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
module
Gargantext.Text.Terms
where
import
Control.Lens
import
Data.Aeson.TH
(
deriveJSON
)
import
Data.Swagger
import
Data.Text
(
Text
)
import
Data.Traversable
import
GHC.Base
(
String
)
import
Gargantext.
Prelude
import
GHC.Generics
(
Generic
)
import
Gargantext.
Core.Utils.Prefix
(
unPrefix
,
unPrefixSwagger
)
import
Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Prelude
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Text.Terms.Mono.Stem
(
stem
)
import
qualified
Data.Set
as
Set
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Multi
(
multiterms
)
import
qualified
Data.List
as
List
import
qualified
Data.Set
as
Set
import
qualified
Data.Text
as
Text
import
Gargantext.Text
(
sentences
)
import
Gargantext.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
data
TermType
lang
=
Mono
{
_tt_lang
::
lang
}
|
Multi
{
_tt_lang
::
lang
}
|
MonoMulti
{
_tt_lang
::
lang
}
|
Unsupervised
{
_tt_lang
::
lang
,
_tt_windo
Size
::
Int
,
_tt_windo
wSize
::
Int
,
_tt_ngramsSize
::
Int
,
_tt_model
::
Maybe
(
Tries
Token
()
)
}
makeLenses
''
T
ermType
deriving
Generic
makeLenses
''
T
ermType
--group :: [Text] -> [Text]
--group = undefined
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment