Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
159
Issues
159
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
3d9b4e21
Commit
3d9b4e21
authored
Dec 13, 2024
by
Grégoire Locqueville
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Removed dead code in `Core.Text...`
parent
91c73c45
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
17 additions
and
534 deletions
+17
-534
gargantext.cabal
gargantext.cabal
+0
-4
Text.hs
src/Gargantext/Core/Text.hs
+0
-13
Context.hs
src/Gargantext/Core/Text/Context.hs
+0
-29
Corpus.hs
src/Gargantext/Core/Text/Corpus.hs
+2
-19
Hal.hs
src/Gargantext/Core/Text/Corpus/API/Hal.hs
+0
-5
Isidore.hs
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
+3
-14
Pubmed.hs
src/Gargantext/Core/Text/Corpus/API/Pubmed.hs
+0
-2
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+0
-40
Date.hs
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
+1
-12
FrameWrite.hs
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
+0
-73
Iramuteq.hs
src/Gargantext/Core/Text/Corpus/Parsers/Iramuteq.hs
+5
-6
Isidore.hs
src/Gargantext/Core/Text/Corpus/Parsers/Isidore.hs
+0
-141
TSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
+1
-172
Query.hs
src/Gargantext/Core/Text/Corpus/Query.hs
+0
-4
weeder.toml
weeder.toml
+5
-0
No files found.
gargantext.cabal
View file @
3d9b4e21
...
...
@@ -362,7 +362,6 @@ library
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.RIS
...
...
@@ -530,7 +529,6 @@ library
, haskell-throttle
, hlcm ^>= 0.2.2
, hsinfomap ^>= 0.1
, hsparql ^>= 0.3.8
, hstatistics ^>= 0.3.1
, http-api-data >= 0.5 && < 0.6
, http-client ^>= 0.7.14
...
...
@@ -573,7 +571,6 @@ library
, quickcheck-instances ^>= 0.3.25.2
, rake ^>= 0.0.1
, random ^>= 1.2.1
, rdf4h ^>= 3.1.1
, regex
, replace-attoparsec ^>= 1.5.0.0
, resource-pool >= 0.4.0.0 && < 0.5
...
...
@@ -604,7 +601,6 @@ library
, stm-containers >= 1.2.0.3 && < 1.3
, stringsearch >= 0.3.6.6
, swagger2 ^>= 2.8.7
, tagsoup ^>= 0.14.8
, template-haskell ^>= 2.19.0.0
, temporary ^>= 1.3
, text ^>= 2.0.2
...
...
src/Gargantext/Core/Text.hs
View file @
3d9b4e21
...
...
@@ -15,7 +15,6 @@ Text gathers terms in unit of contexts.
module
Gargantext.Core.Text
where
import
Data.Text
(
split
)
import
Data.Text
qualified
as
DT
import
Gargantext.Prelude
hiding
(
filter
)
import
NLP.FullStop
(
segment
)
...
...
@@ -85,18 +84,6 @@ instance Collage MultiTerme Mot where
sentences
::
Text
->
[
Text
]
sentences
txt
=
map
DT
.
pack
$
segment
$
DT
.
unpack
txt
sentences'
::
Text
->
[
Text
]
sentences'
txt
=
split
isCharStop
txt
isCharStop
::
Char
->
Bool
isCharStop
c
=
c
`
elem
`
[
'.'
,
'?'
,
'!'
]
unsentences
::
[
Text
]
->
Text
unsentences
txts
=
DT
.
unwords
txts
-- | Ngrams size
size
::
Text
->
Int
size
t
=
1
+
DT
.
count
" "
t
src/Gargantext/Core/Text/Context.hs
View file @
3d9b4e21
...
...
@@ -23,10 +23,7 @@ How to split contexts is describes in this module.
module
Gargantext.Core.Text.Context
where
import
Data.Text
(
pack
,
unpack
)
import
Gargantext.Core.Text
import
Gargantext.Prelude
hiding
(
length
)
import
Text.HTML.TagSoup
(
parseTags
,
isTagText
,
Tag
(
..
))
------------------------------------------------------------------------
type
Term
=
Text
...
...
@@ -38,31 +35,5 @@ type TermList = [(Label, [MultiTerm])]
type
Sentence
a
=
[
a
]
-- or a nominal group
type
Corpus
a
=
[
Sentence
a
]
-- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts.
data
SplitContext
=
Chars
Int
|
Sentences
Int
|
Paragraphs
Int
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Core.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy
::
SplitContext
->
Text
->
[
Text
]
splitBy
(
Chars
n
)
=
map
pack
.
chunkAlong
(
n
+
1
)
1
.
unpack
splitBy
(
Sentences
n
)
=
map
unsentences
.
chunkAlong
(
n
+
1
)
1
.
sentences
splitBy
(
Paragraphs
_
)
=
map
unTag
.
filter
isTagText
.
parseTags
where
unTag
::
IsString
p
=>
Tag
p
->
p
unTag
(
TagText
x
)
=
x
unTag
_
=
""
src/Gargantext/Core/Text/Corpus.hs
View file @
3d9b4e21
module
Gargantext.Core.Text.Corpus
(
makeSubcorpusFromQuery
,
subcorpusEasy
)
where
module
Gargantext.Core.Text.Corpus
(
makeSubcorpusFromQuery
)
where
import
Control.Lens
(
view
)
import
Data.Set.Internal
qualified
as
Set
(
singleton
)
import
Data.Text
qualified
as
T
import
Gargantext.API.Dev
(
runCmdReplEasy
)
import
Gargantext.API.Errors.Types
(
BackendInternalError
(
InternalNodeError
))
import
Gargantext.Core
(
Lang
(
EN
))
import
Gargantext.Core.NodeStory.Types
(
HasNodeStoryEnv
)
...
...
@@ -18,7 +16,7 @@ import Gargantext.Database.Action.Metrics (updateContextScore, updateNgramsOccur
import
Gargantext.Database.Action.Search
(
searchInCorpus
)
import
Gargantext.Database.Action.User
(
getUserId
)
import
Gargantext.Database.Admin.Types.Hyperdata.Corpus
(
HyperdataCorpus
,
hc_lang
)
import
Gargantext.Database.Admin.Types.Node
(
CorpusId
,
Node
Id
(
UnsafeMkNodeId
),
Node
Type
(
..
),
nodeId2ContextId
)
import
Gargantext.Database.Admin.Types.Node
(
CorpusId
,
NodeType
(
..
),
nodeId2ContextId
)
import
Gargantext.Database.Prelude
(
DBCmdWithEnv
)
import
Gargantext.Database.Query.Facet.Types
(
facetDoc_id
)
import
Gargantext.Database.Query.Table.Node
(
insertDefaultNode
,
copyNodeStories
,
defaultList
,
getNodeWithType
)
...
...
@@ -28,21 +26,6 @@ import Gargantext.Database.Schema.Node (node_hyperdata)
import
Gargantext.Prelude
-- | A version of the below function for use in the REPL (so you don't need to
-- manually import tons of constructors etc.)
subcorpusEasy
::
Text
-- ^ Username
->
Int
-- ^ Original corpus ID
->
Text
-- ^ Search string
->
Bool
-- ^ Whether to reuse the parent term list (True) or recompute one from scratch (False)
->
IO
()
subcorpusEasy
username
cId
rawQuery
reuseParentList
=
do
let
eitherQuery
=
Q
.
parseQuery
$
Q
.
RawQuery
rawQuery
case
eitherQuery
of
Left
msg
->
print
$
"Error parsing query
\"
"
<>
rawQuery
<>
"
\"
: "
<>
T
.
pack
msg
Right
query
->
void
$
runCmdReplEasy
$
makeSubcorpusFromQuery
(
UserName
username
)
(
UnsafeMkNodeId
cId
)
query
reuseParentList
-- | Given a "parent" corpus and a query, search for all docs in the parent
-- that match the query, and create a corpus from those. The created corpus
-- is inserted in the tree as a child of the parent corpus.
...
...
src/Gargantext/Core/Text/Corpus/API/Hal.hs
View file @
3d9b4e21
...
...
@@ -23,13 +23,8 @@ import Gargantext.Defaults qualified as Defaults
import
Gargantext.Prelude
hiding
(
intercalate
)
import
HAL
qualified
import
HAL.Doc.Document
qualified
as
HAL
import
HAL.Types
qualified
as
HAL
import
Servant.Client
(
ClientError
)
get
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
[
HyperdataDocument
]
get
la
q
ml
=
do
eDocs
<-
HAL
.
getMetadataWith
[
q
]
(
Just
0
)
(
fromIntegral
<$>
ml
)
la
either
(
panicTrace
.
pack
.
show
)
(
mapM
(
toDoc'
la
)
.
HAL
.
_docs
)
eDocs
getC
::
Maybe
ISO639
.
ISO639_1
->
Text
->
Maybe
Int
->
IO
(
Either
ClientError
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
IO
()
))
getC
la
q
ml
=
do
...
...
src/Gargantext/Core/Text/Corpus/API/Isidore.hs
View file @
3d9b4e21
...
...
@@ -11,22 +11,18 @@ Portability : POSIX
{-# LANGUAGE ScopedTypeVariables #-}
module
Gargantext.Core.Text.Corpus.API.Isidore
(
get
-- * Internals (possibly unused?)
,
isidore2tsvFile
module
Gargantext.Core.Text.Corpus.API.Isidore
(
get
)
where
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers
(
cleanText
)
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
writeDocs2Tsv
)
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Defaults
qualified
as
Defaults
import
Gargantext.Prelude
hiding
(
get
)
import
Isidore
qualified
as
Isidore
import
Isidore
qualified
import
Isidore.Client
import
Servant.Client
(
ClientError
(
DecodeFailure
)
)
...
...
@@ -50,13 +46,6 @@ get lang l q a = do
hDocs
<-
mapM
(
isidoreToDoc
lang
)
(
toIsidoreDocs
iDocs
)
pure
hDocs
isidore2tsvFile
::
FilePath
->
Lang
->
Maybe
Isidore
.
Limit
->
Maybe
Isidore
.
TextQuery
->
Maybe
Isidore
.
AuthorQuery
->
IO
()
isidore2tsvFile
fp
lang
li
tq
aq
=
do
hdocs
<-
get
lang
li
tq
aq
writeDocs2Tsv
fp
hdocs
isidoreToDoc
::
Lang
->
IsidoreDoc
->
IO
HyperdataDocument
isidoreToDoc
lang
(
IsidoreDoc
t
a
d
u
s
as
)
=
do
let
...
...
src/Gargantext/Core/Text/Corpus/API/Pubmed.hs
View file @
3d9b4e21
...
...
@@ -10,12 +10,10 @@ Portability : POSIX
-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
module
Gargantext.Core.Text.Corpus.API.Pubmed
(
get
-- * Internals for testing
,
ESearch
(
..
)
,
convertQuery
,
getESearch
)
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
3d9b4e21
...
...
@@ -29,7 +29,6 @@ module Gargantext.Core.Text.Corpus.Parsers (
,
cleanText
,
parseFormatC
,
splitOn
,
etale
)
where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
...
...
@@ -49,7 +48,6 @@ import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.TSV
(
parseHal
,
parseTsv
,
parseTsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.Date
qualified
as
Date
import
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
(
text2titleParagraphs
)
import
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
qualified
as
Iramuteq
import
Gargantext.Core.Text.Corpus.Parsers.JSON
(
parseJSONC
,
parseIstex
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS
qualified
as
RIS
...
...
@@ -178,44 +176,6 @@ filterZIPFileNameP Istex f = (takeExtension (unEntrySelector f) == ".json") &&
filterZIPFileNameP
_
_
=
True
etale
::
[
HyperdataDocument
]
->
[
HyperdataDocument
]
etale
=
concatMap
etale'
where
etale'
::
HyperdataDocument
->
[
HyperdataDocument
]
etale'
h
=
map
(
\
t
->
h
{
_hd_abstract
=
Just
t
})
$
map
snd
$
text2titleParagraphs
7
(
maybe
""
identity
$
_hd_abstract
h
)
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat TsvGargV3 bs = pure $ parseTsv' $ DBL.fromStrict bs
-- parseFormat TsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
-- <$> enrichWith RisPresse
-- $ partitionEithers
-- $ [runParser' RisPresse bs]
-- pure $ Right docs
-- parseFormat WOS bs = do
-- docs <- mapM (toDoc WOS)
-- <$> snd
-- <$> enrichWith WOS
-- $ partitionEithers
-- $ [runParser' WOS bs]
-- pure $ Right docs
-- parseFormat ZIP bs = do
-- path <- emptySystemTempFile "parsed-zip"
-- DB.writeFile path bs
-- parsedZip <- withArchive path $ do
-- DM.keys <$> getEntries
-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
-- parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile
::
FileType
->
FileFormat
->
FilePath
...
...
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
View file @
3d9b4e21
...
...
@@ -21,16 +21,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module
Gargantext.Core.Text.Corpus.Parsers.Date
(
dateSplit
,
mDateSplit
,
defaultDay
,
defaultUTCTime
,
split'
)
where
import
Data.List
qualified
as
List
import
Data.Text
(
unpack
,
splitOn
,
replace
)
import
Data.Time
(
defaultTimeLocale
,
iso8601DateFormat
,
parseTimeM
,
toGregorian
)
import
Data.Time.Calendar
qualified
as
DTC
import
Data.Time.Clock
(
UTCTime
(
..
),
secondsToDiffTime
)
import
Data.Time.Clock
(
UTCTime
(
..
))
import
Gargantext.Prelude
hiding
(
replace
)
------------------------------------------------------------------------
...
...
@@ -113,11 +110,3 @@ readDate txt = do
--let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S")
let
format
=
cs
$
iso8601DateFormat
Nothing
parseTimeM
True
defaultTimeLocale
(
unpack
format
)
(
cs
txt
)
defaultDay
::
DTC
.
Day
defaultDay
=
DTC
.
fromGregorian
1
1
1
defaultUTCTime
::
UTCTime
defaultUTCTime
=
UTCTime
{
utctDay
=
defaultDay
,
utctDayTime
=
secondsToDiffTime
0
}
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
View file @
3d9b4e21
...
...
@@ -35,39 +35,6 @@ import Text.Parsec.String
-- par défaut: un doc == 1 NodeWrite
-- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
sample
::
Text
sample
=
unlines
[
"title1"
-- , "title2"
-- , "=="
-- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
,
"date: 2021-09-10"
,
"source: someSource"
,
"document contents 1"
,
"document contents 2"
]
sampleUnordered
::
Text
sampleUnordered
=
unlines
[
"title1"
,
"title2"
,
"=="
,
"document contents 1"
,
"date: 2021-09-10"
,
"authors: FirstName1, LastName1; FirstName2, LastName2"
,
"source: someSource"
,
"document contents 2"
]
-- parseSample = parse documentP "sample" (unpack sample)
-- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
parseLinesSample
::
Either
ParseError
Parsed
parseLinesSample
=
parseLines
sample
parseLinesSampleUnordered
::
Either
ParseError
Parsed
parseLinesSampleUnordered
=
parseLines
sampleUnordered
data
Author
=
Author
{
firstName
::
Text
,
lastName
::
Text
}
...
...
@@ -114,14 +81,6 @@ parseLines text = foldl f emptyParsed <$> lst
f
(
Parsed
{
..
})
(
LSource
s
)
=
Parsed
{
source
=
Just
s
,
..
}
f
(
Parsed
{
..
})
(
LTitle
t
)
=
Parsed
{
title
=
t
,
..
}
-- Source should be the name of the node
-- First line of each Context should be the title.
documentLinesP
::
Parser
[
Line
]
documentLinesP
=
do
t
<-
titleP
ls
<-
lineP
`
sepBy
`
newline
pure
$
[
LTitle
$
pack
t
]
++
ls
documentLines
::
Parser
[
Line
]
documentLines
=
do
ls
<-
lineP
`
sepBy
`
newline
...
...
@@ -157,27 +116,6 @@ contentsLineP = do
--------------------
-- documentP = do
-- t <- titleP
-- a <- optionMaybe authorsP
-- d <- optionMaybe dateP
-- s <- optionMaybe sourceP
-- c <- contentsP
-- pure $ Parsed { title = pack t
-- , authors = fromMaybe [] a
-- , date = pack <$> d
-- , source = pack <$> s
-- , contents = pack c }
titleDelimiterP
::
Parser
()
titleDelimiterP
=
do
_
<-
newline
-- _ <- try (string "==")
pure
()
titleP
::
Parser
[
Char
]
titleP
=
manyTill
anyChar
(
try
titleDelimiterP
)
authorsPrefixP
::
Parser
[
Char
]
authorsPrefixP
=
do
_
<-
string
"authors:"
...
...
@@ -225,12 +163,6 @@ sourceP = try sourcePrefixP
_
<-
string
"source:"
many
(
char
' '
)
-- contentsP :: Parser String
-- contentsP = many anyChar
tokenEnd
::
Parser
()
tokenEnd
=
void
(
char
'
\n
'
)
<|>
eof
--- MISC Tools
-- Using ChunkAlong here enable redundancies in short corpora of texts
-- maybe use splitEvery or chunkAlong depending on the size of the whole text
...
...
@@ -249,8 +181,3 @@ text2titleParagraphs n = catMaybes
doTitle
::
[
Text
]
->
Maybe
(
Text
,
Text
)
doTitle
(
t
:
ts
)
=
Just
(
t
,
DT
.
unwords
ts
)
doTitle
[]
=
Nothing
clean
::
Text
->
Text
clean
=
DT
.
unwords
.
List
.
filter
(
\
w
->
DT
.
length
w
<
25
)
.
DT
.
words
src/Gargantext/Core/Text/Corpus/Parsers/Iramuteq.hs
View file @
3d9b4e21
...
...
@@ -12,19 +12,18 @@ commentary with @some markup@.
-}
module
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
(
parseIramuteqFile
,
parser
,
keys
)
where
module
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
(
parser
,
keys
)
where
import
Control.Applicative
import
Data.Attoparsec.ByteString
(
Parser
,
takeTill
,
parseOnly
)
import
Data.Attoparsec.ByteString.Char8
(
isEndOfLine
,
takeWhile
,
endOfLine
)
import
Data.ByteString
(
ByteString
)
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
,
lines
,
concat
)
import
qualified
Data.ByteString
as
DB
parseIramuteqFile
::
FilePath
->
IO
(
Either
String
[[(
ByteString
,
ByteString
)]])
parseIramuteqFile
fp
=
do
txts
<-
DB
.
readFile
fp
pure
$
parseOnly
parser
txts
-------------------------------------------------------------
parser
::
Parser
[[(
ByteString
,
ByteString
)]]
...
...
src/Gargantext/Core/Text/Corpus/Parsers/Isidore.hs
deleted
100644 → 0
View file @
91c73c45
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Isidore
Description : To query French Humanities publication database
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TODO:
- put endpoint in configuration file
- more flexible fields of research
- type database name
- use more ontologies to help building corpora
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
{-# LANGUAGE ScopedTypeVariables #-}
module
Gargantext.Core.Text.Corpus.Parsers.Isidore
where
import
Data.ByteString.Lazy
(
ByteString
)
import
Data.RDF
(
Node
(
LNode
,
UNode
),
LValue
(
PlainLL
,
TypedL
,
PlainL
)
)
import
Data.Text
qualified
as
T
import
Database.HSparql.Connection
(
BindingValue
(
..
),
EndPoint
,
structureContent
)
import
Database.HSparql.QueryGenerator
import
Gargantext.Core
(
Lang
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
hiding
(
ByteString
)
import
Network.Wreq
(
getWith
,
Response
,
defaults
,
header
,
param
,
responseStatus
,
responseBody
)
import
Prelude
qualified
route
::
EndPoint
route
=
"https://isidore.science/sparql/"
selectQueryRaw'
::
Prelude
.
String
->
Prelude
.
String
->
IO
(
Response
ByteString
)
selectQueryRaw'
uri
q
=
getWith
opts
uri
where
opts
=
defaults
&
header
"Accept"
.~
[
"application/sparql-results+xml"
]
&
header
"User-Agent"
.~
[
"gargantext-hsparql-client"
]
&
param
"query"
.~
[
T
.
pack
q
]
isidoreGet
::
Lang
->
Int
->
Text
->
IO
(
Maybe
[
HyperdataDocument
])
isidoreGet
la
li
q
=
do
bindingValues
<-
isidoreGet'
li
q
case
bindingValues
of
Nothing
->
pure
Nothing
Just
dv
->
pure
$
Just
$
map
(
bind2doc
la
)
dv
isidoreGet'
::
Int
->
Text
->
IO
(
Maybe
[[
BindingValue
]])
isidoreGet'
l
q
=
do
let
s
=
createSelectQuery
$
isidoreSelect
l
q
putStrLn
s
r
<-
selectQueryRaw'
route
s
putStrLn
(
show
$
r
^.
responseStatus
::
Text
)
pure
$
structureContent
$
r
^.
responseBody
-- res <- selectQuery route $ simpleSelect q
-- pure res
isidoreSelect
::
Int
->
Text
->
Query
SelectQuery
isidoreSelect
lim
q
=
do
-- See Predefined Namespace Prefixes:
-- https://isidore.science/sparql?nsdecl
isidore
<-
prefix
"isidore"
(
iriRef
"http://isidore.science/class/"
)
rdf
<-
prefix
"rdf"
(
iriRef
"http://www.w3.org/1999/02/22-rdf-syntax-ns#"
)
dcterms
<-
prefix
"dcterms"
(
iriRef
"http://purl.org/dc/terms/"
)
dc
<-
prefix
"dc"
(
iriRef
"http://purl.org/dc/elements/1.1/"
)
--iso <- prefix "fra" (iriRef "http://lexvo.org/id/iso639-3/")
--ore <- prefix "ore" (iriRef "http://www.openarchives.org/ore/terms/")
--bif <- prefix "bif" (iriRef "bif:")
link'
<-
var
title
<-
var
date
<-
var
abstract
<-
var
authors
<-
var
source
<-
var
langDoc
<-
var
publisher
<-
var
--agg <- var
triple_
link'
(
rdf
.:.
"type"
)
(
isidore
.:.
"Document"
)
triple_
link'
(
dcterms
.:.
"title"
)
title
triple_
link'
(
dcterms
.:.
"date"
)
date
triple_
link'
(
dcterms
.:.
"creator"
)
authors
--triple_ link (dcterms .:. "language") langDoc
triple_
link'
(
dc
.:.
"description"
)
abstract
--triple_ link (ore .:. "isAggregatedBy") agg
--triple_ agg (dcterms .:. "title") title
optional_
$
triple_
link'
(
dcterms
.:.
"source"
)
source
optional_
$
triple_
link'
(
dcterms
.:.
"publisher"
)
publisher
-- TODO FIX BUG with (.||.) operator
--filterExpr_ $ (.||.) (contains title q) (contains abstract q)
--filterExpr_ (containsWith authors q) -- (contains abstract q)
--filterExpr_ (containsWith title q) -- (contains abstract q)
--filterExpr_ $ (.||.) (containsWith title q) (contains abstract q)
filterExpr_
(
containsWith
title
q
)
-- TODO FIX filter with lang
--filterExpr_ $ langMatches title (str ("fra" :: Text))
--filterExpr_ $ (.==.) langDoc (str ("http://lexvo.org/id/iso639-3/fra" :: Text))
orderNextDesc
date
limit_
lim
distinct_
selectVars
[
link'
,
date
,
langDoc
,
authors
,
source
,
publisher
,
title
,
abstract
]
-- | TODO : check if all cases are taken into account
unbound
::
Lang
->
BindingValue
->
Maybe
Text
unbound
_
Unbound
=
Nothing
unbound
_
(
Bound
(
UNode
x
))
=
Just
x
unbound
_
(
Bound
(
LNode
(
TypedL
x
_
)))
=
Just
x
unbound
_
(
Bound
(
LNode
(
PlainL
x
)))
=
Just
x
unbound
l
(
Bound
(
LNode
(
PlainLL
x
l'
)))
=
if
l'
==
T
.
toLower
(
show
l
)
then
Just
x
else
Nothing
unbound
_
_
=
Nothing
bind2doc
::
Lang
->
[
BindingValue
]
->
HyperdataDocument
bind2doc
l
[
link'
,
date
,
langDoc
,
authors
,
_source
,
publisher
,
title
,
abstract
]
=
HyperdataDocument
{
_hd_bdd
=
Just
"Isidore"
,
_hd_doi
=
Nothing
,
_hd_url
=
unbound
l
link'
,
_hd_page
=
Nothing
,
_hd_title
=
unbound
l
title
,
_hd_authors
=
unbound
l
authors
,
_hd_institutes
=
Nothing
,
_hd_source
=
unbound
l
publisher
,
_hd_abstract
=
unbound
l
abstract
,
_hd_publication_date
=
unbound
l
date
,
_hd_publication_year
=
Nothing
,
_hd_publication_month
=
Nothing
,
_hd_publication_day
=
Nothing
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
unbound
l
langDoc
,
_hd_institutes_tree
=
Nothing
}
bind2doc
_
_
=
undefined
src/Gargantext/Core/Text/Corpus/Parsers/TSV.hs
View file @
3d9b4e21
...
...
@@ -15,7 +15,6 @@ TSV parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.TSV
where
import
Conduit
(
ConduitT
,
(
.|
),
yieldMany
,
mapC
)
import
Data.ByteString
qualified
as
BS
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Csv
import
Data.Text
(
pack
)
...
...
@@ -26,23 +25,11 @@ import Data.Text.Read qualified as DTR
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text
(
sentences
,
unsentences
)
import
Gargantext.Core.Text.Context
(
splitBy
,
SplitContext
(
..
)
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
)
)
import
Gargantext.Prelude
hiding
(
length
,
show
)
import
Protolude
---------------------------------------------------------------
headerTsvGargV3
::
Header
headerTsvGargV3
=
header
[
"title"
,
"source"
,
"publication_year"
,
"publication_month"
,
"publication_day"
,
"abstract"
,
"authors"
]
---------------------------------------------------------------
data
TsvGargV3
=
TsvGargV3
{
d_docId
::
!
Int
...
...
@@ -55,92 +42,10 @@ data TsvGargV3 = TsvGargV3
,
d_authors
::
!
Text
}
deriving
(
Show
)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
toDoc
::
TsvGargV3
->
HyperdataDocument
toDoc
(
TsvGargV3
did
dt
_
dpy
dpm
dpd
dab
dau
)
=
HyperdataDocument
{
_hd_bdd
=
Just
"TSV"
,
_hd_doi
=
Just
.
pack
.
show
$
did
,
_hd_url
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
Just
dt
,
_hd_authors
=
Nothing
,
_hd_institutes
=
Just
dau
,
_hd_source
=
Just
dab
,
_hd_abstract
=
Nothing
,
_hd_publication_date
=
Nothing
,
_hd_publication_year
=
Just
dpy
,
_hd_publication_month
=
Just
dpm
,
_hd_publication_day
=
Just
dpd
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Nothing
,
_hd_institutes_tree
=
Nothing
}
---------------------------------------------------------------
-- | Types Conversions
toDocs
::
Vector
TsvDoc
->
[
TsvGargV3
]
toDocs
v
=
V
.
toList
$
V
.
zipWith
(
\
nId
(
TsvDoc
{
..
})
-- (TsvDoc t s mPy pm pd abst auth)
->
TsvGargV3
{
d_docId
=
nId
,
d_title
=
tsv_title
,
d_source
=
tsv_source
,
d_publication_year
=
fromMIntOrDec
defaultYear
tsv_publication_year
,
d_publication_month
=
fromMaybe
defaultMonth
tsv_publication_month
,
d_publication_day
=
fromMaybe
defaultDay
tsv_publication_day
,
d_abstract
=
tsv_abstract
,
d_authors
=
tsv_authors
})
(
V
.
enumFromN
1
(
V
.
length
v''
))
v''
where
v''
=
V
.
foldl
(
\
v'
sep
->
V
.
concatMap
(
splitDoc
(
docsSize
v'
)
sep
)
v'
)
v
seps
seps
=
V
.
fromList
[
Paragraphs
1
,
Sentences
3
,
Chars
3
]
---------------------------------------------------------------
fromDocs
::
Vector
TsvGargV3
->
Vector
TsvDoc
fromDocs
=
V
.
map
fromDocs'
where
fromDocs'
(
TsvGargV3
{
..
})
=
TsvDoc
{
tsv_title
=
d_title
,
tsv_source
=
d_source
,
tsv_publication_year
=
Just
$
IntOrDec
d_publication_year
,
tsv_publication_month
=
Just
d_publication_month
,
tsv_publication_day
=
Just
d_publication_day
,
tsv_abstract
=
d_abstract
,
tsv_authors
=
d_authors
}
---------------------------------------------------------------
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc
::
Mean
->
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
T
.
length
$
tsv_abstract
doc
)
in
if
(
docSize
>
1000
)
&&
(
mod
(
round
m
)
docSize
>=
10
)
then
splitDoc'
splt
doc
else
V
.
fromList
[
doc
]
where
splitDoc'
::
SplitContext
->
TsvDoc
->
Vector
TsvDoc
splitDoc'
contextSize
(
TsvDoc
{
..
})
=
V
.
fromList
$
[
firstDoc
]
<>
nextDocs
where
firstDoc
=
TsvDoc
{
tsv_abstract
=
firstAbstract
,
..
}
firstAbstract
=
head'
"splitDoc'1"
abstracts
nextDocs
=
map
(
\
txt
->
TsvDoc
{
tsv_title
=
head'
"splitDoc'2"
$
sentences
txt
,
tsv_abstract
=
unsentences
$
tail'
"splitDoc'1"
$
sentences
txt
,
..
}
)
(
tail'
"splitDoc'2"
abstracts
)
abstracts
=
(
splitBy
$
contextSize
)
tsv_abstract
---------------------------------------------------------------
---------------------------------------------------------------
type
Mean
=
Double
docsSize
::
Vector
TsvDoc
->
Mean
docsSize
tsvDoc
=
mean
ls
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
T
.
length
.
tsv_abstract
)
tsvDoc
---------------------------------------------------------------
newtype
IntOrDec
=
IntOrDec
Int
deriving
(
Show
,
Eq
,
Read
)
...
...
@@ -195,28 +100,11 @@ instance ToNamedRecord TsvDoc where
,
"authors"
.=
tsv_authors
]
hyperdataDocument2tsvDoc
::
HyperdataDocument
->
TsvDoc
hyperdataDocument2tsvDoc
h
=
TsvDoc
{
tsv_title
=
m
$
_hd_title
h
,
tsv_source
=
m
$
_hd_source
h
,
tsv_publication_year
=
Just
$
IntOrDec
$
mI
$
_hd_publication_year
h
,
tsv_publication_month
=
Just
$
mI
$
_hd_publication_month
h
,
tsv_publication_day
=
Just
$
mI
$
_hd_publication_day
h
,
tsv_abstract
=
m
$
_hd_abstract
h
,
tsv_authors
=
m
$
_hd_authors
h
}
where
m
=
maybe
""
identity
mI
=
maybe
0
identity
data
Delimiter
=
Tab
|
Comma
|
Line
deriving
(
Eq
,
Show
)
tsvDecodeOptions
::
Delimiter
->
DecodeOptions
tsvDecodeOptions
d
=
defaultDecodeOptions
{
decDelimiter
=
delimiter
d
}
tsvEncodeOptions
::
Delimiter
->
EncodeOptions
tsvEncodeOptions
d
=
defaultEncodeOptions
{
encDelimiter
=
delimiter
d
}
delimiter
::
Delimiter
->
Word8
delimiter
Tab
=
fromIntegral
$
ord
'
\t
'
delimiter
Comma
=
fromIntegral
$
ord
','
...
...
@@ -368,35 +256,6 @@ getHeaders bl del = do
------------------------------------------------------------------------
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileLazy
d
f
=
fmap
(
readByteStringLazy
d
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileStrict
d
f
=
fmap
(
readByteStringStrict
d
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringLazy
_f
d
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
d
)
bs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
a
)
readByteStringStrict
d
ff
=
readByteStringLazy
d
ff
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readTSVFile
::
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
TsvDoc
))
readTSVFile
fp
=
do
...
...
@@ -424,20 +283,6 @@ readTsvHal fp = do
readTsvHalLazyBS
::
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalLazyBS
bs
=
first
pack
$
decodeByNameWith
(
tsvDecodeOptions
Tab
)
bs
readTsvHalBSStrict
::
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
TsvHal
)
readTsvHalBSStrict
bs
=
readTsvHalLazyBS
$
BL
.
fromStrict
bs
------------------------------------------------------------------------
writeFile
::
FilePath
->
(
Header
,
Vector
TsvDoc
)
->
IO
()
writeFile
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
(
tsvEncodeOptions
Tab
)
h
(
V
.
toList
vs
)
writeDocs2Tsv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Tsv
fp
hs
=
BL
.
writeFile
fp
$
hyperdataDocument2tsv
hs
hyperdataDocument2tsv
::
[
HyperdataDocument
]
->
BL
.
ByteString
hyperdataDocument2tsv
hs
=
encodeByNameWith
(
tsvEncodeOptions
Tab
)
headerTsvGargV3
(
map
hyperdataDocument2tsvDoc
hs
)
------------------------------------------------------------------------
-- Hal Format
data
TsvHal
=
TsvHal
...
...
@@ -575,27 +420,11 @@ parseHal fp = do
r
<-
readTsvHal
fp
pure
$
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
r
parseHal'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseHal'
bs
=
V
.
toList
.
V
.
map
tsvHal2doc
.
snd
<$>
readTsvHalLazyBS
bs
------------------------------------------------------------------------
parseTsv
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseTsv
fp
=
fmap
(
V
.
toList
.
V
.
map
tsv2doc
.
snd
)
<$>
readTSVFile
fp
{-
parseTsv' :: BL.ByteString -> Either Text [HyperdataDocument]
parseTsv' bs = (V.toList . V.map tsv2doc . snd) <$> readTsvLazyBS Comma bs
-}
parseTsv'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseTsv'
bs
=
do
let
result
=
case
(
testCorrectFile
bs
)
of
Left
_err
->
Left
_err
Right
del
->
readTsvLazyBS
del
bs
V
.
toList
.
V
.
map
tsv2doc
.
snd
<$>
result
parseTsvC
::
BL
.
ByteString
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseTsvC
bs
=
...
...
src/Gargantext/Core/Text/Corpus/Query.hs
View file @
3d9b4e21
...
...
@@ -8,7 +8,6 @@ module Gargantext.Core.Text.Corpus.Query (
,
QueryTerm
(
..
)
,
getQuery
,
parseQuery
,
mapQuery
,
renderQuery
,
renderQueryTerm
,
interpretQuery
...
...
@@ -130,6 +129,3 @@ parseQuery (RawQuery txt) = bimap show (Query . BoolExpr.boolTreeToCNF) $
renderQuery
::
Query
->
RawQuery
renderQuery
(
Query
cnf
)
=
RawQuery
.
T
.
pack
$
BoolExpr
.
boolExprPrinter
(
showsPrec
0
)
(
BoolExpr
.
fromCNF
cnf
)
""
mapQuery
::
(
QueryTerm
->
QueryTerm
)
->
Query
->
Query
mapQuery
f
=
Query
.
fmap
(
map
f
)
.
getQuery
weeder.toml
View file @
3d9b4e21
...
...
@@ -22,6 +22,11 @@ roots = [ '^Main\.main$'
,
'^Gargantext\.API\.Ngrams\.List\.importTsvFile$'
# Used by the tests
,
'^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.convertQuery$'
,
'^Gargantext\.Core\.Text\.Corpus\.API\.Pubmed\.getESearch$'
# Template Haskell
# Weeder is not smart enough to know what functions will be used by
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment