Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
aa7e1142
Commit
aa7e1142
authored
May 12, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/203-dev-corpus-json-import' into dev
parents
19c6f5ad
65fa2834
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
101 additions
and
11 deletions
+101
-11
package.yaml
package.yaml
+1
-0
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+2
-1
Types.hs
src/Gargantext/API/Node/Corpus/New/Types.hs
+3
-1
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+13
-6
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+3
-3
JSON.hs
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
+79
-0
No files found.
package.yaml
View file @
aa7e1142
...
...
@@ -82,6 +82,7 @@ library:
-
Gargantext.Core.Text.Corpus.Parsers
-
Gargantext.Core.Text.Corpus.Parsers.CSV
-
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
-
Gargantext.Core.Text.Corpus.Parsers.JSON
-
Gargantext.Core.Text.List.Formats.CSV
-
Gargantext.Core.Text.Metrics
-
Gargantext.Core.Text.Metrics.CharByChar
...
...
src/Gargantext/API/Node/Corpus/New.hs
View file @
aa7e1142
...
...
@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
markProgress
1
jobHandle
void
$
flowDataText
user
txt
(
Multi
l
)
cid
(
Just
flw
)
jobHandle
-- printDebug "corpus id" cids
-- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail
user
-- TODO ...
...
...
@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
WOS
->
Parser
.
parseFormatC
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormatC
Parser
.
RisPresse
Iramuteq
->
Parser
.
parseFormatC
Parser
.
Iramuteq
JSON
->
Parser
.
parseFormatC
Parser
.
JSON
-- TODO granularity of the logStatus
let
data
'
=
case
ff
of
...
...
src/Gargantext/API/Node/Corpus/New/Types.hs
View file @
aa7e1142
...
...
@@ -15,6 +15,7 @@ data FileType = CSV
|
PresseRIS
|
WOS
|
Iramuteq
|
JSON
deriving
(
Eq
,
Show
,
Generic
)
instance
ToSchema
FileType
instance
Arbitrary
FileType
where
arbitrary
=
elements
[
CSV
,
PresseRIS
]
...
...
@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where
parseUrlPiece
"PresseRis"
=
pure
PresseRIS
parseUrlPiece
"WOS"
=
pure
WOS
parseUrlPiece
"Iramuteq"
=
pure
Iramuteq
parseUrlPiece
_
=
panic
"[G.A.A.Node.Corpus.New] File Type not implemented (yet)"
parseUrlPiece
"JSON"
=
pure
JSON
parseUrlPiece
s
=
panic
$
"[G.A.A.Node.Corpus.New] File Type not implemented (yet): "
<>
s
instance
ToHttpApiData
FileType
where
toUrlPiece
=
pack
.
show
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
aa7e1142
...
...
@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second)
import
Gargantext.API.Node.Corpus.New.Types
(
FileFormat
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.JSON
(
parseJSONC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Query.Table.Ngrams
(
NgramsType
(
..
))
...
...
@@ -79,7 +80,8 @@ data FileType = WOS
|
CsvGargV3
|
CsvHal
|
Iramuteq
deriving
(
Show
)
|
JSON
deriving
(
Show
,
Eq
)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
...
...
@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do
)
<$>
eDocs
parseFormatC
JSON
Plain
bs
=
do
let
eParsedC
=
parseJSONC
$
DBL
.
fromStrict
bs
case
eParsedC
of
Left
err
->
pure
$
Left
err
Right
(
mLen
,
parsedC
)
->
pure
$
Right
(
mLen
,
transPipe
(
pure
.
runIdentity
)
parsedC
)
parseFormatC
ft
ZIP
bs
=
do
path
<-
liftBase
$
emptySystemTempFile
"parsed-zip"
liftBase
$
DB
.
writeFile
path
bs
...
...
@@ -154,7 +162,7 @@ parseFormatC ft ZIP bs = do
pure
$
Right
(
Just
totalLength
,
sequenceConduits
contents'
>>
pure
()
)
-- .| mapM_C (printDebug "[parseFormatC] doc")
_
->
pure
$
Left
$
unpack
$
intercalate
"
\n
"
$
pack
<$>
errs
parseFormatC
_
_
_
=
undefined
...
...
@@ -211,7 +219,7 @@ parseFile WOS Plain p = do
parseFile
Iramuteq
Plain
p
=
do
docs
<-
join
$
mapM
((
toDoc
Iramuteq
)
.
(
map
(
second
(
Text
.
replace
"_"
" "
))))
<$>
snd
<$>
enrichWith
Iramuteq
<$>
enrichWith
Iramuteq
<$>
readFileWith
Iramuteq
p
pure
$
Right
docs
...
...
@@ -226,7 +234,7 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d
let
lang
=
EN
-- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let
dateToParse
=
DT
.
replace
" "
""
<$>
lookup
"PY"
d
-- <> Just " " <> lookup "publication_date" d
let
dateToParse
=
DT
.
replace
" "
""
<$>
lookup
"PY"
d
-- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
dateSplit
lang
dateToParse
...
...
@@ -314,9 +322,8 @@ clean txt = DBC.map clean' txt
clean'
';'
=
'.'
clean'
c
=
c
--
--
splitOn
::
NgramsType
->
Maybe
Text
->
Text
->
[
Text
]
splitOn
Authors
(
Just
"WOS"
)
=
(
DT
.
splitOn
"; "
)
splitOn
_
_
=
(
DT
.
splitOn
", "
)
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
aa7e1142
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.CSV
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -378,7 +378,7 @@ instance ToNamedRecord CsvHal where
,
"instStructId_i"
.=
csvHal_instStructId_i
,
"deptStructId_i"
.=
csvHal_deptStructId_i
,
"labStructId_i"
.=
csvHal_labStructId_i
,
"rteamStructId_i"
.=
csvHal_rteamStructId_i
,
"docType_s"
.=
csvHal_docType_s
]
...
...
@@ -472,7 +472,7 @@ parseCsvC bs = do
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
$
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
$
snd
r
,
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
)
Right
r
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
$
snd
r
,
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
)
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
0 → 100644
View file @
aa7e1142
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.JSON
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
JSON parser for Gargantext corpus files.
-}
{-# LANGUAGE DuplicateRecordFields #-}
module
Gargantext.Core.Text.Corpus.Parsers.JSON
where
import
Conduit
import
Data.Aeson
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Either
(
Either
(
..
))
import
Data.Text
import
GHC.Generics
import
qualified
Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import
Gargantext.Prelude
hiding
(
length
)
data
JSONStruct
=
JSONStruct
{
documents
::
[
JSONStructDocument
]
,
garg_version
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONStruct
data
JSONStructDocument
=
JSONStructDocument
{
document
::
JSONDocument
,
ngrams
::
JSONNgrams
,
hash
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONStructDocument
data
JSONDocument
=
JSONDocument
{
id
::
Int
,
hash_id
::
Maybe
Text
,
typename
::
Int
,
user_id
::
Int
,
parent_id
::
Maybe
Int
,
name
::
Text
,
date
::
Text
,
hyperdata
::
HyperdataDocument
}
deriving
(
Generic
)
instance
FromJSON
JSONDocument
data
JSONNgrams
=
JSONNgrams
{
ngrams
::
[
Text
]
,
hash
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
JSONStruct
readJSONLazyBS
bs
=
eitherDecode
bs
parseJSONC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseJSONC
bs
=
do
case
readJSONLazyBS
bs
of
Left
err
->
Left
err
Right
(
JSONStruct
{
documents
})
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
documents
,
yieldMany
documents
.|
mapC
doc2hyperdoc
)
doc2hyperdoc
::
JSONStructDocument
->
HyperdataDocument
doc2hyperdoc
(
JSONStructDocument
{
document
=
JSONDocument
{
hyperdata
}
})
=
hyperdata
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment