Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
aa7e1142
Commit
aa7e1142
authored
May 12, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/203-dev-corpus-json-import' into dev
parents
19c6f5ad
65fa2834
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
101 additions
and
11 deletions
+101
-11
package.yaml
package.yaml
+1
-0
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+2
-1
Types.hs
src/Gargantext/API/Node/Corpus/New/Types.hs
+3
-1
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+13
-6
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+3
-3
JSON.hs
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
+79
-0
No files found.
package.yaml
View file @
aa7e1142
...
...
@@ -82,6 +82,7 @@ library:
-
Gargantext.Core.Text.Corpus.Parsers
-
Gargantext.Core.Text.Corpus.Parsers.CSV
-
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
-
Gargantext.Core.Text.Corpus.Parsers.JSON
-
Gargantext.Core.Text.List.Formats.CSV
-
Gargantext.Core.Text.Metrics
-
Gargantext.Core.Text.Metrics.CharByChar
...
...
src/Gargantext/API/Node/Corpus/New.hs
View file @
aa7e1142
...
...
@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
markProgress
1
jobHandle
void
$
flowDataText
user
txt
(
Multi
l
)
cid
(
Just
flw
)
jobHandle
-- printDebug "corpus id" cids
-- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail
user
-- TODO ...
...
...
@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
WOS
->
Parser
.
parseFormatC
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormatC
Parser
.
RisPresse
Iramuteq
->
Parser
.
parseFormatC
Parser
.
Iramuteq
JSON
->
Parser
.
parseFormatC
Parser
.
JSON
-- TODO granularity of the logStatus
let
data
'
=
case
ff
of
...
...
src/Gargantext/API/Node/Corpus/New/Types.hs
View file @
aa7e1142
...
...
@@ -15,6 +15,7 @@ data FileType = CSV
|
PresseRIS
|
WOS
|
Iramuteq
|
JSON
deriving
(
Eq
,
Show
,
Generic
)
instance
ToSchema
FileType
instance
Arbitrary
FileType
where
arbitrary
=
elements
[
CSV
,
PresseRIS
]
...
...
@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where
parseUrlPiece
"PresseRis"
=
pure
PresseRIS
parseUrlPiece
"WOS"
=
pure
WOS
parseUrlPiece
"Iramuteq"
=
pure
Iramuteq
parseUrlPiece
_
=
panic
"[G.A.A.Node.Corpus.New] File Type not implemented (yet)"
parseUrlPiece
"JSON"
=
pure
JSON
parseUrlPiece
s
=
panic
$
"[G.A.A.Node.Corpus.New] File Type not implemented (yet): "
<>
s
instance
ToHttpApiData
FileType
where
toUrlPiece
=
pack
.
show
...
...
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
aa7e1142
...
...
@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second)
import
Gargantext.API.Node.Corpus.New.Types
(
FileFormat
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
,
parseCsvC
)
import
Gargantext.Core.Text.Corpus.Parsers.JSON
(
parseJSONC
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Query.Table.Ngrams
(
NgramsType
(
..
))
...
...
@@ -79,7 +80,8 @@ data FileType = WOS
|
CsvGargV3
|
CsvHal
|
Iramuteq
deriving
(
Show
)
|
JSON
deriving
(
Show
,
Eq
)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
...
...
@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do
)
<$>
eDocs
parseFormatC
JSON
Plain
bs
=
do
let
eParsedC
=
parseJSONC
$
DBL
.
fromStrict
bs
case
eParsedC
of
Left
err
->
pure
$
Left
err
Right
(
mLen
,
parsedC
)
->
pure
$
Right
(
mLen
,
transPipe
(
pure
.
runIdentity
)
parsedC
)
parseFormatC
ft
ZIP
bs
=
do
path
<-
liftBase
$
emptySystemTempFile
"parsed-zip"
liftBase
$
DB
.
writeFile
path
bs
...
...
@@ -154,7 +162,7 @@ parseFormatC ft ZIP bs = do
pure
$
Right
(
Just
totalLength
,
sequenceConduits
contents'
>>
pure
()
)
-- .| mapM_C (printDebug "[parseFormatC] doc")
_
->
pure
$
Left
$
unpack
$
intercalate
"
\n
"
$
pack
<$>
errs
parseFormatC
_
_
_
=
undefined
...
...
@@ -211,7 +219,7 @@ parseFile WOS Plain p = do
parseFile
Iramuteq
Plain
p
=
do
docs
<-
join
$
mapM
((
toDoc
Iramuteq
)
.
(
map
(
second
(
Text
.
replace
"_"
" "
))))
<$>
snd
<$>
enrichWith
Iramuteq
<$>
enrichWith
Iramuteq
<$>
readFileWith
Iramuteq
p
pure
$
Right
docs
...
...
@@ -226,7 +234,7 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d
let
lang
=
EN
-- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let
dateToParse
=
DT
.
replace
" "
""
<$>
lookup
"PY"
d
-- <> Just " " <> lookup "publication_date" d
let
dateToParse
=
DT
.
replace
" "
""
<$>
lookup
"PY"
d
-- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
dateSplit
lang
dateToParse
...
...
@@ -314,9 +322,8 @@ clean txt = DBC.map clean' txt
clean'
';'
=
'.'
clean'
c
=
c
--
--
splitOn
::
NgramsType
->
Maybe
Text
->
Text
->
[
Text
]
splitOn
Authors
(
Just
"WOS"
)
=
(
DT
.
splitOn
"; "
)
splitOn
_
_
=
(
DT
.
splitOn
", "
)
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
aa7e1142
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.CSV
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
...
...
@@ -378,7 +378,7 @@ instance ToNamedRecord CsvHal where
,
"instStructId_i"
.=
csvHal_instStructId_i
,
"deptStructId_i"
.=
csvHal_deptStructId_i
,
"labStructId_i"
.=
csvHal_labStructId_i
,
"rteamStructId_i"
.=
csvHal_rteamStructId_i
,
"docType_s"
.=
csvHal_docType_s
]
...
...
@@ -472,7 +472,7 @@ parseCsvC bs = do
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
$
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
$
snd
r
,
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
)
Right
r
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
$
snd
r
,
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
)
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
0 → 100644
View file @
aa7e1142
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.JSON
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
JSON parser for Gargantext corpus files.
-}
{-# LANGUAGE DuplicateRecordFields #-}
module
Gargantext.Core.Text.Corpus.Parsers.JSON
where
import
Conduit
import
Data.Aeson
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Either
(
Either
(
..
))
import
Data.Text
import
GHC.Generics
import
qualified
Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import
Gargantext.Prelude
hiding
(
length
)
data
JSONStruct
=
JSONStruct
{
documents
::
[
JSONStructDocument
]
,
garg_version
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONStruct
data
JSONStructDocument
=
JSONStructDocument
{
document
::
JSONDocument
,
ngrams
::
JSONNgrams
,
hash
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONStructDocument
data
JSONDocument
=
JSONDocument
{
id
::
Int
,
hash_id
::
Maybe
Text
,
typename
::
Int
,
user_id
::
Int
,
parent_id
::
Maybe
Int
,
name
::
Text
,
date
::
Text
,
hyperdata
::
HyperdataDocument
}
deriving
(
Generic
)
instance
FromJSON
JSONDocument
data
JSONNgrams
=
JSONNgrams
{
ngrams
::
[
Text
]
,
hash
::
Text
}
deriving
(
Generic
)
instance
FromJSON
JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
JSONStruct
readJSONLazyBS
bs
=
eitherDecode
bs
parseJSONC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseJSONC
bs
=
do
case
readJSONLazyBS
bs
of
Left
err
->
Left
err
Right
(
JSONStruct
{
documents
})
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
documents
,
yieldMany
documents
.|
mapC
doc2hyperdoc
)
doc2hyperdoc
::
JSONStructDocument
->
HyperdataDocument
doc2hyperdoc
(
JSONStructDocument
{
document
=
JSONDocument
{
hyperdata
}
})
=
hyperdata
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment