Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
3853062d
Commit
3853062d
authored
Jan 10, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] FrameWrite Corpus improvement
parent
a3b5c3c5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
21 deletions
+52
-21
DocumentsFromWriteNodes.hs
src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
+11
-7
FrameWrite.hs
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
+41
-14
No files found.
src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
View file @
3853062d
...
...
@@ -38,6 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Query.Table.Node
(
getChildrenByType
,
getClosestParentIdByType'
,
getNodeWith
)
import
Gargantext.Database.Schema.Node
(
node_hyperdata
)
import
qualified
Data.List
as
List
import
qualified
Gargantext.Defaults
as
Defaults
import
Gargantext.Prelude
import
Gargantext.Utils.Jobs
(
serveJobsAPI
)
...
...
@@ -97,18 +98,19 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure
(
node
,
contents
)
)
frameWrites
let
parsedE
=
(
\
(
node
,
contents
)
->
hyperdataDocumentFromFrameWrite
(
node
^.
node_hyperdata
,
contents
))
<$>
frameWritesWithContents
let
parsed
=
rights
parsedE
let
parsedE
=
(
\
(
node
,
contents
)
->
hyperdataDocumentFromFrameWrite
7
(
node
^.
node_hyperdata
,
contents
))
<$>
frameWritesWithContents
-- TODO hard coded param should be take
let
parsed
=
List
.
concat
$
rights
parsedE
_
<-
flowDataText
(
RootId
(
NodeId
uId
))
(
DataNew
(
Just
$
fromIntegral
$
length
parsed
,
yieldMany
parsed
))
(
Multi
EN
)
cId
Nothing
logStatus
pure
$
jobLogSuccess
jobLog
------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite
::
(
HyperdataFrame
,
T
.
Text
)
->
Either
T
.
Text
HyperdataDocument
hyperdataDocumentFromFrameWrite
(
HyperdataFrame
{
_hf_base
,
_hf_frame_id
},
contents
)
=
hyperdataDocumentFromFrameWrite
::
Int
->
(
HyperdataFrame
,
T
.
Text
)
->
Either
T
.
Text
[
HyperdataDocument
]
hyperdataDocumentFromFrameWrite
paragraphSize
(
HyperdataFrame
{
_hf_base
,
_hf_frame_id
},
contents
)
=
case
parseLines
contents
of
Left
_
->
Left
"Error parsing node"
Right
(
Parsed
{
authors
,
contents
=
c
,
date
,
source
,
title
=
t
})
->
Right
(
Parsed
{
authors
,
contents
=
c
txts
,
date
,
source
,
title
=
t
})
->
let
authorJoinSingle
(
Author
{
firstName
,
lastName
})
=
T
.
concat
[
lastName
,
", "
,
firstName
]
authors'
=
T
.
concat
$
authorJoinSingle
<$>
authors
date'
=
(
\
(
Date
{
year
,
month
,
day
})
->
T
.
concat
[
T
.
pack
$
show
year
,
"-"
...
...
@@ -117,7 +119,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
year'
=
fromIntegral
$
maybe
Defaults
.
year
(
\
(
Date
{
year
})
->
year
)
date
month'
=
maybe
Defaults
.
month
(
\
(
Date
{
month
})
->
fromIntegral
month
)
date
day'
=
maybe
Defaults
.
day
(
\
(
Date
{
day
})
->
fromIntegral
day
)
date
in
Right
HyperdataDocument
{
_hd_bdd
=
Just
"FrameWrite"
Right
(
List
.
map
(
\
ctxt
->
HyperdataDocument
{
_hd_bdd
=
Just
"FrameWrite"
,
_hd_doi
=
Nothing
,
_hd_url
=
Nothing
,
_hd_uniqId
=
Nothing
...
...
@@ -127,7 +129,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
,
_hd_authors
=
Just
authors'
,
_hd_institutes
=
Nothing
,
_hd_source
=
source
,
_hd_abstract
=
Just
c
,
_hd_abstract
=
Just
c
txt
,
_hd_publication_date
=
date'
,
_hd_publication_year
=
Just
year'
,
_hd_publication_month
=
Just
month'
...
...
@@ -136,3 +138,5 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Just
$
T
.
pack
$
show
EN
}
)
(
text2paragraphs
paragraphSize
ctxts
)
)
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
View file @
3853062d
module
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
import
Control.Applicative
((
*>
))
import
Control.Monad
(
void
)
import
Data.Either
import
Data.Maybe
import
Data.Text
hiding
(
foldl
)
import
Gargantext.Core.Text
(
sentences
)
import
Gargantext.Prelude
import
Prelude
((
++
),
read
)
import
Text.Parsec
hiding
(
Line
)
import
Text.Parsec.String
import
qualified
Data.Text
as
DT
import
qualified
Data.List
as
List
-- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
...
...
@@ -26,11 +41,11 @@ sample :: Text
sample
=
unlines
[
"title1"
,
"title2"
,
"=="
,
"^@@authors: FirstName1, LastName1; FirstName2, LastName2"
,
"
^@@
date: 2021-09-10"
,
"
^@@
source: someSource"
--
, "title2"
--
, "=="
--
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
,
"date: 2021-09-10"
,
"source: someSource"
,
"document contents 1"
,
"document contents 2"
]
...
...
@@ -42,9 +57,9 @@ sampleUnordered =
,
"title2"
,
"=="
,
"document contents 1"
,
"
^@@
date: 2021-09-10"
,
"
^@@
authors: FirstName1, LastName1; FirstName2, LastName2"
,
"
^@@
source: someSource"
,
"date: 2021-09-10"
,
"authors: FirstName1, LastName1; FirstName2, LastName2"
,
"source: someSource"
,
"document contents 2"
]
...
...
@@ -150,14 +165,14 @@ contentsLineP = do
titleDelimiterP
::
Parser
()
titleDelimiterP
=
do
_
<-
newline
_
<-
string
"=="
tokenEnd
-- _ <- try (string "==")
pure
()
titleP
::
Parser
[
Char
]
titleP
=
manyTill
anyChar
(
try
titleDelimiterP
)
authorsPrefixP
::
Parser
[
Char
]
authorsPrefixP
=
do
_
<-
string
"
^@@
authors:"
_
<-
string
"authors:"
many
(
char
' '
)
authorsP
::
Parser
[
Author
]
authorsP
=
try
authorsPrefixP
*>
sepBy
authorP
(
char
';'
)
...
...
@@ -173,7 +188,7 @@ authorP = do
datePrefixP
::
Parser
[
Char
]
datePrefixP
=
do
_
<-
string
"
^@@
date:"
_
<-
string
"date:"
many
(
char
' '
)
dateP
::
Parser
Date
dateP
=
try
datePrefixP
...
...
@@ -195,7 +210,7 @@ dateISOP = do
sourcePrefixP
::
Parser
[
Char
]
sourcePrefixP
=
do
_
<-
string
"
^@@
source:"
_
<-
string
"source:"
many
(
char
' '
)
sourceP
::
Parser
[
Char
]
sourceP
=
try
sourcePrefixP
...
...
@@ -206,3 +221,15 @@ sourceP = try sourcePrefixP
tokenEnd
::
Parser
()
tokenEnd
=
void
(
char
'
\n
'
)
<|>
eof
--- MISC Tools
text2paragraphs
::
Int
->
Text
->
[
Text
]
text2paragraphs
n
=
List
.
map
DT
.
concat
.
splitEvery
n
.
List
.
map
clean
.
sentences
.
DT
.
concat
.
DT
.
lines
clean
::
Text
->
Text
clean
=
DT
.
unwords
.
List
.
filter
(
\
w
->
DT
.
length
w
<
25
)
.
DT
.
words
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment