Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
3853062d
Commit
3853062d
authored
Jan 10, 2023
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] FrameWrite Corpus improvement
parent
a3b5c3c5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
21 deletions
+52
-21
DocumentsFromWriteNodes.hs
src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
+11
-7
FrameWrite.hs
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
+41
-14
No files found.
src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
View file @
3853062d
...
...
@@ -38,6 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame
import
Gargantext.Database.Admin.Types.Node
import
Gargantext.Database.Query.Table.Node
(
getChildrenByType
,
getClosestParentIdByType'
,
getNodeWith
)
import
Gargantext.Database.Schema.Node
(
node_hyperdata
)
import
qualified
Data.List
as
List
import
qualified
Gargantext.Defaults
as
Defaults
import
Gargantext.Prelude
import
Gargantext.Utils.Jobs
(
serveJobsAPI
)
...
...
@@ -97,18 +98,19 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure
(
node
,
contents
)
)
frameWrites
let
parsedE
=
(
\
(
node
,
contents
)
->
hyperdataDocumentFromFrameWrite
(
node
^.
node_hyperdata
,
contents
))
<$>
frameWritesWithContents
let
parsed
=
rights
parsedE
let
parsedE
=
(
\
(
node
,
contents
)
->
hyperdataDocumentFromFrameWrite
7
(
node
^.
node_hyperdata
,
contents
))
<$>
frameWritesWithContents
-- TODO hard coded param should be take
let
parsed
=
List
.
concat
$
rights
parsedE
_
<-
flowDataText
(
RootId
(
NodeId
uId
))
(
DataNew
(
Just
$
fromIntegral
$
length
parsed
,
yieldMany
parsed
))
(
Multi
EN
)
cId
Nothing
logStatus
pure
$
jobLogSuccess
jobLog
------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite
::
(
HyperdataFrame
,
T
.
Text
)
->
Either
T
.
Text
HyperdataDocument
hyperdataDocumentFromFrameWrite
(
HyperdataFrame
{
_hf_base
,
_hf_frame_id
},
contents
)
=
hyperdataDocumentFromFrameWrite
::
Int
->
(
HyperdataFrame
,
T
.
Text
)
->
Either
T
.
Text
[
HyperdataDocument
]
hyperdataDocumentFromFrameWrite
paragraphSize
(
HyperdataFrame
{
_hf_base
,
_hf_frame_id
},
contents
)
=
case
parseLines
contents
of
Left
_
->
Left
"Error parsing node"
Right
(
Parsed
{
authors
,
contents
=
c
,
date
,
source
,
title
=
t
})
->
Right
(
Parsed
{
authors
,
contents
=
c
txts
,
date
,
source
,
title
=
t
})
->
let
authorJoinSingle
(
Author
{
firstName
,
lastName
})
=
T
.
concat
[
lastName
,
", "
,
firstName
]
authors'
=
T
.
concat
$
authorJoinSingle
<$>
authors
date'
=
(
\
(
Date
{
year
,
month
,
day
})
->
T
.
concat
[
T
.
pack
$
show
year
,
"-"
...
...
@@ -117,7 +119,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
year'
=
fromIntegral
$
maybe
Defaults
.
year
(
\
(
Date
{
year
})
->
year
)
date
month'
=
maybe
Defaults
.
month
(
\
(
Date
{
month
})
->
fromIntegral
month
)
date
day'
=
maybe
Defaults
.
day
(
\
(
Date
{
day
})
->
fromIntegral
day
)
date
in
Right
HyperdataDocument
{
_hd_bdd
=
Just
"FrameWrite"
Right
(
List
.
map
(
\
ctxt
->
HyperdataDocument
{
_hd_bdd
=
Just
"FrameWrite"
,
_hd_doi
=
Nothing
,
_hd_url
=
Nothing
,
_hd_uniqId
=
Nothing
...
...
@@ -127,7 +129,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
,
_hd_authors
=
Just
authors'
,
_hd_institutes
=
Nothing
,
_hd_source
=
source
,
_hd_abstract
=
Just
c
,
_hd_abstract
=
Just
c
txt
,
_hd_publication_date
=
date'
,
_hd_publication_year
=
Just
year'
,
_hd_publication_month
=
Just
month'
...
...
@@ -136,3 +138,5 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Just
$
T
.
pack
$
show
EN
}
)
(
text2paragraphs
paragraphSize
ctxts
)
)
src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
View file @
3853062d
module
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
import
Control.Applicative
((
*>
))
import
Control.Monad
(
void
)
import
Data.Either
import
Data.Maybe
import
Data.Text
hiding
(
foldl
)
import
Gargantext.Core.Text
(
sentences
)
import
Gargantext.Prelude
import
Prelude
((
++
),
read
)
import
Text.Parsec
hiding
(
Line
)
import
Text.Parsec.String
import
qualified
Data.Text
as
DT
import
qualified
Data.List
as
List
-- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
...
...
@@ -26,11 +41,11 @@ sample :: Text
sample
=
unlines
[
"title1"
,
"title2"
,
"=="
,
"^@@authors: FirstName1, LastName1; FirstName2, LastName2"
,
"
^@@
date: 2021-09-10"
,
"
^@@
source: someSource"
--
, "title2"
--
, "=="
--
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
,
"date: 2021-09-10"
,
"source: someSource"
,
"document contents 1"
,
"document contents 2"
]
...
...
@@ -42,9 +57,9 @@ sampleUnordered =
,
"title2"
,
"=="
,
"document contents 1"
,
"
^@@
date: 2021-09-10"
,
"
^@@
authors: FirstName1, LastName1; FirstName2, LastName2"
,
"
^@@
source: someSource"
,
"date: 2021-09-10"
,
"authors: FirstName1, LastName1; FirstName2, LastName2"
,
"source: someSource"
,
"document contents 2"
]
...
...
@@ -150,14 +165,14 @@ contentsLineP = do
titleDelimiterP
::
Parser
()
titleDelimiterP
=
do
_
<-
newline
_
<-
string
"=="
tokenEnd
-- _ <- try (string "==")
pure
()
titleP
::
Parser
[
Char
]
titleP
=
manyTill
anyChar
(
try
titleDelimiterP
)
authorsPrefixP
::
Parser
[
Char
]
authorsPrefixP
=
do
_
<-
string
"
^@@
authors:"
_
<-
string
"authors:"
many
(
char
' '
)
authorsP
::
Parser
[
Author
]
authorsP
=
try
authorsPrefixP
*>
sepBy
authorP
(
char
';'
)
...
...
@@ -173,7 +188,7 @@ authorP = do
datePrefixP
::
Parser
[
Char
]
datePrefixP
=
do
_
<-
string
"
^@@
date:"
_
<-
string
"date:"
many
(
char
' '
)
dateP
::
Parser
Date
dateP
=
try
datePrefixP
...
...
@@ -195,7 +210,7 @@ dateISOP = do
sourcePrefixP
::
Parser
[
Char
]
sourcePrefixP
=
do
_
<-
string
"
^@@
source:"
_
<-
string
"source:"
many
(
char
' '
)
sourceP
::
Parser
[
Char
]
sourceP
=
try
sourcePrefixP
...
...
@@ -206,3 +221,15 @@ sourceP = try sourcePrefixP
tokenEnd
::
Parser
()
tokenEnd
=
void
(
char
'
\n
'
)
<|>
eof
--- MISC Tools
text2paragraphs
::
Int
->
Text
->
[
Text
]
text2paragraphs
n
=
List
.
map
DT
.
concat
.
splitEvery
n
.
List
.
map
clean
.
sentences
.
DT
.
concat
.
DT
.
lines
clean
::
Text
->
Text
clean
=
DT
.
unwords
.
List
.
filter
(
\
w
->
DT
.
length
w
<
25
)
.
DT
.
words
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment