Commit 3853062d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] FrameWrite Corpus improvement

parent a3b5c3c5
...@@ -38,6 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame ...@@ -38,6 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith) import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Data.List as List
import qualified Gargantext.Defaults as Defaults import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Jobs (serveJobsAPI) import Gargantext.Utils.Jobs (serveJobsAPI)
...@@ -97,18 +98,19 @@ documentsFromWriteNodes uId nId _p logStatus = do ...@@ -97,18 +98,19 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure (node, contents) pure (node, contents)
) frameWrites ) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = rights parsedE -- TODO hard coded param should be take
let parsed = List.concat $ rights parsedE
_ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus _ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus
pure $ jobLogSuccess jobLog pure $ jobLogSuccess jobLog
------------------------------------------------------------------------ ------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument hyperdataDocumentFromFrameWrite :: Int -> (HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) = hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
case parseLines contents of case parseLines contents of
Left _ -> Left "Error parsing node" Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = c, date, source, title = t }) -> Right (Parsed { authors, contents = ctxts, date, source, title = t }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
authors' = T.concat $ authorJoinSingle <$> authors authors' = T.concat $ authorJoinSingle <$> authors
date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-" date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
...@@ -117,7 +119,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont ...@@ -117,7 +119,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
Right HyperdataDocument { _hd_bdd = Just "FrameWrite" Right (List.map (\ctxt -> HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing , _hd_doi = Nothing
, _hd_url = Nothing , _hd_url = Nothing
, _hd_uniqId = Nothing , _hd_uniqId = Nothing
...@@ -127,7 +129,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont ...@@ -127,7 +129,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
, _hd_authors = Just authors' , _hd_authors = Just authors'
, _hd_institutes = Nothing , _hd_institutes = Nothing
, _hd_source = source , _hd_source = source
, _hd_abstract = Just c , _hd_abstract = Just ctxt
, _hd_publication_date = date' , _hd_publication_date = date'
, _hd_publication_year = Just year' , _hd_publication_year = Just year'
, _hd_publication_month = Just month' , _hd_publication_month = Just month'
...@@ -136,3 +138,5 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont ...@@ -136,3 +138,5 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
, _hd_publication_minute = Nothing , _hd_publication_minute = Nothing
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN } , _hd_language_iso2 = Just $ T.pack $ show EN }
) (text2paragraphs paragraphSize ctxts)
)
module Gargantext.Core.Text.Corpus.Parsers.FrameWrite where {-|
Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
import Control.Applicative ((*>)) import Control.Applicative ((*>))
import Control.Monad (void) import Control.Monad (void)
import Data.Either import Data.Either
import Data.Maybe import Data.Maybe
import Data.Text hiding (foldl) import Data.Text hiding (foldl)
import Gargantext.Core.Text (sentences)
import Gargantext.Prelude import Gargantext.Prelude
import Prelude ((++), read) import Prelude ((++), read)
import Text.Parsec hiding (Line) import Text.Parsec hiding (Line)
import Text.Parsec.String import Text.Parsec.String
import qualified Data.Text as DT
import qualified Data.List as List
-- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
...@@ -26,11 +41,11 @@ sample :: Text ...@@ -26,11 +41,11 @@ sample :: Text
sample = sample =
unlines unlines
[ "title1" [ "title1"
, "title2" -- , "title2"
, "==" -- , "=="
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2" -- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "^@@date: 2021-09-10" , "date: 2021-09-10"
, "^@@source: someSource" , "source: someSource"
, "document contents 1" , "document contents 1"
, "document contents 2" , "document contents 2"
] ]
...@@ -42,9 +57,9 @@ sampleUnordered = ...@@ -42,9 +57,9 @@ sampleUnordered =
, "title2" , "title2"
, "==" , "=="
, "document contents 1" , "document contents 1"
, "^@@date: 2021-09-10" , "date: 2021-09-10"
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2" , "authors: FirstName1, LastName1; FirstName2, LastName2"
, "^@@source: someSource" , "source: someSource"
, "document contents 2" , "document contents 2"
] ]
...@@ -150,14 +165,14 @@ contentsLineP = do ...@@ -150,14 +165,14 @@ contentsLineP = do
titleDelimiterP :: Parser () titleDelimiterP :: Parser ()
titleDelimiterP = do titleDelimiterP = do
_ <- newline _ <- newline
_ <- string "==" -- _ <- try (string "==")
tokenEnd pure ()
titleP :: Parser [Char] titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP) titleP = manyTill anyChar (try titleDelimiterP)
authorsPrefixP :: Parser [Char] authorsPrefixP :: Parser [Char]
authorsPrefixP = do authorsPrefixP = do
_ <- string "^@@authors:" _ <- string "authors:"
many (char ' ') many (char ' ')
authorsP :: Parser [Author] authorsP :: Parser [Author]
authorsP = try authorsPrefixP *> sepBy authorP (char ';') authorsP = try authorsPrefixP *> sepBy authorP (char ';')
...@@ -173,7 +188,7 @@ authorP = do ...@@ -173,7 +188,7 @@ authorP = do
datePrefixP :: Parser [Char] datePrefixP :: Parser [Char]
datePrefixP = do datePrefixP = do
_ <- string "^@@date:" _ <- string "date:"
many (char ' ') many (char ' ')
dateP :: Parser Date dateP :: Parser Date
dateP = try datePrefixP dateP = try datePrefixP
...@@ -195,7 +210,7 @@ dateISOP = do ...@@ -195,7 +210,7 @@ dateISOP = do
sourcePrefixP :: Parser [Char] sourcePrefixP :: Parser [Char]
sourcePrefixP = do sourcePrefixP = do
_ <- string "^@@source:" _ <- string "source:"
many (char ' ') many (char ' ')
sourceP :: Parser [Char] sourceP :: Parser [Char]
sourceP = try sourcePrefixP sourceP = try sourcePrefixP
...@@ -206,3 +221,15 @@ sourceP = try sourcePrefixP ...@@ -206,3 +221,15 @@ sourceP = try sourcePrefixP
tokenEnd :: Parser () tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof tokenEnd = void (char '\n') <|> eof
--- MISC Tools
text2paragraphs :: Int -> Text -> [Text]
text2paragraphs n = List.map DT.concat
. splitEvery n . List.map clean
. sentences . DT.concat . DT.lines
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment