Commit 3853062d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] FrameWrite Corpus improvement

parent a3b5c3c5
......@@ -38,6 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Data.List as List
import qualified Gargantext.Defaults as Defaults
import Gargantext.Prelude
import Gargantext.Utils.Jobs (serveJobsAPI)
......@@ -97,18 +98,19 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure (node, contents)
) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = rights parsedE
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
-- TODO hard coded param should be take
let parsed = List.concat $ rights parsedE
_ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus
pure $ jobLogSuccess jobLog
------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
hyperdataDocumentFromFrameWrite :: Int -> (HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
case parseLines contents of
Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = c, date, source, title = t }) ->
Right (Parsed { authors, contents = ctxts, date, source, title = t }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
authors' = T.concat $ authorJoinSingle <$> authors
date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
......@@ -117,7 +119,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
Right (List.map (\ctxt -> HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
......@@ -127,7 +129,7 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
, _hd_authors = Just authors'
, _hd_institutes = Nothing
, _hd_source = source
, _hd_abstract = Just c
, _hd_abstract = Just ctxt
, _hd_publication_date = date'
, _hd_publication_year = Just year'
, _hd_publication_month = Just month'
......@@ -136,3 +138,5 @@ hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, cont
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN }
) (text2paragraphs paragraphSize ctxts)
)
module Gargantext.Core.Text.Corpus.Parsers.FrameWrite where
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
where
import Control.Applicative ((*>))
import Control.Monad (void)
import Data.Either
import Data.Maybe
import Data.Text hiding (foldl)
import Gargantext.Core.Text (sentences)
import Gargantext.Prelude
import Prelude ((++), read)
import Text.Parsec hiding (Line)
import Text.Parsec.String
import qualified Data.Text as DT
import qualified Data.List as List
-- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
......@@ -26,11 +41,11 @@ sample :: Text
sample =
unlines
[ "title1"
, "title2"
, "=="
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "^@@date: 2021-09-10"
, "^@@source: someSource"
-- , "title2"
-- , "=="
-- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "date: 2021-09-10"
, "source: someSource"
, "document contents 1"
, "document contents 2"
]
......@@ -42,9 +57,9 @@ sampleUnordered =
, "title2"
, "=="
, "document contents 1"
, "^@@date: 2021-09-10"
, "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
, "^@@source: someSource"
, "date: 2021-09-10"
, "authors: FirstName1, LastName1; FirstName2, LastName2"
, "source: someSource"
, "document contents 2"
]
......@@ -150,14 +165,14 @@ contentsLineP = do
titleDelimiterP :: Parser ()
titleDelimiterP = do
_ <- newline
_ <- string "=="
tokenEnd
-- _ <- try (string "==")
pure ()
titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP)
authorsPrefixP :: Parser [Char]
authorsPrefixP = do
_ <- string "^@@authors:"
_ <- string "authors:"
many (char ' ')
authorsP :: Parser [Author]
authorsP = try authorsPrefixP *> sepBy authorP (char ';')
......@@ -173,7 +188,7 @@ authorP = do
datePrefixP :: Parser [Char]
datePrefixP = do
_ <- string "^@@date:"
_ <- string "date:"
many (char ' ')
dateP :: Parser Date
dateP = try datePrefixP
......@@ -195,7 +210,7 @@ dateISOP = do
sourcePrefixP :: Parser [Char]
sourcePrefixP = do
_ <- string "^@@source:"
_ <- string "source:"
many (char ' ')
sourceP :: Parser [Char]
sourceP = try sourcePrefixP
......@@ -206,3 +221,15 @@ sourceP = try sourcePrefixP
tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof
--- MISC Tools
text2paragraphs :: Int -> Text -> [Text]
text2paragraphs n = List.map DT.concat
. splitEvery n . List.map clean
. sentences . DT.concat . DT.lines
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment