Commit 59fc9cfd authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Improving NodeWriteParsing

parent d41791b9
......@@ -38,7 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata, node_name)
import Gargantext.Prelude
import Gargantext.Utils.Jobs (serveJobsAPI)
import Servant
......@@ -99,7 +99,7 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure (node, contents)
) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsedE = map (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node, contents)) frameWritesWithContents
-- TODO hard coded param should be taken from user
let parsed = List.concat $ rights parsedE
......@@ -108,34 +108,15 @@ documentsFromWriteNodes uId nId _p logStatus = do
(Multi EN) cId Nothing logStatus
pure $ jobLogSuccess jobLog
------------------------------------------------------------------------
{-
-- extractFrameWrites :: (HasSettings env, FlowCmdM env err m) => NodeId -> m [Node T.Text]
extractFrameWrites nId = do
mcId <- getClosestParentIdByType' nId NodeCorpus
frameWriteIds <- getChildrenByType (fromMaybe (panic "[G.A.N.DocumentsFromWriteNodes] No parent found") mcId) NodeFrameWrite
-- https://write.frame.gargantext.org/<frame_id>/download
frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
frameWritesWithContents <- liftBase $
mapM (\node -> do
contents <- getHyperdataFrameContents (node ^. node_hyperdata)
pure (node, contents)
) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = List.concat $ rights parsedE
pure parsed
-}
------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: Int -> (HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
hyperdataDocumentFromFrameWrite :: Int -> (Node HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
hyperdataDocumentFromFrameWrite paragraphSize (node, contents) =
case parseLines contents of
Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = ctxts, date, source, title = t }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
Right (Parsed { authors, contents = ctxts, date }) ->
let HyperdataFrame { _hf_base, _hf_frame_id } = node ^. node_hyperdata
authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
authors' = T.concat $ authorJoinSingle <$> authors
date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
, T.pack $ show month, "-"
......@@ -143,7 +124,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
Right (List.map (\ctxt -> HyperdataDocument { _hd_bdd = Just "FrameWrite"
Right (List.map (\(t, ctxt) -> HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
......@@ -152,7 +133,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
, _hd_title = Just t
, _hd_authors = Just authors'
, _hd_institutes = Nothing
, _hd_source = source
, _hd_source = Just $ node ^. node_name
, _hd_abstract = Just ctxt
, _hd_publication_date = date'
, _hd_publication_year = Just year'
......@@ -162,5 +143,5 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN }
) (text2paragraphs paragraphSize ctxts)
) (text2titleParagraphs paragraphSize ctxts)
)
......@@ -108,25 +108,35 @@ data Line =
parseLines :: Text -> Either ParseError Parsed
parseLines text = foldl f emptyParsed <$> lst
where
lst = parse documentLinesP "" (unpack text)
lst = parse documentLines "" (unpack text)
f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. }
f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. }
f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
-- Source should be the name of the node
-- First line of each Context should be the title.
documentLinesP :: Parser [Line]
documentLinesP = do
t <- titleP
ls <- lineP `sepBy` newline
pure $ [LTitle $ pack t] ++ ls
documentLines :: Parser [Line]
documentLines = do
ls <- lineP `sepBy` newline
pure ls
lineP :: Parser Line
lineP = do
choice [ try authorsLineP
, try dateLineP
, try sourceLineP
, contentsLineP ]
, contentsLineP
]
authorsLineP :: Parser Line
authorsLineP = do
......@@ -167,6 +177,7 @@ titleDelimiterP = do
_ <- newline
-- _ <- try (string "==")
pure ()
titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP)
......@@ -223,11 +234,14 @@ tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof
--- MISC Tools
text2paragraphs :: Int -> Text -> [Text]
text2paragraphs n = List.map DT.concat
. splitEvery n . List.map clean
. sentences . DT.concat . DT.lines
text2titleParagraphs :: Int -> Text -> [(Text, Text)]
text2titleParagraphs n = catMaybes . List.map doTitle
. splitEvery n . List.map clean
. sentences . DT.concat . DT.lines
doTitle :: [Text] -> Maybe (Text, Text)
doTitle (t:ts) = Just (t, DT.concat ts)
doTitle [] = Nothing
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment