Commit 59fc9cfd authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Improving NodeWriteParsing

parent d41791b9
...@@ -38,7 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..) ...@@ -38,7 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)
import Gargantext.Database.Admin.Types.Hyperdata.Frame import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith) import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Database.Schema.Node (node_hyperdata, node_name)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Utils.Jobs (serveJobsAPI) import Gargantext.Utils.Jobs (serveJobsAPI)
import Servant import Servant
...@@ -99,7 +99,7 @@ documentsFromWriteNodes uId nId _p logStatus = do ...@@ -99,7 +99,7 @@ documentsFromWriteNodes uId nId _p logStatus = do
pure (node, contents) pure (node, contents)
) frameWrites ) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents let parsedE = map (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node, contents)) frameWritesWithContents
-- TODO hard coded param should be taken from user -- TODO hard coded param should be taken from user
let parsed = List.concat $ rights parsedE let parsed = List.concat $ rights parsedE
...@@ -108,34 +108,15 @@ documentsFromWriteNodes uId nId _p logStatus = do ...@@ -108,34 +108,15 @@ documentsFromWriteNodes uId nId _p logStatus = do
(Multi EN) cId Nothing logStatus (Multi EN) cId Nothing logStatus
pure $ jobLogSuccess jobLog pure $ jobLogSuccess jobLog
------------------------------------------------------------------------
{-
-- extractFrameWrites :: (HasSettings env, FlowCmdM env err m) => NodeId -> m [Node T.Text]
extractFrameWrites nId = do
mcId <- getClosestParentIdByType' nId NodeCorpus
frameWriteIds <- getChildrenByType (fromMaybe (panic "[G.A.N.DocumentsFromWriteNodes] No parent found") mcId) NodeFrameWrite
-- https://write.frame.gargantext.org/<frame_id>/download
frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
frameWritesWithContents <- liftBase $
mapM (\node -> do
contents <- getHyperdataFrameContents (node ^. node_hyperdata)
pure (node, contents)
) frameWrites
let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
let parsed = List.concat $ rights parsedE
pure parsed
-}
------------------------------------------------------------------------ ------------------------------------------------------------------------
hyperdataDocumentFromFrameWrite :: Int -> (HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument] hyperdataDocumentFromFrameWrite :: Int -> (Node HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_frame_id }, contents) = hyperdataDocumentFromFrameWrite paragraphSize (node, contents) =
case parseLines contents of case parseLines contents of
Left _ -> Left "Error parsing node" Left _ -> Left "Error parsing node"
Right (Parsed { authors, contents = ctxts, date, source, title = t }) -> Right (Parsed { authors, contents = ctxts, date }) ->
let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ] let HyperdataFrame { _hf_base, _hf_frame_id } = node ^. node_hyperdata
authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
authors' = T.concat $ authorJoinSingle <$> authors authors' = T.concat $ authorJoinSingle <$> authors
date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-" date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
, T.pack $ show month, "-" , T.pack $ show month, "-"
...@@ -143,7 +124,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr ...@@ -143,7 +124,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
Right (List.map (\ctxt -> HyperdataDocument { _hd_bdd = Just "FrameWrite" Right (List.map (\(t, ctxt) -> HyperdataDocument { _hd_bdd = Just "FrameWrite"
, _hd_doi = Nothing , _hd_doi = Nothing
, _hd_url = Nothing , _hd_url = Nothing
, _hd_uniqId = Nothing , _hd_uniqId = Nothing
...@@ -152,7 +133,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr ...@@ -152,7 +133,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
, _hd_title = Just t , _hd_title = Just t
, _hd_authors = Just authors' , _hd_authors = Just authors'
, _hd_institutes = Nothing , _hd_institutes = Nothing
, _hd_source = source , _hd_source = Just $ node ^. node_name
, _hd_abstract = Just ctxt , _hd_abstract = Just ctxt
, _hd_publication_date = date' , _hd_publication_date = date'
, _hd_publication_year = Just year' , _hd_publication_year = Just year'
...@@ -162,5 +143,5 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr ...@@ -162,5 +143,5 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
, _hd_publication_minute = Nothing , _hd_publication_minute = Nothing
, _hd_publication_second = Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show EN } , _hd_language_iso2 = Just $ T.pack $ show EN }
) (text2paragraphs paragraphSize ctxts) ) (text2titleParagraphs paragraphSize ctxts)
) )
...@@ -108,25 +108,35 @@ data Line = ...@@ -108,25 +108,35 @@ data Line =
parseLines :: Text -> Either ParseError Parsed parseLines :: Text -> Either ParseError Parsed
parseLines text = foldl f emptyParsed <$> lst parseLines text = foldl f emptyParsed <$> lst
where where
lst = parse documentLinesP "" (unpack text) lst = parse documentLines "" (unpack text)
f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. } f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. } f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. } f (Parsed { .. }) (LDate d ) = Parsed { date = Just d, .. }
f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. } f (Parsed { .. }) (LSource s ) = Parsed { source = Just s, .. }
f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. } f (Parsed { .. }) (LTitle t ) = Parsed { title = t, .. }
-- Source should be the name of the node
-- First line of each Context should be the title.
documentLinesP :: Parser [Line] documentLinesP :: Parser [Line]
documentLinesP = do documentLinesP = do
t <- titleP t <- titleP
ls <- lineP `sepBy` newline ls <- lineP `sepBy` newline
pure $ [LTitle $ pack t] ++ ls pure $ [LTitle $ pack t] ++ ls
documentLines :: Parser [Line]
documentLines = do
ls <- lineP `sepBy` newline
pure ls
lineP :: Parser Line lineP :: Parser Line
lineP = do lineP = do
choice [ try authorsLineP choice [ try authorsLineP
, try dateLineP , try dateLineP
, try sourceLineP , try sourceLineP
, contentsLineP ] , contentsLineP
]
authorsLineP :: Parser Line authorsLineP :: Parser Line
authorsLineP = do authorsLineP = do
...@@ -167,6 +177,7 @@ titleDelimiterP = do ...@@ -167,6 +177,7 @@ titleDelimiterP = do
_ <- newline _ <- newline
-- _ <- try (string "==") -- _ <- try (string "==")
pure () pure ()
titleP :: Parser [Char] titleP :: Parser [Char]
titleP = manyTill anyChar (try titleDelimiterP) titleP = manyTill anyChar (try titleDelimiterP)
...@@ -223,11 +234,14 @@ tokenEnd :: Parser () ...@@ -223,11 +234,14 @@ tokenEnd :: Parser ()
tokenEnd = void (char '\n') <|> eof tokenEnd = void (char '\n') <|> eof
--- MISC Tools --- MISC Tools
text2titleParagraphs :: Int -> Text -> [(Text, Text)]
text2paragraphs :: Int -> Text -> [Text] text2titleParagraphs n = catMaybes . List.map doTitle
text2paragraphs n = List.map DT.concat . splitEvery n . List.map clean
. splitEvery n . List.map clean . sentences . DT.concat . DT.lines
. sentences . DT.concat . DT.lines
doTitle :: [Text] -> Maybe (Text, Text)
doTitle (t:ts) = Just (t, DT.concat ts)
doTitle [] = Nothing
clean :: Text -> Text clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment