[FEAT] Improving NodeWriteParsing

59fc9cfd · Alexandre Delanoë · d41791b9 · 59fc9cfd · 59fc9cfd
Commit 59fc9cfd authored Jan 25, 2023 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 36 deletions

DocumentsFromWriteNodes.hs src/Gargantext/API/Node/DocumentsFromWriteNodes.hs +10 -29

FrameWrite.hs src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs +21 -7

No files found.
--- a/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
+++ b/src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
@@ -38,7 +38,7 @@ import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)
 import Gargantext.Database.Admin.Types.Hyperdata.Frame
 import Gargantext.Database.Admin.Types.Node
 import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
-import Gargantext.Database.Schema.Node (node_hyperdata)
+import Gargantext.Database.Schema.Node (node_hyperdata, node_name)
 import Gargantext.Prelude
 import Gargantext.Utils.Jobs (serveJobsAPI)
 import Servant
@@ -99,7 +99,7 @@ documentsFromWriteNodes uId nId _p logStatus = do
             pure (node, contents)
         ) frameWrites
-  let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
+  let parsedE = map (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node, contents)) frameWritesWithContents
                                              -- TODO hard coded param should be taken from user
  let parsed = List.concat $ rights parsedE
@@ -108,34 +108,15 @@ documentsFromWriteNodes uId nId _p logStatus = do
                    (Multi EN) cId Nothing logStatus
  pure $ jobLogSuccess jobLog
------------------------------------------------------------------------
-{-
-- extractFrameWrites :: (HasSettings env, FlowCmdM env err m) => NodeId -> m [Node T.Text]
-extractFrameWrites nId = do
-  mcId <- getClosestParentIdByType' nId NodeCorpus
-  frameWriteIds <- getChildrenByType (fromMaybe (panic "[G.A.N.DocumentsFromWriteNodes] No parent found") mcId) NodeFrameWrite
-  -- https://write.frame.gargantext.org/<frame_id>/download
-  frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
-  frameWritesWithContents <- liftBase $
-    mapM (\node -> do
-             contents <- getHyperdataFrameContents (node ^. node_hyperdata)
-             pure (node, contents)
-         ) frameWrites
-  let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite 7 (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
-  let parsed = List.concat $ rights parsedE
-  pure parsed
-}
 ------------------------------------------------------------------------
-hyperdataDocumentFromFrameWrite :: Int -> (HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
+hyperdataDocumentFromFrameWrite :: Int -> (Node HyperdataFrame, T.Text) -> Either T.Text [HyperdataDocument]
-hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
+hyperdataDocumentFromFrameWrite paragraphSize (node, contents) =
  case parseLines contents of
    Left _ -> Left "Error parsing node"
-    Right (Parsed { authors, contents = ctxts, date, source, title = t }) ->
+    Right (Parsed { authors, contents = ctxts, date }) ->
-      let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
+      let HyperdataFrame { _hf_base, _hf_frame_id } = node ^. node_hyperdata
+          authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
          authors' = T.concat $ authorJoinSingle <$> authors 
          date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
                                                            , T.pack $ show month, "-"
@@ -143,7 +124,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
          year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
          month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
          day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
-      Right (List.map (\ctxt ->  HyperdataDocument { _hd_bdd = Just "FrameWrite"
+      Right (List.map (\(t, ctxt) ->  HyperdataDocument { _hd_bdd = Just "FrameWrite"
                              , _hd_doi = Nothing
                              , _hd_url = Nothing
                              , _hd_uniqId = Nothing
@@ -152,7 +133,7 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
                              , _hd_title = Just t
                              , _hd_authors = Just authors'
                              , _hd_institutes = Nothing
-                              , _hd_source = source
+                              , _hd_source = Just $ node ^. node_name
                              , _hd_abstract = Just ctxt
                              , _hd_publication_date = date'
                              , _hd_publication_year = Just year'
@@ -162,5 +143,5 @@ hyperdataDocumentFromFrameWrite paragraphSize (HyperdataFrame { _hf_base, _hf_fr
                              , _hd_publication_minute = Nothing
                              , _hd_publication_second = Nothing
                              , _hd_language_iso2 = Just $ T.pack $ show EN }
-                      ) (text2paragraphs paragraphSize ctxts)
+                      ) (text2titleParagraphs paragraphSize ctxts)
                  )
--- a/src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
@@ -108,25 +108,35 @@ data Line =
 parseLines :: Text -> Either ParseError Parsed
 parseLines text = foldl f emptyParsed <$> lst
  where
-    lst = parse documentLinesP "" (unpack text)
+    lst = parse documentLines "" (unpack text)
    f (Parsed { .. }) (LAuthors as) = Parsed { authors = as, .. }
    f (Parsed { .. }) (LContents c) = Parsed { contents = concat [contents, c], .. }
    f (Parsed { .. }) (LDate    d ) = Parsed { date = Just d, .. }
    f (Parsed { .. }) (LSource  s ) = Parsed { source = Just s, .. }
    f (Parsed { .. }) (LTitle   t ) = Parsed { title = t, .. }
+-- Source should be the name of the node
+-- First line of each Context should be the title.
 documentLinesP :: Parser [Line]
 documentLinesP = do
  t <- titleP
  ls <- lineP `sepBy` newline
  pure $ [LTitle $ pack t] ++ ls
+documentLines :: Parser [Line]
+documentLines = do
+  ls <- lineP `sepBy` newline
+  pure ls
 lineP :: Parser Line
 lineP = do
  choice [ try authorsLineP
         , try dateLineP
         , try sourceLineP
-         , contentsLineP ]
+         , contentsLineP
+         ]
 authorsLineP :: Parser Line
 authorsLineP = do
@@ -167,6 +177,7 @@ titleDelimiterP = do
  _ <- newline
  -- _ <- try (string "==")
  pure ()
 titleP :: Parser [Char]
 titleP = manyTill anyChar (try titleDelimiterP)
@@ -223,11 +234,14 @@ tokenEnd :: Parser ()
 tokenEnd = void (char '\n') <|> eof
 --- MISC Tools
+text2titleParagraphs :: Int -> Text -> [(Text, Text)]
-text2paragraphs :: Int -> Text -> [Text]
+text2titleParagraphs n = catMaybes . List.map doTitle
-text2paragraphs n = List.map DT.concat
+                       . splitEvery n . List.map clean
-                  . splitEvery n . List.map clean
+                       . sentences . DT.concat . DT.lines
-                  . sentences . DT.concat . DT.lines
+doTitle :: [Text] -> Maybe (Text, Text)
+doTitle (t:ts) = Just (t, DT.concat ts)
+doTitle [] = Nothing
 clean :: Text -> Text
 clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words