Commit 34fbf078 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW][Database] ngrams extraction and insertion.

parent b2ef5a09
...@@ -21,15 +21,18 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) ...@@ -21,15 +21,18 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Node.Contact (HyperdataContact(..)) --import Gargantext.Database.Node.Contact (HyperdataContact(..))
import Data.Map (Map, lookup) import Data.Map (Map, lookup)
import Data.Maybe (Maybe(..), catMaybes) import Data.Maybe (Maybe(..), catMaybes)
import Data.Text (Text, splitOn) import Data.Text (Text, splitOn, intercalate)
import Data.Tuple.Extra (both, second) import Data.Tuple.Extra (both, second)
import Data.List (concat)
import GHC.Show (Show) import GHC.Show (Show)
import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId) import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
import Gargantext.Core.Types.Individu (Username) import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core (Lang(..))
import Gargantext.Database.Bashql (runCmd') -- , del) import Gargantext.Database.Bashql (runCmd') -- , del)
import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName) import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
import Gargantext.Database.Flow.Utils (insertToNodeNgrams) import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Database.Node.Document.Add (add) import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..)) import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRootCmd) import Gargantext.Database.Root (getRootCmd)
...@@ -41,6 +44,7 @@ import Gargantext.Database.Schema.User (getUser, UserLight(..)) ...@@ -41,6 +44,7 @@ import Gargantext.Database.Schema.User (getUser, UserLight(..))
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.Types.Node (NodeType(..), NodeId) import Gargantext.Database.Types.Node (NodeType(..), NodeId)
import Gargantext.Database.Utils (Cmd(..)) import Gargantext.Database.Utils (Cmd(..))
import Gargantext.Text.Terms (TermType(..))
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile) import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -102,7 +106,7 @@ flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, user ...@@ -102,7 +106,7 @@ flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, user
let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments) let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
-- printDebug "documentsWithId" documentsWithId -- printDebug "documentsWithId" documentsWithId
let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
-- printDebug "docsWithNgrams" docsWithNgrams -- printDebug "docsWithNgrams" docsWithNgrams
let maps = mapNodeIdNgrams docsWithNgrams let maps = mapNodeIdNgrams docsWithNgrams
...@@ -228,22 +232,31 @@ data DocumentIdWithNgrams = ...@@ -228,22 +232,31 @@ data DocumentIdWithNgrams =
-- TODO add Terms (Title + Abstract) -- TODO add Terms (Title + Abstract)
-- add f :: Text -> Text -- add f :: Text -> Text
-- newtype Ngrams = Ngrams Text -- newtype Ngrams = Ngrams Text
extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int -- TODO group terms
extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)] extractNgramsT :: HyperdataDocument -> IO (Map (NgramsT Ngrams) Int)
<> [(NgramsT Institutes i' , 1)| i' <- institutes ] extractNgramsT doc = do
<> [(NgramsT Authors a' , 1)| a' <- authors ]
where let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
-- TODO group terms terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> extractTerms (Multi EN) leText
pure $ DM.fromList $ [(NgramsT Sources source, 1)]
<> [(NgramsT Institutes i' , 1)| i' <- institutes ]
<> [(NgramsT Authors a' , 1)| a' <- authors ]
<> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
-> [DocumentWithId] -> [DocumentIdWithNgrams] documentIdWithNgrams :: (HyperdataDocument -> IO (Map (NgramsT Ngrams) Int))
documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d)) -> [DocumentWithId] -> IO [DocumentIdWithNgrams]
documentIdWithNgrams f = mapM toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ documentData d
pure $ DocumentIdWithNgrams d e
-- | TODO check optimization -- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int) mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
......
...@@ -108,7 +108,7 @@ insertNodeNgramW nns = ...@@ -108,7 +108,7 @@ insertNodeNgramW nns =
insertNothing = (Insert { iTable = nodeNgramTable insertNothing = (Insert { iTable = nodeNgramTable
, iRows = nns , iRows = nns
, iReturning = rCount , iReturning = rCount
, iOnConflict = Nothing , iOnConflict = (Just DoNothing)
}) })
type NgramsText = Text type NgramsText = Text
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment