Commit 30aae72b authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FLOW][DB][Ngrams] group function.

parent 0ee7da5d
...@@ -21,7 +21,6 @@ Thanks @yannEsposito for this. ...@@ -21,7 +21,6 @@ Thanks @yannEsposito for this.
{-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
...@@ -317,7 +316,7 @@ type Desc t n = Description (AppendSymbol (TypeName t) (AppendSymbol " | " n)) ...@@ -317,7 +316,7 @@ type Desc t n = Description (AppendSymbol (TypeName t) (AppendSymbol " | " n))
swaggerDoc :: Swagger swaggerDoc :: Swagger
swaggerDoc = toSwagger (Proxy :: Proxy GargAPI) swaggerDoc = toSwagger (Proxy :: Proxy GargAPI)
& info.title .~ "Gargantext" & info.title .~ "Gargantext"
& info.version .~ "0.1.0" & info.version .~ "4.0.2" -- TODO same version as Gargantext
-- & info.base_url ?~ (URL "http://gargantext.org/") -- & info.base_url ?~ (URL "http://gargantext.org/")
& info.description ?~ "REST API specifications" & info.description ?~ "REST API specifications"
-- & tags .~ Set.fromList [Tag "Garg" (Just "Main perations") Nothing] -- & tags .~ Set.fromList [Tag "Garg" (Just "Main perations") Nothing]
......
...@@ -25,56 +25,70 @@ authors ...@@ -25,56 +25,70 @@ authors
module Gargantext.Database.Flow module Gargantext.Database.Flow
where where
import System.FilePath (FilePath) import System.FilePath (FilePath)
import Data.Maybe (Maybe(..)) import Data.Maybe (Maybe(..), catMaybes)
import Data.Text (Text) import Data.Text (Text)
import Data.Map (Map) import Data.Map (Map)
import Data.Tuple.Extra (both, second) import Data.Tuple.Extra (both, second)
import qualified Data.Map as DM import qualified Data.Map as DM
import Gargantext.Core.Types (NodePoly(..), ListType(..), listId) import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
import Gargantext.Prelude
import Gargantext.Database.Bashql (runCmd', del) import Gargantext.Database.Bashql (runCmd', del)
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList) import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
import Gargantext.Database.Node.Document.Add (add) import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams) import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew) import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Prelude
import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS)) import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
type UserId = Int type UserId = Int
type RootId = Int type RootId = Int
type CorpusId = Int type CorpusId = Int
flow :: FilePath -> IO Int flow :: FilePath -> CorpusName -> IO [Int]
flow fp = do flow fp cName = do
(masterUserId, _, corpusId) <- subFlow "gargantua" -- Corus Flow
(masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
docs <- map addUniqIds <$> parseDocs WOS fp
ids <- runCmd' $ insertDocuments masterUserId corpusId docs -- Documents Flow
printDebug "Docs IDs : " ids hyperdataDocuments <- map addUniqIds <$> parseDocs WOS fp
ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs printDebug "Docs IDs : " (length ids)
printDebug "Docs IDs : " idsRepeat idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
(_, _, corpusId2) <- subFlow "alexandre" -- Ngrams Flow
let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams
indexedNgrams <- runCmd' $ indexNgrams maps
_ <- runCmd' $ insertToNodeNgrams indexedNgrams
-- List Flow
listId <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
printDebug "list id:" listId
printDebug "Docs IDs : " (length idsRepeat)
(_, _, corpusId2) <- subFlow "alexandre" cName
inserted <- runCmd' $ add corpusId2 (map reId ids) inserted <- runCmd' $ add corpusId2 (map reId ids)
printDebug "Inserted : " inserted printDebug "Inserted : " (length inserted)
runCmd' $ del [corpusId2, corpusId] pure [corpusId2, corpusId]
--runCmd' $ del [corpusId2, corpusId]
subFlow :: Username -> IO (UserId, RootId, CorpusId) type CorpusName = Text
subFlow username = do
subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
subFlow username cName = do
maybeUserId <- runCmd' (getUser username) maybeUserId <- runCmd' (getUser username)
let userId = case maybeUserId of let userId = case maybeUserId of
Nothing -> panic "Error: User does not exist (yet)" Nothing -> panic "Error: User does not exist (yet)"
-- mk NodeUser gargantua_id "Node Gargantua" -- mk NodeUser gargantua_id "Node Gargantua"
Just user -> userLight_id user Just user -> userLight_id user
...@@ -82,24 +96,24 @@ subFlow username = do ...@@ -82,24 +96,24 @@ subFlow username = do
rootId'' <- case rootId' of rootId'' <- case rootId' of
[] -> runCmd' (mkRoot userId) [] -> runCmd' (mkRoot userId)
un -> case length un >= 2 of n -> case length n >= 2 of
True -> panic "Error: more than 1 userNode / user" True -> panic "Error: more than 1 userNode / user"
False -> pure rootId' False -> pure rootId'
let rootId = maybe (panic "error rootId") identity (head rootId'') let rootId = maybe (panic "error rootId") identity (head rootId'')
corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
let corpusId = maybe (panic "error corpusId") identity (head corpusId') let corpusId = maybe (panic "error corpusId") identity (head corpusId')
printDebug "(username, userId, rootId, corpusId" printDebug "(username, userId, rootId, corpusId)"
(username, userId, rootId, corpusId) (username, userId, rootId, corpusId)
pure (userId, rootId, corpusId) pure (userId, rootId, corpusId)
---------------------------------------------------------------- ------------------------------------------------------------------------
type HashId = Text type HashId = Text
type NodeId = Int type NodeId = Int
type ListId = Int type ListId = Int
type ToInsert = Map HashId HyperdataDocument
type Inserted = Map HashId ReturnId
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d)) toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
...@@ -111,24 +125,33 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) ) ...@@ -111,24 +125,33 @@ toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
$ filter (\r -> reInserted r == True) rs $ filter (\r -> reInserted r == True) rs
data DocumentWithId = data DocumentWithId =
DocumentWithId DocumentWithId { documentId :: NodeId
{ documentId :: NodeId , documentData :: HyperdataDocument
, documentData :: HyperdataDocument }
}
mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId] mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
where where
lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs) lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
------------------------------------------------------------------------
data DocumentIdWithNgrams = data DocumentIdWithNgrams =
DocumentIdWithNgrams DocumentIdWithNgrams
{ documentWithId :: DocumentWithId { documentWithId :: DocumentWithId
, document_ngrams :: Map (NgramsT Ngrams) Int , document_ngrams :: Map (NgramsT Ngrams) Int
} }
-- TODO add Authors and Terms (Title + Abstract)
-- add f :: Text -> Text
-- newtype Ngrams = Ngrams Text
extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
extractNgramsT doc = DM.fromList $ [(NgramsT Sources ngrams, 1)]
where
ngrams = text2ngrams $ maybe "Nothing" identity maybeNgrams
maybeNgrams = _hyperdataDocument_source doc
documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int) documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
-> [DocumentWithId] -> [DocumentIdWithNgrams] -> [DocumentWithId] -> [DocumentIdWithNgrams]
documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d)) documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
...@@ -149,22 +172,41 @@ indexNgrams ng2nId = do ...@@ -149,22 +172,41 @@ indexNgrams ng2nId = do
insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng) insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
(fromIntegral n) ((ngramsTypeId . _ngramsType) ng) (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
| (ng, nId2int) <- DM.toList m
| (ng, nId2int) <- DM.toList m , (nId, n) <- DM.toList nId2int
, (nId, n) <- DM.toList nId2int
] ]
------------------------------------------------------------------------
------------------------------------------------------------------------
listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
listFlow uId cId ng = do
lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
-- TODO add stemming equivalence of 2 ngrams
let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ng
_ <- insertGroups lId groupEd
-- compute Candidate / Map
let lists = ngrams2list ng
_ <- insertLists lId lists
pure lId
------------------------------------------------------------------------ ------------------------------------------------------------------------
groupNgramsBy :: (Ngrams -> Ngrams -> Bool)
groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
-> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
-> Map NgramsIndexed NgramsIndexed -> Map NgramsIndexed NgramsIndexed
groupNgramsBy = undefined groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
-- TODO check: do not insert duplicates
insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
insertGroups lId ngrs = insertGroups lId ngrs =
insertNodeNgramsNgramsNew $ [ NodeNgramsNgrams lId ng1 ng2 (Just 1) insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
| (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
] ]
------------------------------------------------------------------------ ------------------------------------------------------------------------
ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
...@@ -177,18 +219,6 @@ insertLists lId list2ngrams = ...@@ -177,18 +219,6 @@ insertLists lId list2ngrams =
| (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams | (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams
] ]
------------------------------------------------------------------------
listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
listFlow uId cId ng = do
lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
-- TODO add stemming equivalence of 2 ngrams
let groupEd = groupNgramsBy (==) ng
_ <- insertGroups lId groupEd
-- compute Candidate / Map
let lists = ngrams2list ng
_ <- insertLists lId lists
pure lId
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -27,7 +27,7 @@ module Gargantext.Database.Ngrams where ...@@ -27,7 +27,7 @@ module Gargantext.Database.Ngrams where
import Control.Lens (makeLenses) import Control.Lens (makeLenses)
import Data.ByteString.Internal (ByteString) import Data.ByteString.Internal (ByteString)
import Data.Map (Map, fromList, lookup) import Data.Map (Map, fromList, lookup)
import Data.Text (Text) import Data.Text (Text, splitOn)
import Database.PostgreSQL.Simple.FromRow (fromRow, field) import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ (sql) import Database.PostgreSQL.Simple.SqlQQ (sql)
import Database.PostgreSQL.Simple.ToField (toField) import Database.PostgreSQL.Simple.ToField (toField)
...@@ -77,6 +77,7 @@ import qualified Database.PostgreSQL.Simple as DPS ...@@ -77,6 +77,7 @@ import qualified Database.PostgreSQL.Simple as DPS
-- ngrams in authors field of document has Authors Type -- ngrams in authors field of document has Authors Type
-- ngrams in text (title or abstract) of documents has Terms Type -- ngrams in text (title or abstract) of documents has Terms Type
data NgramsType = Sources | Authors | Terms data NgramsType = Sources | Authors | Terms
deriving (Eq)
ngramsTypeId :: NgramsType -> Int ngramsTypeId :: NgramsType -> Int
ngramsTypeId Terms = 1 ngramsTypeId Terms = 1
...@@ -100,6 +101,9 @@ makeLenses ''Ngrams ...@@ -100,6 +101,9 @@ makeLenses ''Ngrams
instance DPS.ToRow Ngrams where instance DPS.ToRow Ngrams where
toRow (Ngrams t s) = [toField t, toField s] toRow (Ngrams t s) = [toField t, toField s]
text2ngrams :: Text -> Ngrams
text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
------------------------------------------------------------------------- -------------------------------------------------------------------------
-- | TODO put it in Gargantext.Text.Ngrams -- | TODO put it in Gargantext.Text.Ngrams
-- Named entity are typed ngrams of Terms Ngrams -- Named entity are typed ngrams of Terms Ngrams
...@@ -108,9 +112,16 @@ data NgramsT a = ...@@ -108,9 +112,16 @@ data NgramsT a =
, _ngramsT :: a , _ngramsT :: a
} deriving (Generic) } deriving (Generic)
instance Eq (NgramsT a) where (==) = (==) instance Eq (NgramsT a)
where (==) = (==)
-- where NgramsT
-- t1 == t2
-- n1 == n2
instance Ord (NgramsT a) where compare = compare instance Ord (NgramsT a) where compare = compare
makeLenses ''NgramsT makeLenses ''NgramsT
----------------------------------------------------------------------- -----------------------------------------------------------------------
data NgramsIndexed = data NgramsIndexed =
NgramsIndexed NgramsIndexed
......
...@@ -59,15 +59,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id) ...@@ -59,15 +59,15 @@ import Data.Graph.Clustering.Louvain.CplusPlus (cLouvain, l_community_id)
contextText :: [T.Text] contextText :: [T.Text]
contextText = map T.pack ["The dog is an animal." contextText = map T.pack ["The dog is an animal."
,"The bird is a animal." ,"The bird is an animal."
,"The dog is a animal." ,"The dog is an animal."
,"The animal is a bird or a dog ?" ,"The animal is a bird or a dog ?"
,"The table is a object." ,"The table is an object."
,"The pen is a object." ,"The pen is an object."
,"The object is a pen or a table ?" ,"The object is a pen or a table ?"
,"The girl is human body." ,"The girl is a human."
,"The boy is human body." ,"The boy is a human."
,"The boy or the girl are human body." ,"The boy or the girl are human."
] ]
...@@ -75,7 +75,7 @@ contextText = map T.pack ["The dog is an animal." ...@@ -75,7 +75,7 @@ contextText = map T.pack ["The dog is an animal."
data TextFlow = CSV FilePath data TextFlow = CSV FilePath
| FullText FilePath | FullText FilePath
| Contexts [T.Text] | Contexts [T.Text]
| DB Connection CorpusId | DBV3 Connection CorpusId
| Query T.Text | Query T.Text
...@@ -85,7 +85,7 @@ textFlow termType workType = do ...@@ -85,7 +85,7 @@ textFlow termType workType = do
FullText path -> splitBy (Sentences 5) <$> readFile path FullText path -> splitBy (Sentences 5) <$> readFile path
CSV path -> readCsvOn [csv_title, csv_abstract] path CSV path -> readCsvOn [csv_title, csv_abstract] path
Contexts ctxt -> pure ctxt Contexts ctxt -> pure ctxt
DB con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId DBV3 con corpusId -> catMaybes <$> map (\n -> hyperdataDocumentV3_title (_node_hyperdata n) <> hyperdataDocumentV3_abstract (_node_hyperdata n))<$> getDocumentsV3WithParentId con corpusId
_ -> undefined -- TODO Query not supported _ -> undefined -- TODO Query not supported
textFlow' termType contexts textFlow' termType contexts
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment