Commit 8f0fcd75 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] HashMap Ngrams ...

parent aaf4b338
...@@ -35,15 +35,17 @@ module Gargantext.Core.Text.Terms ...@@ -35,15 +35,17 @@ module Gargantext.Core.Text.Terms
where where
import Control.Lens import Control.Lens
import Data.HashMap.Strict (HashMap)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
import GHC.Base (String)
import GHC.Generics (Generic)
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
import GHC.Base (String) import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import GHC.Generics (Generic)
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
...@@ -114,17 +116,17 @@ class ExtractNgramsT h ...@@ -114,17 +116,17 @@ class ExtractNgramsT h
extractNgramsT :: HasText h extractNgramsT :: HasText h
=> TermType Lang => TermType Lang
-> h -> h
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (HashMap Ngrams (Map NgramsType Int))
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int) filterNgrams :: Int -> HashMap Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int) -> HashMap Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms filterNgrams s = HashMap.mapKeys filter
where where
filter' (ng,y) filter ng
| Text.length (ng ^. ngramsTerms) < s = (ng,y) | Text.length (ng ^. ngramsTerms) < s = ng
| otherwise = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y) | otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
-- ======================================================= -- =======================================================
......
...@@ -22,6 +22,10 @@ partition p m = (HashMap.filter p m, HashMap.filter (not . p) m) ...@@ -22,6 +22,10 @@ partition p m = (HashMap.filter p m, HashMap.filter (not . p) m)
partitionWithKey :: (Ord a, Hashable k) => (k -> a -> Bool) -> HashMap k a -> (HashMap k a, HashMap k a) partitionWithKey :: (Ord a, Hashable k) => (k -> a -> Bool) -> HashMap k a -> (HashMap k a, HashMap k a)
partitionWithKey p m = (HashMap.filterWithKey p m, HashMap.filterWithKey (\k -> not . p k) m) partitionWithKey p m = (HashMap.filterWithKey p m, HashMap.filterWithKey (\k -> not . p k) m)
mapKeys :: (Ord k2, Hashable k2) => (k1->k2) -> HashMap k1 a -> HashMap k2 a
mapKeys f = HashMap.fromList . HashMap.foldrWithKey (\k x xs -> (f k, x) : xs) []
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- getKeyWithMaxValue :: Hashable k => HashMap k a -> Maybe k -- getKeyWithMaxValue :: Hashable k => HashMap k a -> Maybe k
getKeysOrderedByValueMaxFirst :: (Ord k, Hashable k, Ord a) => HashMap k a -> [k] getKeysOrderedByValueMaxFirst :: (Ord k, Hashable k, Ord a) => HashMap k a -> [k]
......
...@@ -47,8 +47,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list) ...@@ -47,8 +47,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Control.Lens ((^.), view, _Just, makeLenses) import Control.Lens ((^.), view, _Just, makeLenses)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Either import Data.Either
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.List (concat) import Data.List (concat)
import qualified Data.Map as Map
import Data.Map (Map, lookup) import Data.Map (Map, lookup)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import Data.Monoid import Data.Monoid
...@@ -58,6 +59,9 @@ import Data.Traversable (traverse) ...@@ -58,6 +59,9 @@ import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Ext.IMT (toSchoolName) import Gargantext.Core.Ext.IMT (toSchoolName)
...@@ -259,27 +263,27 @@ insertMasterDocs c lang hs = do ...@@ -259,27 +263,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring -- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int)) -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs :: Map Ngrams (Map NgramsType (Map NodeId Int)) mapNgramsDocs :: HashMap Ngrams (Map NgramsType (Map NodeId Int))
<- mapNodeIdNgrams <- mapNodeIdNgrams
<$> documentIdWithNgrams <$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId) (extractNgramsT $ withLang lang documentsWithId)
documentsWithId documentsWithId
terms2id <- insertNgrams $ Map.keys mapNgramsDocs terms2id <- insertNgrams $ HashMap.keys mapNgramsDocs
-- to be removed -- to be removed
let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
-- new -- new
lId <- getOrMkList masterCorpusId masterUserId lId <- getOrMkList masterCorpusId masterUserId
mapCgramsId <- listInsertDb lId toNodeNgramsW' mapCgramsId <- listInsertDb lId toNodeNgramsW'
$ map (first _ngramsTerms . second Map.keys) $ map (first _ngramsTerms . second Map.keys)
$ Map.toList mapNgramsDocs $ HashMap.toList mapNgramsDocs
-- insertDocNgrams -- insertDocNgrams
_return <- insertNodeNodeNgrams2 _return <- insertNodeNodeNgrams2
$ catMaybes [ NodeNodeNgrams2 <$> Just nId $ catMaybes [ NodeNodeNgrams2 <$> Just nId
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'') <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight , (nId, w) <- Map.toList mapNodeIdWeight
] ]
...@@ -339,40 +343,57 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList ...@@ -339,40 +343,57 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------ ------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
=> (a => (a
-> Cmd err (Map Ngrams (Map NgramsType Int))) -> Cmd err (HashMap b (Map NgramsType Int)))
-> [Indexed NodeId a] -> [Indexed NodeId a]
-> Cmd err [DocumentIdWithNgrams a] -> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where where
toDocumentIdWithNgrams d = do toDocumentIdWithNgrams d = do
e <- f $ _unIndex d e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e pure $ DocumentIdWithNgrams d e
-- | TODO check optimization
mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap b
(Map NgramsType
(Map NodeId Int)
)
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
where
f :: DocumentIdWithNgrams a b
-> HashMap b (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
where
nId = _index $ documentWithId d
------------------------------------------------------------------------ ------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact instance ExtractNgramsT HyperdataContact
where where
extractNgramsT l hc = filterNgramsT 255 <$> extract l hc extractNgramsT l hc = filterNgrams 255 <$> extract l hc
where where
extract :: TermType Lang -> HyperdataContact extract :: TermType Lang -> HyperdataContact
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (HashMap Ngrams (Map NgramsType Int))
extract _l hc' = do extract _l hc' = do
let authors = map text2ngrams let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a]) $ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc' $ view (hc_who . _Just . cw_lastName) hc'
pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ] pure $ HashMap.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument instance ExtractNgramsT HyperdataDocument
where where
extractNgramsT :: TermType Lang extractNgramsT :: TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (HashMap Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd extractNgramsT lang hd = filterNgrams 255 <$> extractNgramsT' lang hd
where where
extractNgramsT' :: TermType Lang extractNgramsT' :: TermType Lang
-> HyperdataDocument -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (HashMap Ngrams (Map NgramsType Int))
extractNgramsT' lang' doc = do extractNgramsT' lang' doc = do
let source = text2ngrams let source = text2ngrams
$ maybe "Nothing" identity $ maybe "Nothing" identity
...@@ -391,7 +412,7 @@ instance ExtractNgramsT HyperdataDocument ...@@ -391,7 +412,7 @@ instance ExtractNgramsT HyperdataDocument
<$> concat <$> concat
<$> liftBase (extractTerms lang' $ hasText doc) <$> liftBase (extractTerms lang' $ hasText doc)
pure $ Map.fromList $ [(source, Map.singleton Sources 1)] pure $ HashMap.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ] <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(a', Map.singleton Authors 1) | a' <- authors ] <> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ] <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
......
...@@ -82,20 +82,6 @@ flowList_Tficf' u m nt f = do ...@@ -82,20 +82,6 @@ flowList_Tficf' u m nt f = do
-} -}
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
-> Map Ngrams
(Map NgramsType
(Map NodeId Int)
)
mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
where
f :: DocumentIdWithNgrams a
-> Map Ngrams (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
where
nId = _index $ documentWithId d
------------------------------------------------------------------------ ------------------------------------------------------------------------
flowList_DbRepo :: FlowCmdM env err m flowList_DbRepo :: FlowCmdM env err m
=> ListId => ListId
......
...@@ -14,6 +14,7 @@ module Gargantext.Database.Action.Flow.Utils ...@@ -14,6 +14,7 @@ module Gargantext.Database.Action.Flow.Utils
where where
import Data.Map (Map) import Data.Map (Map)
import Data.HashMap.Strict (HashMap)
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNodeNgrams import Gargantext.Database.Query.Table.NodeNodeNgrams
...@@ -21,16 +22,15 @@ import Gargantext.Database.Schema.Ngrams ...@@ -21,16 +22,15 @@ import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Types import Gargantext.Database.Types
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.HashMap.Strict as HashMap
type DocumentWithId a = Indexed NodeId a
data DocumentIdWithNgrams a = data DocumentIdWithNgrams a b =
DocumentIdWithNgrams DocumentIdWithNgrams
{ documentWithId :: DocumentWithId a { documentWithId :: Indexed NodeId a
, documentNgrams :: Map Ngrams (Map NgramsType Int) , documentNgrams :: HashMap b (Map NgramsType Int)
} deriving (Show) } deriving (Show)
docNgrams2nodeNodeNgrams :: CorpusId docNgrams2nodeNodeNgrams :: CorpusId
-> DocNgrams -> DocNgrams
-> NodeNodeNgrams -> NodeNodeNgrams
...@@ -51,11 +51,11 @@ insertDocNgramsOn cId dn = ...@@ -51,11 +51,11 @@ insertDocNgramsOn cId dn =
$ (map (docNgrams2nodeNodeNgrams cId) dn) $ (map (docNgrams2nodeNodeNgrams cId) dn)
insertDocNgrams :: CorpusId insertDocNgrams :: CorpusId
-> Map (Indexed Int Ngrams) (Map NgramsType (Map NodeId Int)) -> HashMap (Indexed Int Ngrams) (Map NgramsType (Map NodeId Int))
-> Cmd err Int -> Cmd err Int
insertDocNgrams cId m = insertDocNgrams cId m =
insertDocNgramsOn cId [ DocNgrams n (_index ng) (ngramsTypeId t) (fromIntegral i) insertDocNgramsOn cId [ DocNgrams n (_index ng) (ngramsTypeId t) (fromIntegral i)
| (ng, t2n2i) <- DM.toList m | (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i , (t, n2i) <- DM.toList t2n2i
, (n, i) <- DM.toList n2i , (n, i) <- DM.toList n2i
] ]
......
...@@ -146,6 +146,8 @@ data Ngrams = UnsafeNgrams { _ngramsTerms :: Text ...@@ -146,6 +146,8 @@ data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
} }
deriving (Generic, Show, Eq, Ord) deriving (Generic, Show, Eq, Ord)
instance Hashable Ngrams
makeLenses ''Ngrams makeLenses ''Ngrams
instance PGS.ToRow Ngrams where instance PGS.ToRow Ngrams where
toRow (UnsafeNgrams t s) = [toField t, toField s] toRow (UnsafeNgrams t s) = [toField t, toField s]
......
...@@ -14,6 +14,7 @@ Portability : POSIX ...@@ -14,6 +14,7 @@ Portability : POSIX
module Gargantext.Database.Types module Gargantext.Database.Types
where where
import Data.Hashable (Hashable)
import Gargantext.Core.Text (HasText(..)) import Gargantext.Core.Text (HasText(..))
import Gargantext.Database.Schema.Prelude import Gargantext.Database.Schema.Prelude
import Gargantext.Prelude import Gargantext.Prelude
...@@ -38,3 +39,4 @@ instance HasText a => HasText (Indexed i a) ...@@ -38,3 +39,4 @@ instance HasText a => HasText (Indexed i a)
where where
hasText (Indexed _ a) = hasText a hasText (Indexed _ a) = hasText a
instance (Hashable a, Hashable b) => Hashable (Indexed a b)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment