Commit 8f0fcd75 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[OPTIM] HashMap Ngrams ...

parent aaf4b338
Pipeline #1332 failed with stage
......@@ -35,15 +35,17 @@ module Gargantext.Core.Text.Terms
where
import Control.Lens
import Data.HashMap.Strict (HashMap)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Text (Text)
import Data.Traversable
import GHC.Base (String)
import GHC.Generics (Generic)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.Text as Text
import GHC.Base (String)
import GHC.Generics (Generic)
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import Gargantext.Core
import Gargantext.Core.Flow.Types
......@@ -114,17 +116,17 @@ class ExtractNgramsT h
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (Map Ngrams (Map NgramsType Int))
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms
filterNgrams :: Int -> HashMap Ngrams (Map NgramsType Int)
-> HashMap Ngrams (Map NgramsType Int)
filterNgrams s = HashMap.mapKeys filter
where
filter' (ng,y)
| Text.length (ng ^. ngramsTerms) < s = (ng,y)
| otherwise = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y)
filter ng
| Text.length (ng ^. ngramsTerms) < s = ng
| otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
-- =======================================================
......
......@@ -22,6 +22,10 @@ partition p m = (HashMap.filter p m, HashMap.filter (not . p) m)
partitionWithKey :: (Ord a, Hashable k) => (k -> a -> Bool) -> HashMap k a -> (HashMap k a, HashMap k a)
partitionWithKey p m = (HashMap.filterWithKey p m, HashMap.filterWithKey (\k -> not . p k) m)
mapKeys :: (Ord k2, Hashable k2) => (k1->k2) -> HashMap k1 a -> HashMap k2 a
mapKeys f = HashMap.fromList . HashMap.foldrWithKey (\k x xs -> (f k, x) : xs) []
------------------------------------------------------------------------
-- getKeyWithMaxValue :: Hashable k => HashMap k a -> Maybe k
getKeysOrderedByValueMaxFirst :: (Ord k, Hashable k, Ord a) => HashMap k a -> [k]
......
......@@ -47,8 +47,9 @@ module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
import Control.Lens ((^.), view, _Just, makeLenses)
import Data.Aeson.TH (deriveJSON)
import Data.Either
import Data.HashMap.Strict (HashMap)
import Data.Hashable (Hashable)
import Data.List (concat)
import qualified Data.Map as Map
import Data.Map (Map, lookup)
import Data.Maybe (catMaybes)
import Data.Monoid
......@@ -58,6 +59,9 @@ import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import System.FilePath (FilePath)
import qualified Data.HashMap.Strict as HashMap
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
import Gargantext.Core (Lang(..))
import Gargantext.Core.Ext.IMT (toSchoolName)
......@@ -259,27 +263,27 @@ insertMasterDocs c lang hs = do
-- this will enable global database monitoring
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs :: Map Ngrams (Map NgramsType (Map NodeId Int))
mapNgramsDocs :: HashMap Ngrams (Map NgramsType (Map NodeId Int))
<- mapNodeIdNgrams
<$> documentIdWithNgrams
(extractNgramsT $ withLang lang documentsWithId)
documentsWithId
terms2id <- insertNgrams $ Map.keys mapNgramsDocs
terms2id <- insertNgrams $ HashMap.keys mapNgramsDocs
-- to be removed
let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
-- new
lId <- getOrMkList masterCorpusId masterUserId
mapCgramsId <- listInsertDb lId toNodeNgramsW'
$ map (first _ngramsTerms . second Map.keys)
$ Map.toList mapNgramsDocs
$ HashMap.toList mapNgramsDocs
-- insertDocNgrams
_return <- insertNodeNodeNgrams2
$ catMaybes [ NodeNodeNgrams2 <$> Just nId
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
| (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
| (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
]
......@@ -339,40 +343,57 @@ mergeData rs = catMaybes . map toDocumentWithId . Map.toList
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (Map Ngrams (Map NgramsType Int)))
-> Cmd err (HashMap b (Map NgramsType Int)))
-> [Indexed NodeId a]
-> Cmd err [DocumentIdWithNgrams a]
-> Cmd err [DocumentIdWithNgrams a b]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e
-- | TODO check optimization
mapNodeIdNgrams :: (Ord b, Hashable b)
=> [DocumentIdWithNgrams a b]
-> HashMap b
(Map NgramsType
(Map NodeId Int)
)
mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
where
f :: DocumentIdWithNgrams a b
-> HashMap b (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
where
nId = _index $ documentWithId d
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
extractNgramsT l hc = filterNgrams 255 <$> extract l hc
where
extract :: TermType Lang -> HyperdataContact
-> Cmd err (Map Ngrams (Map NgramsType Int))
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
extract _l hc' = do
let authors = map text2ngrams
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
pure $ HashMap.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT :: TermType Lang
-> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgrams 255 <$> extractNgramsT' lang hd
where
extractNgramsT' :: TermType Lang
-> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
-> Cmd err (HashMap Ngrams (Map NgramsType Int))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
......@@ -391,7 +412,7 @@ instance ExtractNgramsT HyperdataDocument
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
pure $ HashMap.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
......
......@@ -82,20 +82,6 @@ flowList_Tficf' u m nt f = do
-}
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
-> Map Ngrams
(Map NgramsType
(Map NodeId Int)
)
mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
where
f :: DocumentIdWithNgrams a
-> Map Ngrams (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
where
nId = _index $ documentWithId d
------------------------------------------------------------------------
flowList_DbRepo :: FlowCmdM env err m
=> ListId
......
......@@ -14,6 +14,7 @@ module Gargantext.Database.Action.Flow.Utils
where
import Data.Map (Map)
import Data.HashMap.Strict (HashMap)
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Query.Table.NodeNodeNgrams
......@@ -21,16 +22,15 @@ import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Types
import Gargantext.Prelude
import qualified Data.Map as DM
import qualified Data.HashMap.Strict as HashMap
type DocumentWithId a = Indexed NodeId a
data DocumentIdWithNgrams a =
data DocumentIdWithNgrams a b =
DocumentIdWithNgrams
{ documentWithId :: DocumentWithId a
, documentNgrams :: Map Ngrams (Map NgramsType Int)
{ documentWithId :: Indexed NodeId a
, documentNgrams :: HashMap b (Map NgramsType Int)
} deriving (Show)
docNgrams2nodeNodeNgrams :: CorpusId
-> DocNgrams
-> NodeNodeNgrams
......@@ -51,11 +51,11 @@ insertDocNgramsOn cId dn =
$ (map (docNgrams2nodeNodeNgrams cId) dn)
insertDocNgrams :: CorpusId
-> Map (Indexed Int Ngrams) (Map NgramsType (Map NodeId Int))
-> HashMap (Indexed Int Ngrams) (Map NgramsType (Map NodeId Int))
-> Cmd err Int
insertDocNgrams cId m =
insertDocNgramsOn cId [ DocNgrams n (_index ng) (ngramsTypeId t) (fromIntegral i)
| (ng, t2n2i) <- DM.toList m
| (ng, t2n2i) <- HashMap.toList m
, (t, n2i) <- DM.toList t2n2i
, (n, i) <- DM.toList n2i
]
......
......@@ -146,6 +146,8 @@ data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
}
deriving (Generic, Show, Eq, Ord)
instance Hashable Ngrams
makeLenses ''Ngrams
instance PGS.ToRow Ngrams where
toRow (UnsafeNgrams t s) = [toField t, toField s]
......
......@@ -14,6 +14,7 @@ Portability : POSIX
module Gargantext.Database.Types
where
import Data.Hashable (Hashable)
import Gargantext.Core.Text (HasText(..))
import Gargantext.Database.Schema.Prelude
import Gargantext.Prelude
......@@ -38,3 +39,4 @@ instance HasText a => HasText (Indexed i a)
where
hasText (Indexed _ a) = hasText a
instance (Hashable a, Hashable b) => Hashable (Indexed a b)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment