[FLOW/FIX] extraction of ngrams ok for big corpora.

d2e8a845 · Alexandre Delanoë · 38ebe154 · d2e8a845 · d2e8a845 · d2e8a845
Commit d2e8a845 authored Mar 20, 2019 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 66 additions and 64 deletions

Main.hs bin/gargantext-import/Main.hs +6 -4

Flow.hs src/Gargantext/Database/Flow.hs +59 -60

GrandDebat.hs src/Gargantext/Text/Parsers/GrandDebat.hs +1 -0

No files found.
--- a/bin/gargantext-import/Main.hs
+++ b/bin/gargantext-import/Main.hs
@@ -49,9 +49,12 @@ main = do
  --}
  let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
      cmdCorpus = do
-        docs <- liftIO (splitEvery 500 <$> readFile corpusPath :: IO [[GrandDebatReference ]])
-        ids <- flowCorpus (Text.pack user) (Text.pack name) (Mono FR) docs
-        pure ids
+        docs <- liftIO ( splitEvery 500
+                       <$> take 10000
+                       <$> readFile corpusPath
+                       :: IO [[GrandDebatReference ]]
+                       )
+        flowCorpus (Text.pack user) (Text.pack name) (Multi FR) docs

     -- cmd = {-createUsers >>-} cmdCorpus

@@ -63,4 +66,3 @@ main = do
  _ <- runCmdDev env cmdCorpus
  pure ()

-
--- a/src/Gargantext/Database/Flow.hs
+++ b/src/Gargantext/Database/Flow.hs
@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
 --import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
 --import Gargantext.Text.Metrics.TFICF (Tficf(..))
+--import Debug.Trace (trace)
 import Control.Monad (mapM_)
 import Control.Monad.IO.Class (liftIO)
 import Data.List (concat)
@@ -58,7 +59,7 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
 import Gargantext.Database.Utils (Cmd, CmdM)
 import Gargantext.Ext.IMT (toSchoolName)
 import Gargantext.Prelude
--import Gargantext.Text.List
+import Gargantext.Text.List (buildNgramsLists)
 --import Gargantext.Text.Parsers (parseDocs, FileFormat)
 import Gargantext.Text.Terms (TermType(..))
 import Gargantext.Text.Terms (extractTerms)
@@ -89,8 +90,8 @@ flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
 flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
           => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
 flowCorpus u cn la docs = do
-  ids <- mapM (\doc -> flowCorpusMaster la (map toHyperdataDocument doc)) docs
-  flowCorpusUser u cn $ concat ids
+  ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs
+  flowCorpusUser u cn (concat ids)


 -- TODO query with complex query
@@ -102,23 +103,19 @@ flowCorpusSearchInDatabase u q = do
  flowCorpusUser u q ids


-- TODO uniformize language of corpus
-flowCorpusMaster :: FlowCmdM env ServantErr m => TermType Lang -> [HyperdataDocument] -> m [NodeId]
-flowCorpusMaster la hd = insertMasterDocs la hd
-
-
-flowCorpusUser :: FlowCmdM env ServantErr m => Username -> CorpusName -> [NodeId] -> m CorpusId
+flowCorpusUser :: FlowCmdM env ServantErr m
+               => Username -> CorpusName -> [NodeId] -> m CorpusId
 flowCorpusUser userName corpusName ids = do
  -- User Flow
-  (_userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
+  (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
  -- TODO: check if present already, ignore
  _ <- Doc.add userCorpusId ids

  -- User List Flow
-  --(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
-  --ngs         <- buildNgramsLists userCorpusId masterCorpusId
-  --userListId  <- flowList userId userCorpusId ngs
-  --printDebug "userListId" userListId
+  (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
+  ngs         <- buildNgramsLists userCorpusId masterCorpusId
+  userListId  <- flowList userId userCorpusId ngs
+  printDebug "userListId" userListId

  -- User Graph Flow
  --_ <- mkGraph     userCorpusId userId
@@ -134,20 +131,23 @@ flowCorpusUser userName corpusName ids = do
 insertMasterDocs :: FlowCmdM env ServantErr m
                => TermType Lang -> [HyperdataDocument] -> m [DocId]
 insertMasterDocs lang hs  =  do
-  let hs' = map addUniqIdsDoc hs
-  let hyperdataDocuments' = map (\h -> ToDbDocument h) hs'
-
  (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName
-  ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'

-  let documentsWithId = mergeData (toInserted ids) (toInsert hs)
+  -- TODO Type NodeDocumentUnicised
+  let hs' = map addUniqIdsDoc hs
+  ids <- insertDocuments masterUserId masterCorpusId NodeDocument
+                         $ map ToDbDocument hs'
+                         --      ^ TODO Type class to insert Doc
+                         --         ^ TODO Type Class AddUnicity where unicity = addUnicity
+
+  let documentsWithId = mergeData (toInserted ids) (toInsert hs')
  docsWithNgrams     <- documentIdWithNgrams (extractNgramsT lang) documentsWithId

  let maps            = mapNodeIdNgrams docsWithNgrams

  terms2id <- insertNgrams $ DM.keys maps
  let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
-  _             <- insertToNodeNgrams indexedNgrams
+  _                <- insertToNodeNgrams indexedNgrams
  pure $ map reId ids


@@ -220,51 +220,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
  , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
  } deriving (Show)

-
+-- TODO extractNgrams according to Type of Data
 extractNgramsT :: HasNodeError err
               => TermType Lang -> HyperdataDocument
               -> Cmd err (Map Ngrams (Map NgramsType Int))
 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
-
-
-extractNgramsT' :: HasNodeError err
-               => TermType Lang -> HyperdataDocument
-               -> Cmd err (Map Ngrams (Map NgramsType Int))
-extractNgramsT' lang doc = do
-  let source    = text2ngrams
-                $ maybe "Nothing" identity
-                $ _hyperdataDocument_source doc
-
-      institutes = map text2ngrams
-                 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
-                 $ _hyperdataDocument_institutes doc
-
-      authors    = map text2ngrams
-                 $ maybe ["Nothing"] (splitOn ", ")
-                 $ _hyperdataDocument_authors doc
-
-      leText = catMaybes [ _hyperdataDocument_title    doc
-                         , _hyperdataDocument_abstract doc
-                         ]
-
-  terms' <- map text2ngrams
-         <$> map (intercalate " " . _terms_label)
-         <$> concat
-         <$> liftIO (extractTerms lang leText)
-
-  pure $ DM.fromList $  [(source, DM.singleton Sources 1)]
-                     <> [(i', DM.singleton Institutes  1) | i' <- institutes ]
-                     <> [(a', DM.singleton Authors     1) | a' <- authors    ]
-                     <> [(t', DM.singleton NgramsTerms 1) | t' <- terms'     ]
-
-
-filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-                     -> Map Ngrams (Map NgramsType Int)
-filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
  where
-    filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
-          True  -> (ng,y)
-          False -> (Ngrams (Text.take s' t) n , y)
+    extractNgramsT' :: HasNodeError err
+                   => TermType Lang -> HyperdataDocument
+                   -> Cmd err (Map Ngrams (Map NgramsType Int))
+    extractNgramsT' lang' doc = do
+      let source    = text2ngrams
+                    $ maybe "Nothing" identity
+                    $ _hyperdataDocument_source doc
+
+          institutes = map text2ngrams
+                     $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
+                     $ _hyperdataDocument_institutes doc
+
+          authors    = map text2ngrams
+                     $ maybe ["Nothing"] (splitOn ", ")
+                     $ _hyperdataDocument_authors doc
+
+          leText = catMaybes [ _hyperdataDocument_title    doc
+                             , _hyperdataDocument_abstract doc
+                             ]
+
+      terms' <- map text2ngrams
+             <$> map (intercalate " " . _terms_label)
+             <$> concat
+             <$> liftIO (extractTerms lang' leText)
+
+      pure $ DM.fromList $  [(source, DM.singleton Sources 1)]
+                         <> [(i', DM.singleton Institutes  1) | i' <- institutes ]
+                         <> [(a', DM.singleton Authors     1) | a' <- authors    ]
+                         <> [(t', DM.singleton NgramsTerms 1) | t' <- terms'     ]
+
+
+    filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
+                         -> Map Ngrams (Map NgramsType Int)
+    filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
+      where
+        filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
+              True  -> (ng,y)
+              False -> (Ngrams (Text.take s' t) n , y)


 documentIdWithNgrams :: HasNodeError err

--- a/src/Gargantext/Text/Parsers/GrandDebat.hs
+++ b/src/Gargantext/Text/Parsers/GrandDebat.hs
@@ -97,6 +97,7 @@ class ReadFile a

 instance ReadFile [GrandDebatReference]
  where
+    -- | read json: 3 version below are working but with increased optimization
    --readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
    --readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
    readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp