Commit 05aa3d7e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[INDEXING] WIP

parent 44be4e4c
...@@ -103,18 +103,9 @@ getJson path = L.readFile path ...@@ -103,18 +103,9 @@ getJson path = L.readFile path
-- | Parse | -- -- | Parse | --
--------------- ---------------
-- | To filter the Ngrams of a document based on the termList -- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text]) filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d) filterTerms patterns (y,d) = (y,termsInText patterns d)
where
--------------------------------------
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = DL.nub
$ DL.concat
$ map (map unwords)
$ extractTermsWithList pats txt
--------------------------------------
-- | To transform a Csv nfile into a readable corpus -- | To transform a Csv nfile into a readable corpus
......
...@@ -10,8 +10,6 @@ Portability : POSIX ...@@ -10,8 +10,6 @@ Portability : POSIX
Count API part of Gargantext. Count API part of Gargantext.
-} -}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
...@@ -19,23 +17,18 @@ Count API part of Gargantext. ...@@ -19,23 +17,18 @@ Count API part of Gargantext.
module Gargantext.API.Count module Gargantext.API.Count
where where
import GHC.Generics (Generic)
import Data.Aeson hiding (Error) import Data.Aeson hiding (Error)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Either import Data.Either
import Data.List (permutations) import Data.List (permutations)
import Data.Swagger import Data.Swagger
import Data.Text (Text, pack) import Data.Text (Text, pack)
import GHC.Generics (Generic)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Prelude
import Servant import Servant
import Test.QuickCheck.Arbitrary
import Test.QuickCheck (elements) import Test.QuickCheck (elements)
-- import Control.Applicative ((<*>)) import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- TODO-ACCESS: CanCount -- TODO-ACCESS: CanCount
......
-- | {-|
Module : Gargantext.API.Dev
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
-- Use only for dev/repl -- Use only for dev/repl
module Gargantext.API.Dev where module Gargantext.API.Dev where
...@@ -17,7 +26,6 @@ import Gargantext.Prelude ...@@ -17,7 +26,6 @@ import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig(..), readConfig) import Gargantext.Prelude.Config (GargConfig(..), readConfig)
------------------------------------------------------------------- -------------------------------------------------------------------
withDevEnv :: IniPath -> (DevEnv -> IO a) -> IO a withDevEnv :: IniPath -> (DevEnv -> IO a) -> IO a
withDevEnv iniPath k = do withDevEnv iniPath k = do
env <- newDevEnv env <- newDevEnv
...@@ -38,7 +46,6 @@ withDevEnv iniPath k = do ...@@ -38,7 +46,6 @@ withDevEnv iniPath k = do
} }
-- | Run Cmd Sugar for the Repl (GHCI) -- | Run Cmd Sugar for the Repl (GHCI)
runCmdRepl :: Show err => Cmd'' DevEnv err a -> IO a runCmdRepl :: Show err => Cmd'' DevEnv err a -> IO a
runCmdRepl f = withDevEnv "gargantext.ini" $ \env -> runCmdDev env f runCmdRepl f = withDevEnv "gargantext.ini" $ \env -> runCmdDev env f
...@@ -62,4 +69,4 @@ runCmdDevServantErr :: DevEnv -> Cmd' DevEnv ServerError a -> IO a ...@@ -62,4 +69,4 @@ runCmdDevServantErr :: DevEnv -> Cmd' DevEnv ServerError a -> IO a
runCmdDevServantErr = runCmdDev runCmdDevServantErr = runCmdDev
runCmdReplEasy :: Cmd'' DevEnv GargError a -> IO a runCmdReplEasy :: Cmd'' DevEnv GargError a -> IO a
runCmdReplEasy f = withDevEnv "gargantext.ini" $ \env -> runCmdDev env f runCmdReplEasy f = withDevEnv "gargantext.ini" $ \env -> runCmdDev env f
\ No newline at end of file
...@@ -514,7 +514,9 @@ getTableNgrams _nType nId tabType listId limit_ offset ...@@ -514,7 +514,9 @@ getTableNgrams _nType nId tabType listId limit_ offset
filteredNodes :: Map NgramsTerm NgramsElement -> [NgramsElement] filteredNodes :: Map NgramsTerm NgramsElement -> [NgramsElement]
filteredNodes tableMap = rootOf <$> list & filter selected_node filteredNodes tableMap = rootOf <$> list & filter selected_node
where where
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. at r)) rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root")
(tableMap ^. at r)
)
(ne ^. ne_root) (ne ^. ne_root)
list = tableMap ^.. each list = tableMap ^.. each
...@@ -523,7 +525,9 @@ getTableNgrams _nType nId tabType listId limit_ offset ...@@ -523,7 +525,9 @@ getTableNgrams _nType nId tabType listId limit_ offset
selectAndPaginate tableMap = roots <> inners selectAndPaginate tableMap = roots <> inners
where where
list = tableMap ^.. each list = tableMap ^.. each
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. at r)) rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root")
(tableMap ^. at r)
)
(ne ^. ne_root) (ne ^. ne_root)
selected_nodes = list & take limit_ selected_nodes = list & take limit_
. drop offset' . drop offset'
...@@ -777,3 +781,7 @@ listNgramsChangedSince listId ngramsType version ...@@ -777,3 +781,7 @@ listNgramsChangedSince listId ngramsType version
Versioned <$> currentVersion <*> pure True Versioned <$> currentVersion <*> pure True
| otherwise = | otherwise =
tableNgramsPull listId ngramsType version & mapped . v_data %~ (== mempty) tableNgramsPull listId ngramsType version & mapped . v_data %~ (== mempty)
...@@ -15,6 +15,7 @@ Portability : POSIX ...@@ -15,6 +15,7 @@ Portability : POSIX
module Gargantext.API.Ngrams.List module Gargantext.API.Ngrams.List
where where
import Data.Maybe (catMaybes)
import Control.Lens hiding (elements) import Control.Lens hiding (elements)
import Data.Aeson import Data.Aeson
import Data.Map (toList, fromList) import Data.Map (toList, fromList)
...@@ -22,21 +23,27 @@ import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema) ...@@ -22,21 +23,27 @@ import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
import Data.Text (Text, concat, pack) import Data.Text (Text, concat, pack)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.API.Admin.Orchestrator.Types import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams) import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams, NgramsTerm)
import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList) import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
import Gargantext.API.Node.Corpus.New.File (FileType(..)) import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Prelude (GargServer) import Gargantext.API.Prelude (GargServer, GargNoServer)
import Gargantext.Core.Utils.Prefix (unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Node
import Gargantext.Database.Admin.Types.Node import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Schema.Ngrams (ngramsTypes) import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Schema.Ngrams (ngramsTypes, NgramsType(..))
import Gargantext.Database.Query.Table.Node (getDocumentsWithParentId)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Network.HTTP.Media ((//), (/:)) import Network.HTTP.Media ((//), (/:))
import Servant import Servant
import Servant.Job.Async import Servant.Job.Async
import Servant.Job.Utils (jsonOptions) import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded (FromForm) import Web.FormUrlEncoded (FromForm)
------------------------------------------------------------------------ import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
------------------------------------------------------------------------ ------------------------------------------------------------------------
type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList) type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
...@@ -83,6 +90,41 @@ post l m = do ...@@ -83,6 +90,41 @@ post l m = do
-- TODO reindex -- TODO reindex
pure True pure True
-----------------------------------------------------------------------------
-- | Re-index documents of a corpus with new ngrams (called orphans here)
reIndexWith :: CorpusId
-> ListId
-> NgramsType
-> [NgramsTerm]
-> GargNoServer ()
reIndexWith cId lId nt ts = do
docs <- getDocumentsWithParentId cId
-- Taking the ngrams with 0 occurrences only (orphans)
orphans <- map (\k -> ([unNgramsTerm k], []))
<$> HashMap.keys
<$> HashMap.filter (==0)
<$> getOccByNgramsOnlyFast' cId lId nt ts
-- Checking Text documents where orphans match
let
docMatched =
map (\doc -> ( doc ^. node_id
, termsInText (buildPatterns orphans)
( Text.unlines
$ catMaybes
[ doc ^. node_hyperdata . hd_title
, doc ^. node_hyperdata . hd_abstract
]
)
)
) docs
-- Saving the indexation in database
pure ()
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
type PostAPI = Summary "Update List" type PostAPI = Summary "Update List"
......
...@@ -23,12 +23,12 @@ import GHC.IO (FilePath) ...@@ -23,12 +23,12 @@ import GHC.IO (FilePath)
import Gargantext.Core.Types (CorpusId) import Gargantext.Core.Types (CorpusId)
{- {-
____ _ _ ____ _____ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_ / ___| __ _ _ __ __ _ __ _ _ _|_ _|____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __| | | _ / _` | '__/ _` |/ _` | '_ \| |/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | || __/> <| |_ | |_| | (_| | | | (_| | (_| | | | | | __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__| \____|\__,_|_| \__, |\__,_|_| |_|_|\___/_/\_\\__|
|___/ |___/
-} -}
......
...@@ -17,14 +17,14 @@ module Gargantext.Core.Text.Terms.WithList where ...@@ -17,14 +17,14 @@ module Gargantext.Core.Text.Terms.WithList where
import Data.List (null) import Data.List (null)
import Data.Ord import Data.Ord
import Data.Text (Text, concat) import Data.Text (Text, concat, unwords)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Prelude (error) import Prelude (error)
import qualified Data.Algorithms.KMP as KMP import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Pattern = Pattern data Pattern = Pattern
...@@ -67,6 +67,19 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern ...@@ -67,6 +67,19 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
Pattern (KMP.build alt) (length alt) label Pattern (KMP.build alt) (length alt) label
--(Terms label $ Set.empty) -- TODO check stems --(Terms label $ Set.empty) -- TODO check stems
--------------------------------------------------------------------------
-- Utils
type BlockText = Text
type MatchedText = Text
termsInText :: Patterns -> BlockText -> [MatchedText]
termsInText pats txt = List.nub
$ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text] extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
...@@ -78,7 +91,9 @@ extractTermsWithList' :: Patterns -> Text -> [Text] ...@@ -78,7 +91,9 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats) extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
. monoTextsBySentence . monoTextsBySentence
--------------------------------------------------------------------------
{- | Not used
filterWith :: TermList filterWith :: TermList
-> (a -> Text) -> (a -> Text)
-> [a] -> [a]
...@@ -96,4 +111,4 @@ filterWith' termList f f' xs = f' xs ...@@ -96,4 +111,4 @@ filterWith' termList f f' xs = f' xs
$ map f xs $ map f xs
where where
pats = buildPatterns termList pats = buildPatterns termList
-}
{-|
Module : Gargantext.Database.Action.Index
Description : Indexation tools
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Main Database functions for Gargantext.API.Node.Update
UpdateNodeParamsTexts { methodTexts :: Granularity }
data Granularity = NewNgrams | NewTexts | Both
deriving (Generic, Eq, Ord, Enum, Bounded)
-- TODO add option for type of ngrams
-}
module Gargantext.Database.Action.Index
where
import Data.List (nub)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, filterTerms, termsInText)
index :: CorpusId -> Granularity -> Cmd err [Int]
index cId NewNgrams = do
ngrams <- get ngrams with zero count
texts <- get all text to index
indexSave text (buildPatterns ngrams)
index cId NewTexts = do
ngrams <- get all ngrams
texts <- get text with zero count
indexSave text (buildPatterns ngrams)
index cId Both = do
r1 <- index cId NewNgrams
r2 <- index cId NewTexts
pure $ r1 <> r2
indexSave :: [Document] -> Pattern -> Cmd err [Int]
indexSave corpus p = do
indexedDoc <- map (filterTerms patterns) corpus
saveIndexDoc ngramsTextId
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment