Commit 05aa3d7e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[INDEXING] WIP

parent 44be4e4c
......@@ -103,18 +103,9 @@ getJson path = L.readFile path
-- | Parse | --
---------------
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (a, Text) -> (a, [Text])
filterTerms patterns (y,d) = (y,termsInText patterns d)
where
--------------------------------------
termsInText :: Patterns -> Text -> [Text]
termsInText pats txt = DL.nub
$ DL.concat
$ map (map unwords)
$ extractTermsWithList pats txt
--------------------------------------
-- | To transform a Csv nfile into a readable corpus
......
......@@ -10,8 +10,6 @@ Portability : POSIX
Count API part of Gargantext.
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE DeriveAnyClass #-}
......@@ -19,23 +17,18 @@ Count API part of Gargantext.
module Gargantext.API.Count
where
import GHC.Generics (Generic)
import Data.Aeson hiding (Error)
import Data.Aeson.TH (deriveJSON)
import Data.Either
import Data.List (permutations)
import Data.Swagger
import Data.Text (Text, pack)
import GHC.Generics (Generic)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Prelude
import Servant
import Test.QuickCheck.Arbitrary
import Test.QuickCheck (elements)
-- import Control.Applicative ((<*>))
import Gargantext.Prelude
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Test.QuickCheck.Arbitrary
-----------------------------------------------------------------------
-- TODO-ACCESS: CanCount
......
-- |
{-|
Module : Gargantext.API.Dev
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
-- Use only for dev/repl
module Gargantext.API.Dev where
......@@ -17,7 +26,6 @@ import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig(..), readConfig)
-------------------------------------------------------------------
withDevEnv :: IniPath -> (DevEnv -> IO a) -> IO a
withDevEnv iniPath k = do
env <- newDevEnv
......@@ -38,7 +46,6 @@ withDevEnv iniPath k = do
}
-- | Run Cmd Sugar for the Repl (GHCI)
runCmdRepl :: Show err => Cmd'' DevEnv err a -> IO a
runCmdRepl f = withDevEnv "gargantext.ini" $ \env -> runCmdDev env f
......
......@@ -514,7 +514,9 @@ getTableNgrams _nType nId tabType listId limit_ offset
filteredNodes :: Map NgramsTerm NgramsElement -> [NgramsElement]
filteredNodes tableMap = rootOf <$> list & filter selected_node
where
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. at r))
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root")
(tableMap ^. at r)
)
(ne ^. ne_root)
list = tableMap ^.. each
......@@ -523,7 +525,9 @@ getTableNgrams _nType nId tabType listId limit_ offset
selectAndPaginate tableMap = roots <> inners
where
list = tableMap ^.. each
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. at r))
rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root")
(tableMap ^. at r)
)
(ne ^. ne_root)
selected_nodes = list & take limit_
. drop offset'
......@@ -777,3 +781,7 @@ listNgramsChangedSince listId ngramsType version
Versioned <$> currentVersion <*> pure True
| otherwise =
tableNgramsPull listId ngramsType version & mapped . v_data %~ (== mempty)
......@@ -15,6 +15,7 @@ Portability : POSIX
module Gargantext.API.Ngrams.List
where
import Data.Maybe (catMaybes)
import Control.Lens hiding (elements)
import Data.Aeson
import Data.Map (toList, fromList)
......@@ -22,21 +23,27 @@ import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
import Data.Text (Text, concat, pack)
import GHC.Generics (Generic)
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList)
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams, NgramsTerm)
import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Prelude (GargServer)
import Gargantext.API.Prelude (GargServer, GargNoServer)
import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Node
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Schema.Ngrams (ngramsTypes)
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Schema.Ngrams (ngramsTypes, NgramsType(..))
import Gargantext.Database.Query.Table.Node (getDocumentsWithParentId)
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Network.HTTP.Media ((//), (/:))
import Servant
import Servant.Job.Async
import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded (FromForm)
------------------------------------------------------------------------
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
------------------------------------------------------------------------
type API = Get '[JSON, HTML] (Headers '[Header "Content-Disposition" Text] NgramsList)
......@@ -83,6 +90,41 @@ post l m = do
-- TODO reindex
pure True
-----------------------------------------------------------------------------
-- | Re-index documents of a corpus with new ngrams (called orphans here)
reIndexWith :: CorpusId
-> ListId
-> NgramsType
-> [NgramsTerm]
-> GargNoServer ()
reIndexWith cId lId nt ts = do
docs <- getDocumentsWithParentId cId
-- Taking the ngrams with 0 occurrences only (orphans)
orphans <- map (\k -> ([unNgramsTerm k], []))
<$> HashMap.keys
<$> HashMap.filter (==0)
<$> getOccByNgramsOnlyFast' cId lId nt ts
-- Checking Text documents where orphans match
let
docMatched =
map (\doc -> ( doc ^. node_id
, termsInText (buildPatterns orphans)
( Text.unlines
$ catMaybes
[ doc ^. node_hyperdata . hd_title
, doc ^. node_hyperdata . hd_abstract
]
)
)
) docs
-- Saving the indexation in database
pure ()
------------------------------------------------------------------------
------------------------------------------------------------------------
type PostAPI = Summary "Update List"
......
......@@ -23,11 +23,11 @@ import GHC.IO (FilePath)
import Gargantext.Core.Types (CorpusId)
{-
____ _ _
/ ___| __ _ _ __ __ _ __ _ _ __ | |_ _____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| __/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | || __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|\__\___/_/\_\\__|
____ _____ _
/ ___| __ _ _ __ __ _ __ _ _ _|_ _|____ _| |_
| | _ / _` | '__/ _` |/ _` | '_ \| |/ _ \ \/ / __|
| |_| | (_| | | | (_| | (_| | | | | | __/> <| |_
\____|\__,_|_| \__, |\__,_|_| |_|_|\___/_/\_\\__|
|___/
-}
......
......@@ -17,14 +17,14 @@ module Gargantext.Core.Text.Terms.WithList where
import Data.List (null)
import Data.Ord
import Data.Text (Text, concat)
import Data.Text (Text, concat, unwords)
import Gargantext.Prelude
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Prelude (error)
import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List
------------------------------------------------------------------------
data Pattern = Pattern
......@@ -67,6 +67,19 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
Pattern (KMP.build alt) (length alt) label
--(Terms label $ Set.empty) -- TODO check stems
--------------------------------------------------------------------------
-- Utils
type BlockText = Text
type MatchedText = Text
termsInText :: Patterns -> BlockText -> [MatchedText]
termsInText pats txt = List.nub
$ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
......@@ -78,7 +91,9 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
. monoTextsBySentence
--------------------------------------------------------------------------
{- | Not used
filterWith :: TermList
-> (a -> Text)
-> [a]
......@@ -96,4 +111,4 @@ filterWith' termList f f' xs = f' xs
$ map f xs
where
pats = buildPatterns termList
-}
{-|
Module : Gargantext.Database.Action.Index
Description : Indexation tools
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Main Database functions for Gargantext.API.Node.Update
UpdateNodeParamsTexts { methodTexts :: Granularity }
data Granularity = NewNgrams | NewTexts | Both
deriving (Generic, Eq, Ord, Enum, Bounded)
-- TODO add option for type of ngrams
-}
module Gargantext.Database.Action.Index
where
import Data.List (nub)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, filterTerms, termsInText)
index :: CorpusId -> Granularity -> Cmd err [Int]
index cId NewNgrams = do
ngrams <- get ngrams with zero count
texts <- get all text to index
indexSave text (buildPatterns ngrams)
index cId NewTexts = do
ngrams <- get all ngrams
texts <- get text with zero count
indexSave text (buildPatterns ngrams)
index cId Both = do
r1 <- index cId NewNgrams
r2 <- index cId NewTexts
pure $ r1 <> r2
indexSave :: [Document] -> Pattern -> Cmd err [Int]
indexSave corpus p = do
indexedDoc <- map (filterTerms patterns) corpus
saveIndexDoc ngramsTextId
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment