Commit 9f5df649 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Indexation function to test

parent 05aa3d7e
Pipeline #1454 canceled with stage
...@@ -15,34 +15,43 @@ Portability : POSIX ...@@ -15,34 +15,43 @@ Portability : POSIX
module Gargantext.API.Ngrams.List module Gargantext.API.Ngrams.List
where where
import Data.Maybe (catMaybes) import Control.Lens hiding (elements, Indexed)
import Control.Lens hiding (elements)
import Data.Aeson import Data.Aeson
import Data.HashMap.Strict (HashMap)
import Data.Map (toList, fromList) import Data.Map (toList, fromList)
import Data.Maybe (catMaybes)
import Data.Set (Set)
import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema) import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
import Data.Text (Text, concat, pack) import Data.Text (Text, concat, pack)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.API.Admin.Orchestrator.Types import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams, NgramsTerm) import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..)) import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
import Gargantext.API.Node.Corpus.New.File (FileType(..)) import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Prelude (GargServer, GargNoServer) import Gargantext.API.Prelude (GargServer, GargNoServer)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Utils.Prefix (unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast') import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Node
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Admin.Types.Hyperdata.Document import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Schema.Ngrams (ngramsTypes, NgramsType(..)) import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.Node (getDocumentsWithParentId) import Gargantext.Database.Query.Table.Node (getDocumentsWithParentId)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node
import Gargantext.Database.Types (Indexed(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Network.HTTP.Media ((//), (/:)) import Network.HTTP.Media ((//), (/:))
import Servant import Servant
import Servant.Job.Async import Servant.Job.Async
import Servant.Job.Utils (jsonOptions) import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded (FromForm) import Web.FormUrlEncoded (FromForm)
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Text as Text import qualified Data.Text as Text
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -64,9 +73,9 @@ get :: RepoCmdM env err m => ...@@ -64,9 +73,9 @@ get :: RepoCmdM env err m =>
ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList) ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
get lId = do get lId = do
lst <- get' lId lst <- get' lId
let (NodeId id) = lId let (NodeId id') = lId
return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-" return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
, pack $ show id , pack $ show id'
, ".json" , ".json"
] ]
) lst ) lst
...@@ -96,35 +105,58 @@ post l m = do ...@@ -96,35 +105,58 @@ post l m = do
reIndexWith :: CorpusId reIndexWith :: CorpusId
-> ListId -> ListId
-> NgramsType -> NgramsType
-> [NgramsTerm] -> Set ListType
-> GargNoServer () -> GargNoServer ()
reIndexWith cId lId nt ts = do reIndexWith cId lId nt lts = do
docs <- getDocumentsWithParentId cId -- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- printDebug "ts" ts
-- Taking the ngrams with 0 occurrences only (orphans) -- Taking the ngrams with 0 occurrences only (orphans)
orphans <- map (\k -> ([unNgramsTerm k], [])) orphans <- HashMap.keys
<$> HashMap.keys
<$> HashMap.filter (==0) <$> HashMap.filter (==0)
<$> getOccByNgramsOnlyFast' cId lId nt ts <$> getOccByNgramsOnlyFast' cId lId nt ts
-- Getting the Id of orphan ngrams
mapTextNgramsId <- insertNgrams (map (text2ngrams . unNgramsTerm) orphans)
printDebug "orphans" orphans
-- Get all documents of the corpus
docs <- getDocumentsWithParentId cId
-- Checking Text documents where orphans match -- Checking Text documents where orphans match
-- TODO Tests here
let let
docMatched = ngramsByDoc = List.concat
map (\doc -> ( doc ^. node_id $ map (\doc -> List.zip
, termsInText (buildPatterns orphans) (termsInText (buildPatterns $ map (\k -> ([unNgramsTerm k], [])) orphans)
( Text.unlines $ Text.unlines $ catMaybes
$ catMaybes [ doc ^. node_hyperdata . hd_title
[ doc ^. node_hyperdata . hd_title , doc ^. node_hyperdata . hd_abstract
, doc ^. node_hyperdata . hd_abstract ]
]
) )
) (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
) docs ) docs
-- Saving the indexation in database -- Saving the indexation in database
_ <- insertDocNgrams lId ( HashMap.fromList
$ catMaybes
$ map (\(t,d) -> (,) <$> toIndexedNgrams mapTextNgramsId t
<*> Just d ) ngramsByDoc
)
pure () pure ()
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
where
i = HashMap.lookup t m
n = Just (text2ngrams t)
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
type PostAPI = Summary "Update List" type PostAPI = Summary "Update List"
......
...@@ -26,8 +26,8 @@ import Gargantext.Core.Types (ListType(..), NodeId, ListId) ...@@ -26,8 +26,8 @@ import Gargantext.Core.Types (ListType(..), NodeId, ListId)
import Gargantext.Database.Schema.Ngrams (NgramsType) import Gargantext.Database.Schema.Ngrams (NgramsType)
import Gargantext.Prelude import Gargantext.Prelude
import qualified Data.HashMap.Strict as HM import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as Map import qualified Data.Map.Strict as Map
import qualified Data.Set as Set import qualified Data.Set as Set
mergeNgramsElement :: NgramsRepoElement -> NgramsRepoElement -> NgramsRepoElement mergeNgramsElement :: NgramsRepoElement -> NgramsRepoElement -> NgramsRepoElement
mergeNgramsElement _neOld neNew = neNew mergeNgramsElement _neOld neNew = neNew
...@@ -50,7 +50,6 @@ listNgramsFromRepo nodeIds ngramsType repo = ngrams ...@@ -50,7 +50,6 @@ listNgramsFromRepo nodeIds ngramsType repo = ngrams
[ ngramsMap ^. at nodeId . _Just | nodeId <- nodeIds ] [ ngramsMap ^. at nodeId . _Just | nodeId <- nodeIds ]
-- TODO-ACCESS: We want to do the security check before entering here. -- TODO-ACCESS: We want to do the security check before entering here.
-- Add a static capability parameter would be nice. -- Add a static capability parameter would be nice.
-- Ideally this is the access to `repoVar` which needs to -- Ideally this is the access to `repoVar` which needs to
...@@ -62,12 +61,12 @@ getListNgrams nodeIds ngramsType = listNgramsFromRepo nodeIds ngramsType <$> get ...@@ -62,12 +61,12 @@ getListNgrams nodeIds ngramsType = listNgramsFromRepo nodeIds ngramsType <$> get
getTermsWith :: (RepoCmdM env err m, Eq a, Hashable a) getTermsWith :: (RepoCmdM env err m, Eq a, Hashable a)
=> (NgramsTerm -> a) -> [ListId] => (NgramsTerm -> a) -> [ListId]
-> NgramsType -> ListType -> NgramsType -> Set ListType
-> m (HashMap a [a]) -> m (HashMap a [a])
getTermsWith f ls ngt lt = HM.fromListWith (<>) getTermsWith f ls ngt lts = HM.fromListWith (<>)
<$> map toTreeWith <$> map toTreeWith
<$> HM.toList <$> HM.toList
<$> HM.filter (\f' -> fst f' == lt) <$> HM.filter (\f' -> Set.member (fst f') lts)
<$> mapTermListRoot ls ngt <$> mapTermListRoot ls ngt
<$> getRepo <$> getRepo
where where
......
...@@ -150,7 +150,6 @@ getGroupParams gp _ = pure gp ...@@ -150,7 +150,6 @@ getGroupParams gp _ = pure gp
-- TODO use ListIds -- TODO use ListIds
buildNgramsTermsList :: ( HasNodeError err buildNgramsTermsList :: ( HasNodeError err
, CmdM env err m , CmdM env err m
......
...@@ -144,7 +144,6 @@ extracted2ngrams :: ExtractedNgrams -> Ngrams ...@@ -144,7 +144,6 @@ extracted2ngrams :: ExtractedNgrams -> Ngrams
extracted2ngrams (SimpleNgrams ng) = ng extracted2ngrams (SimpleNgrams ng) = ng
extracted2ngrams (EnrichedNgrams ng) = view np_form ng extracted2ngrams (EnrichedNgrams ng) = view np_form ng
--------------------------- ---------------------------
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId) insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams ngs = do insertExtractedNgrams ngs = do
...@@ -216,6 +215,6 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt) ...@@ -216,6 +215,6 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure) isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String) <$> ("!?(),;.:" :: String)
...@@ -41,7 +41,8 @@ import Gargantext.Core.Types ...@@ -41,7 +41,8 @@ import Gargantext.Core.Types
-- import Gargantext.Core.Viz.Phylo.View.Export -- import Gargantext.Core.Viz.Phylo.View.Export
-- import Gargantext.Core.Viz.Phylo.View.ViewMaker -- TODO Just Maker is fine -- import Gargantext.Core.Viz.Phylo.View.ViewMaker -- TODO Just Maker is fine
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text import qualified Data.Set as Set
import qualified Data.Text as Text
type MinSizeBranch = Int type MinSizeBranch = Int
...@@ -51,7 +52,7 @@ flowPhylo :: FlowCmdM env err m ...@@ -51,7 +52,7 @@ flowPhylo :: FlowCmdM env err m
flowPhylo cId = do flowPhylo cId = do
list <- defaultList cId list <- defaultList cId
termList <- HashMap.toList <$> getTermsWith (Text.words . unNgramsTerm) [list] NgramsTerms MapTerm termList <- HashMap.toList <$> getTermsWith (Text.words . unNgramsTerm) [list] NgramsTerms (Set.singleton MapTerm)
docs' <- catMaybes docs' <- catMaybes
<$> map (\h -> (,) <$> _hd_publication_year h <$> map (\h -> (,) <$> _hd_publication_year h
......
...@@ -236,7 +236,8 @@ flowCorpusUser l user corpusName ctype ids = do ...@@ -236,7 +236,8 @@ flowCorpusUser l user corpusName ctype ids = do
-- printDebug "Node Text Ids:" tId -- printDebug "Node Text Ids:" tId
-- User List Flow -- User List Flow
(masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype (masterUserId, _masterRootId, masterCorpusId)
<- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
--let gp = (GroupParams l 2 3 (StopSize 3)) --let gp = (GroupParams l 2 3 (StopSize 3))
let gp = GroupWithPosTag l CoreNLP HashMap.empty let gp = GroupWithPosTag l CoreNLP HashMap.empty
......
...@@ -37,10 +37,10 @@ docNgrams2nodeNodeNgrams :: CorpusId ...@@ -37,10 +37,10 @@ docNgrams2nodeNodeNgrams :: CorpusId
docNgrams2nodeNodeNgrams cId (DocNgrams d n nt w) = docNgrams2nodeNodeNgrams cId (DocNgrams d n nt w) =
NodeNodeNgrams cId d n nt w NodeNodeNgrams cId d n nt w
data DocNgrams = DocNgrams { dn_doc_id :: DocId data DocNgrams = DocNgrams { dn_doc_id :: DocId
, dn_ngrams_id :: Int , dn_ngrams_id :: Int
, dn_ngrams_type :: NgramsTypeId , dn_ngrams_type :: NgramsTypeId
, dn_weight :: Double , dn_weight :: Double
} }
insertDocNgramsOn :: CorpusId insertDocNgramsOn :: CorpusId
......
...@@ -20,6 +20,7 @@ data Granularity = NewNgrams | NewTexts | Both ...@@ -20,6 +20,7 @@ data Granularity = NewNgrams | NewTexts | Both
module Gargantext.Database.Action.Index module Gargantext.Database.Action.Index
where where
{-
import Data.List (nub) import Data.List (nub)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, filterTerms, termsInText) import Gargantext.Core.Text.Terms.WithList (buildPatterns, filterTerms, termsInText)
...@@ -44,7 +45,7 @@ indexSave :: [Document] -> Pattern -> Cmd err [Int] ...@@ -44,7 +45,7 @@ indexSave :: [Document] -> Pattern -> Cmd err [Int]
indexSave corpus p = do indexSave corpus p = do
indexedDoc <- map (filterTerms patterns) corpus indexedDoc <- map (filterTerms patterns) corpus
saveIndexDoc ngramsTextId saveIndexDoc ngramsTextId
-}
......
...@@ -101,5 +101,3 @@ queryInsertNgrams = [sql| ...@@ -101,5 +101,3 @@ queryInsertNgrams = [sql|
FROM input_rows FROM input_rows
JOIN ngrams c USING (terms); -- columns of unique index JOIN ngrams c USING (terms); -- columns of unique index
|] |]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment