Commit 9f5df649 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Indexation function to test

parent 05aa3d7e
Pipeline #1454 canceled with stage
......@@ -15,34 +15,43 @@ Portability : POSIX
module Gargantext.API.Ngrams.List
where
import Data.Maybe (catMaybes)
import Control.Lens hiding (elements)
import Control.Lens hiding (elements, Indexed)
import Data.Aeson
import Data.HashMap.Strict (HashMap)
import Data.Map (toList, fromList)
import Data.Maybe (catMaybes)
import Data.Set (Set)
import Data.Swagger (ToSchema, declareNamedSchema, genericDeclareNamedSchema)
import Data.Text (Text, concat, pack)
import GHC.Generics (Generic)
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams, NgramsTerm)
import Gargantext.API.Ngrams (getNgramsTableMap, setListNgrams)
import Gargantext.API.Ngrams.Tools (getTermsWith)
import Gargantext.API.Ngrams.Types (RepoCmdM, Versioned(..), NgramsList, NgramsTerm(..))
import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Prelude (GargServer, GargNoServer)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Database.Action.Flow.Types (FlowCmdM)
import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
import Gargantext.Database.Schema.Node
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Schema.Ngrams (ngramsTypes, NgramsType(..))
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
import Gargantext.Database.Query.Table.Node (getDocumentsWithParentId)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node
import Gargantext.Database.Types (Indexed(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Terms.WithList (buildPatterns, termsInText)
import Network.HTTP.Media ((//), (/:))
import Servant
import Servant.Job.Async
import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded (FromForm)
import qualified Data.HashMap.Strict as HashMap
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Text as Text
------------------------------------------------------------------------
......@@ -64,9 +73,9 @@ get :: RepoCmdM env err m =>
ListId -> m (Headers '[Header "Content-Disposition" Text] NgramsList)
get lId = do
lst <- get' lId
let (NodeId id) = lId
let (NodeId id') = lId
return $ addHeader (concat [ "attachment; filename=GarganText_NgramsList-"
, pack $ show id
, pack $ show id'
, ".json"
]
) lst
......@@ -96,35 +105,58 @@ post l m = do
reIndexWith :: CorpusId
-> ListId
-> NgramsType
-> [NgramsTerm]
-> Set ListType
-> GargNoServer ()
reIndexWith cId lId nt ts = do
docs <- getDocumentsWithParentId cId
reIndexWith cId lId nt lts = do
-- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- printDebug "ts" ts
-- Taking the ngrams with 0 occurrences only (orphans)
orphans <- map (\k -> ([unNgramsTerm k], []))
<$> HashMap.keys
orphans <- HashMap.keys
<$> HashMap.filter (==0)
<$> getOccByNgramsOnlyFast' cId lId nt ts
-- Getting the Id of orphan ngrams
mapTextNgramsId <- insertNgrams (map (text2ngrams . unNgramsTerm) orphans)
printDebug "orphans" orphans
-- Get all documents of the corpus
docs <- getDocumentsWithParentId cId
-- Checking Text documents where orphans match
-- TODO Tests here
let
docMatched =
map (\doc -> ( doc ^. node_id
, termsInText (buildPatterns orphans)
( Text.unlines
$ catMaybes
[ doc ^. node_hyperdata . hd_title
, doc ^. node_hyperdata . hd_abstract
]
ngramsByDoc = List.concat
$ map (\doc -> List.zip
(termsInText (buildPatterns $ map (\k -> ([unNgramsTerm k], [])) orphans)
$ Text.unlines $ catMaybes
[ doc ^. node_hyperdata . hd_title
, doc ^. node_hyperdata . hd_abstract
]
)
)
) docs
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. node_id) 1 )]])
) docs
-- Saving the indexation in database
_ <- insertDocNgrams lId ( HashMap.fromList
$ catMaybes
$ map (\(t,d) -> (,) <$> toIndexedNgrams mapTextNgramsId t
<*> Just d ) ngramsByDoc
)
pure ()
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
where
i = HashMap.lookup t m
n = Just (text2ngrams t)
------------------------------------------------------------------------
------------------------------------------------------------------------
type PostAPI = Summary "Update List"
......
......@@ -26,8 +26,8 @@ import Gargantext.Core.Types (ListType(..), NodeId, ListId)
import Gargantext.Database.Schema.Ngrams (NgramsType)
import Gargantext.Prelude
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
mergeNgramsElement :: NgramsRepoElement -> NgramsRepoElement -> NgramsRepoElement
mergeNgramsElement _neOld neNew = neNew
......@@ -50,7 +50,6 @@ listNgramsFromRepo nodeIds ngramsType repo = ngrams
[ ngramsMap ^. at nodeId . _Just | nodeId <- nodeIds ]
-- TODO-ACCESS: We want to do the security check before entering here.
-- Add a static capability parameter would be nice.
-- Ideally this is the access to `repoVar` which needs to
......@@ -62,12 +61,12 @@ getListNgrams nodeIds ngramsType = listNgramsFromRepo nodeIds ngramsType <$> get
getTermsWith :: (RepoCmdM env err m, Eq a, Hashable a)
=> (NgramsTerm -> a) -> [ListId]
-> NgramsType -> ListType
-> NgramsType -> Set ListType
-> m (HashMap a [a])
getTermsWith f ls ngt lt = HM.fromListWith (<>)
getTermsWith f ls ngt lts = HM.fromListWith (<>)
<$> map toTreeWith
<$> HM.toList
<$> HM.filter (\f' -> fst f' == lt)
<$> HM.filter (\f' -> Set.member (fst f') lts)
<$> mapTermListRoot ls ngt
<$> getRepo
where
......
......@@ -150,7 +150,6 @@ getGroupParams gp _ = pure gp
-- TODO use ListIds
buildNgramsTermsList :: ( HasNodeError err
, CmdM env err m
......
......@@ -144,7 +144,6 @@ extracted2ngrams :: ExtractedNgrams -> Ngrams
extracted2ngrams (SimpleNgrams ng) = ng
extracted2ngrams (EnrichedNgrams ng) = view np_form ng
---------------------------
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams ngs = do
......@@ -216,6 +215,6 @@ text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $ (Text.pack . pure)
<$> ("!?(),;." :: String)
<$> ("!?(),;.:" :: String)
......@@ -41,7 +41,8 @@ import Gargantext.Core.Types
-- import Gargantext.Core.Viz.Phylo.View.Export
-- import Gargantext.Core.Viz.Phylo.View.ViewMaker -- TODO Just Maker is fine
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
import qualified Data.Set as Set
import qualified Data.Text as Text
type MinSizeBranch = Int
......@@ -51,7 +52,7 @@ flowPhylo :: FlowCmdM env err m
flowPhylo cId = do
list <- defaultList cId
termList <- HashMap.toList <$> getTermsWith (Text.words . unNgramsTerm) [list] NgramsTerms MapTerm
termList <- HashMap.toList <$> getTermsWith (Text.words . unNgramsTerm) [list] NgramsTerms (Set.singleton MapTerm)
docs' <- catMaybes
<$> map (\h -> (,) <$> _hd_publication_year h
......
......@@ -236,7 +236,8 @@ flowCorpusUser l user corpusName ctype ids = do
-- printDebug "Node Text Ids:" tId
-- User List Flow
(masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
(masterUserId, _masterRootId, masterCorpusId)
<- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
--let gp = (GroupParams l 2 3 (StopSize 3))
let gp = GroupWithPosTag l CoreNLP HashMap.empty
......
......@@ -37,10 +37,10 @@ docNgrams2nodeNodeNgrams :: CorpusId
docNgrams2nodeNodeNgrams cId (DocNgrams d n nt w) =
NodeNodeNgrams cId d n nt w
data DocNgrams = DocNgrams { dn_doc_id :: DocId
, dn_ngrams_id :: Int
data DocNgrams = DocNgrams { dn_doc_id :: DocId
, dn_ngrams_id :: Int
, dn_ngrams_type :: NgramsTypeId
, dn_weight :: Double
, dn_weight :: Double
}
insertDocNgramsOn :: CorpusId
......
......@@ -20,6 +20,7 @@ data Granularity = NewNgrams | NewTexts | Both
module Gargantext.Database.Action.Index
where
{-
import Data.List (nub)
import Gargantext.Core.Text.Terms.WithList (buildPatterns, filterTerms, termsInText)
......@@ -44,7 +45,7 @@ indexSave :: [Document] -> Pattern -> Cmd err [Int]
indexSave corpus p = do
indexedDoc <- map (filterTerms patterns) corpus
saveIndexDoc ngramsTextId
-}
......
......@@ -101,5 +101,3 @@ queryInsertNgrams = [sql|
FROM input_rows
JOIN ngrams c USING (terms); -- columns of unique index
|]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment