Commit 80fbde18 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FACTORING] G.Text.Terms.

parent 269eba92
Pipeline #828 failed with stage
...@@ -20,39 +20,17 @@ module Gargantext.Core.Flow.Types where ...@@ -20,39 +20,17 @@ module Gargantext.Core.Flow.Types where
import Control.Lens (Lens') import Control.Lens (Lens')
import Data.Map (Map) import Data.Map (Map)
import Data.Maybe (Maybe) import Data.Maybe (Maybe)
import Data.Text (Text) import Gargantext.Text (HasText(..))
import Gargantext.Core (Lang)
import Gargantext.Core.Types.Main (HashId) import Gargantext.Core.Types.Main (HashId)
import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..)) import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..))
import Gargantext.Database.Action.Query.Node.Document.Insert (AddUniqId, InsertDb)
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..))
import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Database.Schema.Ngrams (Ngrams, NgramsType) import Gargantext.Database.Schema.Ngrams (Ngrams, NgramsType)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Terms (TermType)
type FlowCorpus a = ( AddUniqId a
, UniqId a
, InsertDb a
, ExtractNgramsT a
, HasText a
)
class UniqId a class UniqId a
where where
uniqId :: Lens' a (Maybe HashId) uniqId :: Lens' a (Maybe HashId)
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (Map Ngrams (Map NgramsType Int))
class HasText h
where
hasText :: h -> [Text]
instance UniqId HyperdataDocument instance UniqId HyperdataDocument
where where
uniqId = hyperdataDocument_uniqId uniqId = hyperdataDocument_uniqId
...@@ -60,3 +38,18 @@ instance UniqId HyperdataDocument ...@@ -60,3 +38,18 @@ instance UniqId HyperdataDocument
instance UniqId HyperdataContact instance UniqId HyperdataContact
where where
uniqId = hc_uniqId uniqId = hc_uniqId
data DocumentIdWithNgrams a = DocumentIdWithNgrams
{ documentWithId :: !(DocumentWithId a)
, document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show)
data DocumentWithId a = DocumentWithId
{ documentId :: !NodeId
, documentData :: !a
} deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
...@@ -58,7 +58,6 @@ import Data.Swagger ...@@ -58,7 +58,6 @@ import Data.Swagger
import Data.Text (splitOn, intercalate) import Data.Text (splitOn, intercalate)
import Data.Traversable (traverse) import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second) import Data.Tuple.Extra (first, second)
import Debug.Trace (trace)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Core.Types (Terms(..)) import Gargantext.Core.Types (Terms(..))
...@@ -82,18 +81,15 @@ import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNod ...@@ -82,18 +81,15 @@ import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNod
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile) import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Text
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat) import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Text.List (buildNgramsLists,StopSize(..)) import Gargantext.Text.List (buildNgramsLists,StopSize(..))
import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms.Eleve (buildTries, toToken)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Text.Terms
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Prelude (String)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Text as Text
import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
import qualified Gargantext.Text.Corpus.API as API import qualified Gargantext.Text.Corpus.API as API
...@@ -272,9 +268,9 @@ insertMasterDocs c lang hs = do ...@@ -272,9 +268,9 @@ insertMasterDocs c lang hs = do
-- insertDocNgrams -- insertDocNgrams
_return <- insertNodeNodeNgrams2 _return <- insertNodeNodeNgrams2
$ catMaybes [ NodeNodeNgrams2 <$> Just nId $ catMaybes [ NodeNodeNgrams2 <$> Just nId
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms) <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double) <*> Just (fromIntegral w :: Double)
| (terms, mapNgramsTypes) <- Map.toList maps | (terms'', mapNgramsTypes) <- Map.toList maps
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight , (nId, w) <- Map.toList mapNodeIdWeight
] ]
...@@ -287,22 +283,8 @@ insertMasterDocs c lang hs = do ...@@ -287,22 +283,8 @@ insertMasterDocs c lang hs = do
pure ids' pure ids'
withLang :: HasText a ------------------------------------------------------------------------
=> TermType Lang
-> [DocumentWithId a]
-> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
where
m' = case m of
Nothing -> trace ("buildTries here" :: String)
$ Just
$ buildTries n ( fmap toToken $ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
)
just_m -> just_m
withLang l _ = l
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -335,6 +317,24 @@ instance HasText HyperdataContact ...@@ -335,6 +317,24 @@ instance HasText HyperdataContact
where where
hasText = undefined hasText = undefined
------------------------------------------------------------------------
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (Map Ngrams (Map NgramsType Int)))
-> [DocumentWithId a]
-> Cmd err [DocumentIdWithNgrams a]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ documentData d
pure $ DocumentIdWithNgrams d e
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact instance ExtractNgramsT HyperdataContact
where where
extractNgramsT l hc = filterNgramsT 255 <$> extract l hc extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
...@@ -387,23 +387,4 @@ instance ExtractNgramsT HyperdataDocument ...@@ -387,23 +387,4 @@ instance ExtractNgramsT HyperdataDocument
<> [(a', Map.singleton Authors 1) | a' <- authors ] <> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ] <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (Map Ngrams (Map NgramsType Int)))
-> [DocumentWithId a]
-> Cmd err [DocumentIdWithNgrams a]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ documentData d
pure $ DocumentIdWithNgrams d e
...@@ -30,6 +30,7 @@ import Data.Maybe (Maybe(..), catMaybes) ...@@ -30,6 +30,7 @@ import Data.Maybe (Maybe(..), catMaybes)
import Data.Text (Text) import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams) import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams)
import Gargantext.Core.Types.Main (ListType(CandidateTerm)) import Gargantext.Core.Types.Main (ListType(CandidateTerm))
import Gargantext.Core.Flow.Types
import Gargantext.Database.Action.Flow.Types import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId) import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId) import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
......
...@@ -24,14 +24,13 @@ Portability : POSIX ...@@ -24,14 +24,13 @@ Portability : POSIX
module Gargantext.Database.Action.Flow.Types module Gargantext.Database.Action.Flow.Types
where where
import Data.Map (Map)
import Gargantext.Prelude
import Gargantext.Core.Flow.Types import Gargantext.Core.Flow.Types
import Gargantext.Text
import Gargantext.Text.Terms
import Gargantext.API.Ngrams (HasRepoVar, RepoCmdM) import Gargantext.API.Ngrams (HasRepoVar, RepoCmdM)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Admin.Types.Errors (HasNodeError) import Gargantext.Database.Admin.Types.Errors (HasNodeError)
import Gargantext.Database.Admin.Utils (CmdM) import Gargantext.Database.Admin.Utils (CmdM)
import Gargantext.Database.Action.Query.Node.Document.Insert
type FlowCmdM env err m = type FlowCmdM env err m =
( CmdM env err m ( CmdM env err m
...@@ -40,18 +39,10 @@ type FlowCmdM env err m = ...@@ -40,18 +39,10 @@ type FlowCmdM env err m =
, HasRepoVar env , HasRepoVar env
) )
data DocumentIdWithNgrams a = DocumentIdWithNgrams type FlowCorpus a = ( AddUniqId a
{ documentWithId :: !(DocumentWithId a) , UniqId a
, document_ngrams :: !(Map Ngrams (Map NgramsType Int)) , InsertDb a
} deriving (Show) , ExtractNgramsT a
, HasText a
data DocumentWithId a = DocumentWithId )
{ documentId :: !NodeId
, documentData :: !a
} deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
...@@ -24,16 +24,13 @@ import NLP.FullStop (segment) ...@@ -24,16 +24,13 @@ import NLP.FullStop (segment)
import qualified Data.Text as DT import qualified Data.Text as DT
----------------------------------------------------------------- -----------------------------------------------------------------
-- | Why not use data ?
data Niveau = NiveauTexte Texte class HasText h
| NiveauParagraphe Paragraphe where
| NiveauPhrase Phrase hasText :: h -> [Text]
| NiveauMultiTerme MultiTerme
| NiveauMot Mot -----------------------------------------------------------------
| NiveauLettre Lettre -- French words to distinguish contexts
deriving (Show)
-- | Why use newtype ?
newtype Texte = Texte Text newtype Texte = Texte Text
newtype Paragraphe = Paragraphe Text newtype Paragraphe = Paragraphe Text
newtype Phrase = Phrase Text newtype Phrase = Phrase Text
...@@ -43,6 +40,7 @@ newtype Lettre = Lettre Text ...@@ -43,6 +40,7 @@ newtype Lettre = Lettre Text
-- | Type syn seems obvious -- | Type syn seems obvious
type Titre = Phrase type Titre = Phrase
----------------------------------------------------------------- -----------------------------------------------------------------
instance Show Texte where instance Show Texte where
...@@ -85,14 +83,6 @@ instance Collage MultiTerme Mot where ...@@ -85,14 +83,6 @@ instance Collage MultiTerme Mot where
dec (MultiTerme mt) = map Mot $ DT.words mt dec (MultiTerme mt) = map Mot $ DT.words mt
inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m) inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
-- | We could use Type Classes but we lose the Sum Type classification
toMultiTerme :: Niveau -> [MultiTerme]
toMultiTerme (NiveauTexte (Texte _t)) = undefined
toMultiTerme (NiveauPhrase p) = dec p
toMultiTerme (NiveauMultiTerme mt) = [mt]
toMultiTerme (NiveauMot _m) = undefined
toMultiTerme _ = undefined
------------------------------------------------------------------- -------------------------------------------------------------------
-- Contexts of text -- Contexts of text
sentences :: Text -> [Text] sentences :: Text -> [Text]
......
...@@ -32,21 +32,29 @@ compute graph ...@@ -32,21 +32,29 @@ compute graph
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ConstrainedClassMethods #-}
module Gargantext.Text.Terms module Gargantext.Text.Terms
where where
import Control.Lens import Control.Lens
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
import GHC.Base (String) import GHC.Base (String)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Flow.Types
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text (sentences) import Gargantext.Text (sentences, HasText(..))
import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken) import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
import Gargantext.Text.Terms.Mono (monoTerms) import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Text.Terms.Mono.Stem (stem) import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
...@@ -55,13 +63,13 @@ import qualified Data.Set as Set ...@@ -55,13 +63,13 @@ import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
data TermType lang data TermType lang
= Mono { _tt_lang :: lang } = Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: lang } | Multi { _tt_lang :: !lang }
| MonoMulti { _tt_lang :: lang } | MonoMulti { _tt_lang :: !lang }
| Unsupervised { _tt_lang :: lang | Unsupervised { _tt_lang :: !lang
, _tt_windowSize :: Int , _tt_windowSize :: !Int
, _tt_ngramsSize :: Int , _tt_ngramsSize :: !Int
, _tt_model :: Maybe (Tries Token ()) , _tt_model :: !(Maybe (Tries Token ()))
} }
deriving Generic deriving Generic
...@@ -84,7 +92,45 @@ extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m ...@@ -84,7 +92,45 @@ extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------
withLang :: HasText a
=> TermType Lang
-> [DocumentWithId a]
-> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
where
m' = case m of
Nothing -> -- trace ("buildTries here" :: String)
Just $ buildTries n ( fmap toToken
$ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
)
just_m -> just_m
withLang l _ = l
------------------------------------------------------------------------ ------------------------------------------------------------------------
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (Map Ngrams (Map NgramsType Int))
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
-- =======================================================
-- | Terms from Text -- | Terms from Text
-- Mono : mono terms -- Mono : mono terms
-- Multi : multi terms -- Multi : multi terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment