Commit 80fbde18 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FACTORING] G.Text.Terms.

parent 269eba92
Pipeline #828 failed with stage
......@@ -20,39 +20,17 @@ module Gargantext.Core.Flow.Types where
import Control.Lens (Lens')
import Data.Map (Map)
import Data.Maybe (Maybe)
import Data.Text (Text)
import Gargantext.Core (Lang)
import Gargantext.Text (HasText(..))
import Gargantext.Core.Types.Main (HashId)
import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..))
import Gargantext.Database.Action.Query.Node.Document.Insert (AddUniqId, InsertDb)
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..))
import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Database.Schema.Ngrams (Ngrams, NgramsType)
import Gargantext.Prelude
import Gargantext.Text.Terms (TermType)
type FlowCorpus a = ( AddUniqId a
, UniqId a
, InsertDb a
, ExtractNgramsT a
, HasText a
)
class UniqId a
where
uniqId :: Lens' a (Maybe HashId)
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (Map Ngrams (Map NgramsType Int))
class HasText h
where
hasText :: h -> [Text]
instance UniqId HyperdataDocument
where
uniqId = hyperdataDocument_uniqId
......@@ -60,3 +38,18 @@ instance UniqId HyperdataDocument
instance UniqId HyperdataContact
where
uniqId = hc_uniqId
data DocumentIdWithNgrams a = DocumentIdWithNgrams
{ documentWithId :: !(DocumentWithId a)
, document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show)
data DocumentWithId a = DocumentWithId
{ documentId :: !NodeId
, documentData :: !a
} deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
......@@ -58,7 +58,6 @@ import Data.Swagger
import Data.Text (splitOn, intercalate)
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import Debug.Trace (trace)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Flow.Types
import Gargantext.Core.Types (Terms(..))
......@@ -82,18 +81,15 @@ import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNod
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Text
import Gargantext.Prelude
import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Text.List (buildNgramsLists,StopSize(..))
import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms.Eleve (buildTries, toToken)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Text.Terms
import GHC.Generics (Generic)
import Prelude (String)
import System.FilePath (FilePath)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Text as Text
import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
import qualified Gargantext.Text.Corpus.API as API
......@@ -272,9 +268,9 @@ insertMasterDocs c lang hs = do
-- insertDocNgrams
_return <- insertNodeNodeNgrams2
$ catMaybes [ NodeNodeNgrams2 <$> Just nId
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
| (terms, mapNgramsTypes) <- Map.toList maps
| (terms'', mapNgramsTypes) <- Map.toList maps
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
, (nId, w) <- Map.toList mapNodeIdWeight
]
......@@ -287,22 +283,8 @@ insertMasterDocs c lang hs = do
pure ids'
withLang :: HasText a
=> TermType Lang
-> [DocumentWithId a]
-> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
where
m' = case m of
Nothing -> trace ("buildTries here" :: String)
$ Just
$ buildTries n ( fmap toToken $ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
)
just_m -> just_m
withLang l _ = l
------------------------------------------------------------------------
------------------------------------------------------------------------
......@@ -335,6 +317,24 @@ instance HasText HyperdataContact
where
hasText = undefined
------------------------------------------------------------------------
------------------------------------------------------------------------
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (Map Ngrams (Map NgramsType Int)))
-> [DocumentWithId a]
-> Cmd err [DocumentIdWithNgrams a]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ documentData d
pure $ DocumentIdWithNgrams d e
------------------------------------------------------------------------
instance ExtractNgramsT HyperdataContact
where
extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
......@@ -387,23 +387,4 @@ instance ExtractNgramsT HyperdataDocument
<> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
documentIdWithNgrams :: HasNodeError err
=> (a
-> Cmd err (Map Ngrams (Map NgramsType Int)))
-> [DocumentWithId a]
-> Cmd err [DocumentIdWithNgrams a]
documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ documentData d
pure $ DocumentIdWithNgrams d e
......@@ -30,6 +30,7 @@ import Data.Maybe (Maybe(..), catMaybes)
import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams)
import Gargantext.Core.Types.Main (ListType(CandidateTerm))
import Gargantext.Core.Flow.Types
import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
......
......@@ -24,14 +24,13 @@ Portability : POSIX
module Gargantext.Database.Action.Flow.Types
where
import Data.Map (Map)
import Gargantext.Prelude
import Gargantext.Core.Flow.Types
import Gargantext.Text
import Gargantext.Text.Terms
import Gargantext.API.Ngrams (HasRepoVar, RepoCmdM)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Admin.Types.Errors (HasNodeError)
import Gargantext.Database.Admin.Utils (CmdM)
import Gargantext.Database.Action.Query.Node.Document.Insert
type FlowCmdM env err m =
( CmdM env err m
......@@ -40,18 +39,10 @@ type FlowCmdM env err m =
, HasRepoVar env
)
data DocumentIdWithNgrams a = DocumentIdWithNgrams
{ documentWithId :: !(DocumentWithId a)
, document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show)
data DocumentWithId a = DocumentWithId
{ documentId :: !NodeId
, documentData :: !a
} deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
type FlowCorpus a = ( AddUniqId a
, UniqId a
, InsertDb a
, ExtractNgramsT a
, HasText a
)
......@@ -24,16 +24,13 @@ import NLP.FullStop (segment)
import qualified Data.Text as DT
-----------------------------------------------------------------
-- | Why not use data ?
data Niveau = NiveauTexte Texte
| NiveauParagraphe Paragraphe
| NiveauPhrase Phrase
| NiveauMultiTerme MultiTerme
| NiveauMot Mot
| NiveauLettre Lettre
deriving (Show)
-- | Why use newtype ?
class HasText h
where
hasText :: h -> [Text]
-----------------------------------------------------------------
-- French words to distinguish contexts
newtype Texte = Texte Text
newtype Paragraphe = Paragraphe Text
newtype Phrase = Phrase Text
......@@ -43,6 +40,7 @@ newtype Lettre = Lettre Text
-- | Type syn seems obvious
type Titre = Phrase
-----------------------------------------------------------------
instance Show Texte where
......@@ -85,14 +83,6 @@ instance Collage MultiTerme Mot where
dec (MultiTerme mt) = map Mot $ DT.words mt
inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
-- | We could use Type Classes but we lose the Sum Type classification
toMultiTerme :: Niveau -> [MultiTerme]
toMultiTerme (NiveauTexte (Texte _t)) = undefined
toMultiTerme (NiveauPhrase p) = dec p
toMultiTerme (NiveauMultiTerme mt) = [mt]
toMultiTerme (NiveauMot _m) = undefined
toMultiTerme _ = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
......
......@@ -32,21 +32,29 @@ compute graph
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ConstrainedClassMethods #-}
module Gargantext.Text.Terms
where
import Control.Lens
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Text (Text)
import Data.Traversable
import GHC.Base (String)
import GHC.Generics (Generic)
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Core.Flow.Types
import Gargantext.Prelude
import Gargantext.Text (sentences)
import Gargantext.Text (sentences, HasText(..))
import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Database.Admin.Utils (Cmd)
import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Text.Terms.Multi (multiterms)
......@@ -55,13 +63,13 @@ import qualified Data.Set as Set
import qualified Data.Text as Text
data TermType lang
= Mono { _tt_lang :: lang }
| Multi { _tt_lang :: lang }
| MonoMulti { _tt_lang :: lang }
| Unsupervised { _tt_lang :: lang
, _tt_windowSize :: Int
, _tt_ngramsSize :: Int
, _tt_model :: Maybe (Tries Token ())
= Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang }
| MonoMulti { _tt_lang :: !lang }
| Unsupervised { _tt_lang :: !lang
, _tt_windowSize :: !Int
, _tt_ngramsSize :: !Int
, _tt_model :: !(Maybe (Tries Token ()))
}
deriving Generic
......@@ -84,7 +92,45 @@ extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------
withLang :: HasText a
=> TermType Lang
-> [DocumentWithId a]
-> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
where
m' = case m of
Nothing -> -- trace ("buildTries here" :: String)
Just $ buildTries n ( fmap toToken
$ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
)
just_m -> just_m
withLang l _ = l
------------------------------------------------------------------------
class ExtractNgramsT h
where
extractNgramsT :: HasText h
=> TermType Lang
-> h
-> Cmd err (Map Ngrams (Map NgramsType Int))
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
-- =======================================================
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment