[ngrams] rewrite NgramsElement type to use NgramsRepoElement

Also, Tokenize.hs added for future endpoint for tokenization
parent 22d25fbe
......@@ -275,6 +275,7 @@ library
Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.CoreNLP
Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Terms.Tokenize
Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils
......
-- |
{-|
Module : Gargantext.API.Admin.EnvTypes
Description : Env
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
......@@ -106,10 +115,11 @@ data GargJob
| RecomputeGraphJob
deriving (Show, Eq, Ord, Enum, Bounded)
-- Do /not/ treat the data types of this type as strict, because it's convenient
-- to be able to partially initialise things like an 'Env' during tests, without
-- having to specify /everything/. This means that when we /construct/ an 'Env',
-- we need to remember to force the fields to WHNF at that point.
-- | Do /not/ treat the data types of this type as strict, because
-- it's convenient to be able to partially initialise things like an
-- 'Env' during tests, without having to specify /everything/. This
-- means that when we /construct/ an 'Env', we need to remember to
-- force the fields to WHNF at that point.
data Env = Env
{ _env_settings :: ~Settings
, _env_logger :: ~(Logger (GargM Env BackendInternalError))
......
......@@ -547,13 +547,13 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
-- /CAREFUL/: The root we select might /not/ have the same 'listType' we are
-- filtering for, in which case we have to change its type to match, if needed.
rootOf :: Map NgramsTerm NgramsElement -> NgramsElement -> NgramsElement
rootOf tblMap ne = case ne ^. ne_root of
rootOf tblMap ne = case ne ^. ne_nre . nre_root of
Nothing -> ne
Just rootKey
| Just r <- tblMap ^. at rootKey
-- NOTE(adinapoli) It's unclear what is the correct behaviour here: should
-- we override the type or we filter out the node altogether?
-> over ne_list (\oldList -> fromMaybe oldList _nsq_listType) r
-> over (ne_nre . nre_list) (\oldList -> fromMaybe oldList _nsq_listType) r
| otherwise
-> ne
......@@ -561,7 +561,7 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
-- mandated by 'NgramsSearchQuery'.
matchingNode :: NgramsElement -> Bool
matchingNode inputNode =
let nodeSize = inputNode ^. ne_size
let nodeSize = inputNode ^. ne_nre . nre_size
matchesListType = maybe (const True) (==) _nsq_listType
respectsMinSize = maybe (const True) (<=) (getMinSize <$> _nsq_minSize)
respectsMaxSize = maybe (const True) (>=) (getMaxSize <$> _nsq_maxSize)
......@@ -569,7 +569,7 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
in respectsMinSize nodeSize
&& respectsMaxSize nodeSize
&& _nsq_searchQuery (inputNode ^. ne_ngrams)
&& matchesListType (inputNode ^. ne_list)
&& matchesListType (inputNode ^. ne_nre . nre_list)
sortOnOrder :: Maybe OrderBy -> ([NgramsElement] -> [NgramsElement])
sortOnOrder Nothing = sortOnOrder (Just ScoreDesc)
......@@ -593,7 +593,7 @@ searchTableNgrams versionedTableMap NgramsSearchQuery{..} =
where
addSubitemsOccurrences :: NgramsElement -> NgramsElement
addSubitemsOccurrences e =
e { _ne_occurrences = foldl' alterOccurrences (e ^. ne_occurrences) (e ^. ne_children) }
e { _ne_occurrences = foldl' alterOccurrences (e ^. ne_occurrences) (e ^. ne_nre . nre_children) }
alterOccurrences :: Set ContextId -> NgramsTerm -> Set ContextId
alterOccurrences occs t = case Map.lookup t tblMap of
......
......@@ -171,12 +171,8 @@ instance Serialise (MSet NgramsTerm)
data NgramsElement =
NgramsElement { _ne_ngrams :: NgramsTerm
, _ne_size :: Int
, _ne_list :: ListType
, _ne_occurrences :: Set ContextId
, _ne_root :: Maybe NgramsTerm
, _ne_parent :: Maybe NgramsTerm
, _ne_children :: MSet NgramsTerm
, _ne_nre :: NgramsRepoElement
}
deriving (Ord, Eq, Show, Generic)
......@@ -189,7 +185,15 @@ mkNgramsElement :: NgramsTerm
-> MSet NgramsTerm
-> NgramsElement
mkNgramsElement ngrams list' rp children =
NgramsElement ngrams (size (unNgramsTerm ngrams)) list' mempty (_rp_root <$> rp) (_rp_parent <$> rp) children
NgramsElement { _ne_ngrams = ngrams
, _ne_occurrences = mempty
, _ne_nre = nre }
where
nre = NgramsRepoElement { _nre_size = size $ unNgramsTerm ngrams
, _nre_list = list'
, _nre_root = _rp_root <$> rp
, _nre_parent = _rp_parent <$> rp
, _nre_children = children }
newNgramsElement :: Maybe ListType -> NgramsTerm -> NgramsElement
newNgramsElement mayList ngrams =
......@@ -587,36 +591,11 @@ instance Applicable NgramsTablePatch (Maybe NgramsTableMap) where
ngramsElementToRepo :: NgramsElement -> NgramsRepoElement
ngramsElementToRepo
(NgramsElement { _ne_size = s
, _ne_list = l
, _ne_root = r
, _ne_parent = p
, _ne_children = c
}) =
NgramsRepoElement
{ _nre_size = s
, _nre_list = l
, _nre_parent = p
, _nre_root = r
, _nre_children = c
}
ngramsElementToRepo (NgramsElement { _ne_nre = nre }) = nre
ngramsElementFromRepo :: NgramsTerm -> NgramsRepoElement -> NgramsElement
ngramsElementFromRepo
ngrams
(NgramsRepoElement
{ _nre_size = s
, _nre_list = l
, _nre_parent = p
, _nre_root = r
, _nre_children = c
}) =
NgramsElement { _ne_size = s
, _ne_list = l
, _ne_root = r
, _ne_parent = p
, _ne_children = c
ngramsElementFromRepo ngrams nre =
NgramsElement { _ne_nre = nre
, _ne_ngrams = ngrams
, _ne_occurrences = mempty -- panic $ "API.Ngrams.Types._ne_occurrences"
{-
......
{-|
Module : Gargantext.Core.Text.Terms.Tokenize
Description : String tokenization
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Terms.Tokenize
where
import Control.Lens (view)
import Control.Monad.Trans.Control (MonadBaseControl)
import Gargantext.Core (Lang)
import Gargantext.Core.NLP (nlpServerGet, HasNLPServer)
import Gargantext.Core.Text.Terms.Multi (tokenTags)
import Gargantext.Core.Types (TokenTag(..))
import Gargantext.Prelude
tokenize :: ( HasNLPServer env
, MonadReader env m
, MonadBaseControl IO m) => Lang -> Text -> m [TokenTag]
tokenize lang txt = do
nlp <- view (nlpServerGet lang)
liftBase $ concat <$> tokenTags nlp lang txt
......@@ -157,6 +157,7 @@ data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_offset_begin :: Int
, _my_token_offset_end :: Int
} deriving (Show)
$(deriveJSON (unPrefix "_my_token_") ''TokenTag)
-- | NOTE: Combining 'TokenTag' doesn't make much sense to me. And
-- lemma combining is just wrong. You can't just "cat" <> "woman" to
......
......@@ -125,12 +125,14 @@ toNodeNgramsW l ngs = List.concat $ map (toNodeNgramsW'' l) ngs
, _nng_ngrams_class = Nothing
, _nng_ngrams_weight = 0 } |
(NgramsElement { _ne_ngrams = NgramsTerm ngrams_terms'
, _ne_size = _size
, _ne_list = list_type
, _ne_occurrences = _occ
, _ne_root = _root
, _ne_parent = _parent
, _ne_children = _children
-- , _ne_occurrences = _occ
, _ne_nre = NgramsRepoElement {
_nre_list = list_type
-- , _nre_size = _size
-- , _nre_root = _root
-- , _nre_parent = _parent
-- , _nre_children = _children
}
}
) <- elms
]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment