[BREANKING ngrams] this should fix #471 multi-ngrams terms indexing

However, tests don't pass and I'm not sure if other functionality
doesn't break.
parent bd92fece
Pipeline #7646 failed with stages
in 40 minutes and 5 seconds
......@@ -18,9 +18,9 @@ module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentenc
import Data.List qualified as L
import Data.Set qualified as S
import Data.Text qualified as T
import Gargantext.Core
import Gargantext.Core ( Lang )
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types
import Gargantext.Core.Types ( TermsWithCount, Terms(..) )
import Gargantext.Prelude hiding (words)
import Prelude (String)
--import Data.Char (isAlphaNum, isSpace)
......@@ -43,7 +43,8 @@ monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt)
monoText2term lang txt = Terms { _terms_label = [txt]
, _terms_stem = S.singleton $ stem lang PorterAlgorithm txt }
monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words
......
......@@ -11,7 +11,6 @@ Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList (
......@@ -26,6 +25,10 @@ module Gargantext.Core.Text.Terms.WithList (
-- * Properties
, prop_patterns_internal_consistency
-- * For debugging
, ReplaceTerms(..)
, replaceTerms
) where
import Prelude (show)
......@@ -39,7 +42,7 @@ import Data.Text qualified as T
import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Context ( Corpus, TermList, Label, MultiTerm )
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts)
......@@ -58,7 +61,7 @@ data Pattern = Pattern
}
instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")"
show Pattern{..} = "Pattern { _pat_length = " <> Prelude.show _pat_length <> ", _pat_terms = " <> Prelude.show _pat_terms <> "}"
type Patterns = [Pattern]
......@@ -66,25 +69,44 @@ type Patterns = [Pattern]
data ReplaceTerms = KeepAll | LongestOnly
-- | Given a 'ReplaceTerms' strategy, patterns and a split text,
-- return matching terms according to strategy. This function is
-- usually applied to words in the whole sentence (i.e. 'terms'
-- variable contains a tokenized sentence, coming from
-- 'monoTextsBySentence').
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0
replaceTerms rTerms pats terms =
List.concat (
mapMaybe (\(ix, _t) ->
case IntMap.lookup ix m of
Nothing -> Nothing
-- lst :: [(Int, [Text])]
-- snd <$> lst :: [[Text]]
Just lst -> Just (snd <$> lst)) $ zip [0..] terms
)
--replaceTerms rTerms pats terms = go 0
where
terms_len = length terms
go ix | ix >= terms_len = []
| otherwise =
case IntMap.lookup ix m of
Nothing -> go (ix + 1)
Just (len, term) ->
term : go (ix + len)
-- termsLen :: Int
-- termsLen = length terms
-- go :: Int -> [[Text]]
-- go ix | ix >= termsLen = []
-- | otherwise =
-- case IntMap.lookup ix m of
-- Nothing -> go (ix + 1)
-- Just (len, term) ->
-- term : go (ix + len)
m :: IntMap [(Int, [Text])]
m = toMap
[ (ix, (len, term))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ]
toMap = case rplaceTerms of
KeepAll -> IntMap.fromList
LongestOnly -> IntMap.fromListWith merge
[ (ix, (_pat_length, _pat_terms))
| Pattern { .. } <- pats
, ix <- KMP.match _pat_table terms ]
toMap :: [(IntMap.Key, (Int, [Text]))] -> IntMap [(Int, [Text])]
toMap kv = case rTerms of
KeepAll -> IntMap.fromListWith (<>) (second (:[]) <$> kv)
LongestOnly -> IntMap.map (:[]) $ IntMap.fromListWith merge kv
where
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
......@@ -97,7 +119,7 @@ buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
where
buildPattern :: Label -> [MultiTerm] -> [Pattern]
buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts)
buildPattern label alts = mapMaybe (mkPattern label) $ map (filter (/= "")) (label : alts)
mkPattern :: Label -> [Text] -> Maybe Pattern
mkPattern label alt
......@@ -109,7 +131,9 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
-- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label)
Pattern { _pat_table = KMP.build $ map T.toLower alt
, _pat_length = length alt
, _pat_terms = map T.toLower label }
--(Terms label $ Set.empty) -- TODO check stems
......@@ -119,9 +143,7 @@ type MatchedText = Text
termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText lang pats (manipulateText lang -> txt) =
groupWithCounts $ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
groupWithCounts $ List.concatMap (map unwords) (extractTermsWithList pats txt)
-- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces.
......@@ -135,12 +157,11 @@ extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentenc
-- | Extract terms
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence
extractTermsWithList' pats txt = map concat $ List.concat $ extractTermsWithList pats txt
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = T.unwords . (T.chunksOf 1)
addSpaces = T.intersperse ' '
--------------------------------------------------------------------------
......
......@@ -120,6 +120,14 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
where
--------
-- 2) find the local maxima in the quality distribution
-- TODO (seeg, #471) head throws errors when list is too short.
-- I propose this implementation, but I'm not sure of the length of the list
-- maxima = if List.length qua' > 1 then
-- [snd (List.head qua') > snd (List.head $ List.tail qua')] ++
-- (findMaxima qua') ++
-- [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
-- else
-- [True, True]
maxima :: [Bool]
maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
--------
......
......@@ -11,6 +11,7 @@ Portability : POSIX
module Gargantext.Database.Action.Flow.Utils
( docNgrams
, docNgrams'
, documentIdWithNgrams
, insertDocNgrams
, insertDocs
......@@ -68,21 +69,29 @@ insertDocNgrams lId m = do
-- Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with counts.
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS).
-- | Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with
-- counts. This is a pure function (doesn't use corenlp nor
-- PostgreSQL FTS).
docNgrams :: Lang
-> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument
-> [(MatchedText, TermsCount)]
docNgrams lang ts doc =
(
termsInText lang (buildPatternsWith lang ts)
docNgrams' lang ts
$ T.unlines $ catMaybes
[ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract
]
)
-- | Given language, ngrams type, a list of terms and a text, return
-- ngrams that are in this text, with counts.
docNgrams' :: Lang
-> [NT.NgramsTerm]
-> Text
-> [(MatchedText, TermsCount)]
docNgrams' lang ts txt =
termsInText lang (buildPatternsWith lang ts) txt
documentIdWithNgrams :: HasNodeError err
......@@ -94,7 +103,8 @@ documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e
pure $ DocumentIdWithNgrams { documentWithId = d
, documentNgrams = e }
-- | TODO check optimization
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment