[BREANKING ngrams] this should fix #471 multi-ngrams terms indexing

However, tests don't pass and I'm not sure if other functionality
doesn't break.
parent bd92fece
Pipeline #7646 failed with stages
in 40 minutes and 5 seconds
...@@ -18,9 +18,9 @@ module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentenc ...@@ -18,9 +18,9 @@ module Gargantext.Core.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentenc
import Data.List qualified as L import Data.List qualified as L
import Data.Set qualified as S import Data.Set qualified as S
import Data.Text qualified as T import Data.Text qualified as T
import Gargantext.Core import Gargantext.Core ( Lang )
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..)) import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types import Gargantext.Core.Types ( TermsWithCount, Terms(..) )
import Gargantext.Prelude hiding (words) import Gargantext.Prelude hiding (words)
import Prelude (String) import Prelude (String)
--import Data.Char (isAlphaNum, isSpace) --import Data.Char (isAlphaNum, isSpace)
...@@ -43,7 +43,8 @@ monoTexts = L.concat . monoTextsBySentence ...@@ -43,7 +43,8 @@ monoTexts = L.concat . monoTextsBySentence
-- | TODO use text2term only -- | TODO use text2term only
monoText2term :: Lang -> Text -> Terms monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang PorterAlgorithm txt) monoText2term lang txt = Terms { _terms_label = [txt]
, _terms_stem = S.singleton $ stem lang PorterAlgorithm txt }
monoTextsBySentence :: Text -> [[Text]] monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map T.words monoTextsBySentence = map T.words
......
...@@ -11,7 +11,6 @@ Here is a longer description of this module, containing some ...@@ -11,7 +11,6 @@ Here is a longer description of this module, containing some
commentary with @some markup@. commentary with @some markup@.
-} -}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE ViewPatterns #-}
module Gargantext.Core.Text.Terms.WithList ( module Gargantext.Core.Text.Terms.WithList (
...@@ -26,6 +25,10 @@ module Gargantext.Core.Text.Terms.WithList ( ...@@ -26,6 +25,10 @@ module Gargantext.Core.Text.Terms.WithList (
-- * Properties -- * Properties
, prop_patterns_internal_consistency , prop_patterns_internal_consistency
-- * For debugging
, ReplaceTerms(..)
, replaceTerms
) where ) where
import Prelude (show) import Prelude (show)
...@@ -39,7 +42,7 @@ import Data.Text qualified as T ...@@ -39,7 +42,7 @@ import Data.Text qualified as T
import GHC.Exts (sortWith) import GHC.Exts (sortWith)
import Gargantext.API.Ngrams.Types (NgramsTerm(..)) import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core (Lang(ZH)) import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context ( Corpus, TermList, Label, MultiTerm )
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Utils (groupWithCounts) import Gargantext.Core.Utils (groupWithCounts)
...@@ -58,7 +61,7 @@ data Pattern = Pattern ...@@ -58,7 +61,7 @@ data Pattern = Pattern
} }
instance Show Pattern where instance Show Pattern where
show Pattern{..} = "Pattern (length: " <> Prelude.show _pat_length <> ", terms: " <> Prelude.show _pat_terms <> ")" show Pattern{..} = "Pattern { _pat_length = " <> Prelude.show _pat_length <> ", _pat_terms = " <> Prelude.show _pat_terms <> "}"
type Patterns = [Pattern] type Patterns = [Pattern]
...@@ -66,25 +69,44 @@ type Patterns = [Pattern] ...@@ -66,25 +69,44 @@ type Patterns = [Pattern]
data ReplaceTerms = KeepAll | LongestOnly data ReplaceTerms = KeepAll | LongestOnly
-- | Given a 'ReplaceTerms' strategy, patterns and a split text,
-- return matching terms according to strategy. This function is
-- usually applied to words in the whole sentence (i.e. 'terms'
-- variable contains a tokenized sentence, coming from
-- 'monoTextsBySentence').
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]] replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0 replaceTerms rTerms pats terms =
where List.concat (
terms_len = length terms mapMaybe (\(ix, _t) ->
go ix | ix >= terms_len = []
| otherwise =
case IntMap.lookup ix m of case IntMap.lookup ix m of
Nothing -> go (ix + 1) Nothing -> Nothing
Just (len, term) -> -- lst :: [(Int, [Text])]
term : go (ix + len) -- snd <$> lst :: [[Text]]
Just lst -> Just (snd <$> lst)) $ zip [0..] terms
)
--replaceTerms rTerms pats terms = go 0
where
-- termsLen :: Int
-- termsLen = length terms
-- go :: Int -> [[Text]]
-- go ix | ix >= termsLen = []
-- | otherwise =
-- case IntMap.lookup ix m of
-- Nothing -> go (ix + 1)
-- Just (len, term) ->
-- term : go (ix + len)
m :: IntMap [(Int, [Text])]
m = toMap m = toMap
[ (ix, (len, term)) [ (ix, (_pat_length, _pat_terms))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ] | Pattern { .. } <- pats
, ix <- KMP.match _pat_table terms ]
toMap = case rplaceTerms of
KeepAll -> IntMap.fromList toMap :: [(IntMap.Key, (Int, [Text]))] -> IntMap [(Int, [Text])]
LongestOnly -> IntMap.fromListWith merge toMap kv = case rTerms of
KeepAll -> IntMap.fromListWith (<>) (second (:[]) <$> kv)
LongestOnly -> IntMap.map (:[]) $ IntMap.fromListWith merge kv
where where
merge (len1, lab1) (len2, lab2) = merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2) if len2 < len1 then (len1, lab1) else (len2, lab2)
...@@ -97,7 +119,7 @@ buildPatterns :: TermList -> Patterns ...@@ -97,7 +119,7 @@ buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern) buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
where where
buildPattern :: Label -> [MultiTerm] -> [Pattern] buildPattern :: Label -> [MultiTerm] -> [Pattern]
buildPattern label alts = mapMaybe (mkPattern label) $ map (\alt -> filter (/= "") alt) (label : alts) buildPattern label alts = mapMaybe (mkPattern label) $ map (filter (/= "")) (label : alts)
mkPattern :: Label -> [Text] -> Maybe Pattern mkPattern :: Label -> [Text] -> Maybe Pattern
mkPattern label alt mkPattern label alt
...@@ -109,7 +131,9 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern) ...@@ -109,7 +131,9 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap (uncurry buildPattern)
-- which is exactly what we will be given as part of 'termsInText', -- which is exactly what we will be given as part of 'termsInText',
-- that calls 'monoTextsBySentence'. If we don't lower here it -- that calls 'monoTextsBySentence'. If we don't lower here it
-- means we won't be getting matches, whereas in theory we could. -- means we won't be getting matches, whereas in theory we could.
Pattern (KMP.build $ map T.toLower alt) (length alt) (map T.toLower label) Pattern { _pat_table = KMP.build $ map T.toLower alt
, _pat_length = length alt
, _pat_terms = map T.toLower label }
--(Terms label $ Set.empty) -- TODO check stems --(Terms label $ Set.empty) -- TODO check stems
...@@ -119,9 +143,7 @@ type MatchedText = Text ...@@ -119,9 +143,7 @@ type MatchedText = Text
termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)] termsInText :: Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText lang pats (manipulateText lang -> txt) = termsInText lang pats (manipulateText lang -> txt) =
groupWithCounts $ List.concat groupWithCounts $ List.concatMap (map unwords) (extractTermsWithList pats txt)
$ map (map unwords)
$ extractTermsWithList pats txt
-- | Manipulates the input 'Text' before passing it to 'termsInText'. -- | Manipulates the input 'Text' before passing it to 'termsInText'.
-- In particular, if the language is Chinese (ZH), we add spaces. -- In particular, if the language is Chinese (ZH), we add spaces.
...@@ -135,12 +157,11 @@ extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentenc ...@@ -135,12 +157,11 @@ extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentenc
-- | Extract terms -- | Extract terms
extractTermsWithList' :: Patterns -> Text -> [Text] extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats) extractTermsWithList' pats txt = map concat $ List.concat $ extractTermsWithList pats txt
. monoTextsBySentence
-------------------------------------------------------------------------- --------------------------------------------------------------------------
addSpaces :: Text -> Text addSpaces :: Text -> Text
addSpaces = T.unwords . (T.chunksOf 1) addSpaces = T.intersperse ' '
-------------------------------------------------------------------------- --------------------------------------------------------------------------
......
...@@ -120,6 +120,14 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd ...@@ -120,6 +120,14 @@ evolvSeaLadder nbFdt lambda freq similarities graph = map snd
where where
-------- --------
-- 2) find the local maxima in the quality distribution -- 2) find the local maxima in the quality distribution
-- TODO (seeg, #471) head throws errors when list is too short.
-- I propose this implementation, but I'm not sure of the length of the list
-- maxima = if List.length qua' > 1 then
-- [snd (List.head qua') > snd (List.head $ List.tail qua')] ++
-- (findMaxima qua') ++
-- [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
-- else
-- [True, True]
maxima :: [Bool] maxima :: [Bool]
maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')] maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
-------- --------
......
...@@ -11,6 +11,7 @@ Portability : POSIX ...@@ -11,6 +11,7 @@ Portability : POSIX
module Gargantext.Database.Action.Flow.Utils module Gargantext.Database.Action.Flow.Utils
( docNgrams ( docNgrams
, docNgrams'
, documentIdWithNgrams , documentIdWithNgrams
, insertDocNgrams , insertDocNgrams
, insertDocs , insertDocs
...@@ -68,21 +69,29 @@ insertDocNgrams lId m = do ...@@ -68,21 +69,29 @@ insertDocNgrams lId m = do
-- Given language, ngrams type, a list of terms and a -- | Given language, ngrams type, a list of terms and a
-- HyperdataDocument, return ngrams that are in this text, with counts. -- HyperdataDocument, return ngrams that are in this text, with
-- This is a pure function (doesn't use corenlp nor PostgreSQL FTS). -- counts. This is a pure function (doesn't use corenlp nor
-- PostgreSQL FTS).
docNgrams :: Lang docNgrams :: Lang
-> [NT.NgramsTerm] -> [NT.NgramsTerm]
-> ContextOnlyId HyperdataDocument -> ContextOnlyId HyperdataDocument
-> [(MatchedText, TermsCount)] -> [(MatchedText, TermsCount)]
docNgrams lang ts doc = docNgrams lang ts doc =
( docNgrams' lang ts
termsInText lang (buildPatternsWith lang ts)
$ T.unlines $ catMaybes $ T.unlines $ catMaybes
[ doc ^. context_oid_hyperdata . hd_title [ doc ^. context_oid_hyperdata . hd_title
, doc ^. context_oid_hyperdata . hd_abstract , doc ^. context_oid_hyperdata . hd_abstract
] ]
)
-- | Given language, ngrams type, a list of terms and a text, return
-- ngrams that are in this text, with counts.
docNgrams' :: Lang
-> [NT.NgramsTerm]
-> Text
-> [(MatchedText, TermsCount)]
docNgrams' lang ts txt =
termsInText lang (buildPatternsWith lang ts) txt
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
...@@ -94,7 +103,8 @@ documentIdWithNgrams f = traverse toDocumentIdWithNgrams ...@@ -94,7 +103,8 @@ documentIdWithNgrams f = traverse toDocumentIdWithNgrams
where where
toDocumentIdWithNgrams d = do toDocumentIdWithNgrams d = do
e <- f $ _unIndex d e <- f $ _unIndex d
pure $ DocumentIdWithNgrams d e pure $ DocumentIdWithNgrams { documentWithId = d
, documentNgrams = e }
-- | TODO check optimization -- | TODO check optimization
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment