[ngrams] fixes to highlight (Karp-Rabin was broken)

parent 1ce56e02
Pipeline #5336 canceled with stage
......@@ -141,6 +141,7 @@ let additions =
[ "arrays"
, "enums"
, "foldable-traversable"
, "int64"
, "integers"
, "lists"
, "maybe"
......@@ -152,7 +153,7 @@ let additions =
, "uint"
]
, repo = "https://git.sr.ht/~cgenie/purescript-string-search"
, version = "v0.1.3"
, version = "v0.1.4"
}
, dom-filereader =
{ dependencies = [ "aff", "arraybuffer-types", "web-file", "web-html" ]
......
......@@ -29,6 +29,7 @@ to generate this file without the comments in this block.
, "css"
, "d3"
, "datetime"
, "debug"
, "dom-filereader"
, "dom-simple"
, "effect"
......
......@@ -28,7 +28,7 @@ import Data.Tuple.Nested ((/\))
import Effect (Effect)
import Gargantext.Components.Annotation.Menu (annotationMenu, AnnotationMenu)
import Gargantext.Components.Annotation.Types (MenuType(..), ModeType(..), termClass)
import Gargantext.Core.NgramsTable.Functions (Cache, findNgramTermList, highlightNgrams, normNgram)
import Gargantext.Core.NgramsTable.Functions (Cache, computeCache, findNgramTermList, highlightNgrams, normNgram)
import Gargantext.Core.NgramsTable.Types (HighlightElement, NgramsRepoElement(..), NgramsTable(..), NgramsTerm(..), parentMap)
import Gargantext.Types (CTabNgramType(..), TermList)
import Gargantext.Utils.Reactix as R2
......@@ -137,7 +137,7 @@ annotatedFieldInnerCpt = here.component "annotatedFieldInner" cpt where
{ className: "annotated-field-runs"
, aria: { expanded: false }
}
((\p -> annotateRun p) <$> wrap <$> compile cache ngrams fieldText)
(annotateRun <$> wrap <$> compile cache ngrams fieldText)
AdditionMode ->
......
......@@ -32,6 +32,7 @@ import Data.Traversable (for, traverse_, traverse)
import Data.TraversableWithIndex (traverseWithIndex)
import Data.Tuple (Tuple(..))
import Data.Tuple.Nested ((/\))
-- import Debug (trace)
import Effect (Effect)
import Effect.Aff (Aff, launchAff_)
import Effect.Class (liftEffect)
......@@ -48,6 +49,7 @@ import Gargantext.Utils.Either (eitherMap)
--import Gargantext.Utils.KarpRabin (indicesOfAny)
import Gargantext.Utils.Reactix as R2
import Gargantext.Utils.String as GS
import Gargantext.Utils.Tuple as GUT
import Partial (crashWith)
import Partial.Unsafe (unsafePartial)
import Reactix as R
......@@ -168,11 +170,17 @@ computeCache ngrams contextNgrams = { contextNgrams, pm, pats }
highlightNgrams :: Record Cache -> CTabNgramType -> NgramsTable -> String -> Array HighlightElement
highlightNgrams cache@{ pm, pats } ntype table@(NgramsTable {ngrams_repo_elements: elts}) input0 =
-- trace {pats, input0, input, ixs} \_ ->
A.fromFoldable ((\(s /\ ls)-> undb s /\ ls) <$> unsafePartial (foldl goFold ((input /\ Nil) : Nil) ixs))
-- trace ("[highlightNgrams] " <> show { input0
-- , ixs
-- , normInput: normNgramInternal ntype input
-- , hashStructPats: hashStruct.pats }) \_ ->
A.fromFoldable (GUT.first undb <$> unsafePartial (foldl goFold ((input /\ Nil) : Nil) ixs))
where
spR x = " " <> R.replace wordBoundaryReg "$1$1" x <> " "
-- reR = R.replace wordBoundaryReg " "
db = S.replaceAll (S.Pattern " ") (S.Replacement " ")
-- | We need to add spaces around terms to avoid matching on
-- | substrings, i.e. term "cat" shouldn't match "category"
sp x = " " <> db x <> " "
undb = R.replace wordBoundaryReg2 "$1"
input = spR input0
......@@ -183,6 +191,12 @@ highlightNgrams cache@{ pm, pats } ntype table@(NgramsTable {ngrams_repo_element
-- n) Set.empty elts
-- foldlWithIndex (\term acc (NgramsRepoElement nre) -> Set.union acc $ Set.insert term nre.children) Set.empty elts
hashStruct = SSKR.hashStruct (sp <<< ngramsTermText <$> pats)
-- | 'ixs' structure is a list of patterns matched by
-- | Karp-Rabin. First element of tuple is the position in string,
-- | second element is a list of matching patterns. Note that
-- | Karp-Rabin matches by hash, so hash collision is possible, we
-- | need to compare strings later.
ixs :: Array (Tuple Int (Array Int))
ixs = SSKR.indicesOfAnyHashStruct hashStruct (normNgramInternal ntype input)
splitAcc :: Partial => Int -> HighlightAccumulator
......@@ -215,6 +229,7 @@ highlightNgrams cache@{ pm, pats } ntype table@(NgramsTable {ngrams_repo_element
goAcc :: Partial => Int -> HighlightAccumulator -> Tuple NgramsTerm Int -> HighlightAccumulator
goAcc i acc (pat /\ lpat) =
-- let _ = trace ("[goAcc] acc: " <> show acc <> ", pat: " <> show pat <> ", lpat: " <> show lpat) \_ -> 0 in
case lookupRootListWithChildren pat table cache of
Nothing ->
-- crashWith $ "highlightNgrams: pattern [" <> show pat <> "] missing from table: " <> show table
......@@ -223,17 +238,28 @@ highlightNgrams cache@{ pm, pats } ntype table@(NgramsTable {ngrams_repo_element
let
(acc0 /\ acc1_2) = splitAcc i acc
(acc1 /\ acc2) = splitAcc (lpat + 1) acc1_2
-- _ = trace ("Split acc0: " <> show acc0) \_ -> acc0
-- _ = trace ("Split acc1_2: " <> show acc1_2) \_ -> acc1_2
-- _ = trace ("Split acc1: " <> show acc1) \_ -> acc1
-- _ = trace ("Split acc2: " <> show acc2) \_ -> acc2
text = extractInputTextMatch i lpat input
ng = normNgram ntype text
in
acc0 <> (addNgramElt ng ne_list <$> acc1) <> acc2
-- | Basic iteration function to check a single pattern matched by
-- | Karp-Rabin. 'i' corresponds to index of the first character
-- | in string, 'pis' is the list of indices of potentially
-- | matching patterns.
goFold :: Partial => HighlightAccumulator -> Tuple Int (Array Int) -> HighlightAccumulator
goFold acc (Tuple i pis) = foldl (goAcc i) acc $
-- A.sortWith snd $
map (\pat -> pat /\ S.length (db (ngramsTermText pat))) $
fromMaybe' (\_ -> crashWith "highlightNgrams: out of bounds pattern") $
traverse (A.index pats) pis
goFold acc (Tuple i pis) =
foldl (goAcc i) acc $
-- A.sortWith snd $
map (\pat -> pat /\ S.length (db (ngramsTermText pat))) $
-- we can't have non-matching index
fromMaybe' (\_ -> crashWith "highlightNgrams: out of bounds pattern") $
-- convert pattern indices into 'Maybe NgramsTerm'
traverse (A.index pats) pis
--applyNgramsTablePatchToSingleTerm :: NgramsTerm -> NgramsTablePatch -> Set NgramsTerm -> Set NgramsTerm
--applyNgramsTablePatchToSingleTerm ngram patch s =
......
......@@ -56,13 +56,13 @@ instance JSON.ReadForeign Session where
readImpl f = do
r <- JSON.readImpl f
let objTuple = Object.toUnfoldable r.caches :: Array (Tuple String NT.CacheState)
let rUp = r { caches = Map.fromFoldable (GUT.mapFst (fromMaybe 0 <<< Int.fromString) <$> objTuple) }
let rUp = r { caches = Map.fromFoldable (GUT.first (fromMaybe 0 <<< Int.fromString) <$> objTuple) }
pure $ Session rUp
instance JSON.WriteForeign Session where
writeImpl (Session { backend, caches, token, treeId, username, userId}) =
JSON.writeImpl { backend, caches: caches', token, treeId, username, userId }
where
caches' = JSON.writeImpl $ Object.fromFoldable (GUT.mapFst show <$> Map.toUnfoldable caches :: Array (Tuple String NT.CacheState))
caches' = JSON.writeImpl $ Object.fromFoldable (GUT.first show <$> Map.toUnfoldable caches :: Array (Tuple String NT.CacheState))
instance Eq Session where eq = genericEq
......
......@@ -2,5 +2,8 @@ module Gargantext.Utils.Tuple where
import Data.Tuple (Tuple(..))
mapFst :: forall a b c. (a -> c) -> Tuple a b -> Tuple c b
mapFst f (Tuple k v) = Tuple (f k) v
-- | Similar to
-- https://hackage.haskell.org/package/base-4.19.0.0/docs/Data-Bifunctor.html#v:first
first :: forall a b c. (a -> c) -> Tuple a b -> Tuple c b
first f (Tuple k v) = Tuple (f k) v
module Test.Gargantext.Components.NgramsTable.Spec where
import Prelude
-- import Test.Spec.QuickCheck (quickCheck')
import Data.List as L
import Data.Maybe (Maybe(..))
import Data.Map as Map
import Data.Maybe (Maybe(..), isJust)
import Data.Set as Set
import Data.Tuple (Tuple(..), fst)
import Test.Spec (Spec, describe, it)
-- import Test.Spec.Assertions (shouldEqual)
-- import Test.Spec.QuickCheck (quickCheck')
import Test.Utils (shouldEqualArray)
import Gargantext.Core.NgramsTable.Functions (highlightNgrams, normNgram, computeCache)
import Gargantext.Core.NgramsTable.Functions (highlightNgrams, normNgram, computeCache, lookupRootListWithChildren)
import Gargantext.Core.NgramsTable.Types (HighlightElement, NgramsElement(..), NgramsRepoElement(..), NgramsTable(..), NgramsTerm)
import Gargantext.Types (CTabNgramType(..), TermList(..))
import Prelude
import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldContain, shouldNotContain, shouldSatisfy)
import Test.Utils (shouldEqualArray)
ne :: String -> TermList -> CTabNgramType -> NgramsElement
......@@ -153,3 +151,29 @@ spec = do
, highlightNil ", after "
]
highlightNgrams cache CTabTerms table input `shouldEqualArray` output
it "works for #598" do
let input = "Passant d’une conception quasi solipsiste ou monadique de la sensation, présente dans l’Essai sur les données immédiates de la conscience, à l’idée d’une sensibilité universelle mais impersonnelle dans Matière et mémoire, Bergson se heurte à plusieurs difficultés, dont celle d’une dissolution de la notion de Sujet. Dans L’évolution créatrice puis dans Les deux sources de la morale et de la religion, son approche évolutionniste lui permet d’envisager la perspective d’une sensibilité indéfiniment extensible, inséparable désormais de celle d’un Vivant par définition métastable qui va de l’amibe jusqu’à Dieu."
let startInput = " Passant d’une "
let middleInput = " quasi solipsiste ou monadique de la sensation, présente dans l’Essai sur les données immédiates de la conscience, à l’idée d’une sensibilité universelle mais impersonnelle dans Matière et mémoire, Bergson se heurte à plusieurs difficultés, dont celle d’une dissolution de la "
let endInput = " de Sujet. Dans L’évolution créatrice puis dans Les deux sources de la morale et de la religion, son approche évolutionniste lui permet d’envisager la perspective d’une sensibilité indéfiniment extensible, inséparable désormais de celle d’un Vivant par définition métastable qui va de l’amibe jusqu’à Dieu. "
let ngramType = CTabSources
-- let termsS = ["arbre","article","automate","automates","calcul","cas","classe","concept","conception","construction","difficultés","données","décidabilité","effet","ensemble","extension","familles","fois","graphes","généralisons","intérêt","langage","langages","logique","longueur inférieure","monadique","mots","notion","opérations","pensée","perception","perspective","point","premier ordre","problème","produit","produits","rendre compte","second ordre","structure","structures","thèse","théorie","théorème","travail","travaux","travers","univers","variétés","vue","étude","évaluation"]
-- let contextNgramsS = ["concept","conception","difficultés","données","extension","monadique","Nothing"
-- ,"notion","perspective"]
let termsS = ["conception", "notion"]
let contextNgramsS = ["conception", "notion"]
let tnres = (\t -> tnre t MapTerm ngramType) <$> termsS
let table = NgramsTable { ngrams_repo_elements: Map.fromFoldable tnres
, ngrams_scores: Map.fromFoldable [] }
let cache = computeCache table $ Set.fromFoldable $ normNgram ngramType <$> contextNgramsS
lookupRootListWithChildren (normNgram ngramType "conception") table cache `shouldSatisfy` isJust
lookupRootListWithChildren (normNgram ngramType "notion") table cache `shouldSatisfy` isJust
highlightNgrams cache CTabTerms table input `shouldEqualArray`
[ highlightNil startInput
, highlightSingleton " conception" ngramType MapTerm
, highlightNil middleInput
, highlightSingleton " notion" ngramType MapTerm
, highlightNil endInput ]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment