highlightNgrams now supports nested/crossing ngrams

TODO UI & better testing

highlightNgrams now supports nested/crossing ngrams
TODO UI & better testing
c3987256 · Nicolas Pouillard · dd85f69e · c3987256
Commit c3987256 authored Nov 27, 2020 by Nicolas Pouillard
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 47 deletions

Core.purs src/Gargantext/Components/NgramsTable/Core.purs +43 -47

No files found.
--- a/src/Gargantext/Components/NgramsTable/Core.purs
+++ b/src/Gargantext/Components/NgramsTable/Core.purs
@@ -93,7 +93,7 @@ import Data.Lens.Index (class Index, ix)
 import Data.Lens.Iso.Newtype (_Newtype)
 import Data.Lens.Record (prop)
 import Data.List ((:), List(Nil))
-import Data.List as List
+import Data.List as L
 import Data.Map (Map)
 import Data.Map as Map
 import Data.Maybe (Maybe(..), fromMaybe, fromMaybe', isJust)
@@ -436,74 +436,70 @@ wordBoundaryReg2 = case R.regex ("(" <> wordBoundaryChars <> ")\\1") (R.global <
  Left e  -> unsafePartial $ crashWith e
  Right r -> r
-type HighlightAccumulator =
+type HighlightElement = Tuple String (List (Tuple NgramsTerm TermList))
-  { i0 :: Int    -- where are we in input
+type HighlightAccumulator = List HighlightElement
-  , s  :: String -- == drop i0 input
-  , l  :: List (Tuple String (List (Tuple NgramsTerm TermList)))
-  }
 -- TODO: while this function works well with word boundaries,
 --       it inserts too many spaces.
-highlightNgrams :: CTabNgramType -> NgramsTable -> String -> Array (Tuple String (List (Tuple NgramsTerm TermList)))
+highlightNgrams :: CTabNgramType -> NgramsTable -> String -> Array HighlightElement
 highlightNgrams ntype table@(NgramsTable {ngrams_repo_elements: elts}) input0 =
    -- trace {pats, input0, input, ixs} \_ ->
-    let sN = unsafePartial (foldl goFold {i0: 0, s: input, l: Nil} ixs) in
+    A.fromFoldable ((\(s /\ ls)-> undb s /\ ls) <$> unsafePartial (foldl goFold ((input /\ Nil) : Nil) ixs))
-    A.reverse (A.fromFoldable (consNonEmpty (undb (init sN.s)) sN.l))
  where
    spR x = " " <> R.replace wordBoundaryReg "$1$1" x <> " "
    reR = R.replace wordBoundaryReg " "
    db = S.replaceAll (S.Pattern " ") (S.Replacement "  ")
    sp x = " " <> db x <> " "
    undb = R.replace wordBoundaryReg2 "$1"
-    init x = S.take (S.length x - 1) x
    input = spR input0
    pats = A.fromFoldable (Map.keys elts)
    ixs = indicesOfAny (sp <<< ngramsTermText <$> pats) (normNgramInternal ntype input)
-    consOnJustTail s xs@(Tuple _ (_ : _) : _) = Tuple s Nil : xs
+    splitAcc :: Partial => Int -> HighlightAccumulator
-    consOnJustTail _ xs = xs
+             -> Tuple HighlightAccumulator HighlightAccumulator
+    splitAcc i = go 0 Nil
-    consNonEmpty x xs
+      where
-      | S.null x  = xs
+      go j pref acc =
-      | otherwise = Tuple x Nil : xs
+        case compare i j of
+          LT -> crashWith "highlightNgrams: splitAcc': i < j"
-    goAcc :: Partial => Tuple NgramsTerm Int -> Int -> HighlightAccumulator -> HighlightAccumulator
+          EQ -> L.reverse pref /\ acc
-    goAcc (pat /\ lpat) i { i0, s, l } =
+          GT ->
+            case acc of
+              Nil -> crashWith "highlightNgrams: splitAcc': acc=Nil" -- pref /\ Nil
+              elt@(s /\ ls) : elts ->
+                let slen = S.length s in
+                case compare i (j + slen) of
+                  LT -> let {before: s0, after: s1} = S.splitAt (i - j) s in
+                        L.reverse ((s0 /\ ls) : pref) /\ ((s1 /\ ls) : elts)
+                  EQ -> L.reverse (elt : pref) /\ elts
+                  GT -> go (j + slen) (elt : pref) elts
+    extractInputTextMatch :: Int -> Int -> String -> String
+    extractInputTextMatch i len input = undb $ S.take len $ S.drop (i + 1) input
+    addNgramElt ng ne_list (elt /\ elt_lists) = (elt /\ ((ng /\ ne_list) : elt_lists))
+    goAcc :: Partial => Int -> HighlightAccumulator -> Tuple NgramsTerm Int -> HighlightAccumulator
+    goAcc i acc (pat /\ lpat) =
      case lookupRootList pat table of
        Nothing ->
          crashWith "highlightNgrams: pattern missing from table"
        Just ne_list ->
          let
-            s1    = {-if i <= i0 then-} S.splitAt (i - i0) s {-else S.drop i input-}
+            (acc0 /\ acc1_2) = splitAcc i acc
-            s2    = S.splitAt lpat (S.drop 1 s1.after)
+            (acc1 /\ acc2) = splitAcc (lpat + 1) acc1_2
-            s3    = S.splitAt 1    s2.after
+            text = extractInputTextMatch i lpat input
-            unspB = if i0 == 0 then S.drop 1 else identity
+            ng = normNgram ntype text
-            s3b   = s3.before
-            text  = undb s2.before
          in
-          -- trace {s, i, i0, s1, s2, s3, pat, lpat, s3b} \_ ->
+            acc0 <> (addNgramElt ng ne_list <$> acc1) <> acc2
-          -- `undb s2.before` and pat might differ by casing only!
-          { i0: i + lpat + 2
-          , s:  s3.after
-          , l:  Tuple text ((normNgram ntype text /\ ne_list) : Nil) :
-                consOnJustTail s3b
-                (consNonEmpty (unspB (undb s1.before)) l)
-          }
-    -- NOTE that only the first matching pattern is used, the others are ignored!
    goFold :: Partial => HighlightAccumulator -> Tuple Int (Array Int) -> HighlightAccumulator
-    goFold acc@{ i0, s, l } (Tuple i pis)
+    goFold acc (Tuple i pis) = foldl (goAcc i) acc $
-      | i < i0    =
+                           --  A.sortWith snd $
-        -- Skip this pattern which is overlapping with a previous one.
+                               map (\pat -> pat /\ S.length (db (ngramsTermText pat))) $
-        { i0, s, l }
+                               fromMaybe' (\_ -> crashWith "highlightNgrams: out of bounds pattern") $
-      | otherwise =
+                               traverse (A.index pats) pis
-        let pats' = A.sortWith snd $
-                    map (\pat -> pat /\ S.length (db (ngramsTermText pat))) $
-                    fromMaybe' (\_ -> crashWith "highlightNgrams: out of bounds pattern") $
-                    traverse (A.index pats) pis in
-        case List.fromFoldable pats' of
-          Nil -> { i0, s, l }
-          pat : _ -> goAcc pat i acc
 -----------------------------------------------------------------------------------