[karp-rabin] migrate this algorithm to a separate repository

Also, performance is fixed now.

[karp-rabin] migrate this algorithm to a separate repository
Also, performance is fixed now.
baca05ec · Przemyslaw Kaminski · be456f9c · baca05ec · baca05ec · baca05ec
Verified Commit baca05ec authored Mar 01, 2023 by Przemyslaw Kaminski
7 changed files
--- a/packages.dhall
+++ b/packages.dhall
@@ -136,6 +136,24 @@ let additions =
        , repo = "https://github.com/purescript-spec/purescript-spec-quickcheck"
        , version = "v3.1.0"
        }
+      , string-search =
+        { dependencies =
+          [ "arrays"
+          , "enums"
+          , "foldable-traversable"
+          , "integers"
+          , "lists"
+          , "maybe"
+          , "ordered-collections"
+          , "partial"
+          , "prelude"
+          , "strings"
+          , "tuples"
+          , "uint"
+          ]
+        , repo = "https://git.sr.ht/~cgenie/purescript-string-search"
+        , version = "v0.1.3"
+        }
      , dom-filereader =
        { dependencies = [ "aff", "arraybuffer-types", "web-file", "web-html" ]
        , repo = "https://github.com/nwolverson/purescript-dom-filereader"

--- a/shell.nix
+++ b/shell.nix
@@ -97,10 +97,10 @@ let
    echo "Compiling"
    yarn
-    spago build
+    #spago build
    #build-purs
    echo "Testing"
-    spago test
+    spago -x test.dhall test --main Test.Main
    # pulp browserify --skip-compile -t dist/bundle.js --src-path output
    # pulp test --src-path output --test-path output
    #NODE_PATH=output node -e "require('Test.Main').main();"

--- a/spago.dhall
+++ b/spago.dhall
@@ -33,7 +33,6 @@ to generate this file without the comments in this block.
  , "dom-simple"
  , "effect"
  , "either"
-  , "enums"
  , "exceptions"
  , "ffi-simple"
  , "foldable-traversable"
@@ -75,6 +74,7 @@ to generate this file without the comments in this block.
  , "simple-json"
  , "simple-json-generics"
  , "simplecrypto"
+  , "string-search"
  , "strings"
  , "strings-extra"
  , "stringutils"
@@ -85,7 +85,6 @@ to generate this file without the comments in this block.
  , "tuples-native"
  , "typelevel"
  , "typelevel-prelude"
-  , "uint"
  , "unfoldable"
  , "unordered-collections"
  , "unsafe-coerce"

--- a/src/Gargantext/Core/NgramsTable/Functions.purs
+++ b/src/Gargantext/Core/NgramsTable/Functions.purs
@@ -23,6 +23,7 @@ import Data.String as S
 import Data.String.Common as DSC
 import Data.String.Regex (Regex, regex, replace) as R
 import Data.String.Regex.Flags (global, multiline) as R
+import Data.String.Search.KarpRabin as SSKR
 import Data.String.Utils as SU
 import Data.These (These(..))
 import Data.Traversable (for, traverse_, traverse)
@@ -42,7 +43,7 @@ import Gargantext.Routes (SessionRoute(..))
 import Gargantext.Sessions (Session, get, post, put)
 import Gargantext.Types (AsyncTask, AsyncTaskType(..), AsyncTaskWithType(..), CTabNgramType(..), FrontendError, OrderBy(..), ScoreType(..), TabSubType(..), TabType(..), TermList(..), TermSize(..))
 import Gargantext.Utils.Either (eitherMap)
-import Gargantext.Utils.KarpRabin (indicesOfAny)
+--import Gargantext.Utils.KarpRabin (indicesOfAny)
 import Gargantext.Utils.Reactix as R2
 import Partial (crashWith)
 import Partial.Unsafe (unsafePartial)
@@ -138,7 +139,8 @@ highlightNgrams ntype table@(NgramsTable {ngrams_repo_elements: elts}) input0 =
    undb = R.replace wordBoundaryReg2 "$1"
    input = spR input0
    pats = A.fromFoldable (Map.keys elts)
-    ixs = indicesOfAny (sp <<< ngramsTermText <$> pats) (normNgramInternal ntype input)
+    hashStruct = SSKR.hashStruct (sp <<< ngramsTermText <$> pats)
+    ixs = SSKR.indicesOfAnyHashStruct hashStruct (normNgramInternal ntype input)
    splitAcc :: Partial => Int -> HighlightAccumulator
             -> Tuple HighlightAccumulator HighlightAccumulator

--- a/src/Gargantext/Data/Array.purs
+++ b/src/Gargantext/Data/Array.purs
@@ -7,7 +7,7 @@ import Data.List as List
 import Data.Maybe
 import Data.Sequence as Seq
 import Data.Tuple (Tuple(..))
-import Prelude (bind, flip, identity, (<<<), ($), (<), (-), (<$>))
+import Prelude
 ----------------------------------------------------------------------
 -- | Split arrays tools
@@ -72,9 +72,4 @@ slidingWindow lst len =
  let diff = DA.length lst - len
  in
   if diff < 0 then []
-   else (\idx -> DA.take len $ DA.drop idx lst) <$> 0 .. diff
+   else (\idx -> DA.slice idx (idx + len) lst) <$> 0 .. diff
-- slidingWindow lst len = DA.fromFoldable $ go lst
--   where
--     go l =
--       if (DA.length l) < len then List.Nil
--       else (DA.take len l) List.: (go $ DA.drop 1 l)
--- a/src/Gargantext/Utils/KarpRabin.purs
+++ b/src/Gargantext/Utils/KarpRabin.purs
-- |
-- The present module has been ported from Haskell to PureScript
-- by Nicolas Pouillard for the Gargantext projet.
--
-- Original Haskell code:
--   Copyright      : (c) 2010 Daniel Fischer
--   Licence        : BSD3
--   Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
--
-- Simultaneous search for multiple patterns in a 'String'
-- using the Karp-Rabin algorithm.
--
-- A description of the algorithm for a single pattern can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>.
-module Gargantext.Utils.KarpRabin ( -- * Overview
-                                    -- $overview
-                                    -- ** Caution
-                                    -- $caution
-                                    -- * Function
-                                    indicesOfAny
-                                  ) where
-import Data.Array as A
-import Data.Array ((..))
-import Data.Enum (fromEnum)
-import Data.Foldable (class Foldable, minimum, foldl)
-import Data.FunctorWithIndex (mapWithIndex)
-import Data.Int (quot)
-import Data.List as L
-import Data.Map as M
-import Data.Maybe (Maybe(..), isJust)
-import Data.String as S
-import Data.String (CodePoint)
-import Data.Tuple (Tuple(..), snd)
-import Data.UInt (UInt, shl, fromInt)
-import Partial.Unsafe (unsafePartial)
-import Gargantext.Data.Array as GDA
-import Prelude
-fromCodePoint :: CodePoint -> UInt
-fromCodePoint c = fromInt (fromEnum c)
-- $overview
--
-- The Karp-Rabin algorithm works by calculating a hash of the pattern and
-- comparing that hash with the hash of a slice of the target string with
-- the same length as the pattern. If the hashes are equal, the slice of the
-- target is compared to the pattern byte for byte (since the hash
-- function generally isn't injective).
--
-- For a single pattern, this tends to be more efficient than the na&#239;ve
-- algorithm, but it cannot compete with algorithms like
-- Knuth-Morris-Pratt or Boyer-Moore.
--
-- However, the algorithm can be generalised to search for multiple patterns
-- simultaneously. If the shortest pattern has length @k@, hash the prefix of
-- length @k@ of all patterns and compare the hash of the target's slices of
-- length @k@ to them. If there's a match, check whether the slice is part
-- of an occurrence of the corresponding pattern.
--
-- With a hash-function that
--
--   * allows to compute the hash of one slice in constant time from the hash
--     of the previous slice, the new and the dropped character, and
--
--   * produces few spurious matches,
--
-- searching for occurrences of any of @n@ patterns has a best-case complexity
-- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is
-- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is
-- not much worse than the best case.
--
-- The functions in this module store the hashes of the patterns in an
-- 'Map', so the lookup is /O/(@log n@). Re-hashing is done in constant
-- time and spurious matches of the hashes /should be/ sufficiently rare.
-- The maximal length of the prefixes to be hashed is 32.
-- $caution
--
-- Unfortunately, the constant factors are high, so these functions are slow.
-- Unless the number of patterns to search for is high (larger than 50 at
-- least), repeated search for single patterns using Boyer-Moore or DFA and
-- manual merging of the indices is faster. /Much/ faster for less than 40
-- or so patterns.
--
-- In summary, this module is more of an interesting curiosity than anything
-- else.
-- | @'indicesOfAny'@ finds all occurrences of any of several non-empty patterns
--   in a strict target string. If no non-empty patterns are given,
--   the result is an empty array. Otherwise the result array contains
--   the pairs of all indices where any of the (non-empty) patterns start
--   and the array of all patterns starting at that index, the patterns being
--   represented by their (zero-based) position in the pattern array.
--   Empty patterns are filtered out before processing begins.
-indicesOfAny :: Array String                  -- ^ Array of non-empty patterns
-             -> String                        -- ^ String to search
-             -> Array (Tuple Int (Array Int)) -- ^ Array of matches
-indicesOfAny pats = if A.null nepats then const []
-                                     else strictMatcher nepats
-      where
-        nepats = A.filter (not <<< S.null) pats
------------------------------------------------------------------------------
--                                 Workers                                 --
------------------------------------------------------------------------------
-rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
-rehash' shDi out h o n =
-    (h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
-minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
-minimum1 a fa =
-  case minimum fa of
-    Nothing -> a
-    Just b  -> min a b
-type GoAcc = Tuple UInt (Array (Maybe (Tuple Int (Array Int))))
-strictMatcher :: Array String -> String -> Array (Tuple Int (Array Int))
-strictMatcher pats = unsafePartial search
-  where
-    hLen = minimum1 32 (S.length <$> pats)
-    hLen' = fromInt hLen
-    shDi = case 32 `quot` hLen of
-              q | q < 4 -> q
-                | otherwise -> 4
-    outS = fromInt (shDi * hLen)
-    patNum = A.length pats
-    rehash :: UInt -> CodePoint -> CodePoint -> UInt
-    rehash = case shDi of
-                1 -> rehash' (fromInt 1) hLen'
-                2 -> rehash' (fromInt 2) outS
-                3 -> rehash' (fromInt 3) outS
-                _ -> rehash' (fromInt 4) outS
-    hash :: String -> UInt
-    hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
-        <<< S.toCodePointArray
-        <<< S.take hLen
-    hashMap :: M.Map UInt (Array Int)
-    hashMap =
-      M.fromFoldableWith (flip (<>))
-                         (mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
-    search :: Partial => String -> Array (Tuple Int (Array Int))
-    search str = if strLen < hLen then []
-                 else A.catMaybes $ snd <$> foldl go' (Tuple shash []) $ A.zip (0 .. maxIdx) (GDA.slidingWindow arr hLen)
-                 -- else A.fromFoldable (go 0 shash)
-          where
-            strLen = S.length str
-            maxIdx = strLen - hLen
-            arr = S.toCodePointArray str
-            strAt i = A.unsafeIndex arr i
-            shash :: UInt
-            shash = hash str
-            -- NOTE: A new function, with a map over the range [0..maxIdx]
-            go' :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
-            go' (Tuple h acc) (Tuple idx arr') =
-              let rehashed = rehash h (A.unsafeIndex arr' 0) (A.unsafeIndex arr' (-1))
-                  str' = S.fromCodePointArray arr'
-                  okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
-                  mPs = M.lookup h hashMap
-                  acc' = case mPs of
-                    Nothing -> Nothing
-                    Just ps ->
-                      case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
-                        [] -> Nothing
-                        qs -> Just (Tuple idx qs)
-              in
-               Tuple rehashed (A.snoc acc acc')
-            -- go :: Int -> UInt -> L.List (Tuple Int (Array Int))
-            -- go sI h =
-            --   let rehashed = rehash h (strAt sI) (strAt (sI + hLen))
-            --   in
-            --    if sI == maxIdx then
-            --      L.Nil
-            --    else
-            --      case M.lookup h hashMap of
-            --        Nothing ->
-            --          go (sI + 1) rehashed
-            --        Just ps ->
-            --          let rst = S.drop sI str
-            --              okay bs =
-            --                isJust (S.stripPrefix (S.Pattern bs) rst)
-            --          in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
-            --            [] -> go (sI + 1) rehashed
-            --            qs -> Tuple sI qs L.: (go (sI + 1) rehashed)
-              -- case M.lookup h hashMap of
-              --   Nothing ->
-              --     if sI == maxIdx
-              --       then L.Nil
-              --       else go (sI + 1) (rehash h (strAt sI) (strAt (sI + hLen)))
-              --   Just ps ->
-              --     let rst = S.drop sI str
-              --         hd  = strAt sI
-              --         more = if sI == maxIdx then L.Nil else
-              --                   go (sI + 1) (rehash h hd (strAt (sI + hLen)))
-              --         okay bs =
-              --           isJust (S.stripPrefix (S.Pattern bs) rst)
-              --     in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
-              --              [] -> more
-              --              qs -> Tuple sI qs L.: more
--- a/test/Gargantext/Components/NgramsTable/Spec.purs
+++ b/test/Gargantext/Components/NgramsTable/Spec.purs
@@ -21,7 +21,7 @@ ne :: String -> TermList -> CTabNgramType -> NgramsElement
 ne ngrams list ngramType = NgramsElement { ngrams: normed
                                         , size: 1  -- TODO
                                         , list
-                                         , occurrences: 0
+                                         , occurrences: Set.singleton 0
                                         , parent:   Nothing
                                         , root:     Nothing
                                         , children: Set.empty