Initial commit and a working version of KarpRabin

f272bf8e · Przemyslaw Kaminski · f272bf8e · f272bf8e · f272bf8e · f272bf8e
Verified Commit f272bf8e authored Mar 01, 2023 by Przemyslaw Kaminski
13 changed files
--- a/.envrc
+++ b/.envrc
+use nix
--- a/.gitignore
+++ b/.gitignore
+.spago/
+node_modules/
+output/
--- a/nix/pinned.nix
+++ b/nix/pinned.nix
+import (
+  builtins.fetchTarball {
+    url = "https://github.com/NixOS/nixpkgs/archive/22.11.tar.gz";
+  }
+)
--- a/packages.dhall
+++ b/packages.dhall
+let upstream =
+      https://github.com/garganscript/package-sets/releases/download/v0.1.7/release.dhall
+        sha256:52886309e1f0158a85427f80c1e3d47ce747c5f14fcec671a81fe5c2c711a6db
+in  upstream
--- a/shell.nix
+++ b/shell.nix
+{ pkgs ? import ./nix/pinned.nix {} }:
+let
+  default-shell = import (
+    pkgs.fetchFromGitHub {
+      owner = "garganscript";
+      repo = "package-sets";
+      rev = "master";
+      sha256 = "RYFsTna5cg7EjynLvkFg87zTLSMeZw7DNIgPu9iXokk=";
+    } + "/default-shell.nix");
+  build = pkgs.writeShellScriptBin "build" ''
+    #!/usr/bin/env bash
+    set -e
+    echo "Installing JS Dependencies"
+    yarn
+    echo "Compiling"
+    # 0.15
+    spago build
+    #spago bundle-module --main Main --to dist/bundle.js
+  '';
+  test-ps = pkgs.writeShellScriptBin "test-ps" ''
+    #!/usr/bin/env bash
+    set -e
+    echo "Compiling"
+    yarn
+    echo "Testing"
+    spago -x test.dhall test --main Test.Main
+  '';
+in
+pkgs.mkShell {
+  name = "purescript-string-search";
+  buildInputs = [
+    build
+    test-ps
+    default-shell.purs
+    default-shell.easy-ps.psc-package
+    default-shell.easy-ps.spago
+    default-shell.build
+    default-shell.pkgs.dhall-json
+    default-shell.pkgs.nodejs
+  ];
+}
--- a/spago.dhall
+++ b/spago.dhall
+{-
+Welcome to a Spago project!
+You can edit this file as you like.
+Need help? See the following resources:
+- Spago documentation: https://github.com/purescript/spago
+- Dhall language tour: https://docs.dhall-lang.org/tutorials/Language-Tour.html
+When creating a new Spago project, you can use
+`spago init --no-comments` or `spago init -C`
+to generate this file without the comments in this block.
+-}
+{ name = "string-search"
+, dependencies =
+  [ "arrays"
+  , "debug"
+  , "enums"
+  , "foldable-traversable"
+  , "integers"
+  , "lists"
+  , "maybe"
+  , "ordered-collections"
+  , "partial"
+  , "prelude"
+  , "strings"
+  , "tuples"
+  , "uint"
+  ]
+, packages = ./packages.dhall
+, sources = [ "src/**/*.purs" ]
+}
--- a/src/Data/String/Search.purs
+++ b/src/Data/String/Search.purs
+module Data.String.Search where
--- a/src/Data/String/Search/KarpRabin.purs
+++ b/src/Data/String/Search/KarpRabin.purs
+-- |
+-- The present module has been ported from Haskell to PureScript
+-- by Nicolas Pouillard for the Gargantext projet.
+--
+-- Original Haskell code:
+--   Copyright      : (c) 2010 Daniel Fischer
+--   Licence        : BSD3
+--   Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
+--
+-- Simultaneous search for multiple patterns in a 'String'
+-- using the Karp-Rabin algorithm.
+--
+-- A description of the algorithm for a single pattern can be found at
+-- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>.
+module Data.String.Search.KarpRabin (
+                                    -- $overview
+                                    -- ** Caution
+                                    -- $caution
+                                    -- * Function
+                                    indicesOfAny
+                                  , indicesOfAnyHashStruct
+                                  , indicesOfAnyLegacy
+                                  , HashStruct
+                                  , hashStruct
+                                  ) where
+import Data.Array as A
+import Data.Array ((..))
+import Data.Enum (fromEnum)
+import Data.Foldable (class Foldable, minimum, foldl)
+import Data.FunctorWithIndex (mapWithIndex)
+import Data.Int (quot)
+import Data.List as L
+import Data.Map as M
+import Data.Maybe (Maybe(..), isJust, fromJust)
+import Data.String as S
+import Data.String (CodePoint)
+import Data.String.Search.Utils (slidingWindow)
+import Data.Tuple (Tuple(..), snd)
+import Data.Tuple.Nested ((/\))
+import Data.UInt (UInt, shl, fromInt)
+import Debug (trace)
+import Partial.Unsafe (unsafePartial)
+import Prelude
+fromCodePoint :: CodePoint -> UInt
+fromCodePoint c = fromInt (fromEnum c)
+-- | This struct is for performance reasons.
+type HashStruct = {
+    hash    :: String -> UInt
+  , hashMap :: M.Map UInt (Array Int)
+  , hLen    :: Int
+  , pats    :: Array String
+  , rehash  :: UInt -> CodePoint -> CodePoint -> UInt
+  }
+-- | This is a temporary structure, computed from given patterns, to
+-- | make the function more performant (suppose we want to match the
+-- | same patterns many times for a given string: we need to compute
+-- | `hash` etc only once for these patterns).
+hashStruct :: Array String -> HashStruct
+hashStruct pats = { hash, hashMap, hLen, pats, rehash }
+  where
+    hLen = minimum1 32 (S.length <$> pats)
+    hLen' = fromInt hLen
+    shDi = case 32 `quot` hLen of
+              q | q < 4 -> q
+                | otherwise -> 4
+    outS = fromInt (shDi * hLen)
+    rehash :: UInt -> CodePoint -> CodePoint -> UInt
+    rehash = case shDi of
+                1 -> rehash' (fromInt 1) hLen'
+                2 -> rehash' (fromInt 2) outS
+                3 -> rehash' (fromInt 3) outS
+                _ -> rehash' (fromInt 4) outS
+    hash :: String -> UInt
+    hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
+        <<< S.toCodePointArray
+        <<< S.take hLen
+    hashMap :: M.Map UInt (Array Int)
+    hashMap =
+      M.fromFoldableWith (flip (<>))
+                         (mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
+-- $overview
+--
+-- The Karp-Rabin algorithm works by calculating a hash of the pattern and
+-- comparing that hash with the hash of a slice of the target string with
+-- the same length as the pattern. If the hashes are equal, the slice of the
+-- target is compared to the pattern byte for byte (since the hash
+-- function generally isn't injective).
+--
+-- For a single pattern, this tends to be more efficient than the na&#239;ve
+-- algorithm, but it cannot compete with algorithms like
+-- Knuth-Morris-Pratt or Boyer-Moore.
+--
+-- However, the algorithm can be generalised to search for multiple patterns
+-- simultaneously. If the shortest pattern has length @k@, hash the prefix of
+-- length @k@ of all patterns and compare the hash of the target's slices of
+-- length @k@ to them. If there's a match, check whether the slice is part
+-- of an occurrence of the corresponding pattern.
+--
+-- With a hash-function that
+--
+--   * allows to compute the hash of one slice in constant time from the hash
+--     of the previous slice, the new and the dropped character, and
+--
+--   * produces few spurious matches,
+--
+-- searching for occurrences of any of @n@ patterns has a best-case complexity
+-- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is
+-- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is
+-- not much worse than the best case.
+--
+-- The functions in this module store the hashes of the patterns in an
+-- 'Map', so the lookup is /O/(@log n@). Re-hashing is done in constant
+-- time and spurious matches of the hashes /should be/ sufficiently rare.
+-- The maximal length of the prefixes to be hashed is 32.
+-- $caution
+--
+-- Unfortunately, the constant factors are high, so these functions are slow.
+-- Unless the number of patterns to search for is high (larger than 50 at
+-- least), repeated search for single patterns using Boyer-Moore or DFA and
+-- manual merging of the indices is faster. /Much/ faster for less than 40
+-- or so patterns.
+--
+-- In summary, this module is more of an interesting curiosity than anything
+-- else.
+-- | @'indicesOfAny'@ finds all occurrences of any of several non-empty patterns
+--   in a strict target string. If no non-empty patterns are given,
+--   the result is an empty array. Otherwise the result array contains
+--   the pairs of all indices where any of the (non-empty) patterns start
+--   and the array of all patterns starting at that index, the patterns being
+--   represented by their (zero-based) position in the pattern array.
+--   Empty patterns are filtered out before processing begins.
+indicesOfAny :: Array String                  -- ^ Array of non-empty patterns
+             -> String                        -- ^ String to search
+             -> Array (Tuple Int (Array Int)) -- ^ Array of matches
+indicesOfAny pats = if A.null nepats then const []
+                    else strictMatcher $ hashStruct nepats
+  where
+    nepats = A.filter (not <<< S.null) pats
+indicesOfAnyHashStruct :: HashStruct
+                       -> String
+                       -> Array (Tuple Int (Array Int))
+indicesOfAnyHashStruct hs = if A.null nepats then const []
+                            else strictMatcher hs
+  where
+    nepats = A.filter (not <<< S.null) hs.pats
+------------------------------------------------------------------------------
+--                                 Workers                                 --
+------------------------------------------------------------------------------
+rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
+rehash' shDi out h o n =
+    (h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
+minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
+minimum1 a fa =
+  case minimum fa of
+    Nothing -> a
+    Just b  -> min a b
+type GoAcc = Tuple UInt (Array (Tuple Int (Array Int)))
+strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int))
+strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
+  where
+    search :: Partial => String -> Array (Tuple Int (Array Int))
+    search str =
+      if strLen < hLen then []
+      else snd <$> foldl go (Tuple shash []) $ A.zip (0 .. maxIdx) window
+          where
+            -- | Sliding window of substrings. Note that we need to
+            -- | add the last one.
+            window :: Array (Array CodePoint)
+            window = slidingWindow arr (hLen + 1) <> [A.take hLen $ A.drop maxIdx arr]
+            strLen = S.length str
+            maxIdx = strLen - hLen
+            arr = S.toCodePointArray str
+            shash :: UInt
+            shash = hash str
+            go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
+            go (Tuple h acc) (Tuple idx arr') =
+              let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr')
+                  -- str' = S.fromCodePointArray arr'
+                  str' = S.drop idx str
+                  okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
+                  el = case M.lookup h hashMap of
+                    Nothing -> Nothing
+                    Just ps ->
+                      case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
+                        [] -> Nothing
+                        qs -> Just (Tuple idx qs)
+                  acc' = case el of
+                    Nothing -> acc
+                    Just el' -> A.snoc acc el'
+              in
+               Tuple rehashed acc'
+-- LEGACY STUFF
+indicesOfAnyLegacy :: Array String                  -- ^ Array of non-empty patterns
+                   -> String                        -- ^ String to search
+                   -> Array (Tuple Int (Array Int)) -- ^ Array of matches
+indicesOfAnyLegacy pats = if A.null nepats then const []
+                          else strictMatcherLegacy $ hashStruct nepats
+      where
+        nepats = A.filter (not <<< S.null) pats
+-- | This function works correctly, however it has recursion error
+-- | problems inside `search`.
+strictMatcherLegacy :: HashStruct -> String -> Array (Tuple Int (Array Int))
+strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search
+  where
+    search :: Partial => String -> Array (Tuple Int (Array Int))
+    search str = if strLen < hLen then []
+                 else A.fromFoldable (go 0 shash)
+      where
+        arr = S.toCodePointArray str
+        shash :: UInt
+        shash = hash str
+        strLen = S.length str
+        maxIdx = strLen - hLen
+        strAt i = A.unsafeIndex arr i
+        go :: Int -> UInt -> L.List (Tuple Int (Array Int))
+        go sI h =
+          let hd = strAt sI
+              isEnd = sI == maxIdx
+          in
+           case M.lookup h hashMap of
+             Nothing ->
+               if isEnd then L.Nil
+               else go (sI + 1) (rehash h hd (strAt (sI + hLen)))
+             Just ps ->
+               let rst = S.drop sI str
+                   more = if isEnd then L.Nil
+                          else go (sI + 1) (rehash h hd (strAt (sI + hLen)))
+                   okay bs =
+                     isJust (S.stripPrefix (S.Pattern bs) rst)
+               in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
+                 [] -> more
+                 qs -> Tuple sI qs L.: more
+--            go sI h =
+--              case M.lookup h hashMap of
+--                Nothing ->
+--                  if sI == maxIdx
+--                    then L.Nil
+--                    else go (sI + 1) (rehash h (strAt sI) (strAt (sI + hLen)))
+--                Just ps ->
+--                  let rst = S.drop sI str
+--                      hd  = strAt sI
+--                      more = if sI == maxIdx then L.Nil else
+--                                go (sI + 1) (rehash h hd (strAt (sI + hLen)))
+--                      okay bs =
+--                        isJust (S.stripPrefix (S.Pattern bs) rst)
+--                  in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
+--                           [] -> more
+--                           qs -> Tuple sI qs L.: more
--- a/src/Data/String/Search/Utils.purs
+++ b/src/Data/String/Search/Utils.purs
+module Data.String.Search.Utils where
+import Data.Array
+import Prelude
+slidingWindow :: forall a. Array a -> Int -> Array (Array a)
+slidingWindow lst len =
+  let diff = length lst - len
+  in
+   if diff < 0 then []
+   else (\idx -> slice idx (idx + len) lst) <$> 0 .. diff
--- a/test.dhall
+++ b/test.dhall
+let conf = ./spago.dhall
+in      conf
+    //  { sources = conf.sources # [ "test/**/*.purs" ]
+        , dependencies =
+              conf.dependencies
+            # [ "aff", "effect", "spec", "spec-discovery", "spec-quickcheck" ]
+        }
--- a/test/Data/String/Search/Spec.purs
+++ b/test/Data/String/Search/Spec.purs
+module Data.String.Search.KarpRabin.Spec where
+import Prelude
+import Data.Array (index)
+import Data.Foldable (all)
+import Data.Maybe (Maybe(..), isJust)
+import Data.String (drop, stripPrefix, Pattern(..))
+import Data.String.Search.KarpRabin (indicesOfAnyLegacy, indicesOfAny)
+import Data.Tuple (Tuple(..))
+import Test.Spec (Spec, describe, it)
+import Test.Spec.Assertions (shouldEqual)
+import Test.Spec.QuickCheck (quickCheck')
+validIndices :: Array String -> String -> Boolean
+validIndices pats input = all validIndex (indicesOfAny pats input)
+  where
+    validIndex (Tuple i ps) = all validPat ps
+      where
+        input' = drop i input
+        validPat p =
+          case index pats p of
+            Just pat -> isJust (stripPrefix (Pattern pat) input')
+                        -- <?> (show input' <> " should start with " <> show pat)
+            Nothing  -> false -- Failed "out of bounds pattern"
+-- indicesOfAny :: Array String -> String -> Array (Tuple Int (Array Int))
+-- indicesOfAny = indicesOfAnyLegacy
+spec :: Spec Unit
+spec =
+  describe "KarpRabin" do
+    it "works on a single pattern matching two times" do
+      let pats   = ["ab"]
+      let input  = "abcbab"
+      let output = [Tuple 0 [0], Tuple 4 [0]]
+      indicesOfAny pats input `shouldEqual` output
+      -- hashes: 1650, 1667, 1682, 1665, 1650
+    it "works on a many unmatching patterns" do
+      let pats   = ["abd","e","bac","abcbabe"]
+      let input  = "abcbab"
+      let output = []
+      indicesOfAny pats input `shouldEqual` output
+    it "works on a simple case" do
+      let pats   = ["ab","cb","bc","bca"]
+      let input  = "abcbab"
+      let output = [Tuple 0 [0]
+                   ,Tuple 1 [2]
+                   ,Tuple 2 [1]
+                   ,Tuple 4 [0]
+                   ]
+      indicesOfAny pats input `shouldEqual` output
+    it "works with overlaps" do
+      let pats   = ["aba"]
+      let input  = "ababa"
+      let output = [Tuple 0 [0]
+                   ,Tuple 2 [0]
+                   ]
+      indicesOfAny pats input `shouldEqual` output
+    it "returns valid indices" do
+      validIndices ["a","ab","ba","abc","aba","abab","abcde"]
+                   "ababarbabacbbababcaccacabbababa"
+          `shouldEqual` true
+    it "returns valid indices 2000 random samples" do
+      quickCheck' 2000 validIndices
--- a/test/Main.purs
+++ b/test/Main.purs
+module Test.Main where
+import Prelude
+import Effect (Effect)
+import Effect.Aff (launchAff_)
+import Test.Spec.Discovery (discover)
+import Test.Spec.Reporter.Console (consoleReporter)
+import Test.Spec.Runner (runSpec)
+main :: Effect Unit
+main = launchAff_ do
+  specs <- discover "Data\\.String\\.Search\\..*Spec"
+  runSpec [consoleReporter] specs
--- a/yarn.lock
+++ b/yarn.lock
+# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
+# yarn lockfile v1