Initial commit and a working version of KarpRabin

parents
use nix
.spago/
node_modules/
output/
import (
builtins.fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/22.11.tar.gz";
}
)
let upstream =
https://github.com/garganscript/package-sets/releases/download/v0.1.7/release.dhall
sha256:52886309e1f0158a85427f80c1e3d47ce747c5f14fcec671a81fe5c2c711a6db
in upstream
{ pkgs ? import ./nix/pinned.nix {} }:
let
default-shell = import (
pkgs.fetchFromGitHub {
owner = "garganscript";
repo = "package-sets";
rev = "master";
sha256 = "RYFsTna5cg7EjynLvkFg87zTLSMeZw7DNIgPu9iXokk=";
} + "/default-shell.nix");
build = pkgs.writeShellScriptBin "build" ''
#!/usr/bin/env bash
set -e
echo "Installing JS Dependencies"
yarn
echo "Compiling"
# 0.15
spago build
#spago bundle-module --main Main --to dist/bundle.js
'';
test-ps = pkgs.writeShellScriptBin "test-ps" ''
#!/usr/bin/env bash
set -e
echo "Compiling"
yarn
echo "Testing"
spago -x test.dhall test --main Test.Main
'';
in
pkgs.mkShell {
name = "purescript-string-search";
buildInputs = [
build
test-ps
default-shell.purs
default-shell.easy-ps.psc-package
default-shell.easy-ps.spago
default-shell.build
default-shell.pkgs.dhall-json
default-shell.pkgs.nodejs
];
}
{-
Welcome to a Spago project!
You can edit this file as you like.
Need help? See the following resources:
- Spago documentation: https://github.com/purescript/spago
- Dhall language tour: https://docs.dhall-lang.org/tutorials/Language-Tour.html
When creating a new Spago project, you can use
`spago init --no-comments` or `spago init -C`
to generate this file without the comments in this block.
-}
{ name = "string-search"
, dependencies =
[ "arrays"
, "debug"
, "enums"
, "foldable-traversable"
, "integers"
, "lists"
, "maybe"
, "ordered-collections"
, "partial"
, "prelude"
, "strings"
, "tuples"
, "uint"
]
, packages = ./packages.dhall
, sources = [ "src/**/*.purs" ]
}
module Data.String.Search where
-- |
-- The present module has been ported from Haskell to PureScript
-- by Nicolas Pouillard for the Gargantext projet.
--
-- Original Haskell code:
-- Copyright : (c) 2010 Daniel Fischer
-- Licence : BSD3
-- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com>
--
-- Simultaneous search for multiple patterns in a 'String'
-- using the Karp-Rabin algorithm.
--
-- A description of the algorithm for a single pattern can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>.
module Data.String.Search.KarpRabin (
-- $overview
-- ** Caution
-- $caution
-- * Function
indicesOfAny
, indicesOfAnyHashStruct
, indicesOfAnyLegacy
, HashStruct
, hashStruct
) where
import Data.Array as A
import Data.Array ((..))
import Data.Enum (fromEnum)
import Data.Foldable (class Foldable, minimum, foldl)
import Data.FunctorWithIndex (mapWithIndex)
import Data.Int (quot)
import Data.List as L
import Data.Map as M
import Data.Maybe (Maybe(..), isJust, fromJust)
import Data.String as S
import Data.String (CodePoint)
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..), snd)
import Data.Tuple.Nested ((/\))
import Data.UInt (UInt, shl, fromInt)
import Debug (trace)
import Partial.Unsafe (unsafePartial)
import Prelude
fromCodePoint :: CodePoint -> UInt
fromCodePoint c = fromInt (fromEnum c)
-- | This struct is for performance reasons.
type HashStruct = {
hash :: String -> UInt
, hashMap :: M.Map UInt (Array Int)
, hLen :: Int
, pats :: Array String
, rehash :: UInt -> CodePoint -> CodePoint -> UInt
}
-- | This is a temporary structure, computed from given patterns, to
-- | make the function more performant (suppose we want to match the
-- | same patterns many times for a given string: we need to compute
-- | `hash` etc only once for these patterns).
hashStruct :: Array String -> HashStruct
hashStruct pats = { hash, hashMap, hLen, pats, rehash }
where
hLen = minimum1 32 (S.length <$> pats)
hLen' = fromInt hLen
shDi = case 32 `quot` hLen of
q | q < 4 -> q
| otherwise -> 4
outS = fromInt (shDi * hLen)
rehash :: UInt -> CodePoint -> CodePoint -> UInt
rehash = case shDi of
1 -> rehash' (fromInt 1) hLen'
2 -> rehash' (fromInt 2) outS
3 -> rehash' (fromInt 3) outS
_ -> rehash' (fromInt 4) outS
hash :: String -> UInt
hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
<<< S.toCodePointArray
<<< S.take hLen
hashMap :: M.Map UInt (Array Int)
hashMap =
M.fromFoldableWith (flip (<>))
(mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
-- $overview
--
-- The Karp-Rabin algorithm works by calculating a hash of the pattern and
-- comparing that hash with the hash of a slice of the target string with
-- the same length as the pattern. If the hashes are equal, the slice of the
-- target is compared to the pattern byte for byte (since the hash
-- function generally isn't injective).
--
-- For a single pattern, this tends to be more efficient than the na&#239;ve
-- algorithm, but it cannot compete with algorithms like
-- Knuth-Morris-Pratt or Boyer-Moore.
--
-- However, the algorithm can be generalised to search for multiple patterns
-- simultaneously. If the shortest pattern has length @k@, hash the prefix of
-- length @k@ of all patterns and compare the hash of the target's slices of
-- length @k@ to them. If there's a match, check whether the slice is part
-- of an occurrence of the corresponding pattern.
--
-- With a hash-function that
--
-- * allows to compute the hash of one slice in constant time from the hash
-- of the previous slice, the new and the dropped character, and
--
-- * produces few spurious matches,
--
-- searching for occurrences of any of @n@ patterns has a best-case complexity
-- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is
-- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is
-- not much worse than the best case.
--
-- The functions in this module store the hashes of the patterns in an
-- 'Map', so the lookup is /O/(@log n@). Re-hashing is done in constant
-- time and spurious matches of the hashes /should be/ sufficiently rare.
-- The maximal length of the prefixes to be hashed is 32.
-- $caution
--
-- Unfortunately, the constant factors are high, so these functions are slow.
-- Unless the number of patterns to search for is high (larger than 50 at
-- least), repeated search for single patterns using Boyer-Moore or DFA and
-- manual merging of the indices is faster. /Much/ faster for less than 40
-- or so patterns.
--
-- In summary, this module is more of an interesting curiosity than anything
-- else.
-- | @'indicesOfAny'@ finds all occurrences of any of several non-empty patterns
-- in a strict target string. If no non-empty patterns are given,
-- the result is an empty array. Otherwise the result array contains
-- the pairs of all indices where any of the (non-empty) patterns start
-- and the array of all patterns starting at that index, the patterns being
-- represented by their (zero-based) position in the pattern array.
-- Empty patterns are filtered out before processing begins.
indicesOfAny :: Array String -- ^ Array of non-empty patterns
-> String -- ^ String to search
-> Array (Tuple Int (Array Int)) -- ^ Array of matches
indicesOfAny pats = if A.null nepats then const []
else strictMatcher $ hashStruct nepats
where
nepats = A.filter (not <<< S.null) pats
indicesOfAnyHashStruct :: HashStruct
-> String
-> Array (Tuple Int (Array Int))
indicesOfAnyHashStruct hs = if A.null nepats then const []
else strictMatcher hs
where
nepats = A.filter (not <<< S.null) hs.pats
------------------------------------------------------------------------------
-- Workers --
------------------------------------------------------------------------------
rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
rehash' shDi out h o n =
(h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
minimum1 a fa =
case minimum fa of
Nothing -> a
Just b -> min a b
type GoAcc = Tuple UInt (Array (Tuple Int (Array Int)))
strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int))
strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
where
search :: Partial => String -> Array (Tuple Int (Array Int))
search str =
if strLen < hLen then []
else snd <$> foldl go (Tuple shash []) $ A.zip (0 .. maxIdx) window
where
-- | Sliding window of substrings. Note that we need to
-- | add the last one.
window :: Array (Array CodePoint)
window = slidingWindow arr (hLen + 1) <> [A.take hLen $ A.drop maxIdx arr]
strLen = S.length str
maxIdx = strLen - hLen
arr = S.toCodePointArray str
shash :: UInt
shash = hash str
go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
go (Tuple h acc) (Tuple idx arr') =
let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr')
-- str' = S.fromCodePointArray arr'
str' = S.drop idx str
okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
el = case M.lookup h hashMap of
Nothing -> Nothing
Just ps ->
case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
[] -> Nothing
qs -> Just (Tuple idx qs)
acc' = case el of
Nothing -> acc
Just el' -> A.snoc acc el'
in
Tuple rehashed acc'
-- LEGACY STUFF
indicesOfAnyLegacy :: Array String -- ^ Array of non-empty patterns
-> String -- ^ String to search
-> Array (Tuple Int (Array Int)) -- ^ Array of matches
indicesOfAnyLegacy pats = if A.null nepats then const []
else strictMatcherLegacy $ hashStruct nepats
where
nepats = A.filter (not <<< S.null) pats
-- | This function works correctly, however it has recursion error
-- | problems inside `search`.
strictMatcherLegacy :: HashStruct -> String -> Array (Tuple Int (Array Int))
strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search
where
search :: Partial => String -> Array (Tuple Int (Array Int))
search str = if strLen < hLen then []
else A.fromFoldable (go 0 shash)
where
arr = S.toCodePointArray str
shash :: UInt
shash = hash str
strLen = S.length str
maxIdx = strLen - hLen
strAt i = A.unsafeIndex arr i
go :: Int -> UInt -> L.List (Tuple Int (Array Int))
go sI h =
let hd = strAt sI
isEnd = sI == maxIdx
in
case M.lookup h hashMap of
Nothing ->
if isEnd then L.Nil
else go (sI + 1) (rehash h hd (strAt (sI + hLen)))
Just ps ->
let rst = S.drop sI str
more = if isEnd then L.Nil
else go (sI + 1) (rehash h hd (strAt (sI + hLen)))
okay bs =
isJust (S.stripPrefix (S.Pattern bs) rst)
in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
[] -> more
qs -> Tuple sI qs L.: more
-- go sI h =
-- case M.lookup h hashMap of
-- Nothing ->
-- if sI == maxIdx
-- then L.Nil
-- else go (sI + 1) (rehash h (strAt sI) (strAt (sI + hLen)))
-- Just ps ->
-- let rst = S.drop sI str
-- hd = strAt sI
-- more = if sI == maxIdx then L.Nil else
-- go (sI + 1) (rehash h hd (strAt (sI + hLen)))
-- okay bs =
-- isJust (S.stripPrefix (S.Pattern bs) rst)
-- in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
-- [] -> more
-- qs -> Tuple sI qs L.: more
module Data.String.Search.Utils where
import Data.Array
import Prelude
slidingWindow :: forall a. Array a -> Int -> Array (Array a)
slidingWindow lst len =
let diff = length lst - len
in
if diff < 0 then []
else (\idx -> slice idx (idx + len) lst) <$> 0 .. diff
let conf = ./spago.dhall
in conf
// { sources = conf.sources # [ "test/**/*.purs" ]
, dependencies =
conf.dependencies
# [ "aff", "effect", "spec", "spec-discovery", "spec-quickcheck" ]
}
module Data.String.Search.KarpRabin.Spec where
import Prelude
import Data.Array (index)
import Data.Foldable (all)
import Data.Maybe (Maybe(..), isJust)
import Data.String (drop, stripPrefix, Pattern(..))
import Data.String.Search.KarpRabin (indicesOfAnyLegacy, indicesOfAny)
import Data.Tuple (Tuple(..))
import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldEqual)
import Test.Spec.QuickCheck (quickCheck')
validIndices :: Array String -> String -> Boolean
validIndices pats input = all validIndex (indicesOfAny pats input)
where
validIndex (Tuple i ps) = all validPat ps
where
input' = drop i input
validPat p =
case index pats p of
Just pat -> isJust (stripPrefix (Pattern pat) input')
-- <?> (show input' <> " should start with " <> show pat)
Nothing -> false -- Failed "out of bounds pattern"
-- indicesOfAny :: Array String -> String -> Array (Tuple Int (Array Int))
-- indicesOfAny = indicesOfAnyLegacy
spec :: Spec Unit
spec =
describe "KarpRabin" do
it "works on a single pattern matching two times" do
let pats = ["ab"]
let input = "abcbab"
let output = [Tuple 0 [0], Tuple 4 [0]]
indicesOfAny pats input `shouldEqual` output
-- hashes: 1650, 1667, 1682, 1665, 1650
it "works on a many unmatching patterns" do
let pats = ["abd","e","bac","abcbabe"]
let input = "abcbab"
let output = []
indicesOfAny pats input `shouldEqual` output
it "works on a simple case" do
let pats = ["ab","cb","bc","bca"]
let input = "abcbab"
let output = [Tuple 0 [0]
,Tuple 1 [2]
,Tuple 2 [1]
,Tuple 4 [0]
]
indicesOfAny pats input `shouldEqual` output
it "works with overlaps" do
let pats = ["aba"]
let input = "ababa"
let output = [Tuple 0 [0]
,Tuple 2 [0]
]
indicesOfAny pats input `shouldEqual` output
it "returns valid indices" do
validIndices ["a","ab","ba","abc","aba","abab","abcde"]
"ababarbabacbbababcaccacabbababa"
`shouldEqual` true
it "returns valid indices 2000 random samples" do
quickCheck' 2000 validIndices
module Test.Main where
import Prelude
import Effect (Effect)
import Effect.Aff (launchAff_)
import Test.Spec.Discovery (discover)
import Test.Spec.Reporter.Console (consoleReporter)
import Test.Spec.Runner (runSpec)
main :: Effect Unit
main = launchAff_ do
specs <- discover "Data\\.String\\.Search\\..*Spec"
runSpec [consoleReporter] specs
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment