-- | -- The present module has been ported from Haskell to PureScript -- by Nicolas Pouillard for the Gargantext projet. -- -- Original Haskell code: -- Copyright : (c) 2010 Daniel Fischer -- Licence : BSD3 -- Maintainer : Daniel Fischer <daniel.is.fischer@googlemail.com> -- -- Simultaneous search for multiple patterns in a 'String' -- using the Karp-Rabin algorithm. -- -- A description of the algorithm for a single pattern can be found at -- <http://www-igm.univ-mlv.fr/~lecroq/string/node5.html#SECTION0050>. module Gargantext.Utils.KarpRabin ( -- * Overview -- $overview -- ** Caution -- $caution -- * Function indicesOfAny ) where import Data.Array as A import Data.Enum (fromEnum) import Data.Foldable (class Foldable, minimum, foldl) import Data.Int (quot) import Data.List as L import Data.Map as M import Data.Maybe (Maybe(..), isJust) import Data.String as S import Data.String (CodePoint) import Data.Tuple (Tuple(..)) import Data.UInt (UInt, shl, fromInt) import Partial.Unsafe (unsafePartial) import Prelude fromCodePoint :: CodePoint -> UInt fromCodePoint c = fromInt (fromEnum c) -- $overview -- -- The Karp-Rabin algorithm works by calculating a hash of the pattern and -- comparing that hash with the hash of a slice of the target string with -- the same length as the pattern. If the hashes are equal, the slice of the -- target is compared to the pattern byte for byte (since the hash -- function generally isn't injective). -- -- For a single pattern, this tends to be more efficient than the naïve -- algorithm, but it cannot compete with algorithms like -- Knuth-Morris-Pratt or Boyer-Moore. -- -- However, the algorithm can be generalised to search for multiple patterns -- simultaneously. If the shortest pattern has length @k@, hash the prefix of -- length @k@ of all patterns and compare the hash of the target's slices of -- length @k@ to them. If there's a match, check whether the slice is part -- of an occurrence of the corresponding pattern. -- -- With a hash-function that -- -- * allows to compute the hash of one slice in constant time from the hash -- of the previous slice, the new and the dropped character, and -- -- * produces few spurious matches, -- -- searching for occurrences of any of @n@ patterns has a best-case complexity -- of /O/(@targetLength@ * @lookup n@). The worst-case complexity is -- /O/(@targetLength@ * @lookup n@ * @sum patternLengths@), the average is -- not much worse than the best case. -- -- The functions in this module store the hashes of the patterns in an -- 'Map', so the lookup is /O/(@log n@). Re-hashing is done in constant -- time and spurious matches of the hashes /should be/ sufficiently rare. -- The maximal length of the prefixes to be hashed is 32. -- $caution -- -- Unfortunately, the constant factors are high, so these functions are slow. -- Unless the number of patterns to search for is high (larger than 50 at -- least), repeated search for single patterns using Boyer-Moore or DFA and -- manual merging of the indices is faster. /Much/ faster for less than 40 -- or so patterns. -- -- In summary, this module is more of an interesting curiosity than anything -- else. -- | @'indicesOfAny'@ finds all occurrences of any of several non-empty patterns -- in a strict target string. If no non-empty patterns are given, -- the result is an empty array. Otherwise the result array contains -- the pairs of all indices where any of the (non-empty) patterns start -- and the array of all patterns starting at that index, the patterns being -- represented by their (zero-based) position in the pattern array. -- Empty patterns are filtered out before processing begins. indicesOfAny :: Array String -- ^ Array of non-empty patterns -> String -- ^ String to search -> Array (Tuple Int (Array Int)) -- ^ Array of matches indicesOfAny pats = if A.null nepats then const [] else strictMatcher nepats where nepats = A.filter (not <<< S.null) pats ------------------------------------------------------------------------------ -- Workers -- ------------------------------------------------------------------------------ rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt rehash' shDi out h o n = (h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a minimum1 a fa = case minimum fa of Nothing -> a Just b -> min a b strictMatcher :: Array String -> String -> Array (Tuple Int (Array Int)) strictMatcher pats = unsafePartial search where hLen = minimum1 32 (S.length <$> pats) hLen' = fromInt hLen shDi = case 32 `quot` hLen of q | q < 4 -> q | otherwise -> 4 outS = fromInt (shDi * hLen) patNum = A.length pats rehash :: UInt -> CodePoint -> CodePoint -> UInt rehash = case shDi of 1 -> rehash' (fromInt 1) hLen' 2 -> rehash' (fromInt 2) outS 3 -> rehash' (fromInt 3) outS _ -> rehash' (fromInt 4) outS hash :: String -> UInt hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0) <<< S.toCodePointArray <<< S.take hLen hashMap = M.fromFoldableWith (flip (<>)) (A.mapWithIndex (\i a -> Tuple (hash a) [i]) pats) search :: Partial => String -> Array (Tuple Int (Array Int)) search str = if strLen < hLen then [] else A.fromFoldable (go 0 shash) where strLen = S.length str maxIdx = strLen - hLen arr = S.toCodePointArray str strAt i = A.unsafeIndex arr i shash = hash str go sI h = case M.lookup h hashMap of Nothing -> if sI == maxIdx then L.Nil else go (sI + 1) (rehash h (strAt sI) (strAt (sI + hLen))) Just ps -> let rst = S.drop sI str hd = strAt sI more = if sI == maxIdx then L.Nil else go (sI + 1) (rehash h hd (strAt (sI + hLen))) okay bs = isJust (S.stripPrefix (S.Pattern bs) rst) in case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of [] -> more qs -> Tuple sI qs L.: more