[fix] improve tests, fixed issues with hashing algorithm

parent 04f86880
.psci_modules/
.spago/ .spago/
node_modules/ node_modules/
output/ output/
...@@ -44,5 +44,6 @@ pkgs.mkShell { ...@@ -44,5 +44,6 @@ pkgs.mkShell {
default-shell.build default-shell.build
default-shell.pkgs.dhall-json default-shell.pkgs.dhall-json
default-shell.pkgs.nodejs default-shell.pkgs.nodejs
default-shell.pkgs.yarn
]; ];
} }
...@@ -13,8 +13,10 @@ to generate this file without the comments in this block. ...@@ -13,8 +13,10 @@ to generate this file without the comments in this block.
{ name = "string-search" { name = "string-search"
, dependencies = , dependencies =
[ "arrays" [ "arrays"
, "debug"
, "enums" , "enums"
, "foldable-traversable" , "foldable-traversable"
, "int64"
, "integers" , "integers"
, "lists" , "lists"
, "maybe" , "maybe"
......
...@@ -22,8 +22,17 @@ module Data.String.Search.KarpRabin ( ...@@ -22,8 +22,17 @@ module Data.String.Search.KarpRabin (
indicesOfAny indicesOfAny
, indicesOfAnyHashStruct , indicesOfAnyHashStruct
, indicesOfAnyLegacy , indicesOfAnyLegacy
, Base
, universalBase
, Modulus
, universalModulus
, Hash
, HashStruct , HashStruct
, hashStruct , hashStruct
, RollingHash
, mkRollingHash
, hashRH
, rehashRH
) where ) where
...@@ -32,29 +41,80 @@ import Data.Array ((..)) ...@@ -32,29 +41,80 @@ import Data.Array ((..))
import Data.Enum (fromEnum) import Data.Enum (fromEnum)
import Data.Foldable (class Foldable, minimum, foldl) import Data.Foldable (class Foldable, minimum, foldl)
import Data.FunctorWithIndex (mapWithIndex) import Data.FunctorWithIndex (mapWithIndex)
import Data.Int (quot) import Data.Generic.Rep (class Generic)
import Data.List as L import Data.List as L
import Data.Map as M import Data.Map as M
import Data.Maybe (Maybe(..), isJust, fromJust) import Data.Maybe (Maybe(..), isJust, fromJust)
import Data.Show.Generic (genericShow)
import Data.String as S import Data.String as S
import Data.String (CodePoint) import Data.String (CodePoint)
import Data.String.Search.Utils (slidingWindow) import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..), snd) import Data.Tuple (Tuple(..), snd)
import Data.UInt (UInt, shl, fromInt) import Data.UInt64 (UInt64, unsafeFromInt) -- shl
-- import Debug (trace)
import Partial.Unsafe (unsafePartial) import Partial.Unsafe (unsafePartial)
import Prelude import Prelude
fromCodePoint :: CodePoint -> UInt
fromCodePoint c = fromInt (fromEnum c) type Base = UInt64
type Modulus = UInt64
type Hash = UInt64
-- | Base that we will use in Karp-Rabin
universalBase :: Base
universalBase = unsafeFromInt 256
-- | Modulus that we will use in Karp-Rabin
-- https://www.wolframalpha.com/input?i=prime+number+greater+than+50000000
universalModulus :: Modulus
universalModulus = unsafeFromInt 50000017
fromCodePoint :: CodePoint -> Base
fromCodePoint c = unsafeFromInt (fromEnum c)
-- Rolling hash implementation, see
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
newtype RollingHash = RollingHash {
base :: Base
, modulus :: Modulus
, len :: Int
, basePowLen :: Base -- pow base len % modulus (stored for performance reasons)
}
derive instance Generic RollingHash _
instance Show RollingHash where
show = genericShow
mkRollingHash :: Base -> Modulus -> Int -> RollingHash
mkRollingHash base modulus len = RollingHash { base
, modulus
, len
, basePowLen }
where
basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
-- | NOTE: xs must be of length RollingHash.len
hashRH :: RollingHash -> Array Base -> Hash
hashRH rh@(RollingHash {base, modulus }) xs =
-- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | This struct is for performance reasons. -- | This struct is for performance reasons.
type HashStruct = { type HashStruct = {
hash :: String -> UInt hash :: String -> Hash
, hashMap :: M.Map UInt (Array Int) , hashMap :: M.Map Hash (Array Int)
, hLen :: Int , hLen :: Int
, pats :: Array String , pats :: Array String
, rehash :: UInt -> CodePoint -> CodePoint -> UInt , rehash :: Hash -> CodePoint -> CodePoint -> Hash
, rehashChar :: Hash -> Char -> Char -> Hash
} }
-- | This is a temporary structure, computed from given patterns, to -- | This is a temporary structure, computed from given patterns, to
...@@ -62,29 +122,44 @@ type HashStruct = { ...@@ -62,29 +122,44 @@ type HashStruct = {
-- | same patterns many times for a given string: we need to compute -- | same patterns many times for a given string: we need to compute
-- | `hash` etc only once for these patterns). -- | `hash` etc only once for these patterns).
hashStruct :: Array String -> HashStruct hashStruct :: Array String -> HashStruct
hashStruct pats = { hash, hashMap, hLen, pats, rehash } hashStruct pats = { hash, hashMap, hLen, pats, rehash, rehashChar }
where where
hLen = minimum1 32 (S.length <$> pats) hLen = minimum1 32 (S.length <$> pats)
hLen' = fromInt hLen
shDi = case 32 `quot` hLen of rh = mkRollingHash universalBase universalModulus hLen
q | q < 4 -> q
| otherwise -> 4 hash :: String -> Hash
outS = fromInt (shDi * hLen) hash = hashRH rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
rehash :: UInt -> CodePoint -> CodePoint -> UInt
rehash = case shDi of rehash :: Hash -> CodePoint -> CodePoint -> Hash
1 -> rehash' (fromInt 1) hLen' rehash h o n = rehashRH rh h (fromCodePoint o) (fromCodePoint n)
2 -> rehash' (fromInt 2) outS
3 -> rehash' (fromInt 3) outS rehashChar :: Hash -> Char -> Char -> Hash
_ -> rehash' (fromInt 4) outS rehashChar h o n = rehash h (S.codePointFromChar o) (S.codePointFromChar n)
hash :: String -> UInt
hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0) hashMap :: M.Map Hash (Array Int)
<<< S.toCodePointArray
<<< S.take hLen
hashMap :: M.Map UInt (Array Int)
hashMap = hashMap =
M.fromFoldableWith (flip (<>)) M.fromFoldableWith (flip (<>))
(mapWithIndex (\i a -> Tuple (hash a) [i]) pats) (mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
-- hLen' = fromInt hLen
-- shDi = case 32 `quot` hLen of
-- q | q < 4 -> q
-- | otherwise -> 4
-- outS = fromInt (shDi * hLen)
-- rehash :: UInt -> CodePoint -> CodePoint -> UInt
-- rehash = case shDi of
-- 1 -> rehash' (fromInt 1) hLen'
-- 2 -> rehash' (fromInt 2) outS
-- 3 -> rehash' (fromInt 3) outS
-- _ -> rehash' (fromInt 4) outS
-- hash :: String -> UInt
-- hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
-- <<< S.toCodePointArray
-- <<< S.take hLen
-- $overview -- $overview
-- --
...@@ -160,9 +235,9 @@ indicesOfAnyHashStruct hs = if A.null nepats then const [] ...@@ -160,9 +235,9 @@ indicesOfAnyHashStruct hs = if A.null nepats then const []
-- Workers -- -- Workers --
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt -- rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
rehash' shDi out h o n = -- rehash' shDi out h o n =
(h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n -- (h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
minimum1 a fa = minimum1 a fa =
...@@ -170,7 +245,7 @@ minimum1 a fa = ...@@ -170,7 +245,7 @@ minimum1 a fa =
Nothing -> a Nothing -> a
Just b -> min a b Just b -> min a b
type GoAcc = Tuple UInt (Array (Tuple Int (Array Int))) type GoAcc = Tuple Hash (Array (Tuple Int (Array Int)))
strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int)) strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int))
strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
...@@ -187,16 +262,29 @@ strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search ...@@ -187,16 +262,29 @@ strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
strLen = S.length str strLen = S.length str
maxIdx = strLen - hLen maxIdx = strLen - hLen
arr = S.toCodePointArray str arr = S.toCodePointArray str
shash :: UInt shash :: Hash
shash = hash str shash = hash str
go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
go (Tuple h acc) (Tuple idx arr') = go (Tuple h acc) (Tuple idx arr') =
let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr') let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr')
-- _ = if A.elem " abd" pats then
-- trace ("[strictMatcher] " <> show { h
-- , hashMap
-- , acc
-- , idx
-- , arr': S.fromCodePointArray arr'
-- , arrHeadLast: S.fromCodePointArray [fromJust $ A.head arr', fromJust $ A.last arr']
-- , lookup: M.lookup h hashMap }) \_ -> unit
-- else
-- unit
el = case M.lookup h hashMap of el = case M.lookup h hashMap of
Nothing -> Nothing Nothing -> Nothing
Just ps -> Just ps ->
let str' = S.drop idx str let str' = S.drop idx str
okay bs = isJust (S.stripPrefix (S.Pattern bs) str') okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
-- _ = trace ("[strictMatcher] " <> show { pats
-- , ps
-- , oks: (\x -> okay (A.unsafeIndex pats x)) <$> ps}) \_ -> unit
in in
case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
[] -> Nothing [] -> Nothing
...@@ -228,14 +316,14 @@ strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search ...@@ -228,14 +316,14 @@ strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search
else A.fromFoldable (go 0 shash) else A.fromFoldable (go 0 shash)
where where
arr = S.toCodePointArray str arr = S.toCodePointArray str
shash :: UInt shash :: Hash
shash = hash str shash = hash str
strLen = S.length str strLen = S.length str
maxIdx = strLen - hLen maxIdx = strLen - hLen
strAt i = A.unsafeIndex arr i strAt i = A.unsafeIndex arr i
go :: Int -> UInt -> L.List (Tuple Int (Array Int)) go :: Int -> Hash -> L.List (Tuple Int (Array Int))
go sI h = go sI h =
let hd = strAt sI let hd = strAt sI
isEnd = sI == maxIdx isEnd = sI == maxIdx
......
...@@ -4,5 +4,11 @@ in conf ...@@ -4,5 +4,11 @@ in conf
// { sources = conf.sources # [ "test/**/*.purs" ] // { sources = conf.sources # [ "test/**/*.purs" ]
, dependencies = , dependencies =
conf.dependencies conf.dependencies
# [ "aff", "effect", "spec", "spec-discovery", "spec-quickcheck" ] # [ "aff"
, "effect"
, "quickcheck"
, "spec"
, "spec-discovery"
, "spec-quickcheck"
]
} }
...@@ -4,13 +4,30 @@ import Prelude ...@@ -4,13 +4,30 @@ import Prelude
import Data.Array (index) import Data.Array (index)
import Data.Foldable (all) import Data.Foldable (all)
import Data.Maybe (Maybe(..), isJust) import Data.Maybe (Maybe(..), isJust)
import Data.String (drop, stripPrefix, Pattern(..)) import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar)
import Data.String.Search.KarpRabin (indicesOfAnyLegacy, indicesOfAny) import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashRH, rehashRH, hashStruct) -- indicesOfAnyLegacy,
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..)) import Data.Tuple (Tuple(..))
import Data.UInt64 (unsafeFromInt)
import Test.QuickCheck ((<?>))
import Test.QuickCheck.Arbitrary
import Test.Spec (Spec, describe, it) import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldEqual) import Test.Spec.Assertions (shouldEqual)
import Test.Spec.QuickCheck (quickCheck') import Test.Spec.QuickCheck (quickCheck')
fromInt = unsafeFromInt
data SmallInt = SmallInt Int
runInt :: SmallInt -> Int
runInt (SmallInt i) = i
instance arbSmallInt :: Arbitrary SmallInt where
arbitrary = map (SmallInt <<< (\i -> i / 10000 + 100)) arbitrary
validIndices :: Array String -> String -> Boolean validIndices :: Array String -> String -> Boolean
validIndices pats input = all validIndex (indicesOfAny pats input) validIndices pats input = all validIndex (indicesOfAny pats input)
where where
...@@ -29,12 +46,58 @@ validIndices pats input = all validIndex (indicesOfAny pats input) ...@@ -29,12 +46,58 @@ validIndices pats input = all validIndex (indicesOfAny pats input)
spec :: Spec Unit spec :: Spec Unit
spec = spec =
describe "KarpRabin" do describe "KarpRabin" do
it "sliding window works" do
slidingWindow [1, 2, 3, 4] 2 `shouldEqual`
[[1, 2], [2, 3], [3, 4]]
it "rolling hash works 1" do
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm#Hash_function_used
let rh = mkRollingHash (fromInt 256) (fromInt 101) 3
let a = fromInt 97
let b = fromInt 98
let r = fromInt 114
hashRH rh [a, b, r] `shouldEqual` (fromInt 4)
hashRH rh [b, r, a] `shouldEqual` (fromInt 30)
hashRH rh [b, r, a] `shouldEqual`
rehashRH rh (hashRH rh [a, b, r]) a a
it "rolling hash works 2" do
let rh = mkRollingHash (fromInt 7) (fromInt 1009) 3
let a = fromInt 1
let b = fromInt 2
let c = fromInt 3
let d = fromInt 4
let h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
h3 = hashRH rh [c, d, a]
h2 `shouldEqual` rehashRH rh h1 a d
h3 `shouldEqual` rehashRH rh h2 b a
it "rolling hash works 3 (quickcheck)" $ do
let rh = mkRollingHash (fromInt 256) (fromInt 1009) 3
quickCheck' 2000 \(SmallInt a') (SmallInt b') (SmallInt c') (SmallInt d') ->
let a = fromInt a'
b = fromInt b'
c = fromInt c'
d = fromInt d'
h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
in
h2 == rehashRH rh h1 a d
<?> ( "Fail for: " <> show [a', b', c', d']
<> ", h1 = " <> show h1
<> ", h2 = " <> show h2
<> ", rehash = " <> show (rehashRH rh h1 a d))
it "works on a single pattern matching two times" do it "works on a single pattern matching two times" do
let pats = ["ab"] let pats = ["ab"]
let input = "abcbab" let input = "abcbab"
let output = [Tuple 0 [0], Tuple 4 [0]] let output = [Tuple 0 [0], Tuple 4 [0]]
indicesOfAny pats input `shouldEqual` output indicesOfAny pats input `shouldEqual` output
-- hashes: 1650, 1667, 1682, 1665, 1650
it "works on a many unmatching patterns" do it "works on a many unmatching patterns" do
let pats = ["abd","e","bac","abcbabe"] let pats = ["abd","e","bac","abcbabe"]
...@@ -42,7 +105,27 @@ spec = ...@@ -42,7 +105,27 @@ spec =
let output = [] let output = []
indicesOfAny pats input `shouldEqual` output indicesOfAny pats input `shouldEqual` output
it "works on a simple case" do it "works when pattern is in the middle" do
let pats = ["abd"]
let input = "eeabdee"
let output = [Tuple 2 [0]]
let hs = hashStruct pats
let h1 = hs.hash "abd"
let h2 = hs.hash "bde"
h2 `shouldEqual` (hs.rehashChar h1 'a' 'e')
indicesOfAny pats input `shouldEqual` output
it "works when pattern is in the middle, with space in it (ps #598)" do
let pats = [" abd"]
let input = "e abdee"
let output = [Tuple 1 [0]]
let hs = hashStruct pats
let h1 = hs.hash "e ab"
let h2 = hs.hash " abd"
h2 `shouldEqual` (hs.rehashChar h1 'e' 'd')
indicesOfAny pats input `shouldEqual` output
it "works on a bit more complex case" do
let pats = ["ab","cb","bc","bca"] let pats = ["ab","cb","bc","bca"]
let input = "abcbab" let input = "abcbab"
let output = [Tuple 0 [0] let output = [Tuple 0 [0]
...@@ -67,3 +150,15 @@ spec = ...@@ -67,3 +150,15 @@ spec =
it "returns valid indices 2000 random samples" do it "returns valid indices 2000 random samples" do
quickCheck' 2000 validIndices quickCheck' 2000 validIndices
it "works for purescript-gargantext issue #598, test 1" do
let pats = [" conception " ]
let input = " passant d'une conception quasi solipsiste ou monadique de la sensation "
let output = [Tuple 16 [0]]
indicesOfAny pats input `shouldEqual` output
it "works for purescript-gargantext issue #598, test 2" do
let pats = [" conception ", " notion "]
let input = " passant d'une conception quasi solipsiste ou monadique de la sensation "
let output = [Tuple 16 [0]]
indicesOfAny pats input `shouldEqual` output
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment