[fix] hashRH didn't work for all CodePoint

This is fixed now using modulus of UInt64.
parent 790d4b74
......@@ -13,11 +13,9 @@ to generate this file without the comments in this block.
{ name = "string-search"
, dependencies =
[ "arrays"
, "debug"
, "enums"
, "foldable-traversable"
, "int64"
, "integers"
, "lists"
, "maybe"
, "ordered-collections"
......@@ -25,7 +23,6 @@ to generate this file without the comments in this block.
, "prelude"
, "strings"
, "tuples"
, "uint"
]
, packages = ./packages.dhall
, sources = [ "src/**/*.purs" ]
......
......@@ -22,17 +22,20 @@ module Data.String.Search.KarpRabin (
indicesOfAny
, indicesOfAnyHashStruct
, indicesOfAnyLegacy
, fromCodePoint
, Base
, universalBase
, Modulus
, universalModulus
-- , universalModulus
, Hash
, HashStruct
, hashStruct
, RollingHash
, mkRollingHash
, hashRH
, rehashRH
-- , hashRH
-- , rehashRH
, hashU64
, rehashU64
) where
......@@ -67,8 +70,9 @@ universalBase = unsafeFromInt 256
-- | Modulus that we will use in Karp-Rabin
-- https://www.wolframalpha.com/input?i=prime+number+greater+than+50000000
universalModulus :: Modulus
universalModulus = unsafeFromInt 50000017
-- universalModulus :: Modulus
-- universalModulus = unsafeFromInt 1009
-- universalModulus = unsafeFromInt 50000017
fromCodePoint :: CodePoint -> Base
......@@ -79,7 +83,7 @@ fromCodePoint c = unsafeFromInt (fromEnum c)
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
newtype RollingHash = RollingHash {
base :: Base
, modulus :: Modulus
-- , modulus :: Modulus -- in our case, the modulus is (top :: UInt64)
, len :: Int
, basePowLen :: Base -- pow base len % modulus (stored for performance reasons)
}
......@@ -88,23 +92,37 @@ derive instance Generic RollingHash _
instance Show RollingHash where
show = genericShow
mkRollingHash :: Base -> Modulus -> Int -> RollingHash
mkRollingHash base modulus len = RollingHash { base
, modulus
, len
, basePowLen }
mkRollingHash :: Base -> Int -> RollingHash
mkRollingHash base len = RollingHash { base
-- , modulus
, len
, basePowLen }
where
basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
-- basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
basePowLen = foldl (\acc _l -> (acc*base)) (unsafeFromInt 1) (1..(len - 1))
-- -- | NOTE: xs must be of length RollingHash.len
-- hashRH :: RollingHash -> Array Base -> Hash
-- hashRH rh@(RollingHash { base, modulus }) xs =
-- -- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
-- foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
-- rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
-- rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
-- ((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | NOTE: xs must be of length RollingHash.len
hashRH :: RollingHash -> Array Base -> Hash
hashRH rh@(RollingHash {base, modulus }) xs =
hashU64 :: RollingHash -> Array Base -> Hash
hashU64 rh@(RollingHash { base }) xs =
-- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
foldl (\acc x -> rehashU64 rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | In this function we use the fact that UIn64 rolls automatically
-- | after 18446744073709551615ul.rehashU64 :: RollingHash -> Hash ->
-- | Base -> Base -> Hash
rehashU64 (RollingHash { base, basePowLen }) h old new =
(h - old*basePowLen)*base + new
-- | This struct is for performance reasons.
......@@ -126,13 +144,13 @@ hashStruct pats = { hash, hashMap, hLen, pats, rehash, rehashChar }
where
hLen = minimum1 32 (S.length <$> pats)
rh = mkRollingHash universalBase universalModulus hLen
rh = mkRollingHash universalBase hLen
hash :: String -> Hash
hash = hashRH rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
hash = hashU64 rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
rehash :: Hash -> CodePoint -> CodePoint -> Hash
rehash h o n = rehashRH rh h (fromCodePoint o) (fromCodePoint n)
rehash h o n = rehashU64 rh h (fromCodePoint o) (fromCodePoint n)
rehashChar :: Hash -> Char -> Char -> Hash
rehashChar h o n = rehash h (S.codePointFromChar o) (S.codePointFromChar n)
......
......@@ -2,15 +2,25 @@ module Data.String.Search.KarpRabin.Spec where
import Prelude
import Data.Array (index)
import Data.Bounded (class Bounded)
import Data.Bounded.Generic (genericTop, genericBottom)
import Data.Enum (class BoundedEnum, class Enum)
import Data.Enum.Generic (genericCardinality, genericToEnum, genericFromEnum, genericSucc, genericPred)
import Data.Eq (class Eq)
import Data.Eq.Generic (genericEq)
import Data.Foldable (all)
import Data.Generic.Rep (class Generic)
import Data.Maybe (Maybe(..), isJust)
import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar)
import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashRH, rehashRH, hashStruct) -- indicesOfAnyLegacy,
import Data.Ord (class Ord)
import Data.Ord.Generic (genericCompare)
import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar, CodePoint)
import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashU64, rehashU64, hashStruct, fromCodePoint) -- indicesOfAnyLegacy,
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..))
import Data.UInt64 (unsafeFromInt)
import Data.UInt64 (unsafeFromInt, UInt64)
import Test.QuickCheck ((<?>))
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Gen (enum)
import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldEqual)
import Test.Spec.QuickCheck (quickCheck')
......@@ -19,13 +29,25 @@ import Test.Spec.QuickCheck (quickCheck')
fromInt = unsafeFromInt
data SmallInt = SmallInt Int
newtype CodePointA = CodePointA CodePoint
runInt :: SmallInt -> Int
runInt (SmallInt i) = i
instance arbSmallInt :: Arbitrary SmallInt where
arbitrary = map (SmallInt <<< (\i -> i / 10000 + 100)) arbitrary
derive instance Generic CodePointA _
instance Eq CodePointA where
eq = genericEq
instance Ord CodePointA where
compare = genericCompare
instance Bounded CodePointA where
top = genericTop
bottom = genericBottom
instance Enum CodePointA where
succ = genericSucc
pred = genericPred
instance BoundedEnum CodePointA where
cardinality = genericCardinality
toEnum = genericToEnum
fromEnum = genericFromEnum
instance Arbitrary CodePointA where
arbitrary = enum
validIndices :: Array String -> String -> Boolean
......@@ -51,47 +73,35 @@ spec =
[[1, 2], [2, 3], [3, 4]]
it "rolling hash works 1" do
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm#Hash_function_used
let rh = mkRollingHash (fromInt 256) (fromInt 101) 3
let a = fromInt 97
let b = fromInt 98
let r = fromInt 114
hashRH rh [a, b, r] `shouldEqual` (fromInt 4)
hashRH rh [b, r, a] `shouldEqual` (fromInt 30)
hashRH rh [b, r, a] `shouldEqual`
rehashRH rh (hashRH rh [a, b, r]) a a
it "rolling hash works 2" do
let rh = mkRollingHash (fromInt 7) (fromInt 1009) 3
let rh = mkRollingHash (fromInt 7) 3
let a = fromInt 1
let b = fromInt 2
let c = fromInt 3
let d = fromInt 4
let h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
h3 = hashRH rh [c, d, a]
let h1 = hashU64 rh [a, b, c]
h2 = hashU64 rh [b, c, d]
h3 = hashU64 rh [c, d, a]
h2 `shouldEqual` rehashRH rh h1 a d
h3 `shouldEqual` rehashRH rh h2 b a
h2 `shouldEqual` rehashU64 rh h1 a d
h3 `shouldEqual` rehashU64 rh h2 b a
it "rolling hash works 3 (quickcheck)" $ do
let rh = mkRollingHash (fromInt 256) (fromInt 1009) 3
it "rolling hash works 2 (quickcheck)" $ do
let rh = mkRollingHash (fromInt 256) 3
quickCheck' 2000 \(SmallInt a') (SmallInt b') (SmallInt c') (SmallInt d') ->
let a = fromInt a'
b = fromInt b'
c = fromInt c'
d = fromInt d'
h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
quickCheck' 2000 \(CodePointA a') (CodePointA b') (CodePointA c') (CodePointA d') ->
let a = fromCodePoint a'
b = fromCodePoint b'
c = fromCodePoint c'
d = fromCodePoint d'
h1 = hashU64 rh [a, b, c]
h2 = hashU64 rh [b, c, d]
in
h2 == rehashRH rh h1 a d
h2 == rehashU64 rh h1 a d
<?> ( "Fail for: " <> show [a', b', c', d']
<> ", h1 = " <> show h1
<> ", h2 = " <> show h2
<> ", rehash = " <> show (rehashRH rh h1 a d))
<> ", rehash = " <> show (rehashU64 rh h1 a d))
it "works on a single pattern matching two times" do
let pats = ["ab"]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment