diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..7e7b5e311ccef5f3601ce525cb50d311680aa9a1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +## [0.1.5] - 2023-11-09 + +### Changed + +- Code cleanup + +## [0.1.4] - 2023-11-09 + +### Fixed + +- Hash is still incorrect because of overflowing (`UInt64` range vs + `modulus`). I removed `modulus` and just used the overflow + functionality of `UInt64`. + +## [0.1.3] - 2023-11-08 + +### Fixed + +- Hash was computed incorrectly. I changed to `UInt64` and added tests + with `SmallInt`. diff --git a/src/Data/String/Search/KarpRabin.purs b/src/Data/String/Search/KarpRabin.purs index c404d9415b4bf357e421f3c5944b520f2a0f2343..59c6f8fbc2d1d443cbfc216dd2f4912efa6f602d 100644 --- a/src/Data/String/Search/KarpRabin.purs +++ b/src/Data/String/Search/KarpRabin.purs @@ -25,15 +25,11 @@ module Data.String.Search.KarpRabin ( , fromCodePoint , Base , universalBase - , Modulus - -- , universalModulus , Hash , HashStruct , hashStruct , RollingHash , mkRollingHash - -- , hashRH - -- , rehashRH , hashU64 , rehashU64 ) where @@ -60,7 +56,6 @@ import Prelude type Base = UInt64 -type Modulus = UInt64 type Hash = UInt64 @@ -68,12 +63,6 @@ type Hash = UInt64 universalBase :: Base universalBase = unsafeFromInt 256 --- | Modulus that we will use in Karp-Rabin --- https://www.wolframalpha.com/input?i=prime+number+greater+than+50000000 --- universalModulus :: Modulus --- universalModulus = unsafeFromInt 1009 --- universalModulus = unsafeFromInt 50000017 - fromCodePoint :: CodePoint -> Base fromCodePoint c = unsafeFromInt (fromEnum c) @@ -83,7 +72,6 @@ fromCodePoint c = unsafeFromInt (fromEnum c) -- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm newtype RollingHash = RollingHash { base :: Base - -- , modulus :: Modulus -- in our case, the modulus is (top :: UInt64) , len :: Int , basePowLen :: Base -- pow base len % modulus (stored for performance reasons) } @@ -101,16 +89,6 @@ mkRollingHash base len = RollingHash { base -- basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1)) basePowLen = foldl (\acc _l -> (acc*base)) (unsafeFromInt 1) (1..(len - 1)) --- -- | NOTE: xs must be of length RollingHash.len --- hashRH :: RollingHash -> Array Base -> Hash --- hashRH rh@(RollingHash { base, modulus }) xs = --- -- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs --- foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs - --- rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash --- rehashRH (RollingHash { base, basePowLen, modulus }) h old new = --- ((h + (modulus - old)*basePowLen)*base + new) `mod` modulus - -- | NOTE: xs must be of length RollingHash.len hashU64 :: RollingHash -> Array Base -> Hash @@ -160,23 +138,6 @@ hashStruct pats = { hash, hashMap, hLen, pats, rehash, rehashChar } M.fromFoldableWith (flip (<>)) (mapWithIndex (\i a -> Tuple (hash a) [i]) pats) - - -- hLen' = fromInt hLen - -- shDi = case 32 `quot` hLen of - -- q | q < 4 -> q - -- | otherwise -> 4 - -- outS = fromInt (shDi * hLen) - - -- rehash :: UInt -> CodePoint -> CodePoint -> UInt - -- rehash = case shDi of - -- 1 -> rehash' (fromInt 1) hLen' - -- 2 -> rehash' (fromInt 2) outS - -- 3 -> rehash' (fromInt 3) outS - -- _ -> rehash' (fromInt 4) outS - -- hash :: String -> UInt - -- hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0) - -- <<< S.toCodePointArray - -- <<< S.take hLen -- $overview diff --git a/test/Main.purs b/test/Main.purs index ef8c6b19ee91cf0cd57e4ae8d8b9f9ac998ac7a7..3550eed98f035f7893ff3c557c3b3f5f742b26df 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -9,5 +9,5 @@ import Test.Spec.Runner (runSpec) main :: Effect Unit main = launchAff_ do - specs <- discover "Data\\.String\\.Search\\..*Spec" + specs <- discover "Test\\.Data\\.String\\.Search\\..*Spec" runSpec [consoleReporter] specs diff --git a/test/Data/String/Search/Spec.purs b/test/Test/Data/String/Search/Spec.purs similarity index 73% rename from test/Data/String/Search/Spec.purs rename to test/Test/Data/String/Search/Spec.purs index 60e2bf80c759759555b4d551710fea1b3818a895..83678f9432fcc6be6447610615908e32fa92f66c 100644 --- a/test/Data/String/Search/Spec.purs +++ b/test/Test/Data/String/Search/Spec.purs @@ -1,54 +1,21 @@ -module Data.String.Search.KarpRabin.Spec where +module Test.Data.String.Search.KarpRabin.Spec where -import Prelude import Data.Array (index) -import Data.Bounded (class Bounded) -import Data.Bounded.Generic (genericTop, genericBottom) -import Data.Enum (class BoundedEnum, class Enum) -import Data.Enum.Generic (genericCardinality, genericToEnum, genericFromEnum, genericSucc, genericPred) -import Data.Eq (class Eq) -import Data.Eq.Generic (genericEq) import Data.Foldable (all) -import Data.Generic.Rep (class Generic) import Data.Maybe (Maybe(..), isJust) -import Data.Ord (class Ord) -import Data.Ord.Generic (genericCompare) -import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar, CodePoint) -import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashU64, rehashU64, hashStruct, fromCodePoint) -- indicesOfAnyLegacy, +import Data.String (drop, stripPrefix, Pattern(..)) +import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashU64, rehashU64, hashStruct, fromCodePoint) import Data.String.Search.Utils (slidingWindow) import Data.Tuple (Tuple(..)) -import Data.UInt64 (unsafeFromInt, UInt64) +import Data.UInt64 (unsafeFromInt) +import Prelude +import Test.Data.String.Search.Utils (CodePointA(..)) import Test.QuickCheck ((<?>)) -import Test.QuickCheck.Arbitrary -import Test.QuickCheck.Gen (enum) import Test.Spec (Spec, describe, it) import Test.Spec.Assertions (shouldEqual) import Test.Spec.QuickCheck (quickCheck') -fromInt = unsafeFromInt - - -newtype CodePointA = CodePointA CodePoint - -derive instance Generic CodePointA _ -instance Eq CodePointA where - eq = genericEq -instance Ord CodePointA where - compare = genericCompare -instance Bounded CodePointA where - top = genericTop - bottom = genericBottom -instance Enum CodePointA where - succ = genericSucc - pred = genericPred -instance BoundedEnum CodePointA where - cardinality = genericCardinality - toEnum = genericToEnum - fromEnum = genericFromEnum -instance Arbitrary CodePointA where - arbitrary = enum - validIndices :: Array String -> String -> Boolean validIndices pats input = all validIndex (indicesOfAny pats input) @@ -62,9 +29,6 @@ validIndices pats input = all validIndex (indicesOfAny pats input) -- <?> (show input' <> " should start with " <> show pat) Nothing -> false -- Failed "out of bounds pattern" --- indicesOfAny :: Array String -> String -> Array (Tuple Int (Array Int)) --- indicesOfAny = indicesOfAnyLegacy - spec :: Spec Unit spec = describe "KarpRabin" do @@ -73,11 +37,11 @@ spec = [[1, 2], [2, 3], [3, 4]] it "rolling hash works 1" do - let rh = mkRollingHash (fromInt 7) 3 - let a = fromInt 1 - let b = fromInt 2 - let c = fromInt 3 - let d = fromInt 4 + let rh = mkRollingHash (unsafeFromInt 7) 3 + let a = unsafeFromInt 1 + let b = unsafeFromInt 2 + let c = unsafeFromInt 3 + let d = unsafeFromInt 4 let h1 = hashU64 rh [a, b, c] h2 = hashU64 rh [b, c, d] @@ -87,7 +51,7 @@ spec = h3 `shouldEqual` rehashU64 rh h2 b a it "rolling hash works 2 (quickcheck)" $ do - let rh = mkRollingHash (fromInt 256) 3 + let rh = mkRollingHash (unsafeFromInt 256) 3 quickCheck' 2000 \(CodePointA a') (CodePointA b') (CodePointA c') (CodePointA d') -> let a = fromCodePoint a' diff --git a/test/Test/Data/String/Utils.purs b/test/Test/Data/String/Utils.purs new file mode 100644 index 0000000000000000000000000000000000000000..fe776bbd11fc64571afcc280b55e40de80b81762 --- /dev/null +++ b/test/Test/Data/String/Utils.purs @@ -0,0 +1,34 @@ +module Test.Data.String.Search.Utils where + +import Data.Bounded.Generic (genericTop, genericBottom) +import Data.Enum (class BoundedEnum, class Enum) +import Data.Enum.Generic (genericCardinality, genericToEnum, genericFromEnum, genericSucc, genericPred) +import Data.Eq.Generic (genericEq) +import Data.Generic.Rep (class Generic) +import Data.String (CodePoint) +import Prelude (class Bounded, class Eq, class Ord) +import Data.Ord.Generic (genericCompare) +import Test.QuickCheck.Arbitrary (class Arbitrary) +import Test.QuickCheck.Gen (enum) + + +newtype CodePointA = CodePointA CodePoint + +derive instance Generic CodePointA _ +instance Eq CodePointA where + eq = genericEq +instance Ord CodePointA where + compare = genericCompare +instance Bounded CodePointA where + top = genericTop + bottom = genericBottom +instance Enum CodePointA where + succ = genericSucc + pred = genericPred +instance BoundedEnum CodePointA where + cardinality = genericCardinality + toEnum = genericToEnum + fromEnum = genericFromEnum +instance Arbitrary CodePointA where + arbitrary = enum +