Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
purescript-string-search
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
purescript-string-search
Commits
934ed593
Verified
Commit
934ed593
authored
Nov 09, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[fix] hashRH didn't work for all CodePoint
This is fixed now using modulus of UInt64.
parent
790d4b74
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
87 additions
and
62 deletions
+87
-62
spago.dhall
spago.dhall
+0
-3
KarpRabin.purs
src/Data/String/Search/KarpRabin.purs
+39
-21
Spec.purs
test/Data/String/Search/Spec.purs
+48
-38
No files found.
spago.dhall
View file @
934ed593
...
...
@@ -13,11 +13,9 @@ to generate this file without the comments in this block.
{ name = "string-search"
, dependencies =
[ "arrays"
, "debug"
, "enums"
, "foldable-traversable"
, "int64"
, "integers"
, "lists"
, "maybe"
, "ordered-collections"
...
...
@@ -25,7 +23,6 @@ to generate this file without the comments in this block.
, "prelude"
, "strings"
, "tuples"
, "uint"
]
, packages = ./packages.dhall
, sources = [ "src/**/*.purs" ]
...
...
src/Data/String/Search/KarpRabin.purs
View file @
934ed593
...
...
@@ -22,17 +22,20 @@ module Data.String.Search.KarpRabin (
indicesOfAny
, indicesOfAnyHashStruct
, indicesOfAnyLegacy
, fromCodePoint
, Base
, universalBase
, Modulus
, universalModulus
--
, universalModulus
, Hash
, HashStruct
, hashStruct
, RollingHash
, mkRollingHash
, hashRH
, rehashRH
-- , hashRH
-- , rehashRH
, hashU64
, rehashU64
) where
...
...
@@ -67,8 +70,9 @@ universalBase = unsafeFromInt 256
-- | Modulus that we will use in Karp-Rabin
-- https://www.wolframalpha.com/input?i=prime+number+greater+than+50000000
universalModulus :: Modulus
universalModulus = unsafeFromInt 50000017
-- universalModulus :: Modulus
-- universalModulus = unsafeFromInt 1009
-- universalModulus = unsafeFromInt 50000017
fromCodePoint :: CodePoint -> Base
...
...
@@ -79,7 +83,7 @@ fromCodePoint c = unsafeFromInt (fromEnum c)
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
newtype RollingHash = RollingHash {
base :: Base
, modulus :: Modulus
-- , modulus :: Modulus -- in our case, the modulus is (top :: UInt64)
, len :: Int
, basePowLen :: Base -- pow base len % modulus (stored for performance reasons)
}
...
...
@@ -88,23 +92,37 @@ derive instance Generic RollingHash _
instance Show RollingHash where
show = genericShow
mkRollingHash :: Base ->
Modulus ->
Int -> RollingHash
mkRollingHash base
modulus
len = RollingHash { base
, modulus
, len
, basePowLen }
mkRollingHash :: Base -> Int -> RollingHash
mkRollingHash base len = RollingHash { base
--
, modulus
, len
, basePowLen }
where
basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
-- basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
basePowLen = foldl (\acc _l -> (acc*base)) (unsafeFromInt 1) (1..(len - 1))
-- -- | NOTE: xs must be of length RollingHash.len
-- hashRH :: RollingHash -> Array Base -> Hash
-- hashRH rh@(RollingHash { base, modulus }) xs =
-- -- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
-- foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
-- rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
-- rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
-- ((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | NOTE: xs must be of length RollingHash.len
hash
RH
:: RollingHash -> Array Base -> Hash
hash
RH rh@(RollingHash {base, modulus
}) xs =
hash
U64
:: RollingHash -> Array Base -> Hash
hash
U64 rh@(RollingHash { base
}) xs =
-- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
foldl (\acc x -> rehash
RH
rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
foldl (\acc x -> rehash
U64
rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | In this function we use the fact that UIn64 rolls automatically
-- | after 18446744073709551615ul.rehashU64 :: RollingHash -> Hash ->
-- | Base -> Base -> Hash
rehashU64 (RollingHash { base, basePowLen }) h old new =
(h - old*basePowLen)*base + new
-- | This struct is for performance reasons.
...
...
@@ -126,13 +144,13 @@ hashStruct pats = { hash, hashMap, hLen, pats, rehash, rehashChar }
where
hLen = minimum1 32 (S.length <$> pats)
rh = mkRollingHash universalBase
universalModulus
hLen
rh = mkRollingHash universalBase hLen
hash :: String -> Hash
hash = hash
RH
rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
hash = hash
U64
rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
rehash :: Hash -> CodePoint -> CodePoint -> Hash
rehash h o n = rehash
RH
rh h (fromCodePoint o) (fromCodePoint n)
rehash h o n = rehash
U64
rh h (fromCodePoint o) (fromCodePoint n)
rehashChar :: Hash -> Char -> Char -> Hash
rehashChar h o n = rehash h (S.codePointFromChar o) (S.codePointFromChar n)
...
...
test/Data/String/Search/Spec.purs
View file @
934ed593
...
...
@@ -2,15 +2,25 @@ module Data.String.Search.KarpRabin.Spec where
import Prelude
import Data.Array (index)
import Data.Bounded (class Bounded)
import Data.Bounded.Generic (genericTop, genericBottom)
import Data.Enum (class BoundedEnum, class Enum)
import Data.Enum.Generic (genericCardinality, genericToEnum, genericFromEnum, genericSucc, genericPred)
import Data.Eq (class Eq)
import Data.Eq.Generic (genericEq)
import Data.Foldable (all)
import Data.Generic.Rep (class Generic)
import Data.Maybe (Maybe(..), isJust)
import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar)
import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashRH, rehashRH, hashStruct) -- indicesOfAnyLegacy,
import Data.Ord (class Ord)
import Data.Ord.Generic (genericCompare)
import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar, CodePoint)
import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashU64, rehashU64, hashStruct, fromCodePoint) -- indicesOfAnyLegacy,
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..))
import Data.UInt64 (unsafeFromInt)
import Data.UInt64 (unsafeFromInt
, UInt64
)
import Test.QuickCheck ((<?>))
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Gen (enum)
import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldEqual)
import Test.Spec.QuickCheck (quickCheck')
...
...
@@ -19,13 +29,25 @@ import Test.Spec.QuickCheck (quickCheck')
fromInt = unsafeFromInt
data SmallInt = SmallInt I
nt
newtype CodePointA = CodePointA CodePoi
nt
runInt :: SmallInt -> Int
runInt (SmallInt i) = i
instance arbSmallInt :: Arbitrary SmallInt where
arbitrary = map (SmallInt <<< (\i -> i / 10000 + 100)) arbitrary
derive instance Generic CodePointA _
instance Eq CodePointA where
eq = genericEq
instance Ord CodePointA where
compare = genericCompare
instance Bounded CodePointA where
top = genericTop
bottom = genericBottom
instance Enum CodePointA where
succ = genericSucc
pred = genericPred
instance BoundedEnum CodePointA where
cardinality = genericCardinality
toEnum = genericToEnum
fromEnum = genericFromEnum
instance Arbitrary CodePointA where
arbitrary = enum
validIndices :: Array String -> String -> Boolean
...
...
@@ -51,47 +73,35 @@ spec =
[[1, 2], [2, 3], [3, 4]]
it "rolling hash works 1" do
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm#Hash_function_used
let rh = mkRollingHash (fromInt 256) (fromInt 101) 3
let a = fromInt 97
let b = fromInt 98
let r = fromInt 114
hashRH rh [a, b, r] `shouldEqual` (fromInt 4)
hashRH rh [b, r, a] `shouldEqual` (fromInt 30)
hashRH rh [b, r, a] `shouldEqual`
rehashRH rh (hashRH rh [a, b, r]) a a
it "rolling hash works 2" do
let rh = mkRollingHash (fromInt 7) (fromInt 1009) 3
let rh = mkRollingHash (fromInt 7) 3
let a = fromInt 1
let b = fromInt 2
let c = fromInt 3
let d = fromInt 4
let h1 = hash
RH
rh [a, b, c]
h2 = hash
RH
rh [b, c, d]
h3 = hash
RH
rh [c, d, a]
let h1 = hash
U64
rh [a, b, c]
h2 = hash
U64
rh [b, c, d]
h3 = hash
U64
rh [c, d, a]
h2 `shouldEqual` rehash
RH
rh h1 a d
h3 `shouldEqual` rehash
RH
rh h2 b a
h2 `shouldEqual` rehash
U64
rh h1 a d
h3 `shouldEqual` rehash
U64
rh h2 b a
it "rolling hash works
3
(quickcheck)" $ do
let rh = mkRollingHash (fromInt 256)
(fromInt 1009)
3
it "rolling hash works
2
(quickcheck)" $ do
let rh = mkRollingHash (fromInt 256) 3
quickCheck' 2000 \(
SmallInt a') (SmallInt b') (SmallInt c') (SmallInt
d') ->
let a = from
I
nt a'
b = from
I
nt b'
c = from
I
nt c'
d = from
I
nt d'
h1 = hash
RH
rh [a, b, c]
h2 = hash
RH
rh [b, c, d]
quickCheck' 2000 \(
CodePointA a') (CodePointA b') (CodePointA c') (CodePointA
d') ->
let a = from
CodePoi
nt a'
b = from
CodePoi
nt b'
c = from
CodePoi
nt c'
d = from
CodePoi
nt d'
h1 = hash
U64
rh [a, b, c]
h2 = hash
U64
rh [b, c, d]
in
h2 == rehash
RH
rh h1 a d
h2 == rehash
U64
rh h1 a d
<?> ( "Fail for: " <> show [a', b', c', d']
<> ", h1 = " <> show h1
<> ", h2 = " <> show h2
<> ", rehash = " <> show (rehash
RH
rh h1 a d))
<> ", rehash = " <> show (rehash
U64
rh h1 a d))
it "works on a single pattern matching two times" do
let pats = ["ab"]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment