Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
purescript-string-search
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
purescript-string-search
Commits
790d4b74
Verified
Commit
790d4b74
authored
Nov 08, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[fix] improve tests, fixed issues with hashing algorithm
parent
04f86880
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
231 additions
and
38 deletions
+231
-38
.gitignore
.gitignore
+1
-0
shell.nix
shell.nix
+1
-0
spago.dhall
spago.dhall
+2
-0
KarpRabin.purs
src/Data/String/Search/KarpRabin.purs
+121
-33
test.dhall
test.dhall
+7
-1
Spec.purs
test/Data/String/Search/Spec.purs
+99
-4
No files found.
.gitignore
View file @
790d4b74
.psci_modules/
.spago/
.spago/
node_modules/
node_modules/
output/
output/
shell.nix
View file @
790d4b74
...
@@ -44,5 +44,6 @@ pkgs.mkShell {
...
@@ -44,5 +44,6 @@ pkgs.mkShell {
default-shell
.
build
default-shell
.
build
default-shell
.
pkgs
.
dhall-json
default-shell
.
pkgs
.
dhall-json
default-shell
.
pkgs
.
nodejs
default-shell
.
pkgs
.
nodejs
default-shell
.
pkgs
.
yarn
];
];
}
}
spago.dhall
View file @
790d4b74
...
@@ -13,8 +13,10 @@ to generate this file without the comments in this block.
...
@@ -13,8 +13,10 @@ to generate this file without the comments in this block.
{ name = "string-search"
{ name = "string-search"
, dependencies =
, dependencies =
[ "arrays"
[ "arrays"
, "debug"
, "enums"
, "enums"
, "foldable-traversable"
, "foldable-traversable"
, "int64"
, "integers"
, "integers"
, "lists"
, "lists"
, "maybe"
, "maybe"
...
...
src/Data/String/Search/KarpRabin.purs
View file @
790d4b74
...
@@ -22,8 +22,17 @@ module Data.String.Search.KarpRabin (
...
@@ -22,8 +22,17 @@ module Data.String.Search.KarpRabin (
indicesOfAny
indicesOfAny
, indicesOfAnyHashStruct
, indicesOfAnyHashStruct
, indicesOfAnyLegacy
, indicesOfAnyLegacy
, Base
, universalBase
, Modulus
, universalModulus
, Hash
, HashStruct
, HashStruct
, hashStruct
, hashStruct
, RollingHash
, mkRollingHash
, hashRH
, rehashRH
) where
) where
...
@@ -32,29 +41,80 @@ import Data.Array ((..))
...
@@ -32,29 +41,80 @@ import Data.Array ((..))
import Data.Enum (fromEnum)
import Data.Enum (fromEnum)
import Data.Foldable (class Foldable, minimum, foldl)
import Data.Foldable (class Foldable, minimum, foldl)
import Data.FunctorWithIndex (mapWithIndex)
import Data.FunctorWithIndex (mapWithIndex)
import Data.
Int (quot
)
import Data.
Generic.Rep (class Generic
)
import Data.List as L
import Data.List as L
import Data.Map as M
import Data.Map as M
import Data.Maybe (Maybe(..), isJust, fromJust)
import Data.Maybe (Maybe(..), isJust, fromJust)
import Data.Show.Generic (genericShow)
import Data.String as S
import Data.String as S
import Data.String (CodePoint)
import Data.String (CodePoint)
import Data.String.Search.Utils (slidingWindow)
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..), snd)
import Data.Tuple (Tuple(..), snd)
import Data.UInt (UInt, shl, fromInt)
import Data.UInt64 (UInt64, unsafeFromInt) -- shl
-- import Debug (trace)
import Partial.Unsafe (unsafePartial)
import Partial.Unsafe (unsafePartial)
import Prelude
import Prelude
fromCodePoint :: CodePoint -> UInt
fromCodePoint c = fromInt (fromEnum c)
type Base = UInt64
type Modulus = UInt64
type Hash = UInt64
-- | Base that we will use in Karp-Rabin
universalBase :: Base
universalBase = unsafeFromInt 256
-- | Modulus that we will use in Karp-Rabin
-- https://www.wolframalpha.com/input?i=prime+number+greater+than+50000000
universalModulus :: Modulus
universalModulus = unsafeFromInt 50000017
fromCodePoint :: CodePoint -> Base
fromCodePoint c = unsafeFromInt (fromEnum c)
-- Rolling hash implementation, see
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
newtype RollingHash = RollingHash {
base :: Base
, modulus :: Modulus
, len :: Int
, basePowLen :: Base -- pow base len % modulus (stored for performance reasons)
}
derive instance Generic RollingHash _
instance Show RollingHash where
show = genericShow
mkRollingHash :: Base -> Modulus -> Int -> RollingHash
mkRollingHash base modulus len = RollingHash { base
, modulus
, len
, basePowLen }
where
basePowLen = foldl (\acc _l -> (acc*base) `mod` modulus) (unsafeFromInt 1) (1..(len - 1))
-- | NOTE: xs must be of length RollingHash.len
hashRH :: RollingHash -> Array Base -> Hash
hashRH rh@(RollingHash {base, modulus }) xs =
-- foldl (\acc x -> (((acc*base) `mod` modulus) + x) `mod` modulus) (unsafeFromInt 0) xs
foldl (\acc x -> rehashRH rh acc (unsafeFromInt 0) x) (unsafeFromInt 0) xs
rehashRH :: RollingHash -> Hash -> Base -> Base -> Hash
rehashRH (RollingHash { base, basePowLen, modulus }) h old new =
((h + (modulus - old)*basePowLen)*base + new) `mod` modulus
-- | This struct is for performance reasons.
-- | This struct is for performance reasons.
type HashStruct = {
type HashStruct = {
hash :: String -> UInt
hash :: String -> Hash
, hashMap :: M.Map UInt (Array Int)
, hashMap :: M.Map Hash (Array Int)
, hLen :: Int
, hLen :: Int
, pats :: Array String
, pats :: Array String
, rehash :: UInt -> CodePoint -> CodePoint -> UInt
, rehash :: Hash -> CodePoint -> CodePoint -> Hash
, rehashChar :: Hash -> Char -> Char -> Hash
}
}
-- | This is a temporary structure, computed from given patterns, to
-- | This is a temporary structure, computed from given patterns, to
...
@@ -62,29 +122,44 @@ type HashStruct = {
...
@@ -62,29 +122,44 @@ type HashStruct = {
-- | same patterns many times for a given string: we need to compute
-- | same patterns many times for a given string: we need to compute
-- | `hash` etc only once for these patterns).
-- | `hash` etc only once for these patterns).
hashStruct :: Array String -> HashStruct
hashStruct :: Array String -> HashStruct
hashStruct pats = { hash, hashMap, hLen, pats, rehash }
hashStruct pats = { hash, hashMap, hLen, pats, rehash
, rehashChar
}
where
where
hLen = minimum1 32 (S.length <$> pats)
hLen = minimum1 32 (S.length <$> pats)
hLen' = fromInt hLen
shDi = case 32 `quot` hLen of
rh = mkRollingHash universalBase universalModulus hLen
q | q < 4 -> q
| otherwise -> 4
hash :: String -> Hash
outS = fromInt (shDi * hLen)
hash = hashRH rh <<< map fromCodePoint <<< S.toCodePointArray <<< S.take hLen
rehash :: UInt -> CodePoint -> CodePoint -> UInt
rehash = case shDi of
rehash :: Hash -> CodePoint -> CodePoint -> Hash
1 -> rehash' (fromInt 1) hLen'
rehash h o n = rehashRH rh h (fromCodePoint o) (fromCodePoint n)
2 -> rehash' (fromInt 2) outS
3 -> rehash' (fromInt 3) outS
rehashChar :: Hash -> Char -> Char -> Hash
_ -> rehash' (fromInt 4) outS
rehashChar h o n = rehash h (S.codePointFromChar o) (S.codePointFromChar n)
hash :: String -> UInt
hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
hashMap :: M.Map Hash (Array Int)
<<< S.toCodePointArray
<<< S.take hLen
hashMap :: M.Map UInt (Array Int)
hashMap =
hashMap =
M.fromFoldableWith (flip (<>))
M.fromFoldableWith (flip (<>))
(mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
(mapWithIndex (\i a -> Tuple (hash a) [i]) pats)
-- hLen' = fromInt hLen
-- shDi = case 32 `quot` hLen of
-- q | q < 4 -> q
-- | otherwise -> 4
-- outS = fromInt (shDi * hLen)
-- rehash :: UInt -> CodePoint -> CodePoint -> UInt
-- rehash = case shDi of
-- 1 -> rehash' (fromInt 1) hLen'
-- 2 -> rehash' (fromInt 2) outS
-- 3 -> rehash' (fromInt 3) outS
-- _ -> rehash' (fromInt 4) outS
-- hash :: String -> UInt
-- hash = foldl (\h w -> (h `shl` fromInt shDi) + fromCodePoint w) (fromInt 0)
-- <<< S.toCodePointArray
-- <<< S.take hLen
-- $overview
-- $overview
--
--
...
@@ -160,9 +235,9 @@ indicesOfAnyHashStruct hs = if A.null nepats then const []
...
@@ -160,9 +235,9 @@ indicesOfAnyHashStruct hs = if A.null nepats then const []
-- Workers --
-- Workers --
------------------------------------------------------------------------------
------------------------------------------------------------------------------
rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
--
rehash' :: UInt -> UInt -> UInt -> CodePoint -> CodePoint -> UInt
rehash' shDi out h o n =
--
rehash' shDi out h o n =
(h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
--
(h `shl` shDi - (fromCodePoint o `shl` out)) + fromCodePoint n
minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
minimum1 :: forall a f. Ord a => Foldable f => a -> f a -> a
minimum1 a fa =
minimum1 a fa =
...
@@ -170,7 +245,7 @@ minimum1 a fa =
...
@@ -170,7 +245,7 @@ minimum1 a fa =
Nothing -> a
Nothing -> a
Just b -> min a b
Just b -> min a b
type GoAcc = Tuple
UInt
(Array (Tuple Int (Array Int)))
type GoAcc = Tuple
Hash
(Array (Tuple Int (Array Int)))
strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int))
strictMatcher :: HashStruct -> String -> Array (Tuple Int (Array Int))
strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
...
@@ -187,16 +262,29 @@ strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
...
@@ -187,16 +262,29 @@ strictMatcher { hash, hashMap, hLen, pats, rehash } = unsafePartial search
strLen = S.length str
strLen = S.length str
maxIdx = strLen - hLen
maxIdx = strLen - hLen
arr = S.toCodePointArray str
arr = S.toCodePointArray str
shash ::
UInt
shash ::
Hash
shash = hash str
shash = hash str
go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
go :: GoAcc -> Tuple Int (Array CodePoint) -> GoAcc
go (Tuple h acc) (Tuple idx arr') =
go (Tuple h acc) (Tuple idx arr') =
let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr')
let rehashed = rehash h (fromJust $ A.head arr') (fromJust $ A.last arr')
-- _ = if A.elem " abd" pats then
-- trace ("[strictMatcher] " <> show { h
-- , hashMap
-- , acc
-- , idx
-- , arr': S.fromCodePointArray arr'
-- , arrHeadLast: S.fromCodePointArray [fromJust $ A.head arr', fromJust $ A.last arr']
-- , lookup: M.lookup h hashMap }) \_ -> unit
-- else
-- unit
el = case M.lookup h hashMap of
el = case M.lookup h hashMap of
Nothing -> Nothing
Nothing -> Nothing
Just ps ->
Just ps ->
let str' = S.drop idx str
let str' = S.drop idx str
okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
okay bs = isJust (S.stripPrefix (S.Pattern bs) str')
-- _ = trace ("[strictMatcher] " <> show { pats
-- , ps
-- , oks: (\x -> okay (A.unsafeIndex pats x)) <$> ps}) \_ -> unit
in
in
case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
case A.filter (\x -> okay (A.unsafeIndex pats x)) ps of
[] -> Nothing
[] -> Nothing
...
@@ -228,14 +316,14 @@ strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search
...
@@ -228,14 +316,14 @@ strictMatcherLegacy { hash, hashMap, hLen, pats, rehash } = unsafePartial search
else A.fromFoldable (go 0 shash)
else A.fromFoldable (go 0 shash)
where
where
arr = S.toCodePointArray str
arr = S.toCodePointArray str
shash ::
UInt
shash ::
Hash
shash = hash str
shash = hash str
strLen = S.length str
strLen = S.length str
maxIdx = strLen - hLen
maxIdx = strLen - hLen
strAt i = A.unsafeIndex arr i
strAt i = A.unsafeIndex arr i
go :: Int ->
UInt
-> L.List (Tuple Int (Array Int))
go :: Int ->
Hash
-> L.List (Tuple Int (Array Int))
go sI h =
go sI h =
let hd = strAt sI
let hd = strAt sI
isEnd = sI == maxIdx
isEnd = sI == maxIdx
...
...
test.dhall
View file @
790d4b74
...
@@ -4,5 +4,11 @@ in conf
...
@@ -4,5 +4,11 @@ in conf
// { sources = conf.sources # [ "test/**/*.purs" ]
// { sources = conf.sources # [ "test/**/*.purs" ]
, dependencies =
, dependencies =
conf.dependencies
conf.dependencies
# [ "aff", "effect", "spec", "spec-discovery", "spec-quickcheck" ]
# [ "aff"
, "effect"
, "quickcheck"
, "spec"
, "spec-discovery"
, "spec-quickcheck"
]
}
}
test/Data/String/Search/Spec.purs
View file @
790d4b74
...
@@ -4,13 +4,30 @@ import Prelude
...
@@ -4,13 +4,30 @@ import Prelude
import Data.Array (index)
import Data.Array (index)
import Data.Foldable (all)
import Data.Foldable (all)
import Data.Maybe (Maybe(..), isJust)
import Data.Maybe (Maybe(..), isJust)
import Data.String (drop, stripPrefix, Pattern(..))
import Data.String (drop, stripPrefix, Pattern(..), codePointFromChar)
import Data.String.Search.KarpRabin (indicesOfAnyLegacy, indicesOfAny)
import Data.String.Search.KarpRabin (indicesOfAny, mkRollingHash, hashRH, rehashRH, hashStruct) -- indicesOfAnyLegacy,
import Data.String.Search.Utils (slidingWindow)
import Data.Tuple (Tuple(..))
import Data.Tuple (Tuple(..))
import Data.UInt64 (unsafeFromInt)
import Test.QuickCheck ((<?>))
import Test.QuickCheck.Arbitrary
import Test.Spec (Spec, describe, it)
import Test.Spec (Spec, describe, it)
import Test.Spec.Assertions (shouldEqual)
import Test.Spec.Assertions (shouldEqual)
import Test.Spec.QuickCheck (quickCheck')
import Test.Spec.QuickCheck (quickCheck')
fromInt = unsafeFromInt
data SmallInt = SmallInt Int
runInt :: SmallInt -> Int
runInt (SmallInt i) = i
instance arbSmallInt :: Arbitrary SmallInt where
arbitrary = map (SmallInt <<< (\i -> i / 10000 + 100)) arbitrary
validIndices :: Array String -> String -> Boolean
validIndices :: Array String -> String -> Boolean
validIndices pats input = all validIndex (indicesOfAny pats input)
validIndices pats input = all validIndex (indicesOfAny pats input)
where
where
...
@@ -29,12 +46,58 @@ validIndices pats input = all validIndex (indicesOfAny pats input)
...
@@ -29,12 +46,58 @@ validIndices pats input = all validIndex (indicesOfAny pats input)
spec :: Spec Unit
spec :: Spec Unit
spec =
spec =
describe "KarpRabin" do
describe "KarpRabin" do
it "sliding window works" do
slidingWindow [1, 2, 3, 4] 2 `shouldEqual`
[[1, 2], [2, 3], [3, 4]]
it "rolling hash works 1" do
-- https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm#Hash_function_used
let rh = mkRollingHash (fromInt 256) (fromInt 101) 3
let a = fromInt 97
let b = fromInt 98
let r = fromInt 114
hashRH rh [a, b, r] `shouldEqual` (fromInt 4)
hashRH rh [b, r, a] `shouldEqual` (fromInt 30)
hashRH rh [b, r, a] `shouldEqual`
rehashRH rh (hashRH rh [a, b, r]) a a
it "rolling hash works 2" do
let rh = mkRollingHash (fromInt 7) (fromInt 1009) 3
let a = fromInt 1
let b = fromInt 2
let c = fromInt 3
let d = fromInt 4
let h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
h3 = hashRH rh [c, d, a]
h2 `shouldEqual` rehashRH rh h1 a d
h3 `shouldEqual` rehashRH rh h2 b a
it "rolling hash works 3 (quickcheck)" $ do
let rh = mkRollingHash (fromInt 256) (fromInt 1009) 3
quickCheck' 2000 \(SmallInt a') (SmallInt b') (SmallInt c') (SmallInt d') ->
let a = fromInt a'
b = fromInt b'
c = fromInt c'
d = fromInt d'
h1 = hashRH rh [a, b, c]
h2 = hashRH rh [b, c, d]
in
h2 == rehashRH rh h1 a d
<?> ( "Fail for: " <> show [a', b', c', d']
<> ", h1 = " <> show h1
<> ", h2 = " <> show h2
<> ", rehash = " <> show (rehashRH rh h1 a d))
it "works on a single pattern matching two times" do
it "works on a single pattern matching two times" do
let pats = ["ab"]
let pats = ["ab"]
let input = "abcbab"
let input = "abcbab"
let output = [Tuple 0 [0], Tuple 4 [0]]
let output = [Tuple 0 [0], Tuple 4 [0]]
indicesOfAny pats input `shouldEqual` output
indicesOfAny pats input `shouldEqual` output
-- hashes: 1650, 1667, 1682, 1665, 1650
it "works on a many unmatching patterns" do
it "works on a many unmatching patterns" do
let pats = ["abd","e","bac","abcbabe"]
let pats = ["abd","e","bac","abcbabe"]
...
@@ -42,7 +105,27 @@ spec =
...
@@ -42,7 +105,27 @@ spec =
let output = []
let output = []
indicesOfAny pats input `shouldEqual` output
indicesOfAny pats input `shouldEqual` output
it "works on a simple case" do
it "works when pattern is in the middle" do
let pats = ["abd"]
let input = "eeabdee"
let output = [Tuple 2 [0]]
let hs = hashStruct pats
let h1 = hs.hash "abd"
let h2 = hs.hash "bde"
h2 `shouldEqual` (hs.rehashChar h1 'a' 'e')
indicesOfAny pats input `shouldEqual` output
it "works when pattern is in the middle, with space in it (ps #598)" do
let pats = [" abd"]
let input = "e abdee"
let output = [Tuple 1 [0]]
let hs = hashStruct pats
let h1 = hs.hash "e ab"
let h2 = hs.hash " abd"
h2 `shouldEqual` (hs.rehashChar h1 'e' 'd')
indicesOfAny pats input `shouldEqual` output
it "works on a bit more complex case" do
let pats = ["ab","cb","bc","bca"]
let pats = ["ab","cb","bc","bca"]
let input = "abcbab"
let input = "abcbab"
let output = [Tuple 0 [0]
let output = [Tuple 0 [0]
...
@@ -67,3 +150,15 @@ spec =
...
@@ -67,3 +150,15 @@ spec =
it "returns valid indices 2000 random samples" do
it "returns valid indices 2000 random samples" do
quickCheck' 2000 validIndices
quickCheck' 2000 validIndices
it "works for purescript-gargantext issue #598, test 1" do
let pats = [" conception " ]
let input = " passant d'une conception quasi solipsiste ou monadique de la sensation "
let output = [Tuple 16 [0]]
indicesOfAny pats input `shouldEqual` output
it "works for purescript-gargantext issue #598, test 2" do
let pats = [" conception ", " notion "]
let input = " passant d'une conception quasi solipsiste ou monadique de la sensation "
let output = [Tuple 16 [0]]
indicesOfAny pats input `shouldEqual` output
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment