[NGRAMS] Teach the highlighter about word boundaries

parent 838e5ac8
...@@ -193,14 +193,21 @@ instance decodeJsonNgramsTable :: DecodeJson NgramsTable where ...@@ -193,14 +193,21 @@ instance decodeJsonNgramsTable :: DecodeJson NgramsTable where
f e@(NgramsElement e') = Tuple e'.ngrams e f e@(NgramsElement e') = Tuple e'.ngrams e
----------------------------------------------------------------------------------- -----------------------------------------------------------------------------------
-- This initial version does not pay attention to word boundaries. -- TODO: while this function works well with word boundaries,
-- it inserts too many spaces.
highlightNgrams :: NgramsTable -> String -> Array (Tuple String (Maybe TermList)) highlightNgrams :: NgramsTable -> String -> Array (Tuple String (Maybe TermList))
highlightNgrams (NgramsTable table) input = highlightNgrams (NgramsTable table) input0 =
let sN = unsafePartial (foldl goFold {i0: 0, s: input, l: Nil} ixs) in let sN = unsafePartial (foldl goFold {i0: 0, s: input, l: Nil} ixs) in
A.reverse (A.fromFoldable (consNonEmpty sN.s sN.l)) A.reverse (A.fromFoldable (consNonEmpty sN.s sN.l))
where where
sp x = " " <> S.replaceAll (S.Pattern " ") (S.Replacement " ") x <> " "
unsp x =
case S.stripSuffix (S.Pattern " ") x of
Nothing -> x
Just x1 -> S.replaceAll (S.Pattern " ") (S.Replacement " ") (S.drop 1 x1)
input = sp input0
pats = A.fromFoldable (Map.keys table) pats = A.fromFoldable (Map.keys table)
ixs = indicesOfAny pats input ixs = indicesOfAny (sp <$> pats) input
consNonEmpty x xs consNonEmpty x xs
| S.null x = xs | S.null x = xs
...@@ -210,6 +217,7 @@ highlightNgrams (NgramsTable table) input = ...@@ -210,6 +217,7 @@ highlightNgrams (NgramsTable table) input =
goFold :: Partial => _ -> Tuple Int (Array Int) -> _ goFold :: Partial => _ -> Tuple Int (Array Int) -> _
goFold { i0, s, l } (Tuple i pis) goFold { i0, s, l } (Tuple i pis)
| i < i0 = | i < i0 =
-- Skip this pattern which is overlapping with a previous one.
{ i0, s, l } { i0, s, l }
| otherwise = | otherwise =
case A.index pis 0 of case A.index pis 0 of
...@@ -220,7 +228,7 @@ highlightNgrams (NgramsTable table) input = ...@@ -220,7 +228,7 @@ highlightNgrams (NgramsTable table) input =
Nothing -> Nothing ->
crashWith "highlightNgrams: out of bounds pattern" crashWith "highlightNgrams: out of bounds pattern"
Just pat -> Just pat ->
let lpat = S.length pat in let lpat = S.length (sp pat) in
case Map.lookup pat table of case Map.lookup pat table of
Nothing -> Nothing ->
crashWith "highlightNgrams: pattern missing from table" crashWith "highlightNgrams: pattern missing from table"
...@@ -228,7 +236,10 @@ highlightNgrams (NgramsTable table) input = ...@@ -228,7 +236,10 @@ highlightNgrams (NgramsTable table) input =
let s1 = S.splitAt (i - i0) s in let s1 = S.splitAt (i - i0) s in
{ i0: i + lpat { i0: i + lpat
, s: S.drop lpat s1.after , s: S.drop lpat s1.after
, l: Tuple pat (Just ne.list) : consNonEmpty s1.before l , l: Tuple " " Nothing :
Tuple pat (Just ne.list) :
Tuple " " Nothing :
consNonEmpty (unsp s1.before) l
} }
----------------------------------------------------------------------------------- -----------------------------------------------------------------------------------
......
...@@ -24,19 +24,30 @@ spec = do ...@@ -24,19 +24,30 @@ spec = do
} }
tne ngrams list = Tuple ngrams (ne ngrams list) tne ngrams list = Tuple ngrams (ne ngrams list)
describe "NgramsTable.highlightNgrams" do describe "NgramsTable.highlightNgrams" do
it "partially works" do it "works on a simple example" do
let table = NgramsTable let table = NgramsTable
(Map.fromFoldable [tne "graph" GraphTerm (Map.fromFoldable [tne "graph" GraphTerm
,tne "stop" StopTerm ,tne "which" StopTerm
,tne "stops" StopTerm
,tne "candidate" CandidateTerm ,tne "candidate" CandidateTerm
]) ])
input = "this is a biography which stops at every candidate" input = "this is a graph about a biography which stops at every candidate"
output = [Tuple "this is a bio" Nothing output = [Tuple "this is a" Nothing
,Tuple " " Nothing
,Tuple "graph" (Just GraphTerm) ,Tuple "graph" (Just GraphTerm)
,Tuple "y which " Nothing ,Tuple " " Nothing
,Tuple "stop" (Just StopTerm) ,Tuple "about a biography" Nothing
,Tuple "s at every " Nothing ,Tuple " " Nothing
,Tuple "candidate" (Just CandidateTerm)] ,Tuple "which" (Just StopTerm)
,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "stops" (Just StopTerm)
,Tuple " " Nothing
,Tuple "at every" Nothing
,Tuple " " Nothing
,Tuple "candidate" (Just CandidateTerm)
,Tuple " " Nothing
]
highlightNgrams table input `shouldEqual` output highlightNgrams table input `shouldEqual` output
it "works when pattern overlaps" do it "works when pattern overlaps" do
...@@ -48,18 +59,43 @@ spec = do ...@@ -48,18 +59,43 @@ spec = do
,tne "the" GraphTerm ,tne "the" GraphTerm
,tne "state" GraphTerm ,tne "state" GraphTerm
]) ])
input = "SCIPION is a new state of the" input = "This is a new state of the"
output = [Tuple "SCIPION " Nothing output = [Tuple "This" Nothing
,Tuple " " Nothing
,Tuple "is" (Just StopTerm) ,Tuple "is" (Just StopTerm)
,Tuple " " Nothing ,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "a" (Just StopTerm) ,Tuple "a" (Just StopTerm)
,Tuple " " Nothing ,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "new" (Just GraphTerm) ,Tuple "new" (Just GraphTerm)
,Tuple " " Nothing ,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "state" (Just GraphTerm) ,Tuple "state" (Just GraphTerm)
,Tuple " " Nothing ,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "of" (Just StopTerm) ,Tuple "of" (Just StopTerm)
,Tuple " " Nothing ,Tuple " " Nothing
,Tuple " " Nothing
,Tuple "the" (Just GraphTerm) ,Tuple "the" (Just GraphTerm)
,Tuple " " Nothing
]
highlightNgrams table input `shouldEqual` output
it "works when pattern overlaps 2" do
let table = NgramsTable
(Map.fromFoldable [tne "from" GraphTerm
,tne "i" StopTerm
,tne "images" GraphTerm
])
input = "This is from space images"
output = [Tuple "This is" Nothing
,Tuple " " Nothing
,Tuple "from" (Just GraphTerm)
,Tuple " " Nothing
,Tuple "space" Nothing
,Tuple " " Nothing
,Tuple "images" (Just GraphTerm)
,Tuple " " Nothing
] ]
highlightNgrams table input `shouldEqual` output highlightNgrams table input `shouldEqual` output
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment