Merge branch 'dev-eleve' into dev

8f285b42 · Alexandre Delanoë · c9ce9c6f · 842cbf68 · 8f285b42
Commit 8f285b42 authored Jun 15, 2019 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 349 additions and 146 deletions

Eleve.hs src/Gargantext/Text/Eleve.hs +349 -146

No files found.
--- a/src/Gargantext/Text/Eleve.hs
+++ b/src/Gargantext/Text/Eleve.hs
@@ -20,7 +20,6 @@ References:
  , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
 Notes for current implementation:
- TODO fix normalization
 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
@@ -32,62 +31,107 @@ Notes for current implementation:
                        $ Gargantext.map _hyperdataDocument_abstract docs
 -}
+{-# LANGUAGE ConstraintKinds   #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE RankNTypes        #-}
 {-# LANGUAGE TemplateHaskell   #-}
+{-# LANGUAGE TypeFamilies      #-}
 module Gargantext.Text.Eleve where
 -- import Debug.Trace (trace)
 -- import Debug.SimpleReflect
-import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
+import Control.Lens hiding (levels, children)
-import Control.Monad (foldM, mapM_, forM_)
+import Control.Monad (forM_)
 import Data.Ord (Ord)
 import qualified Data.List as L
 import Data.Monoid
 import Data.Text (Text)
 import qualified Data.Text as T
 import Data.Map (Map)
-import Data.Maybe (fromMaybe, catMaybes)
+import Data.Maybe (fromMaybe)
 import qualified Data.Map as Map
 import Gargantext.Prelude hiding (cs)
 import qualified Data.Tree as Tree
 import Data.Tree (Tree)
-import qualified Prelude as P (putStrLn, logBase)
+import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
+nan :: Floating e => e
+nan = 0 / 0
+noNaNs :: P.RealFloat e => [e] -> [e]
+noNaNs = filter (not . P.isNaN)
+updateIfDefined :: P.RealFloat e => e -> e -> e
+updateIfDefined e0 e | P.isNaN e = e0
+                     | otherwise = e
+sim :: Entropy e => e -> e -> Bool
+sim x y = x == y || (P.isNaN x && P.isNaN y)
+subst :: Entropy e => (e, e) -> e -> e
+subst (src, dst) x | sim src x = dst
+                   | otherwise = x
+------------------------------------------------------------------------
+type Entropy e =
+  ( Fractional e
+  , Floating e
+  , P.RealFloat e
+  , Show e
+  -- ^ TODO: only used for debugging
+  )
 ------------------------------------------------------------------------
 -- | Example and tests for development
 data I e = I
-  { _info_entropy       :: e
+  { _info_entropy    :: e
-  , _info_norm_entropy  :: e
+  , _info_entropy_var :: e
-  , _info_norm_entropy' :: e
+  , _info_autonomy   :: e
  }
 instance Show e => Show (I e) where
-  show (I e n n') = show (e, n, n')
+  show (I e ev a) = show (e, ev, a)
 makeLenses ''I
 type ModEntropy i o e = (e -> e) -> i -> o
-setNormEntropy :: ModEntropy e (I e) e
+set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
-setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
+set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
+set_entropy_var :: Entropy e => Setter e (I e) e e
+set_entropy_var f e = (\ev -> I e ev nan) <$> f e
+data StartStop = Start | Stop
+  deriving (Ord, Eq, Show)
 data Token = NonTerminal Text
-           | Terminal
+           | Terminal StartStop
  deriving (Ord, Eq, Show)
-toToken :: Int -> [Text] -> [Token]
+isTerminal :: Token -> Bool
-toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
+isTerminal (Terminal    _) = True
+isTerminal (NonTerminal _) = False
-unToken :: [Token] -> [Text]
+nonTerminals :: [Token] -> [Text]
-unToken = map f
+nonTerminals ts = [nt | NonTerminal nt <- ts]
-  where
-    f (NonTerminal x) = x
+parseToken :: Text -> Token
-    f Terminal        = ""
+parseToken "<start>" = Terminal Start
+parseToken "<stop>"  = Terminal Stop
+parseToken t         = NonTerminal t
+toToken :: [Text] -> [Token]
+toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
+printToken :: Token -> Text
+printToken = f
+  where
+    f (NonTerminal x)  = x
+    f (Terminal Start) = "<start>"
+    f (Terminal Stop)  = "<stop>"
 ------------------------------------------------------------------------
 data Trie k e
@@ -100,9 +144,6 @@ data Trie k e
 makeLenses ''Trie
-insertTries :: Ord k => [[k]] -> Trie k ()
-insertTries = L.foldr insertTrie emptyTrie
 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
 insertTrie []     n                    = n { _node_count = _node_count n +1}
 insertTrie (x:xs) (Leaf c)             = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
@@ -121,7 +162,6 @@ mkTrie c children
  | otherwise         = Node c mempty children
                        -----------------------------
 -- | Trie to Tree since Tree as nice print function
 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
 toTree k (Leaf c)      = Tree.Node (k, c, Nothing) []
@@ -129,55 +169,94 @@ toTree k (Node c e cs) = Tree.Node (k, c, Just e)  (map (uncurry toTree) $ Map.t
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
+normalizeLevel :: Entropy e => e -> e -> e -> e
-entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
-entropyTrie _    (Leaf c)             = Leaf c
-entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
-  where
-    e = sum $ map f $ Map.toList children
-    f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
-                             else - chc * P.logBase 2 chc
-      where
-        chc = fromIntegral (_node_count child) / fromIntegral c
-normalizeEntropy :: (Fractional e, Floating e, Show e)
-                 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
-normalizeEntropy inE modE = go $ modE identity
-  where
-    go _ (Leaf c) = Leaf c
-    go f (Node c i children)
-      | Map.null children =
-          panic "normalizeEntropy: impossible"
-      | otherwise         =
-          -- trace (show $ L.length es) $
-          Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
-        where
-          es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
-          m  = mean      es
-          v  = deviation es
------------------------------------------------------------------------
-normalizeLevel :: (Fractional e, Floating e, Show e)
-               => e -> e -> e -> e
 normalizeLevel m v e = (e - m) / v
-buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
+{- Unused
-buildTrie = entropyTrie (== Terminal) . insertTries
-nodeEntropy :: Trie k e -> Maybe e
-nodeEntropy (Node _ e _) = Just e
-nodeEntropy (Leaf _)     = Nothing
 nodeChildren :: Trie k e -> Map k (Trie k e)
 nodeChildren (Node _ _ cs) = cs
 nodeChildren (Leaf _)      = Map.empty
-nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
+-}
-nodeChild k (Node _ _ cs) = Map.lookup k cs
-nodeChild _ (Leaf _)      = Nothing
+chunkAlongEleve :: Int -> [a] -> [[a]]
+chunkAlongEleve n xs = L.take n <$> L.tails xs
+data Direction = Backward | Forward
-findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
+buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
-findTrie ks t = foldM (flip nodeChild) t ks
+buildTrie d n sentences
+  = L.foldr insertTrie emptyTrie
+  . L.concat
+  $ ( filter (/= [Terminal (term d)])
+    . chunkAlongEleve (n + 1)
+    . order d
+    )
+ <$> sentences
+  where
+    order Forward  = identity
+    order Backward = reverse
+    term  Forward  = Stop
+    term  Backward = Start
+class IsTrie trie where
+  entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
+  nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
+  nodeChild   :: Ord k =>  k  -> trie k e -> trie k e
+  findTrie    :: Ord k => [k] -> trie k e -> trie k e
+  printTrie   :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
+  evTrie      :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
+  normalizeEntropy :: Entropy e
+                   => Getting e i e -> ModEntropy i o e
+                   -> trie k i -> trie k o
+instance IsTrie Trie where
+  entropyTrie _    (Leaf c)             = Leaf c
+  entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
+    where
+      children' = Map.toList children
+      sum_count = sum $ _node_count . snd <$> children'
+      e | sum_count == 0 = nan
+        | otherwise      = sum $ f <$> children'
+      f (k, child) = if pred k then   chc * P.logBase 2 (fromIntegral c)
+                              else - chc * P.logBase 2 chc
+        where
+          chc = fromIntegral (_node_count child) / fromIntegral c
+  nodeEntropy inE (Node _ e _) = e ^. inE
+  nodeEntropy _   (Leaf _)     = nan
+  nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
+  nodeChild _ (Leaf _)      = emptyTrie
+  findTrie ks t = L.foldl (flip nodeChild) t ks
+  printTrie inE t = do
+    P.putStrLn . Tree.drawTree
+                . fmap show
+                $ toTree (NonTerminal "") t
+    P.putStrLn "  Levels:"
+    forM_ (normalizationLevels inE t) $ \level ->
+      P.putStrLn $ "    " <> show level
+  evTrie inE setEV = go nan
+    where
+      go _  (Leaf c)            = Leaf c
+      go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
+        where e1 = i ^. inE
+      ev 0  0  = nan
+      ev i0 i1 = i1 - i0
+  normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
+    where
+      go _ _                 (Leaf c)            = Leaf c
+      go _ []                _                   = panic "normalizeEntropy' empty levels"
+      go f ((m, v, _) : ess) (Node c i children)
+        = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
+------------------------------------------------------------------------
 levels :: Trie k e -> [[Trie k e]]
 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
@@ -186,96 +265,173 @@ levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
    subForest (Leaf _)            = []
    subForest (Node _ _ children) = Map.elems children
-entropyLevels :: Getting e i e -> Trie k i -> [[e]]
+entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
-entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
+entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
--fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
--fwd inE outE s = s & outE .~ (s ^. inE)
-normalizeEntropy' :: (Fractional e, Floating e, Show e)
+normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
-                  => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
+normalizationLevels inE = fmap f . entropyLevels inE
-normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
  where
-    go _ []         _                   = panic "normalizeEntropy' empty levels"
+    f es = (mean es, deviation es, length es)
-    go _ _          (Leaf c)            = Leaf c
-    go _ ([] : _)   _                   = panic "normalizeEntropy': empty level"
-    go f (es : ess) (Node c i children) =
-        Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
-      where
-        m  = mean      es
-        v  = deviation es
 ------------------------------------------------------------------------
+data Tries k e = Tries
+  { _fwd :: Trie k e
+  , _bwd :: Trie k e
+  }
+makeLenses ''Tries
+buildTries :: Int -> [[Token]] -> Tries Token ()
+buildTries n sentences = Tries
+  { _fwd = buildTrie Forward  n sentences
+  , _bwd = buildTrie Backward n sentences
+  }
+instance IsTrie Tries where
+  nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
+  findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
+  nodeChild = onTries . nodeChild
+  entropyTrie = onTries . entropyTrie
+  evTrie inE setEV = onTries $ evTrie inE setEV
+  normalizeEntropy inE = onTries . normalizeEntropy inE
+  printTrie inE (Tries f b) = do
+    P.putStrLn "Forward:"
+    printTrie inE f
+    P.putStrLn ""
+    P.putStrLn "Backward:"
+    printTrie inE b
+onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
+onTries h (Tries f b) = Tries (h f) (h b)
 ------------------------------------------------------------------------
-split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
+mayCons :: [a] -> [[a]] -> [[a]]
-split inE t0 = go t0 []
+mayCons [] xss = xss
+mayCons xs xss = xs : xss
+{-
+split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
+split _   _ [] = []
+split inE t (Terminal Start:xs) = split inE t xs
+split inE t (x0:xs0) = go [x0] xs0
  where
-    consRev [] xss = xss
+    go pref []                  = [pref]
-    consRev xs xss = reverse xs : xss
+    go pref (Terminal Stop:_)   = [pref]
+    go _    (Terminal Start:_)  = panic "split impossible"
-    go _ pref []           = [reverse pref]
+    go pref (x:xs) =
-    go _ pref (Terminal:_) = [reverse pref]
+        -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
-    go t pref (x:xs) = case nodeChild x t of
+        if acc
-      Nothing -> consRev pref $ go t0 [x] xs
+          then go prefx xs
-      Just xt -> case nodeChild x t0 of
+          else mayCons pref $ go [x] xs
-        Nothing  -> panic $ "TODO"
+      where
-        Just xt0 ->
+        prefx = pref <> [x]
-          let et   = ne (panic "t")   t
+        pt   = findTrie pref t
-          --  ^ entropy of the current prefix
+        pxt  = findTrie prefx t
-              ext0 = ne (panic "xt0") xt0
+        xt   = findTrie [x] t
-          --  ^ entropy of [x]
+        ept  = ne pt
-              ext  = ne 0 xt
+    --  ^ entropy of the current prefix
-          --  ^ entropy of the current prefix plus x
+        ext  = ne xt
-          in
+    --  ^ entropy of [x]
-          -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
+        epxt = ne pxt
-          if ext + ext0 > et
+    --  ^ entropy of the current prefix plus x
-            then go xt (x:pref) xs
+        acc  = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
-            else consRev pref $ go xt0 [x] xs
+        -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
-    ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
+    ne = nodeEntropy inE
+-}
+split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
+split _ _   _ []  = []
+split _ _   _ [t] = pure <$> nonTerminals [t]
+split n inE t ts  = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
+  where
+    pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
+                       (L.tail . L.inits . take n $ ts)
+{-
+split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
+split inE t0 ts =
+  maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
+-}
------------------------------------------------------------------------
 ------------------------------------------------------------------------
 mainEleve :: Int -> [[Text]] -> [[[Text]]]
-mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
+mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp
  where
-    inp = toToken (n - 1) <$> input
+    inp = toToken <$> input
-    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
+    t   = normalizeEntropy info_entropy_var set_autonomy
-                          -- NP: here we use the entropy to split
+        . evTrie identity set_entropy_var
-                          -- instead we should use either:
+        . entropyTrie isTerminal
-                          --   info_norm_entropy or info_norm_entropy'
+        $ buildTries n inp
-                          -- However they should first be fixed.
-testEleve :: Bool -> Int -> [Text] -> IO Bool
+---------------------------------------------
-testEleve debug n output = do
+type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
+testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
+testEleve debug n output checks = do
  let
-    out = T.words <$> output
+    res = split n info_autonomy nt <$> inp
-    expected = fmap (T.splitOn "-") <$> out
-    input = (T.splitOn "-" =<<) <$> out
-    inp = toToken (n - 1) <$> input
-    t   = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
-    nt  = normalizeEntropy  identity setNormEntropy (t :: Trie Token Double)
-    nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
-    pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
-          | ps <- L.nub $ [ c
-                          | m <- [1..n]
-                          , cs <- chunkAlong m 1 <$> inp
-                          , c <- cs
-                          ]
-          ]
-    res = map unToken . split identity t <$> inp
  when debug $ do
-    P.putStrLn (show input)
+    P.putStrLn $ show input
-    mapM_ (P.putStrLn . show) pss
+    P.putStrLn ""
-    P.putStrLn $ Tree.drawTree
+    printTrie info_entropy nt
-              $ fmap show
+    P.putStrLn ""
-              $ toTree (NonTerminal "") nt'
+    P.putStrLn "Splitting:"
    P.putStrLn $ show res
+  forM_ checks checker
  pure $ expected == res
+  where
+    out      = T.words <$> output
+    expected = fmap (T.splitOn "-") <$> out
+    input    = (T.splitOn "-" =<<) <$> out
+    inp      = toToken <$> input
+    nt :: Tries Token (I Double)
+    nt = normalizeEntropy info_entropy_var set_autonomy
+       . evTrie identity set_entropy_var
+       . entropyTrie isTerminal
+       $ buildTries n inp
+    check f msg ref my =
+      if f ref my
+        then P.putStrLn $ "    \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
+        else P.putStrLn $ "    \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
+    checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
+      let ns  = parseToken <$> T.words ngram
+          nt' = findTrie ns nt
+      P.putStrLn $ "  " <> T.unpack ngram <> ":"
+      check (==) "count"        count        (_node_count                  (_fwd nt'))
+      check sim  "entropy"      entropy      (nodeEntropy info_entropy           nt' )
+      check sim  "ev"           ev           (nodeEntropy info_entropy_var       nt' )
+      check sim  "autonomy"     autonomy     (nodeEntropy info_autonomy          nt' )
+      check sim  "fwd_entropy"  fwd_entropy  (nodeEntropy info_entropy     (_fwd nt'))
+      check sim  "fwd_ev"       fwd_ev       (nodeEntropy info_entropy_var (_fwd nt'))
+      check sim  "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy    (_fwd nt'))
+      check sim  "bwd_entropy"  bwd_entropy  (nodeEntropy info_entropy     (_bwd nt'))
+      check sim  "bwd_ev"       bwd_ev       (nodeEntropy info_entropy_var (_bwd nt'))
+      check sim  "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy    (_bwd nt'))
 -- | TODO real data is a list of tokenized sentences
-example0, example1, example2, example3, example4, example5 :: [Text]
+example0, example1, example2, example3, example4, example5, example6 :: [Text]
 example0 =  ["New-York is New-York and New-York"]
 example1 =  ["to-be or not to-be"]
 example2 =  ["to-be-or not to-be-or NOT to-be and"]
@@ -283,18 +439,65 @@ example3 =  example0 <> example0
       -- > TEST: Should not have York New in the trie
 example4 =  ["a-b-c-d e a-b-c-d f"]
 example5 =  ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
+example6 =  ["le-petit chat"
+            ,"le-petit chien"
+            ,"le-petit rat"
+            ,"le gros rat"
+            ]
+checks0, checks2 :: Checks Double
+checks0 =
+-- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
+  [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
+  , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
+  , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
+  , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
+  , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
+  , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
+  , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
+  , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
+  , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
+  , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
+  , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
+  , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
+  , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
+  , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
+  , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
+  , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
+  , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
+  , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
+  , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
+  , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
+  , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
+  ]
+checks2 = []
+{-
+  [("to be",  3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
+  ,("be or",  2, 0.5, nan, nan, nan, 1.0)
+  ,("or not", 1, 0.0, nan, nan, nan, 0.0)
+  ,("not to", 1, 0.0, nan, nan, nan, 0.0)
+  ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
+  ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
+  ,("be and", 1, 0.0, nan, nan, nan, 0.0)
+  ]
+-}
 runTests :: IO ()
 runTests =
  forM_
-    [("example0", 2, example0)
+    [("example0", 3, example0, checks0)
-    ,("example1", 2, example1)
+    ,("example0", 2, example0, [])
-    ,("example2", 3, example2)
+    ,("example1", 2, example1, [])
-    ,("example3", 2, example3)
+    ,("example2", 3, example2, checks2)
-    ,("example4", 4, example4)
+    ,("example3", 2, example3, [])
-    ,("example5", 5, example5)
+    ,("example4", 4, example4, [])
+    ,("example5", 5, example5, [])
+    ,("example6", 2, example6, [])
    ]
-    (\(name, n, ex) -> do
+    (\(name, n, ex, checks) -> do
-      b <- testEleve False n ex
+      P.putStrLn $ name <> " " <> show n
-      P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
+      b <- testEleve False n ex checks
+      P.putStrLn $ "  splitting: " <> if b then "PASS" else "FAIL"
    )