BreakWords.purs 2.62 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
-- | Break a string into words and spaces
-- | It uses a simple algorithm of searching for word characters incrementally
-- | Punctuation is considered whitespace, so it's best used in a sentence or
-- | for highlighting purposes
module Gargantext.Text.BreakWords (BrokenWord(..), breakWords) where

import Prelude
import Data.Traversable (traverse_)
import Effect (Effect)
import Data.Maybe (Maybe(..))
import Data.Unit (Unit, unit)
import Effect.Uncurried (EffectFn2, runEffectFn2)
import Data.Function.Uncurried (Fn1, runFn1)
import Data.String.CodeUnits (length, slice) -- TODO: double check i'm the right choice
import Data.Nullable (Nullable, toMaybe)
import Data.String.Regex (Regex)
import Gargantext.Utils.Regex
import Gargantext.Utils.Array (push)

data BrokenWord = Word String | Space String

breakWords :: String -> Effect (Array BrokenWord)
breakWords s = loop $ break s
  where loop b = breakNext b >>= (h b) 
        h :: Breaking -> Boolean -> Effect (Array BrokenWord)
        h b cont
          | cont = loop b
          | otherwise = pure b.results

-- Implementation

-- Returns whether to continue
breakNext :: Breaking -> Effect Boolean
breakNext b = checkStatic b $ lastIndex b
  where checkStatic b origin
          | origin == length b.source = pure false
          | otherwise = search b >>= next' origin
        next' origin Nothing = finish b origin
        next' origin (Just w) = next b origin w

next :: Breaking -> Int -> String -> Effect Boolean
next b origin word =
  do traverse_ (pushSpace b) $ preceding b origin word
     pushWord b word
     pure true
46
    
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
preceding :: Breaking -> Int -> String -> Maybe String
preceding b origin word = p $ (lastIndex b) - (length word)
  where p o
          | o == origin = Nothing
          | otherwise = slice origin o b.source

finish :: Breaking -> Int -> Effect Boolean
finish b origin =
  do let last = slice origin (-1) b.source
     traverse_ (pushSpace b) last
     pure false
     
type Breaking = { source :: String, wordRegex :: Regex, results :: Array BrokenWord }

-- almost `pure`
break :: String -> Breaking
break s = { source, wordRegex, results }
  where source = s
        wordRegex = cloneRegex _wordRegex
        results = []

search :: Breaking -> Effect (Maybe String)
search b = execRegex b.wordRegex b.source
     
lastIndex :: Breaking -> Int
lastIndex b = getRegexLastIndex b.wordRegex

pushResult :: Breaking -> BrokenWord -> Effect Unit
pushResult b = push b.results
  
pushSpace :: Breaking -> String -> Effect Unit
pushSpace b = pushResult b <<< Space

pushWord :: Breaking -> String -> Effect Unit
pushWord b = pushResult b <<< Word

foreign import _wordRegex :: Regex