[FEAT/Tokenizer] adding a great tokenizer thanks to Grzegorz Chrupała.

50d95f87 · Alexandre Delanoë · 02202afe · 50d95f87
Commit 50d95f87 authored 6 years ago by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 167 additions and 0 deletions

Text.hs src/Gargantext/Ngrams/Token/Text.hs +167 -0

No files found.
--- a/src/Gargantext/Ngrams/Token/Text.hs
+++ b/src/Gargantext/Ngrams/Token/Text.hs
+{-|
+Module      : Gargantext.Ngrams.Token.Text
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Inspired from https://bitbucket.org/gchrupala/lingo/overview
+-}
+
+
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Ngrams.Token.Text
+    ( EitherList(..)
+    , Tokenizer
+    , tokenize
+    , run
+    , defaultTokenizer
+    , whitespace
+    , uris
+    , punctuation
+    , finalPunctuation
+    , initialPunctuation
+    , allPunctuation
+    , contractions
+    , negatives
+    )
+where
+
+import qualified Data.Char as Char
+import Data.Maybe
+import Control.Monad.Instances ()
+import Control.Applicative
+import Control.Monad
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+-- | A Tokenizer is function which takes a list and returns a list of Eithers
+--  (wrapped in a newtype). Right Texts will be passed on for processing
+--  to tokenizers down
+--  the pipeline. Left Texts will be passed through the pipeline unchanged.
+--  Use a Left Texts in a tokenizer to protect certain tokens from further 
+--  processing (e.g. see the 'uris' tokenizer). 
+--  You can define your own custom tokenizer pipelines by chaining tokenizers together:
+---
+-- > myTokenizer :: Tokenizer 
+-- > myTokenizer = whitespace >=> allPunctuation
+---
+
+type Tokenizer =  Text -> EitherList Text Text
+
+-- | The EitherList is a newtype-wrapped list of Eithers.
+newtype EitherList a b =  E { unE :: [Either a b] }
+
+-- | Split string into words using the default tokenizer pipeline 
+tokenize :: Text -> [Text]
+tokenize = run defaultTokenizer
+
+-- | Run a tokenizer
+run :: Tokenizer -> (Text -> [Text])
+run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
+
+defaultTokenizer :: Tokenizer
+defaultTokenizer =     whitespace 
+                   >=> uris 
+                   >=> punctuation 
+                   >=> contractions 
+                   >=> negatives 
+
+-- | Detect common uris and freeze them
+uris :: Tokenizer
+uris x | isUri x = E [Left x]
+       | True    = E [Right x]
+    where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
+
+-- | Split off initial and final punctuation
+punctuation :: Tokenizer 
+punctuation = finalPunctuation >=> initialPunctuation
+
+hyphens :: Tokenizer
+hyphens xs = E [Right w | w <- T.split (=='-') xs ]
+
+-- | Split off word-final punctuation
+finalPunctuation :: Tokenizer
+finalPunctuation x = E $ filter (not . T.null . unwrap) res
+  where
+    res :: [Either Text Text]
+    res = case T.span Char.isPunctuation (T.reverse x) of
+      (ps, w) | T.null ps -> [ Right $ T.reverse w ]
+              | otherwise -> [ Right $ T.reverse w
+                             , Right $ T.reverse ps]
+      -- ([],w) -> [Right . T.reverse $ w]
+      -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
+
+-- | Split off word-initial punctuation
+initialPunctuation :: Tokenizer
+initialPunctuation x = E $ filter (not . T.null . unwrap) $
+    case T.span Char.isPunctuation x of
+      (ps,w) | T.null ps -> [ Right w ]
+             | otherwise -> [ Right ps
+                            , Right w ]
+
+-- | Split tokens on transitions between punctuation and
+-- non-punctuation characters. This tokenizer is not included in
+-- defaultTokenizer pipeline because dealing with word-internal
+-- punctuation is quite application specific.
+allPunctuation :: Tokenizer
+allPunctuation = E . map Right 
+                 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b) 
+
+-- | Split words ending in n't, and freeze n't 
+negatives :: Tokenizer
+negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
+                                         , Left "n't" ]
+            | True                   = E [ Right x ]
+
+-- | Split common contractions off and freeze them.
+-- | Currently deals with: 'm, 's, 'd, 've, 'll
+contractions :: Tokenizer
+contractions x = case catMaybes . map (splitSuffix x) $ cts of
+                   [] -> return x
+                   ((w,s):_) -> E [ Right w,Left s]
+    where cts = ["'m","'s","'d","'ve","'ll"]
+          splitSuffix w sfx = 
+              let w' = T.reverse w
+                  len = T.length sfx
+              in if sfx `T.isSuffixOf` w 
+                 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
+                 else Nothing
+
+
+-- | Split string on whitespace. This is just a wrapper for Data.List.words
+whitespace :: Tokenizer
+whitespace xs = E [Right w | w <- T.words xs ]
+
+instance Monad (EitherList a) where
+    return x = E [Right x]
+    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
+
+instance Applicative (EitherList a) where
+    pure x = return x
+    f <*> x = f `ap` x
+
+instance Functor (EitherList a) where
+    fmap f (E xs) = E $ (fmap . fmap) f xs
+
+unwrap :: Either a a -> a
+unwrap (Left x) = x
+unwrap (Right x) = x
+
+examples :: [Text]
+examples = 
+    ["This shouldn't happen."
+    ,"Some 'quoted' stuff"
+    ,"This is a URL: http://example.org."
+    ,"How about an email@example.com"
+    ,"ReferenceError #1065 broke my debugger!"
+    ,"I would've gone."
+    ,"They've been there."
+    ,"Hyphen-words"
+    ,"Yes/No questions"
+    ]
+