Token.hs 1.06 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
{-|
Module      : Gargantext.Text.Ngrams.Token
Description : Tokens and tokenizing a text
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

In computer science, lexical analysis, lexing or tokenization is the
process of converting a sequence of characters (such as in a computer
program or web page) into a sequence of tokens (strings with an assigned
and thus identified meaning).
Source: https://en.wikipedia.org/wiki/Tokenize

-}

{-# LANGUAGE NoImplicitPrelude #-}

20
module Gargantext.Text.Terms.Mono.Token (tokenize)
21 22 23
  where

import Data.Text (Text)
24
import qualified Gargantext.Text.Terms.Mono.Token.En as En
25

26 27
-- | Contexts depend on the lang
--import Gargantext.Core (Lang(..))
28

29 30 31 32
type Token = Text

-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
33 34


35 36 37
tokenize :: Text -> [Token]
tokenize = En.tokenize

38 39 40 41 42
--data Context = Letter | Word | Sentence | Line | Paragraph
--
--tokenize' :: Lang -> Context -> [Token]
--tokenize' = undefined
--