1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
{-|
Module : Gargantext.Text.Ngrams.Token
Description : Tokens and tokenizing a text
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In computer science, lexical analysis, lexing or tokenization is the
process of converting a sequence of characters (such as in a computer
program or web page) into a sequence of tokens (strings with an assigned
and thus identified meaning).
Source: https://en.wikipedia.org/wiki/Tokenize
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono.Token (tokenize)
where
import Data.Text (Text)
import qualified Gargantext.Text.Terms.Mono.Token.En as En
-- | Contexts depend on the lang
--import Gargantext.Core (Lang(..))
type Token = Text
-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
tokenize :: Text -> [Token]
tokenize = En.tokenize
--data Context = Letter | Word | Sentence | Line | Paragraph
--
--tokenize' :: Lang -> Context -> [Token]
--tokenize' = undefined
--