1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
{-|
Module : Gargantext.Text.Context
Description : How to manage contexts of texts ?
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Context of text management tool, here are logic of main types:
- Term
- Multi-term
- Label
- Sentence
- Corpus
How to split contexts is describes in this module.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Context
where
import Data.Text (Text, pack, unpack)
import Data.String (IsString)
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
import Gargantext.Text
import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
type Term = Text
type MultiTerm = [Term]
type Label = MultiTerm
type TermList = [(Label, [MultiTerm])]
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
-- | Contexts definition to build/unbuild contexts.
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag :: Text -> [Tag Text]
tag = parseTags
-- | splitBy contexts of Chars or Sentences or Paragraphs
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
-- ["a","b","c","d","e"]
--
-- >>> splitBy (Chars 1) (pack "abcde")
-- ["ab","bc","cd","de"]
--
-- >>> splitBy (Chars 2) (pack "abcde")
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars n) = map pack . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n) = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag . filter isTagText . tag
where
unTag :: IsString p => Tag p -> p
unTag (TagText x) = x
unTag _ = ""