Context.hs 1.97 KB
Newer Older
Alexandre Delanoë's avatar
Alexandre Delanoë committed
1 2
{-|
Module      : Gargantext.Text.Context
3
Description : How to manage contexts of texts ?
Alexandre Delanoë's avatar
Alexandre Delanoë committed
4 5 6 7 8 9
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

10 11 12 13 14 15 16 17 18
Context of text management tool, here are logic of main types:

- Term
- Multi-term
- Label
- Sentence
- Corpus

How to split contexts is describes in this module.
Alexandre Delanoë's avatar
Alexandre Delanoë committed
19 20 21 22 23 24

-}

{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}

25 26
module Gargantext.Text.Context
  where
Alexandre Delanoë's avatar
Alexandre Delanoë committed
27

28
import Data.Text (Text, pack, unpack)
Alexandre Delanoë's avatar
Alexandre Delanoë committed
29 30
import Data.String (IsString)

31
import Text.HTML.TagSoup (parseTags, isTagText, Tag(..))
Alexandre Delanoë's avatar
Alexandre Delanoë committed
32 33 34
import Gargantext.Text
import Gargantext.Prelude hiding (length)

35 36
------------------------------------------------------------------------
type Term = Text
Nicolas Pouillard's avatar
Nicolas Pouillard committed
37 38
type MultiTerm = [Term]
type Label = MultiTerm
39

Nicolas Pouillard's avatar
Nicolas Pouillard committed
40
type TermList = [(Label, [MultiTerm])]
41

42 43 44 45 46 47 48
type Sentence  a = [a] -- or a nominal group
type Corpus    a = [Sentence a] -- a list of sentences

-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------

49
-- | Contexts definition to build/unbuild contexts.
50 51
data SplitContext = Chars Int | Sentences Int | Paragraphs Int

52
tag :: Text -> [Tag Text]
53
tag = parseTags
54

55
-- | splitBy contexts of Chars or Sentences or Paragraphs
56 57 58 59
-- To see some examples at a higher level (sentences and paragraph), see
-- 'Gargantext.Text.Examples.ex_terms'
--
-- >>> splitBy (Chars 0) (pack "abcde")
60
-- ["a","b","c","d","e"]
61 62
--
-- >>> splitBy (Chars 1) (pack "abcde")
63
-- ["ab","bc","cd","de"]
64 65
--
-- >>> splitBy (Chars 2) (pack "abcde")
66 67 68 69 70
-- ["abc","bcd","cde"]
splitBy :: SplitContext -> Text -> [Text]
splitBy (Chars     n)  = map pack        . chunkAlong (n+1) 1 . unpack
splitBy (Sentences n)  = map unsentences . chunkAlong (n+1) 1 . sentences
splitBy (Paragraphs _) = map unTag       . filter isTagText   . tag
Alexandre Delanoë's avatar
Alexandre Delanoë committed
71
  where
72 73 74
    unTag :: IsString p => Tag p -> p
    unTag (TagText x) = x
    unTag _           = ""
Alexandre Delanoë's avatar
Alexandre Delanoë committed
75 76