1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
{-|
Module : Gargantext.Core.Text
Description : Ngrams tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Text gathers terms in unit of contexts.
-}
module Gargantext.Core.Text
where
import Data.Text (split)
import Data.Text qualified as DT
import Gargantext.Prelude hiding (filter)
import NLP.FullStop (segment)
import Prelude qualified
-----------------------------------------------------------------
class HasText h
where
hasText :: h -> [Text]
-----------------------------------------------------------------
-- French words to distinguish contexts
newtype Texte = Texte Text
newtype Paragraphe = Paragraphe Text
newtype Phrase = Phrase Text
newtype MultiTerme = MultiTerme Text
newtype Mot = Mot Text
newtype Lettre = Lettre Text
-- | Type syn seems obvious
type Titre = Phrase
-----------------------------------------------------------------
instance Prelude.Show Texte where
show (Texte t) = show t
instance Prelude.Show Paragraphe where
show (Paragraphe p) = show p
instance Prelude.Show Phrase where
show (Phrase p) = show p
instance Prelude.Show MultiTerme where
show (MultiTerme mt) = show mt
instance Prelude.Show Mot where
show (Mot t) = show t
instance Prelude.Show Lettre where
show (Lettre l) = show l
-----------------------------------------------------------------
class Collage sup inf where
dec :: sup -> [inf]
inc :: [inf] -> sup
instance Collage Texte Paragraphe where
dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t
inc = Texte . DT.intercalate "\n" . map (\(Paragraphe t) -> t)
instance Collage Paragraphe Phrase where
dec (Paragraphe t) = map Phrase $ sentences t
inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p)
instance Collage Phrase MultiTerme where
dec (Phrase t) = map MultiTerme $ DT.words t
inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p)
instance Collage MultiTerme Mot where
dec (MultiTerme mt) = map Mot $ DT.words mt
inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = map DT.pack $ segment $ DT.unpack txt
sentences' :: Text -> [Text]
sentences' txt = split isCharStop txt
isCharStop :: Char -> Bool
isCharStop c = c `elem` ['.','?','!']
unsentences :: [Text] -> Text
unsentences txts = DT.intercalate " " txts
-- | Ngrams size
size :: Text -> Int
size t = 1 + DT.count " " t