Terms.hs 7.6 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Text.Ngrams
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
Description : Ngrams definition and tools
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

An @n-gram@ is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.

Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
"unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
"four-gram", "five-gram", and so on.

Source: https://en.wikipedia.org/wiki/Ngrams

TODO
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups

compute cooccurrences
compute graph

-}

31
{-# LANGUAGE TemplateHaskell   #-}
32
{-# LANGUAGE ConstrainedClassMethods #-}
33

34
module Gargantext.Core.Text.Terms
35 36
  where

37
import Control.Lens
38
import Data.HashMap.Strict (HashMap)
39
import Data.Hashable (Hashable)
40
import Data.Map (Map)
41
import Data.Text (Text)
42
import Data.Traversable
43 44
import GHC.Base (String)
import GHC.Generics (Generic)
45 46 47
import qualified Data.List           as List
import qualified Data.Set            as Set
import qualified Data.Text           as Text
48
import qualified Data.HashMap.Strict as HashMap
49
import Gargantext.Core
50 51 52 53 54 55
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono  (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
56 57
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd)
58
import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
59
import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
60
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
61
import Gargantext.Prelude
62

63
data TermType lang
64 65 66
  = Mono      { _tt_lang :: !lang }
  | Multi     { _tt_lang :: !lang }
  | MonoMulti { _tt_lang :: !lang }
Alexandre Delanoë's avatar
Alexandre Delanoë committed
67
  | Unsupervised { _tt_lang       :: !lang
68 69 70
                 , _tt_windowSize :: !Int
                 , _tt_ngramsSize :: !Int
                 , _tt_model      :: !(Maybe (Tries Token ()))
71
                 }
72
  deriving (Generic)
73

74
makeLenses ''TermType
75 76
--group :: [Text] -> [Text]
--group = undefined
77

78 79
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $ 
80
------------------------------------------------------------------------
81
-- | Sugar to extract terms from text (hiddeng mapM from end user).
82 83
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
84

85
extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
86
  where
87
    m' = case _tt_model of
88
      Just m''-> m''
89
      Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
90

91
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
92

93 94

------------------------------------------------------------------------
95
withLang :: (Foldable t, Functor t, HasText h)
96
         => TermType Lang
97
         -> t h
98
         -> TermType Lang
99
withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
100
  where
101
    m' = case _tt_model of
102
      Nothing -> -- trace ("buildTries here" :: String)
103 104 105 106 107 108
               Just $ buildTries _tt_ngramsSize
                    $ fmap toToken
                    $ uniText
                    $ Text.intercalate " . "
                    $ List.concat
                    $ map hasText ns
109 110 111
      just_m -> just_m
withLang l _ = l

112
------------------------------------------------------------------------
113 114
data ExtractedNgrams = SimpleNgrams   { unSimpleNgrams   :: Ngrams       }
                     | EnrichedNgrams { unEnrichedNgrams :: NgramsPostag }
115
  deriving (Eq, Ord, Generic, Show)
116 117 118

instance Hashable ExtractedNgrams

119 120 121 122 123
class ExtractNgramsT h
  where
    extractNgramsT :: HasText h
                   => TermType Lang
                   -> h
124 125
                   -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
------------------------------------------------------------------------
126 127 128 129 130 131
enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
enrichedTerms l pa po (Terms ng1 ng2) =
  NgramsPostag l pa po form lem
    where
      form = text2ngrams $ Text.intercalate " " ng1
      lem  = text2ngrams $ Text.intercalate " " $ Set.toList ng2
132

133
------------------------------------------------------------------------
134 135 136 137 138
cleanNgrams :: Int -> Ngrams -> Ngrams
cleanNgrams s ng 
      | Text.length (ng ^. ngramsTerms) < s = ng
      | otherwise                           = text2ngrams (Text.take s (ng ^. ngramsTerms))

139
cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
140 141 142
cleanExtractedNgrams s (SimpleNgrams   ng) = SimpleNgrams $ (cleanNgrams s) ng
cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
                                                            $ over np_lem  (cleanNgrams s) ng
143

144
extracted2ngrams :: ExtractedNgrams -> Ngrams
145 146
extracted2ngrams (SimpleNgrams   ng) = ng
extracted2ngrams (EnrichedNgrams ng) = view np_form ng
147

148
---------------------------
149 150 151
insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
insertExtractedNgrams ngs = do
  let (s, e) = List.partition isSimpleNgrams ngs
152
  m1 <- insertNgrams       (map unSimpleNgrams   s)
153
  --printDebug "others" m1
154 155
  
  m2 <- insertNgramsPostag (map unEnrichedNgrams e)
156
  --printDebug "terms" m2
157
 
158
  let result = HashMap.union m1 m2
159
  pure result
160

161 162 163
isSimpleNgrams :: ExtractedNgrams -> Bool
isSimpleNgrams (SimpleNgrams _) = True
isSimpleNgrams _                = False
164

165
------------------------------------------------------------------------
166 167 168 169 170
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
171
terms :: TermType Lang -> Text -> IO [Terms]
172
terms (Mono      lang) txt = pure $ monoTerms lang txt
173 174
terms (Multi     lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
175
terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
176
  where
177
    m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
178
-- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
Alexandre Delanoë's avatar
Alexandre Delanoë committed
179 180


181
------------------------------------------------------------------------
182 183 184
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
185
-- TODO: newtype BlockText
186 187 188 189 190 191

type WindowSize = Int
type MinNgramSize = Int

termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
192 193 194
               pure
             . map (text2term l)
             . List.nub
Alexandre Delanoë's avatar
Alexandre Delanoë committed
195
             . (List.filter (\l' -> List.length l' >= s))
Nicolas Pouillard's avatar
Nicolas Pouillard committed
196
             . List.concat
197
             . mainEleveWith (maybe (panic "no model") identity m) n
198
             . uniText
199
termsUnsupervised _ = undefined
200

201 202


203 204 205
newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)

Alexandre Delanoë's avatar
Alexandre Delanoë committed
206
-- | TODO removing long terms > 24
207
uniText :: Text -> [[Text]]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
208
uniText = map (List.filter (not . isPunctuation))
209
        . map tokenize
210
        . sentences       -- TODO get sentences according to lang
211
        . Text.toLower
212

213 214 215 216 217 218
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)

isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $  (Text.pack . pure)
219
                             <$> ("!?(),;.:" :: String)
220 221