Terms.hs 5.85 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Text.Ngrams
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
Description : Ngrams definition and tools
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

An @n-gram@ is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.

Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
"unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
"four-gram", "five-gram", and so on.

Source: https://en.wikipedia.org/wiki/Ngrams

TODO
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups

compute cooccurrences
compute graph

-}

31
{-# LANGUAGE TemplateHaskell   #-}
32
{-# LANGUAGE ConstrainedClassMethods #-}
33

34
module Gargantext.Core.Text.Terms
35 36
  where

37
import Control.Lens
38 39
import Data.Map (Map)
import qualified Data.Map as Map
40
import Data.Text (Text)
41
import Data.Traversable
42 43 44
import qualified Data.List as List
import qualified Data.Set  as Set
import qualified Data.Text as Text
45
import GHC.Base (String)
46
import GHC.Generics (Generic)
47

48
import Gargantext.Core
49
import Gargantext.Core.Flow.Types
50 51 52 53 54 55
import Gargantext.Core.Text (sentences, HasText(..))
import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
import Gargantext.Core.Text.Terms.Mono  (monoTerms)
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
import Gargantext.Core.Text.Terms.Multi (multiterms)
56 57
import Gargantext.Core.Types
import Gargantext.Database.Prelude (Cmd)
58
import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams)
59
import Gargantext.Prelude
60

61

62
data TermType lang
63 64 65 66 67 68 69
  = Mono      { _tt_lang :: !lang }
  | Multi     { _tt_lang :: !lang }
  | MonoMulti { _tt_lang :: !lang }
  | Unsupervised { _tt_lang  :: !lang
                 , _tt_windowSize :: !Int
                 , _tt_ngramsSize :: !Int
                 , _tt_model      :: !(Maybe (Tries Token ()))
70 71
                 }
  deriving Generic
72

73
makeLenses ''TermType
74 75
--group :: [Text] -> [Text]
--group = undefined
76

77 78
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $ 
79
------------------------------------------------------------------------
80
-- | Sugar to extract terms from text (hiddeng mapM from end user).
81 82
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
83

84
extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
85
  where
86 87 88 89
    m' = case m of
      Just m''-> m''
      Nothing -> newTries n (Text.intercalate " " xs)

90
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
91

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110

------------------------------------------------------------------------
withLang :: HasText a
         => TermType Lang
         -> [DocumentWithId a]
         -> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  where
    m' = case m of
      Nothing -> -- trace ("buildTries here" :: String)
               Just $ buildTries n ( fmap toToken
                                   $ uniText
                                   $ Text.intercalate " . "
                                   $ List.concat
                                   $ map hasText ns
                                   )
      just_m -> just_m
withLang l _ = l

111
------------------------------------------------------------------------
112 113 114 115 116 117 118 119 120
class ExtractNgramsT h
  where
    extractNgramsT :: HasText h
                   => TermType Lang
                   -> h
                   -> Cmd err (Map Ngrams (Map NgramsType Int))

filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
                     -> Map Ngrams (Map NgramsType Int)
121
filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms
122
  where
123 124 125
    filter' (ng,y)
      | Text.length (ng ^. ngramsTerms) < s = (ng,y)
      | otherwise                           = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y)
126 127 128 129


-- =======================================================

130 131 132 133 134
-- | Terms from Text
-- Mono : mono terms
-- Multi : multi terms
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
135
terms :: TermType Lang -> Text -> IO [Terms]
136
terms (Mono      lang) txt = pure $ monoTerms lang txt
137 138
terms (Multi     lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
139
terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
140 141
  where
    m' = maybe (newTries n txt) identity m
142
-- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
143 144
------------------------------------------------------------------------

145 146 147 148
text2term :: Lang -> [Text] -> Terms
text2term _ [] = Terms [] Set.empty
text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)

149 150 151 152 153 154 155
isPunctuation :: Text -> Bool
isPunctuation x = List.elem x $  (Text.pack . pure)
                             <$> ("!?(),;." :: String)

-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: remove IO
156
-- TODO: newtype BlockText
157 158 159 160 161 162

type WindowSize = Int
type MinNgramSize = Int

termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
termsUnsupervised (Unsupervised l n s m) =
163 164 165
               pure
             . map (text2term l)
             . List.nub
Alexandre Delanoë's avatar
Alexandre Delanoë committed
166
             . (List.filter (\l' -> List.length l' >= s))
Nicolas Pouillard's avatar
Nicolas Pouillard committed
167
             . List.concat
168
             . mainEleveWith (maybe (panic "no model") identity m) n
169
             . uniText
170
termsUnsupervised _ = undefined
171 172 173 174

newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t)

Alexandre Delanoë's avatar
Alexandre Delanoë committed
175
-- | TODO removing long terms > 24
176
uniText :: Text -> [[Text]]
Alexandre Delanoë's avatar
Alexandre Delanoë committed
177
uniText = map (List.filter (not . isPunctuation))
178
        . map tokenize
179
        . sentences       -- TODO get sentences according to lang
180
        . Text.toLower
181