Multi.hs 1.78 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
{-|
Module      : Gargantext.Text.Terms.Multi
Description : Multi Terms module
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Multi-terms are ngrams where n > 1.

-}

{-# LANGUAGE NoImplicitPrelude #-}

16
module Gargantext.Text.Terms.Multi (multiterms, multiterms_rake)
17 18
  where

19 20
import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
21
import qualified Data.Set as S
22 23 24 25 26 27

import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types

import Gargantext.Text.Terms.Multi.PosTagging
28
import Gargantext.Text.Terms.Mono.Stem (stem)
29 30 31
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr

32 33
import Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)

34 35
multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
36
                   <$> map (map (tokenTag2terms lang))
37 38
                   <$> map (filter (\t -> _my_token_pos t == Just NP)) 
                   <$> tokenTags lang txt
39

40 41 42 43
tokenTag2terms :: Lang -> TokenTag -> Terms
tokenTag2terms lang (TokenTag w t _ _) =  Terms w t'
  where
    t' = S.fromList $ map (stem lang) $ S.toList t
44

45 46
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s
47

48 49 50

tokenTags' :: Lang -> Text -> IO [[TokenTag]]
tokenTags' lang t =  map tokens2tokensTags
51 52 53 54 55 56 57 58 59
                     <$> map _sentenceTokens
                     <$> _sentences
                     <$> corenlp lang t

---- | This function analyses and groups (or not) ngrams according to
----   specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
60
-- group _  = panic $ pack "group :: Lang not implemeted yet"