Multi.hs

{-|
Module      : Gargantext.Text.Terms.Multi
Description : Multi Terms module
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Multi-terms are ngrams where n > 1.

-}

{-# LANGUAGE NoImplicitPrelude #-}

module Gargantext.Text.Terms.Multi (multiterms, multiterms_rake)
  where

import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
import qualified Data.Set as S

import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types

import Gargantext.Text.Terms.Multi.PosTagging
import Gargantext.Text.Terms.Mono.Stem (stem)
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr

import Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)

multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
                   <$> map (map (tokenTag2terms lang))
                   <$> map (filter (\t -> _my_token_pos t == Just NP)) 
                   <$> tokenTags lang txt

tokenTag2terms :: Lang -> TokenTag -> Terms
tokenTag2terms lang (TokenTag w t _ _) =  Terms w t'
  where
    t' = S.fromList $ map (stem lang) $ S.toList t

tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s


tokenTags' :: Lang -> Text -> IO [[TokenTag]]
tokenTags' lang t =  map tokens2tokensTags
                     <$> map _sentenceTokens
                     <$> _sentences
                     <$> corenlp lang t

---- | This function analyses and groups (or not) ngrams according to
----   specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
-- group _  = panic $ pack "group :: Lang not implemeted yet"