Commit e5871a7d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLEAN] improving groups with lemma (WIP)

parent 66ca6fd7
Pipeline #1169 failed with stage
......@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
import qualified Data.Set as S
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
......@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
<$> map (map (tokenTag2terms lang))
<$> map (map tokenTag2terms)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
tokenTag2terms :: Lang -> TokenTag -> Terms
tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
where
t' = S.fromList $ map (stem lang) $ S.toList t
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s
tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
tokenTags' :: Lang -> Text -> IO [[TokenTag]]
......@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
group _ = panic $ pack "group :: Lang not implemeted yet"
groupTokens :: Lang -> [TokenTag] -> [TokenTag]
groupTokens EN = En.groupTokens
groupTokens FR = Fr.groupTokens
groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"
......@@ -13,7 +13,7 @@ the tokens into extracted terms.
-}
module Gargantext.Core.Text.Terms.Multi.Lang.En (group)
module Gargantext.Core.Text.Terms.Multi.Lang.En (groupTokens)
where
import Gargantext.Prelude
......@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group
------------------------------------------------------------------------
-- | Rule grammar to group tokens
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
-- $ group2 NP IN
$ group2 IN DT
-- $ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
groupTokens :: [TokenTag] -> [TokenTag]
groupTokens [] = []
groupTokens ntags = group2 NP NP
$ group2 NP VB
-- $ group2 NP IN
$ group2 IN DT
-- $ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
......
......@@ -14,16 +14,16 @@ is ADJectiv in french.
-}
module Gargantext.Core.Text.Terms.Multi.Lang.Fr (group)
module Gargantext.Core.Text.Terms.Multi.Lang.Fr (groupTokens)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.Group (group2)
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
groupTokens :: [TokenTag] -> [TokenTag]
groupTokens [] = []
groupTokens ntags = group2 NP NP
$ group2 NP VB
-- group2 NP IN
-- group2 IN DT
......
......@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
tokenTag :: Token -> TokenTag
tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n
tokenTag (Token _ _ w l _ _ p n _ _) = TokenTag w' l' p n
where
w' = split w
s' = fromList (split s)
l' = fromList (split l)
split = splitOn (pack " ") . toLower
filter' :: [TokenTag] -> [TokenTag]
......@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int
data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token]
} deriving (Show, Generic)
......@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences)
corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString) =>
Lang -> p -> IO (Response a)
, ConvertibleStrings p ByteString
)
=> Lang -> p -> IO (Response a)
corenlp' lang txt = do
let properties = case lang of
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
......@@ -142,9 +143,9 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
<$> map _sentenceTokens
<$> _sentences
<$> _sentences
<$> corenlp lang s
......@@ -109,10 +109,10 @@ instance FromJSON NER where
instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_lemma :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
instance Semigroup TokenTag where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment