Commit e5871a7d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLEAN] improving groups with lemma (WIP)

parent 66ca6fd7
Pipeline #1169 failed with stage
...@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake) ...@@ -17,14 +17,12 @@ module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
import Data.Text hiding (map, group, filter, concat) import Data.Text hiding (map, group, filter, concat)
import Data.List (concat) import Data.List (concat)
import qualified Data.Set as S
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.PosTagging import Gargantext.Core.Text.Terms.Multi.PosTagging
import Gargantext.Core.Text.Terms.Mono.Stem (stem)
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
...@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake) ...@@ -32,17 +30,15 @@ import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
multiterms :: Lang -> Text -> IO [Terms] multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat multiterms lang txt = concat
<$> map (map (tokenTag2terms lang)) <$> map (map tokenTag2terms)
<$> map (filter (\t -> _my_token_pos t == Just NP)) <$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt <$> tokenTags lang txt
tokenTag2terms :: Lang -> TokenTag -> Terms tokenTag2terms :: TokenTag -> Terms
tokenTag2terms lang (TokenTag w t _ _) = Terms w t' tokenTag2terms (TokenTag ws t _ _) = Terms ws t
where
t' = S.fromList $ map (stem lang) $ S.toList t
tokenTags :: Lang -> Text -> IO [[TokenTag]] tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
tokenTags' :: Lang -> Text -> IO [[TokenTag]] tokenTags' :: Lang -> Text -> IO [[TokenTag]]
...@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags ...@@ -53,7 +49,7 @@ tokenTags' lang t = map tokens2tokensTags
---- | This function analyses and groups (or not) ngrams according to ---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language. ---- specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag] groupTokens :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group groupTokens EN = En.groupTokens
group FR = Fr.group groupTokens FR = Fr.groupTokens
group _ = panic $ pack "group :: Lang not implemeted yet" groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"
...@@ -13,7 +13,7 @@ the tokens into extracted terms. ...@@ -13,7 +13,7 @@ the tokens into extracted terms.
-} -}
module Gargantext.Core.Text.Terms.Multi.Lang.En (group) module Gargantext.Core.Text.Terms.Multi.Lang.En (groupTokens)
where where
import Gargantext.Prelude import Gargantext.Prelude
...@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group ...@@ -22,17 +22,17 @@ import Gargantext.Core.Text.Terms.Multi.Group
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Rule grammar to group tokens -- | Rule grammar to group tokens
group :: [TokenTag] -> [TokenTag] groupTokens :: [TokenTag] -> [TokenTag]
group [] = [] groupTokens [] = []
group ntags = group2 NP NP groupTokens ntags = group2 NP NP
$ group2 NP VB $ group2 NP VB
-- $ group2 NP IN -- $ group2 NP IN
$ group2 IN DT $ group2 IN DT
-- $ group2 VB NP -- $ group2 VB NP
$ group2 JJ NP $ group2 JJ NP
$ group2 JJ JJ $ group2 JJ JJ
$ group2 JJ CC $ group2 JJ CC
$ ntags $ ntags
------------------------------------------------------------------------ ------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs) --groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
......
...@@ -14,16 +14,16 @@ is ADJectiv in french. ...@@ -14,16 +14,16 @@ is ADJectiv in french.
-} -}
module Gargantext.Core.Text.Terms.Multi.Lang.Fr (group) module Gargantext.Core.Text.Terms.Multi.Lang.Fr (groupTokens)
where where
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.Group (group2) import Gargantext.Core.Text.Terms.Multi.Group (group2)
group :: [TokenTag] -> [TokenTag] groupTokens :: [TokenTag] -> [TokenTag]
group [] = [] groupTokens [] = []
group ntags = group2 NP NP groupTokens ntags = group2 NP NP
$ group2 NP VB $ group2 NP VB
-- group2 NP IN -- group2 NP IN
-- group2 IN DT -- group2 IN DT
......
...@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag] ...@@ -64,10 +64,10 @@ tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokenTag :: Token -> TokenTag tokenTag :: Token -> TokenTag
tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n tokenTag (Token _ _ w l _ _ p n _ _) = TokenTag w' l' p n
where where
w' = split w w' = split w
s' = fromList (split s) l' = fromList (split l)
split = splitOn (pack " ") . toLower split = splitOn (pack " ") . toLower
filter' :: [TokenTag] -> [TokenTag] filter' :: [TokenTag] -> [TokenTag]
...@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs ...@@ -76,7 +76,7 @@ filter' xs = filter isNgrams xs
isNgrams (TokenTag _ _ p n) = isJust p || isJust n isNgrams (TokenTag _ _ p n) = isJust p || isJust n
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token] , _sentenceTokens :: [Token]
} deriving (Show, Generic) } deriving (Show, Generic)
...@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences) ...@@ -110,8 +110,9 @@ $(deriveJSON (unPrefix "_") ''PosSentences)
corenlp' :: ( FromJSON a corenlp' :: ( FromJSON a
, ConvertibleStrings p ByteString) => , ConvertibleStrings p ByteString
Lang -> p -> IO (Response a) )
=> Lang -> p -> IO (Response a)
corenlp' lang txt = do corenlp' lang txt = do
let properties = case lang of let properties = case lang of
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
...@@ -142,9 +143,9 @@ corenlp lang txt = do ...@@ -142,9 +143,9 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter." -- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]] -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]] tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
<$> map _sentenceTokens <$> map _sentenceTokens
<$> _sentences <$> _sentences
<$> corenlp lang s <$> corenlp lang s
...@@ -109,10 +109,10 @@ instance FromJSON NER where ...@@ -109,10 +109,10 @@ instance FromJSON NER where
instance ToJSON NER instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text] data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text , _my_token_lemma :: Set Text
, _my_token_pos :: Maybe POS , _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER , _my_token_ner :: Maybe NER
} deriving (Show) } deriving (Show)
instance Semigroup TokenTag where instance Semigroup TokenTag where
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment