Commit 02e3e8d2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] grouping ngrams better written (simplified) with semigroup. TODO: update the tests

parent c20d8f5f
......@@ -15,7 +15,6 @@ dependencies:
- text
library:
source-dirs: src
default-extensions:
ghc-options:
- -Wincomplete-uni-patterns
- -Wincomplete-record-updates
......@@ -33,6 +32,7 @@ library:
- Gargantext.Text.Metrics.FrequentItemSet
- Gargantext.Text.Ngrams.PosTagging.CoreNLP
- Gargantext.Text.Ngrams.PosTagging.Parser
- Gargantext.Text.Ngrams.Token.Text
- Gargantext.Text.Parsers.Date
- Gargantext.Database
- Gargantext.API
......@@ -54,10 +54,11 @@ library:
- data-time-segment
- directory
- duckling
- exceptions
- filepath
- fclabels
- fast-logger
# - haskell-gi-base
- full-text-search
- http-client
- http-client-tls
- http-conduit
......@@ -134,7 +135,6 @@ tests:
garg-test:
main: Main.hs
source-dirs: src-test
default-extensions:
ghc-options:
- -threaded
- -rtsopts
......
......@@ -46,9 +46,3 @@ ngramsExtractionTest = hspec $ do
t2 `shouldBe` [[("Donald Trump","NNP","PERSON"),("president of the United-States of America","NN","LOCATION")]]
......@@ -20,84 +20,37 @@ n non negative integer
module Gargantext.Text
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, filter, toLower, split, splitOn)
import qualified Data.Text as DT
--import Data.Text.IO (readFile)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
, lookupIndex
--, fromList, keys
)
import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter)
import Data.Foldable (foldl')
-----------------------------------------------------------------
import Gargantext.Text.Ngrams.Stem.En
import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences
import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
import Gargantext.Prelude hiding (filter)
-----------------------------------------------------------------
data ListName = Stop | Candidate | Graph
deriving (Show, Eq)
data Ngroup = Ngroup { _ngroup_label :: Ngrams
, _ngroup_ngrams :: [Ngrams]
data Group = Group { _group_label :: Ngrams
, _group_ngrams :: [Ngrams]
} deriving (Show)
data Ngrams = Ngrams { _ngrams_label :: [Text]
, _ngrams_stem :: Set Text
} deriving (Show)
text2ngrams :: Text -> Ngrams
text2ngrams txt = Ngrams txt' (S.fromList $ map stem txt')
where
txt' = splitOn " " txt
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
type Occ = Int
--type Index = Int
-- Data Ngrams = Monograms | MultiGrams
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
monograms :: Text -> [Text]
monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (unionWith (+)) empty xs
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
......@@ -135,32 +88,35 @@ text2fis n xs = list2fis n (map ngrams xs)
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
---- | https://en.wikipedia.org/wiki/Text_mining
--testText :: Text
--testText = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
--
--
--
---- | Tests
----ngramsTest :: [Text]
--ngramsTest = ocs
-- where
-- --txt = concat <$> lines <$> clean <$> readFile filePath
-- txt = clean $ testText
-- -- | Number of sentences
-- ls = sentences $ txt
-- -- | Number of monograms used in the full text
-- ws = ngrams $ txt
-- -- | stem ngrams
-- -- TODO
-- -- group ngrams
-- ocs = occ $ ws
--
--
-- | https://en.wikipedia.org/wiki/Text_mining
testText_en :: Text
testText_en = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
-- | https://fr.wikipedia.org/wiki/Fouille_de_textes
testText_fr :: Text
testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances » dans les textes est une spécialisation de la fouille de données et fait partie du domaine de l'intelligence artificielle. Cette technique est souvent désignée sous l'anglicisme text mining. Elle désigne un ensemble de traitements informatiques consistant à extraire des connaissances selon un critère de nouveauté ou de similarité dans des textes produits par des humains pour des humains. Dans la pratique, cela revient à mettre en algorithme un modèle simplifié des théories linguistiques dans des systèmes informatiques d'apprentissage et de statistiques. Les disciplines impliquées sont donc la linguistique calculatoire, l'ingénierie des langues, l'apprentissage artificiel, les statistiques et l'informatique."
-- | Ngrams Test
-- >>> ngramsTest testText
-- 248
ngramsTest :: Text -> Int
ngramsTest x= length ws
where
--txt = concat <$> lines <$> clean <$> readFile filePath
txt = clean x
-- | Number of sentences
--ls = sentences $ txt
-- | Number of monograms used in the full text
ws = ngrams $ txt
-- | stem ngrams
-- TODO
-- group ngrams
--ocs = occ $ ws
......@@ -7,17 +7,34 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Token and occurrence
An occurrence is not necessarily a token. Considering the sentence:
"A rose is a rose is a rose". We may equally correctly state that there
are eight or three words in the sentence. There are, in fact, three word
types in the sentence: "rose", "is" and "a". There are eight word tokens
in a token copy of the line. The line itself is a type. There are not
eight word types in the line. It contains (as stated) only the three
word types, 'a', 'is' and 'rose', each of which is unique. So what do we
call what there are eight of? They are occurrences of words. There are
three occurrences of the word type 'a', two of 'is' and three of 'rose'.
Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.Occurrences where
module Gargantext.Text.Metrics.Occurrences
where
import Gargantext.Prelude
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
)
import Control.Monad ((>>),(>>=))
import Data.String (String())
import Data.Attoparsec.Text
......@@ -25,7 +42,18 @@ import Data.Text (Text)
import Data.Either.Extra(Either(..))
import qualified Data.Text as T
import Control.Applicative
import Control.Applicative hiding (empty)
-----------------------------------------------------------
type Occ = Int
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (unionWith (+)) empty xs
occurrenceParser :: Text -> Parser Bool
......
{-|
Module : Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Module : Gargantext.Text.Ngrams
Description : Ngrams definition and tools
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
An @n-gram@ is a contiguous sequence of n items from a given sample of
text. In Gargantext application the items are words, n is a non negative
integer.
Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
"unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
"four-gram", "five-gram", and so on.
Source: https://en.wikipedia.org/wiki/Ngrams
-}
{-# LANGUAGE NoImplicitPrelude #-}
......@@ -15,3 +25,49 @@ Mainly reexport functions in @Data.Text.Metrics@
module Gargantext.Text.Ngrams
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack)
import Data.Set (Set)
import qualified Data.Set as S
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Text.Ngrams.Stem (stem)
data Ngrams = Ngrams { _ngrams_label :: [Text]
, _ngrams_stem :: Set Text
} deriving (Show)
data Terms = MonoGrams | MultiGrams
type MonoGrams = Text
type MultiGrams = [Text]
ngrams :: Text -> [Text]
ngrams = monograms
text2ngrams :: Lang -> Text -> Ngrams
text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
where
txt' = splitOn (pack " ") txt
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms txt = split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
......@@ -19,8 +19,13 @@ module Gargantext.Text.Ngrams.Lists
--import Data.Maybe
--import Data.List (filter)
--import Gargantext.Text
--import Gargantext.Prelude
import Gargantext.Prelude
--
data ListName = Stop | Candidate | Graph
deriving (Show, Eq)
--graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--
......@@ -30,3 +35,4 @@ module Gargantext.Text.Ngrams.Lists
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
......@@ -9,12 +9,13 @@ Portability : POSIX
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
module Gargantext.Text.Ngrams.PosTagging.CoreNLP
where
......@@ -22,9 +23,15 @@ module Gargantext.Text.Ngrams.PosTagging.CoreNLP
import GHC.Generics
import GHC.Show (Show(..))
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Aeson.TH (deriveJSON)
import Data.Monoid ((<>))
import Data.Text (Text)
import Data.Aeson
import Data.Monoid
import Data.Maybe (isJust)
import Data.Set (Set, fromList, empty)
import Data.Text (Text, splitOn, pack, toLower, unpack)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
......@@ -32,24 +39,107 @@ import Gargantext.Prelude
import Network.HTTP.Simple
import Control.Monad.Catch (MonadThrow)
import Control.Monad.IO.Class (MonadIO)
import Data.String.Conversions (ConvertibleStrings)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
, _tokenOriginalText :: Text
, _tokenLemma :: Text
, _tokenCharacterOffsetBegin :: Int
, _tokenCharacterOffsetEnd :: Int
, _tokenPos :: Text
, _tokenNer :: Text
, _tokenPos :: Maybe POS
, _tokenNer :: Maybe NER
, _tokenBefore :: Maybe Text
, _tokenAfter :: Maybe Text
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
------------------------------------------------------------------------
data NgramsTag = NgramsTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
------------------------------------------------------------------------
tokens2ngramsTags :: [Token] -> [NgramsTag]
tokens2ngramsTags ts = select $ map ngramsTag ts
------------------------------------------------------------------------
ngramsTag :: Token -> NgramsTag
ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n
where
w' = split w
s' = fromList (split s)
split = splitOn (pack " ") . toLower
token2text :: Token -> (Text, Text, Text)
token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
select :: [NgramsTag] -> [NgramsTag]
select xs = filter isNgrams xs
where
isNgrams (NgramsTag _ _ p n) = isJust p || isJust n
instance Monoid NgramsTag where
mempty = NgramsTag [] empty Nothing Nothing
mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
= NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token]
} deriving (Show, Generic)
......@@ -82,16 +172,28 @@ $(deriveJSON (unPrefix "_") ''Sentences)
--
corenlp :: Lang -> Text -> IO Sentences
corenlp lang txt = do
corenlp' :: ( MonadThrow m, MonadIO m, FromJSON a
, ConvertibleStrings p ByteString) =>
Lang -> p -> m (Response a)
corenlp' lang txt = do
let properties = case lang of
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
let request = setRequestBodyLBS (cs txt) url
response <- httpJSON request
pure (getResponseBody response :: Sentences)
httpJSON request
corenlpRaw :: Lang -> Text -> IO Value
corenlpRaw lang txt = do
response <- corenlp' lang txt
pure (getResponseBody response)
corenlp :: Lang -> Text -> IO Sentences
corenlp lang txt = do
response <- corenlp' lang txt
pure (getResponseBody response)
-- | parseWith
-- Part Of Speech example
......@@ -102,7 +204,9 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t)))
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang s
......@@ -14,87 +14,99 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.En (selectNgrams, groupNgrams, textTest)
module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
where
--import Data.Text (Text)
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
import Data.Text (Text)
import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = filter isNgrams xs
where
isNgrams (_,"NN" ,_ ) = True
isNgrams (_,"NNS" ,_ ) = True
isNgrams (_,"NNP" ,_ ) = True
isNgrams (_,"NN+CC",_ ) = True
isNgrams (_,_ ,"PERSON" ) = True
isNgrams (_,_ ,"ORGANIZATION") = True
isNgrams (_,_ ,"LOCATION" ) = True
isNgrams (_,_ ,_ ) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
cc = (c1, "CC", c1')
jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2'
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-- extractNgrams "Test the antiinflammatory or analgesic activity?"
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
groupNgrams (x:xs) = (x:(groupNgrams xs))
textTest :: [Text]
textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
, "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
, " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
, "In both models, the standard drug used was aspirin 100 mg/kg. "
, "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
, "Analgesic activity was studied in rats using hot plate and tail-flick models. "
, "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
, "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
, "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
-- group xs = group3 NN IN DT xs
-- TO BE REMOVED old code
--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
-- where
-- jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-- cc = (c1, "CC", c1')
-- jn1 = (j1, "JJ", j1')
-- jn2 = jn j2 j3 j2'
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--
---- extractNgrams "Test the antiinflammatory or analgesic activity?"
---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
---- > should be (antiinflammatory activity) <> (analgesic activity)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [Text]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
......@@ -7,79 +7,118 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french. -
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
import Data.Text (Text)
import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = filter selectNgrams' xs
where
selectNgrams' (_,"N" ,_ ) = True
selectNgrams' (_,"NC" ,_ ) = True
selectNgrams' (_,"NN+CC",_ ) = True
selectNgrams' (_,_ ,"PERSON" ) = True
selectNgrams' (_,_ ,"ORGANIZATION") = True
selectNgrams' (_,_ ,"LOCATION" ) = True
selectNgrams' (_,_ ,_ ) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
groupNgrams [] = []
--groupNgrams ((_,"DET",_):xs) = groupNgrams xs
-- "Groupe : nom commun et adjectifs avec conjonction"
groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
where
n1 = (n <> " " <> j1, "NC", n')
n2 = (n <> " " <> j2, "NC", n')
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
where
n1 = (n <> " " <> j1, "N", n')
n2 = (n <> " " <> j2, "N", n')
-- Groupe : Adjectif + Conjonction de coordination + Adjectif
-- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
-- Groupe : Nom commun + préposition + Nom commun
groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
-- Groupe : Plusieurs adjectifs successifs
groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
-- Groupe : nom commun et adjectif
groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- Groupe : adjectif et nom commun
groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
-- /!\ sometimes N instead of NC (why?)
groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
-- Si aucune des règles précédentes n'est remplie
groupNgrams (x:xs) = (x:(groupNgrams xs))
textTest :: [Text]
textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
------------------------------------------------------------------------
--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
-- where
--
--import Gargantext.Prelude
--import Data.Text (Text)
--import Data.Monoid ((<>))
--
--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--selectNgrams xs = filter selectNgrams' xs
-- where
-- selectNgrams' (_,"N" ,_ ) = True
-- selectNgrams' (_,"NC" ,_ ) = True
-- selectNgrams' (_,"NN+CC",_ ) = True
-- selectNgrams' (_,_ ,"PERSON" ) = True
-- selectNgrams' (_,_ ,"ORGANIZATION") = True
-- selectNgrams' (_,_ ,"LOCATION" ) = True
-- selectNgrams' (_,_ ,_ ) = False
--
--
--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--groupNgrams [] = []
--
----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
--
---- "Groupe : nom commun et adjectifs avec conjonction"
--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "NC", n')
-- n2 = (n <> " " <> j2, "NC", n')
--
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "N", n')
-- n2 = (n <> " " <> j2, "N", n')
--
---- Groupe : Adjectif + Conjonction de coordination + Adjectif
---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
--
---- Groupe : Nom commun + préposition + Nom commun
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
--
---- Groupe : Plusieurs adjectifs successifs
--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
--
---- Groupe : nom commun et adjectif
--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
---- Groupe : adjectif et nom commun
--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--
--
---- Si aucune des règles précédentes n'est remplie
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--
--textTest :: [Text]
--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
--
......@@ -9,63 +9,43 @@ Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Ngrams selection algorithms
A form is a list of characters seperated by one or more spaces in a sentence.
A word is a form.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Text.Ngrams.PosTagging.Parser
where
import Gargantext.Prelude
import Data.Text hiding (map)
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
type SNgrams = (Text, Text, Text)
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
-- A word is a form.
-- type Form = [Char]
-- For performance reasons, Type Text is used, then:
-- type Form = Text
-- Let be a form and its associated forms in contexts of a sentence.
-- Forms and subfoorms can be representend as Tree whose top is the minimal form
-- as a monogram whos occurrences are
-- ps : Common words function in Haskell do not take sentence into account
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Lang -> Text -> IO [[SNgrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams lang s = map (group lang) <$> extractNgrams' lang s
extractNgrams' :: Lang -> Text -> IO [[SNgrams]]
extractNgrams' lang t = map (map token2text)
extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams' lang t = map tokens2ngramsTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams :: Lang -> [SNgrams] -> [SNgrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams :: Lang -> [SNgrams] -> [SNgrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams
---- | This function analyse and groups (or not) ngrams according to
---- grammars specific of each language.
group :: Lang -> [NgramsTag] -> [NgramsTag]
group EN = En.group
group FR = Fr.group
......@@ -7,8 +7,14 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
In linguistic morphology and information retrieval, stemming is the
process of reducing inflected (or sometimes derived) words to their word
stem, base or root form—generally a written word form. The @stem@ needs
not be identical to the morphological root of the word; it is usually
sufficient that related words map to the same stem, even if this stem is
not in itself a valid root.
Source : https://en.wikipedia.org/wiki/Stemming
-}
......@@ -26,6 +32,23 @@ import Gargantext.Core (Lang(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
-- | Stemmer
-- A stemmer for English, for example, should identify the string "cats"
-- (and possibly "catlike", "catty" etc.) as based on the root "cat".
-- and
-- "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming
-- algorithm reduces the words "fishing", "fished", and "fisher" to the
-- root word, "fish". On the other hand, "argue", "argued", "argues",
-- "arguing", and "argus" reduce to the stem "argu" (illustrating the
-- case where the stem is not itself a word or root) but "argument" and
-- "arguments" reduce to the stem "argument".
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
......@@ -33,3 +56,5 @@ stem lang = DT.pack . N.stem lang' . DT.unpack
EN -> N.English
FR -> N.French
{-|
Module : Gargantext.Text.Ngrams.Token
Description : Tokens and tokenizing a text
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In computer science, lexical analysis, lexing or tokenization is the
process of converting a sequence of characters (such as in a computer
program or web page) into a sequence of tokens (strings with an assigned
and thus identified meaning).
Source: https://en.wikipedia.org/wiki/Tokenize
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Token (tokenize)
where
import Data.Text (Text)
import qualified Gargantext.Text.Ngrams.Token.Text as En
type Token = Text
-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
--
tokenize :: Text -> [Token]
tokenize = En.tokenize
......@@ -16,6 +16,7 @@ extra-deps:
- aeson-lens-0.5.0.0
- duckling-0.1.3.0
- extra-1.5.3
- full-text-search-0.2.1.4
- haskell-src-exts-1.18.2
- http-types-0.12.1
- protolude-0.2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment