Commit 28461a06 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern...

[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern matching with FR examples.
parent b38cd1e3
...@@ -33,6 +33,7 @@ library ...@@ -33,6 +33,7 @@ library
, lens , lens
, opaleye , opaleye
, path , path
, parsec
, postgresql-simple , postgresql-simple
, pretty , pretty
, product-profunctors , product-profunctors
...@@ -65,9 +66,6 @@ library ...@@ -65,9 +66,6 @@ library
exposed-modules: Data.Gargantext exposed-modules: Data.Gargantext
, Data.Gargantext.Analysis , Data.Gargantext.Analysis
, Data.Gargantext.DSL , Data.Gargantext.DSL
, Data.Gargantext.Ngrams.Occurrences
, Data.Gargantext.NLP
, Data.Gargantext.NLP.CoreNLP
, Data.Gargantext.Database , Data.Gargantext.Database
, Data.Gargantext.Database.Instances , Data.Gargantext.Database.Instances
, Data.Gargantext.Database.Ngram , Data.Gargantext.Database.Ngram
...@@ -78,13 +76,22 @@ library ...@@ -78,13 +76,22 @@ library
, Data.Gargantext.Database.NodeNodeNgram , Data.Gargantext.Database.NodeNodeNgram
, Data.Gargantext.Database.Private , Data.Gargantext.Database.Private
, Data.Gargantext.Database.User , Data.Gargantext.Database.User
, Data.Gargantext.NLP
, Data.Gargantext.NLP.CoreNLP
, Data.Gargantext.Ngrams
, Data.Gargantext.Ngrams.Count
, Data.Gargantext.Ngrams.Parser
, Data.Gargantext.Ngrams.Lang.En
, Data.Gargantext.Ngrams.Lang.Fr
, Data.Gargantext.Ngrams.TextMining
, Data.Gargantext.Ngrams.Occurrences
, Data.Gargantext.Parsers , Data.Gargantext.Parsers
, Data.Gargantext.Parsers.WOS , Data.Gargantext.Parsers.WOS
, Data.Gargantext.Prelude , Data.Gargantext.Prelude
, Data.Gargantext.Server , Data.Gargantext.Server
, Data.Gargantext.Types , Data.Gargantext.Types
, Data.Gargantext.Types.Node
, Data.Gargantext.Types.Main , Data.Gargantext.Types.Main
, Data.Gargantext.Types.Node
, Data.Gargantext.Utils.DateUtils , Data.Gargantext.Utils.DateUtils
, Data.Gargantext.Utils.Prefix , Data.Gargantext.Utils.Prefix
default-language: Haskell2010 default-language: Haskell2010
...@@ -100,10 +107,10 @@ library ...@@ -100,10 +107,10 @@ library
-- , hastext -- , hastext
-- default-language: Haskell2010 -- default-language: Haskell2010
test-suite garg-test-parsers test-suite garg-test-ngrams
type: exitcode-stdio-1.0 type: exitcode-stdio-1.0
hs-source-dirs: test hs-source-dirs: test
main-is: Parsers.hs main-is: Ngrams.hs
build-depends: base build-depends: base
, extra , extra
, text , text
......
...@@ -26,11 +26,15 @@ data Token = Token { _tokenIndex :: Int ...@@ -26,11 +26,15 @@ data Token = Token { _tokenIndex :: Int
, _tokenCharacterOffsetEnd :: Int , _tokenCharacterOffsetEnd :: Int
, _tokenPos :: Text , _tokenPos :: Text
, _tokenNer :: Text , _tokenNer :: Text
, _tokenBefore :: Text , _tokenBefore :: Maybe Text
, _tokenAfter :: Text , _tokenAfter :: Maybe Text
} deriving (Show, Generic) } deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token) $(deriveJSON (unPrefix "_token") ''Token)
token2text :: Token -> (Text, Text, Text)
token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
data Sentence = Sentence { _sentenceIndex :: Int data Sentence = Sentence { _sentenceIndex :: Int
, _sentenceTokens :: [Token] , _sentenceTokens :: [Token]
} deriving (Show, Generic) } deriving (Show, Generic)
...@@ -49,6 +53,19 @@ instance ToJSON Sentences ...@@ -49,6 +53,19 @@ instance ToJSON Sentences
instance FromJSON Sentences instance FromJSON Sentences
-- request =
-- "fr" : {
-- "tokenize.language" : "fr",
-- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
-- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
-- // dependency parser
-- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
-- "depparse.language" : "french",
-- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
-- "ssplit.newlineIsSentenceBreak": "always"
-- },
--
corenlpPretty :: String -> IO () corenlpPretty :: String -> IO ()
corenlpPretty txt = do corenlpPretty txt = do
......
module Data.Gargantext.Ngrams ( module Data.Gargantext.Ngrams (
module Data.Gargantext.Ngrams.TextMining, module Data.Gargantext.Ngrams.Count,
module Data.Gargantext.Ngrams.Words, --module Data.Gargantext.Ngrams.Hetero,
module Data.Gargantext.Ngrams.Hetero, module Data.Gargantext.Ngrams.Parser,
module Data.Gargantext.Ngrams.Count module Data.Gargantext.Ngrams.Occurrences,
module Data.Gargantext.Ngrams.TextMining
--module Data.Gargantext.Ngrams.Words
) where ) where
import Data.Gargantext.Ngrams.TextMining
import Data.Gargantext.Ngrams.Words
import Data.Gargantext.Ngrams.Hetero
import Data.Gargantext.Ngrams.Count import Data.Gargantext.Ngrams.Count
--import Data.Gargantext.Ngrams.Hetero
import Data.Gargantext.Ngrams.Parser
import Data.Gargantext.Ngrams.Occurrences
import Data.Gargantext.Ngrams.TextMining
--import Data.Gargantext.Ngrams.Words
...@@ -4,7 +4,6 @@ module Data.Gargantext.Ngrams.Count where ...@@ -4,7 +4,6 @@ module Data.Gargantext.Ngrams.Count where
import System.Environment (getArgs) import System.Environment (getArgs)
import Data.List (foldl', take)
import Data.Foldable as F import Data.Foldable as F
import Data.Map (Map) import Data.Map (Map)
...@@ -37,9 +36,9 @@ occurrences xs = foldl' (\x y -> M.insertWith' (+) y 1 x) M.empty xs ...@@ -37,9 +36,9 @@ occurrences xs = foldl' (\x y -> M.insertWith' (+) y 1 x) M.empty xs
--occurrences' :: Ord a => [a] -> Map a Integer --occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs --occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
countMain :: IO ()
countMain = do countMain = do
(fichier:rest) <- getArgs (fichier:_) <- getArgs
c <- DTLIO.readFile fichier c <- DTLIO.readFile fichier
--print $ occurrences $ DTL.chunksOf 1 c --print $ occurrences $ DTL.chunksOf 1 c
print $ occurrences $ letters'' c print $ occurrences $ letters'' c
......
...@@ -10,7 +10,7 @@ import Opaleye.Internal.Column (Column) ...@@ -10,7 +10,7 @@ import Opaleye.Internal.Column (Column)
import Data.Gargantext.Database.Gargandb import Data.Gargantext.Database.Gargandb
import Data.Gargantext.Database.Private import Data.Gargantext.Database.Private
import Data.Gargantext.Utils.Chronos --import Data.Gargantext.Utils.Chronos
import Data.Gargantext.Ngrams.Words (cleanText) import Data.Gargantext.Ngrams.Words (cleanText)
import Data.Gargantext.Ngrams.Count (occurrences) import Data.Gargantext.Ngrams.Count (occurrences)
......
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Ngrams.Lang.En (selectNgrams, groupNgrams) where
import Data.Gargantext.Prelude
import Data.Text (Text)
import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = pf selectNgrams' xs
where
selectNgrams' (_,"NN",_) = True
selectNgrams' (_,"NNS",_) = True
selectNgrams' (_,"NNP",_) = True
selectNgrams' (_,"NN+CC",_) = True
selectNgrams' (_,_,"PERSON") = True
selectNgrams' (_,_,"ORGANIZATION") = True
selectNgrams' (_,_,"LOCATION") = True
selectNgrams' (_,_,_) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
cc = (c1,"CC",c1')
jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2'
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-- extractNgrams "Test the antiinflammatory or analgesic activity?"
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Ngrams.Lang.Fr (selectNgrams, groupNgrams)
where
import Data.Gargantext.Prelude
import Data.Text (Text)
import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = pf selectNgrams' xs
where
selectNgrams' (_,"NN",_) = True
selectNgrams' (_,"NNS",_) = True
selectNgrams' (_,"NNP",_) = True
selectNgrams' (_,"NN+CC",_) = True
selectNgrams' (_,_,"PERSON") = True
selectNgrams' (_,_,"ORGANIZATION") = True
selectNgrams' (_,_,"LOCATION") = True
selectNgrams' (_,_,_) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
cc = (c1,"CC",c1')
jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2'
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
where
jn j m mm p = (j <> " " <> m, p, mm)
jn1 = jn j1 n nn ("NN+CC" :: Text)
jn2 = jn j2 n nn ("NN+CC" :: Text)
groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
-- extractNgrams "Test the antiinflammatory or analgesic activity?"
-- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):("of","IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> "of" <> " " <> y, "NN", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [String]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
--
--
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Data.Gargantext.Ngrams.Parser where
import Data.Gargantext.Prelude
import Data.Gargantext.NLP.CoreNLP
import Data.Gargantext.Types.Main (Language(..), Ngrams)
import qualified Data.Gargantext.Ngrams.Lang.En as En
import qualified Data.Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: String -> IO [[Ngrams]]
extractNgrams t = pm (groupNgrams EN) <$> extractNgrams' t
extractNgrams' :: String -> IO [[Ngrams]]
extractNgrams' t = pm (pm token2text)
<$> pm _sentenceTokens
<$> sentences
<$> corenlp t
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [Ngrams] -> [Ngrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams :: Language -> [Ngrams] -> [Ngrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams
module Data.Gargantext.Ngrams.TFICF where
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Read, Show, Generics)
tfidf :: Text -> TFICF
tfidf txt = TFICF txt Document Corpus score
where
score = Nothing
...@@ -4,11 +4,12 @@ import Data.Map (empty, Map, insertWith, toList) ...@@ -4,11 +4,12 @@ import Data.Map (empty, Map, insertWith, toList)
import Data.List (foldl, foldl') import Data.List (foldl, foldl')
import qualified Data.List as L import qualified Data.List as L
sortGT :: (Ord a, Ord b) => (a, b) -> (a, b) -> Ordering
sortGT (a1, b1) (a2, b2) sortGT (a1, b1) (a2, b2)
| a1 < a2 = GT | a1 < a2 = GT
| a1 > a2 = LT | a1 > a2 = LT
| a1 == a2 = compare b1 b2 | a1 == a2 = compare b1 b2
sortGT (_, _) (_, _) = error "What is this case ?"
--histogram :: Ord a => [a] -> [(a, Int)] --histogram :: Ord a => [a] -> [(a, Int)]
...@@ -21,7 +22,7 @@ countElem m e = Data.Map.insertWith (\n o -> n + o) e 1 m ...@@ -21,7 +22,7 @@ countElem m e = Data.Map.insertWith (\n o -> n + o) e 1 m
freqList :: (Ord k) => [k] -> Data.Map.Map k Int freqList :: (Ord k) => [k] -> Data.Map.Map k Int
freqList = foldl countElem Data.Map.empty freqList = foldl countElem Data.Map.empty
--getMaxFromMap :: Data.Map.Map -> Maybe -> [a] -> [a] getMaxFromMap :: Ord a => Map a1 a -> [a1]
getMaxFromMap m = go [] Nothing (toList m) getMaxFromMap m = go [] Nothing (toList m)
where where
go ks _ [] = ks go ks _ [] = ks
...@@ -53,4 +54,4 @@ countYear' (xs) = foldl' (\x y -> insertWith (+) y 1 x) empty xs ...@@ -53,4 +54,4 @@ countYear' (xs) = foldl' (\x y -> insertWith (+) y 1 x) empty xs
textMiningMain :: IO () textMiningMain :: IO ()
textMiningMain = do textMiningMain = do
print $ merge ["abc"] ["bcd"] print $ merge ["abc"::String] ["bcd" :: String]
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.WOS where module Data.Gargantext.Parsers.WOS where
import Prelude hiding (takeWhile, take, concat, readFile) import Prelude hiding (takeWhile, take, concat, readFile)
...@@ -6,6 +8,7 @@ import Data.Map as DM ...@@ -6,6 +8,7 @@ import Data.Map as DM
import Data.Attoparsec.ByteString import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine) import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Data.Either.Extra(Either(..)) import Data.Either.Extra(Either(..))
import Control.Applicative import Control.Applicative
...@@ -47,8 +50,9 @@ data ParserType = WOS | CSV ...@@ -47,8 +50,9 @@ data ParserType = WOS | CSV
wosParser :: Parser [Maybe [ByteString]] wosParser :: Parser [Maybe [ByteString]]
wosParser = do wosParser = do
-- TODO Warning if version /= 1.0 -- TODO Warning if version /= 1.0
_ <- manyTill anyChar (string "\nVR 1.0") -- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
ns <- many1 wosNotice <* "\nEF" _ <- manyTill anyChar (string $ pack "\nVR 1.0")
ns <- many1 wosNotice <* (string $ pack "\nEF")
return ns return ns
startNotice :: Parser ByteString startNotice :: Parser ByteString
...@@ -56,7 +60,7 @@ startNotice = "\nPT " *> takeTill isEndOfLine ...@@ -56,7 +60,7 @@ startNotice = "\nPT " *> takeTill isEndOfLine
wosNotice :: Parser (Maybe [ByteString]) wosNotice :: Parser (Maybe [ByteString])
wosNotice = do wosNotice = do
n <- startNotice *> wosFields <* manyTill anyChar (string "\nER\n") n <- startNotice *> wosFields <* manyTill anyChar (string $ pack "\nER\n")
return n return n
field' :: Parser (ByteString, [ByteString]) field' :: Parser (ByteString, [ByteString])
......
...@@ -16,6 +16,19 @@ import Data.Gargantext.Types.Node ( NodePoly ...@@ -16,6 +16,19 @@ import Data.Gargantext.Types.Node ( NodePoly
, HyperdataNotebook , HyperdataNotebook
) )
data Language = EN | FR -- | DE | IT | SP
-- > EN == english
-- > FR == french
-- > DE == deutch (not implemented yet)
-- > IT == italian (not implemented yet)
-- > SP == spanish (not implemented yet)
-- > ... add your language and help us to implement it (:
type Ngrams = (Text, Text, Text)
-- | TODO add Symbolic Node / Document -- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes -- TODO make instances of Nodes
......
...@@ -4,4 +4,5 @@ packages: ...@@ -4,4 +4,5 @@ packages:
- . - .
- /home/alexandre/local/logiciels/haskell/servant/servant-multipart - /home/alexandre/local/logiciels/haskell/servant/servant-multipart
extra-deps: extra-deps:
- utc-0.2.0.1
resolver: lts-9.2 resolver: lts-9.2
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
import Test.Hspec
import Test.QuickCheck
import Control.Exception (evaluate)
import Data.Text (Text)
import Data.Gargantext.Prelude
import Data.Gargantext.Types.Main (Language(..))
import Data.Gargantext.Ngrams
import Data.Gargantext.Ngrams.Occurrences (parseOccurrences)
import Data.Gargantext.Ngrams.Parser (extractNgrams, selectNgrams)
ngramsExtractionTest EN = hspec $ do
describe "Ngrams extraction in English Language" $ do
let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
it "\"Of\" seperates two ngrams" $ do
t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0)
t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2)
t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
ngramsExtractionTest FR = hspec $ do
describe "Ngrams extraction in English Language" $ do
let textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. ", "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. ", " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. ", "In both models, the standard drug used was aspirin 100 mg/kg. ", "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. ", "Analgesic activity was studied in rats using hot plate and tail-flick models. ", "Codeine 5 mg/kg and vehicle served as standard and control respectively. ", "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. ", "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "] :: [String]
it "\"Of\" seperates two ngrams" $ do
t1 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 0)
t1 `shouldBe` [[("Alcoholic extract","NN","O"),("Kaempferia galanga","NN","O"),("analgesic activities","NN+CC","O"),("antiinflammatory activities","NN+CC","O"),("animal models","NN","O")]]
it "Tests the conjunction of coordination in two ngrams with its adjectives" $ do
t2 <- pm (selectNgrams EN) <$> extractNgrams (textTest !! 2)
t2 `shouldBe` [[("Acute activities","NN+CC","O"),("sub acute inflammatory activities","NN+CC","O"),("rats","NNS","O"),("carrageenan","NN","O"),("paw edema","NN","O"),("cotton pellet","NN","O"),("granuloma models","NN","O")]]
parsersTest = hspec $ do
describe "Parser for occurrences" $ do
let txt = "internet"
it "returns the result of one parsing" $ do
parseOccurrences "internet" "internet" `shouldBe` Right 1
-- | Context of Text should be toLower
it "returns the result of one parsing not case sensitive" $ do
let txtCase = "Internet"
parseOccurrences txtCase "internet" `shouldBe` Right 1
it "returns the result of one parsing after space" $ do
parseOccurrences txt " internet"
`shouldBe` Right 1
it "returns the result of one parsing after chars" $ do
parseOccurrences txt "l'internet"
`shouldBe` Right 1
it "returns the result of multiple parsing" $ do
parseOccurrences txt "internet internet of things"
`shouldBe` Right 2
it "returns the result of multiple parsing separated by text" $ do
parseOccurrences txt "internet in the internet of things"
`shouldBe` Right 2
it "returns the result of multiple parsing separated by punctuation" $ do
parseOccurrences txt "internet. In the internet of things, internet like; internet?"
`shouldBe` Right 4
-- describe "Parser for nodes" $ do
-- it "returns the result of one parsing after space" $ do
-- occOfCorpus 249509 "sciences" `shouldReturn` 7
main :: IO ()
main = do
-- parsersTest
-- ngramsExtractionTest EN
ngramsExtractionTest FR
{-# LANGUAGE OverloadedStrings #-}
import Test.Hspec
import Test.QuickCheck
import Control.Exception (evaluate)
import Data.Text (Text)
import Data.Gargantext.Parsers.Occurrences (parseOccurrences)
-- import Data.Gargantext.Analysis (occOfCorpus)
parsersTest = hspec $ do
describe "Parser for occurrences" $ do
let txt = "internet"
it "returns the result of one parsing" $ do
parseOccurrences "internet" "internet" `shouldBe` Right 1
-- | Context of Text should be toLower
it "returns the result of one parsing not case sensitive" $ do
let txtCase = "Internet"
parseOccurrences txtCase "internet" `shouldBe` Right 1
it "returns the result of one parsing after space" $ do
parseOccurrences txt " internet"
`shouldBe` Right 1
it "returns the result of one parsing after chars" $ do
parseOccurrences txt "l'internet"
`shouldBe` Right 1
it "returns the result of multiple parsing" $ do
parseOccurrences txt "internet internet of things"
`shouldBe` Right 2
it "returns the result of multiple parsing separated by text" $ do
parseOccurrences txt "internet in the internet of things"
`shouldBe` Right 2
it "returns the result of multiple parsing separated by punctuation" $ do
parseOccurrences txt "internet. In the internet of things, internet like; internet?"
`shouldBe` Right 4
-- describe "Parser for nodes" $ do
-- it "returns the result of one parsing after space" $ do
-- occOfCorpus 249509 "sciences" `shouldReturn` 7
main :: IO ()
main = do
parsersTest
...@@ -8,7 +8,8 @@ import Control.Exception (evaluate) ...@@ -8,7 +8,8 @@ import Control.Exception (evaluate)
import Data.Text (Text) import Data.Text (Text)
import Data.Hastext.Parsers.Occurrences (parse) import Data.Hastext.Parsers.Occurrences (parse)
main = print "hspec $ do main :: IO ()
main = print "hspec $ do"
describe "Parser for occurrences" $ do describe "Parser for occurrences" $ do
let txt = "internet" let txt = "internet"
...@@ -43,5 +44,4 @@ main = print "hspec $ do ...@@ -43,5 +44,4 @@ main = print "hspec $ do
main :: IO ()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment