Commit 1049ea25 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIS][FIX] Frequent Item Set and fix ngrams extraction test.

parent 9f29cddb
...@@ -116,6 +116,7 @@ library: ...@@ -116,6 +116,7 @@ library:
- servant-swagger-ui - servant-swagger-ui
- servant-static-th - servant-static-th
- split - split
- stemmer
- swagger2 - swagger2
- tagsoup - tagsoup
- text-metrics - text-metrics
......
...@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters ...@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics , module Gargantext.Ngrams.Metrics
, ngrams, occ, sumOcc, text2fis , Ngrams(..), ngrams, occ, sumOcc, text2fis
--, module Gargantext.Ngrams.Words --, module Gargantext.Ngrams.Words
) where ) where
...@@ -40,35 +40,46 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS ...@@ -40,35 +40,46 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.Char (Char, isAlpha, isSpace) import Data.Char (Char, isAlpha, isSpace)
import Data.Text (Text, words, filter, toLower) import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map, empty, keys import Data.Map.Strict (Map
, empty
, insertWith, unionWith , insertWith, unionWith
, fromList
, lookupIndex , lookupIndex
--, fromList, keys
) )
import qualified Data.Map.Strict as M (filter) import qualified Data.Map.Strict as M (filter)
import Data.Foldable (foldl') import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter) import Gargantext.Prelude hiding (filter)
import qualified Data.List as L (filter)
-- Maybe useful later: -- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..)) --import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions) --import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..)) --import Language.Aspell.Options (ACOption(..))
data Ngrams = Ngrams { _ngramsNgrams :: Text
, _ngramsStem :: Text
} deriving (Show)
instance Eq Ngrams where
Ngrams n1 s1 == Ngrams n2 s2 = n1 == n2 || s1 == s2
type Occ = Int type Occ = Int
type Index = Int --type Index = Int
type FreqMin = Int
ngrams :: Text -> [Text] ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs ngrams xs = monograms $ toLower $ filter isChar xs
monograms :: Text -> [Text] monograms :: Text -> [Text]
monograms = words monograms = words
isGram :: Char -> Bool -- TODO
isGram '-' = True -- 12-b
isGram '/' = True isChar :: Char -> Bool
isGram c = isAlpha c || isSpace c isChar '-' = True
isChar '/' = True
isChar c = isAlpha c || isSpace c
-- | Compute the occurrences (occ) -- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ occ :: Ord a => [a] -> Map a Occ
...@@ -78,18 +89,18 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs ...@@ -78,18 +89,18 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
sumOcc :: Ord a => [Map a Occ] -> Map a Occ sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
noApax :: Ord a => Map a Occ -> Map a Occ --noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m --noApax m = M.filter (>1) m
-- | /!\ indexes are not the same: -- | /!\ indexes are not the same:
-- | Index ngrams from Map -- | Index ngrams from Map
indexNgram :: Ord a => Map a Occ -> Map Index a --indexNgram :: Ord a => Map a Occ -> Map Index a
indexNgram m = fromList (zip [1..] (keys m)) --indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map -- | Index ngrams from Map
ngramIndex :: Ord a => Map a Occ -> Map a Index --ngramIndex :: Ord a => Map a Occ -> Map a Index
ngramIndex m = fromList (zip (keys m) [1..]) --ngramIndex m = fromList (zip (keys m) [1..])
indexWith :: Ord a => Map a Occ -> [a] -> [Int] indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
...@@ -101,14 +112,16 @@ indexIt xs = (m, is) ...@@ -101,14 +112,16 @@ indexIt xs = (m, is)
is = map (indexWith m) xs is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis]) list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m, fs) list2fis n xs = (m', fs)
where where
(m, is) = indexIt xs (m, is) = indexIt xs
m' = M.filter (>50000) m
fs = FIS.all n is fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs) text2fis n xs = list2fis n (map ngrams xs)
text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis]) --text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fisWith = undefined --text2fisWith = undefined
...@@ -18,7 +18,6 @@ Portability : POSIX ...@@ -18,7 +18,6 @@ Portability : POSIX
module Gargantext.Ngrams.CoreNLP where module Gargantext.Ngrams.CoreNLP where
import Data.Aeson
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import GHC.Generics import GHC.Generics
import Data.Monoid ((<>)) import Data.Monoid ((<>))
...@@ -65,8 +64,8 @@ $(deriveJSON (unPrefix "_properties") ''Properties) ...@@ -65,8 +64,8 @@ $(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { _sentences :: [Sentence]} data Sentences = Sentences { _sentences :: [Sentence]}
deriving (Show, Generic) deriving (Show, Generic)
instance ToJSON Sentences
instance FromJSON Sentences $(deriveJSON (unPrefix "_") ''Sentences)
-- request = -- request =
......
...@@ -45,8 +45,8 @@ all f is = fisWith Nothing f is ...@@ -45,8 +45,8 @@ all f is = fisWith Nothing f is
between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis] between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
between (x,y) f is = fisWithSize (Right (x,y)) f is between (x,y) f is = fisWithSize (Right (x,y)) f is
maximum :: Int -> Frequency -> [[Item]] -> [Fis] --maximum :: Int -> Frequency -> [[Item]] -> [Fis]
maximum m f is = between (0,m) f is --maximum m f is = between (0,m) f is
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -67,7 +67,6 @@ items2fis is = case head is of ...@@ -67,7 +67,6 @@ items2fis is = case head is of
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis] fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
fisWithSize n f is = case n of fisWithSize n f is = case n of
...@@ -87,4 +86,3 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f ...@@ -87,4 +86,3 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------
...@@ -8,16 +8,16 @@ import Data.Text (Text) ...@@ -8,16 +8,16 @@ import Data.Text (Text)
import Data.Monoid ((<>)) import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = filter selectNgrams' xs selectNgrams xs = filter isNgrams xs
where where
selectNgrams' (_,"NN" ,_ ) = True isNgrams (_,"NN" ,_ ) = True
selectNgrams' (_,"NNS" ,_ ) = True isNgrams (_,"NNS" ,_ ) = True
selectNgrams' (_,"NNP" ,_ ) = True isNgrams (_,"NNP" ,_ ) = True
selectNgrams' (_,"NN+CC",_ ) = True isNgrams (_,"NN+CC",_ ) = True
selectNgrams' (_,_ ,"PERSON" ) = True isNgrams (_,_ ,"PERSON" ) = True
selectNgrams' (_,_ ,"ORGANIZATION") = True isNgrams (_,_ ,"ORGANIZATION") = True
selectNgrams' (_,_ ,"LOCATION" ) = True isNgrams (_,_ ,"LOCATION" ) = True
selectNgrams' (_,_ ,_ ) = False isNgrams (_,_ ,_ ) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)] groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
...@@ -26,7 +26,7 @@ groupNgrams [] = [] ...@@ -26,7 +26,7 @@ groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs) groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
where where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn') jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
cc = (c1,"CC",c1') cc = (c1, "CC", c1')
jn1 = (j1, "JJ", j1') jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2' jn2 = jn j2 j3 j2'
...@@ -60,14 +60,14 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", ...@@ -60,14 +60,14 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN",
-- > should be (antiinflammatory activity) <> (analgesic activity) -- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs) groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs) groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs) groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs) groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs) groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs) groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs) groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
groupNgrams (x:xs) = (x:(groupNgrams xs)) groupNgrams (x:xs) = (x:(groupNgrams xs))
......
...@@ -8,10 +8,11 @@ import Gargantext.Prelude ...@@ -8,10 +8,11 @@ import Gargantext.Prelude
import Gargantext.Ngrams.CoreNLP import Gargantext.Ngrams.CoreNLP
import Data.Text hiding (map) import Data.Text hiding (map)
import Gargantext.Types.Main (Language(..), Ngrams) import Gargantext.Types.Main (Language(..))
import qualified Gargantext.Ngrams.Lang.En as En import qualified Gargantext.Ngrams.Lang.En as En
import qualified Gargantext.Ngrams.Lang.Fr as Fr import qualified Gargantext.Ngrams.Lang.Fr as Fr
type SNgrams = (Text, Text, Text)
-- | Ngrams selection algorithms -- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence. -- A form is a list of characters seperated by one or more spaces in a sentence.
...@@ -31,11 +32,11 @@ import qualified Gargantext.Ngrams.Lang.Fr as Fr ...@@ -31,11 +32,11 @@ import qualified Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures -- TODO for scientific papers: add maesures
-- TODO add the p score regex -- TODO add the p score regex
extractNgrams :: Language -> Text -> IO [[Ngrams]] extractNgrams :: Language -> Text -> IO [[SNgrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: Language -> Text -> IO [[Ngrams]] extractNgrams' :: Language -> Text -> IO [[SNgrams]]
extractNgrams' lang t = map (map token2text) extractNgrams' lang t = map (map token2text)
<$> map _sentenceTokens <$> map _sentenceTokens
<$> _sentences <$> _sentences
...@@ -44,14 +45,13 @@ extractNgrams' lang t = map (map token2text) ...@@ -44,14 +45,13 @@ extractNgrams' lang t = map (map token2text)
-- | This function selects ngrams according to grammars specific -- | This function selects ngrams according to grammars specific
-- of each language. -- of each language.
-- In english, JJ is ADJectiv in french. -- In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [Ngrams] -> [Ngrams] selectNgrams :: Language -> [SNgrams] -> [SNgrams]
selectNgrams EN = En.selectNgrams selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams selectNgrams FR = Fr.selectNgrams
-- | This function analyze and groups (or not) ngrams according to -- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language. -- grammars specific of each language.
groupNgrams :: Language -> [Ngrams] -> [Ngrams] groupNgrams :: Language -> [SNgrams] -> [SNgrams]
groupNgrams EN = En.groupNgrams groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams groupNgrams FR = Fr.groupNgrams
...@@ -163,7 +163,6 @@ nodeTypeId tn = fromMaybe (panic $ pack $ "Typename " <> show tn <> " does not e ...@@ -163,7 +163,6 @@ nodeTypeId tn = fromMaybe (panic $ pack $ "Typename " <> show tn <> " does not e
-- Temporary types to be removed -- Temporary types to be removed
type Ngrams = (Text, Text, Text)
type ErrorMessage = Text type ErrorMessage = Text
-- Queries -- Queries
......
...@@ -2,6 +2,7 @@ flags: {} ...@@ -2,6 +2,7 @@ flags: {}
extra-package-dbs: [] extra-package-dbs: []
packages: packages:
- . - .
allow-newer: true allow-newer: true
extra-deps: extra-deps:
- git: https://github.com/delanoe/data-time-segment.git - git: https://github.com/delanoe/data-time-segment.git
...@@ -25,6 +26,7 @@ extra-deps: ...@@ -25,6 +26,7 @@ extra-deps:
- servant-multipart-0.11.1 - servant-multipart-0.11.1
- servant-server-0.12 - servant-server-0.12
- servant-swagger-ui-0.2.3.2.2.8 - servant-swagger-ui-0.2.3.2.2.8
- stemmer-0.5.2
- text-1.2.3.0 - text-1.2.3.0
- text-show-3.6.2 - text-show-3.6.2
resolver: lts-10.6 resolver: lts-10.6
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment