Commit 1049ea25 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIS][FIX] Frequent Item Set and fix ngrams extraction test.

parent 9f29cddb
......@@ -116,6 +116,7 @@ library:
- servant-swagger-ui
- servant-static-th
- split
- stemmer
- swagger2
- tagsoup
- text-metrics
......
......@@ -21,7 +21,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, ngrams, occ, sumOcc, text2fis
, Ngrams(..), ngrams, occ, sumOcc, text2fis
--, module Gargantext.Ngrams.Words
) where
......@@ -40,35 +40,46 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
import Data.Char (Char, isAlpha, isSpace)
import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map, empty, keys
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
, fromList
, lookupIndex
--, fromList, keys
)
import qualified Data.Map.Strict as M (filter)
import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter)
import qualified Data.List as L (filter)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
data Ngrams = Ngrams { _ngramsNgrams :: Text
, _ngramsStem :: Text
} deriving (Show)
instance Eq Ngrams where
Ngrams n1 s1 == Ngrams n2 s2 = n1 == n2 || s1 == s2
type Occ = Int
type Index = Int
type FreqMin = Int
--type Index = Int
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
ngrams xs = monograms $ toLower $ filter isChar xs
monograms :: Text -> [Text]
monograms = words
isGram :: Char -> Bool
isGram '-' = True
isGram '/' = True
isGram c = isAlpha c || isSpace c
-- TODO
-- 12-b
isChar :: Char -> Bool
isChar '-' = True
isChar '/' = True
isChar c = isAlpha c || isSpace c
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
......@@ -78,18 +89,18 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
noApax :: Ord a => Map a Occ -> Map a Occ
noApax m = M.filter (>1) m
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
indexNgram :: Ord a => Map a Occ -> Map Index a
indexNgram m = fromList (zip [1..] (keys m))
--indexNgram :: Ord a => Map a Occ -> Map Index a
--indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
ngramIndex :: Ord a => Map a Occ -> Map a Index
ngramIndex m = fromList (zip (keys m) [1..])
--ngramIndex :: Ord a => Map a Occ -> Map a Index
--ngramIndex m = fromList (zip (keys m) [1..])
indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
......@@ -101,14 +112,16 @@ indexIt xs = (m, is)
is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m, fs)
list2fis n xs = (m', fs)
where
(m, is) = indexIt xs
m' = M.filter (>50000) m
fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs)
text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fisWith = undefined
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
......@@ -18,7 +18,6 @@ Portability : POSIX
module Gargantext.Ngrams.CoreNLP where
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import GHC.Generics
import Data.Monoid ((<>))
......@@ -65,8 +64,8 @@ $(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { _sentences :: [Sentence]}
deriving (Show, Generic)
instance ToJSON Sentences
instance FromJSON Sentences
$(deriveJSON (unPrefix "_") ''Sentences)
-- request =
......
......@@ -45,8 +45,8 @@ all f is = fisWith Nothing f is
between :: (Int, Int) -> Frequency -> [[Item]] -> [Fis]
between (x,y) f is = fisWithSize (Right (x,y)) f is
maximum :: Int -> Frequency -> [[Item]] -> [Fis]
maximum m f is = between (0,m) f is
--maximum :: Int -> Frequency -> [[Item]] -> [Fis]
--maximum m f is = between (0,m) f is
------------------------------------------------------------------------
......@@ -67,7 +67,6 @@ items2fis is = case head is of
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
fisWithSize :: Size -> Frequency -> [[Item]] -> [Fis]
fisWithSize n f is = case n of
......@@ -87,4 +86,3 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
......@@ -8,16 +8,16 @@ import Data.Text (Text)
import Data.Monoid ((<>))
selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
selectNgrams xs = filter selectNgrams' xs
selectNgrams xs = filter isNgrams xs
where
selectNgrams' (_,"NN" ,_ ) = True
selectNgrams' (_,"NNS" ,_ ) = True
selectNgrams' (_,"NNP" ,_ ) = True
selectNgrams' (_,"NN+CC",_ ) = True
selectNgrams' (_,_ ,"PERSON" ) = True
selectNgrams' (_,_ ,"ORGANIZATION") = True
selectNgrams' (_,_ ,"LOCATION" ) = True
selectNgrams' (_,_ ,_ ) = False
isNgrams (_,"NN" ,_ ) = True
isNgrams (_,"NNS" ,_ ) = True
isNgrams (_,"NNP" ,_ ) = True
isNgrams (_,"NN+CC",_ ) = True
isNgrams (_,_ ,"PERSON" ) = True
isNgrams (_,_ ,"ORGANIZATION") = True
isNgrams (_,_ ,"LOCATION" ) = True
isNgrams (_,_ ,_ ) = False
groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
......@@ -26,7 +26,7 @@ groupNgrams [] = []
groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
where
jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
cc = (c1,"CC",c1')
cc = (c1, "CC", c1')
jn1 = (j1, "JJ", j1')
jn2 = jn j2 j3 j2'
......@@ -60,14 +60,14 @@ groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN",
-- > should be (antiinflammatory activity) <> (analgesic activity)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
groupNgrams (x:xs) = (x:(groupNgrams xs))
......
......@@ -8,10 +8,11 @@ import Gargantext.Prelude
import Gargantext.Ngrams.CoreNLP
import Data.Text hiding (map)
import Gargantext.Types.Main (Language(..), Ngrams)
import Gargantext.Types.Main (Language(..))
import qualified Gargantext.Ngrams.Lang.En as En
import qualified Gargantext.Ngrams.Lang.Fr as Fr
type SNgrams = (Text, Text, Text)
-- | Ngrams selection algorithms
-- A form is a list of characters seperated by one or more spaces in a sentence.
......@@ -31,11 +32,11 @@ import qualified Gargantext.Ngrams.Lang.Fr as Fr
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Language -> Text -> IO [[Ngrams]]
extractNgrams :: Language -> Text -> IO [[SNgrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: Language -> Text -> IO [[Ngrams]]
extractNgrams' :: Language -> Text -> IO [[SNgrams]]
extractNgrams' lang t = map (map token2text)
<$> map _sentenceTokens
<$> _sentences
......@@ -44,14 +45,13 @@ extractNgrams' lang t = map (map token2text)
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [Ngrams] -> [Ngrams]
selectNgrams :: Language -> [SNgrams] -> [SNgrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams :: Language -> [Ngrams] -> [Ngrams]
groupNgrams :: Language -> [SNgrams] -> [SNgrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams
......@@ -163,7 +163,6 @@ nodeTypeId tn = fromMaybe (panic $ pack $ "Typename " <> show tn <> " does not e
-- Temporary types to be removed
type Ngrams = (Text, Text, Text)
type ErrorMessage = Text
-- Queries
......
......@@ -2,6 +2,7 @@ flags: {}
extra-package-dbs: []
packages:
- .
allow-newer: true
extra-deps:
- git: https://github.com/delanoe/data-time-segment.git
......@@ -25,6 +26,7 @@ extra-deps:
- servant-multipart-0.11.1
- servant-server-0.12
- servant-swagger-ui-0.2.3.2.2.8
- stemmer-0.5.2
- text-1.2.3.0
- text-show-3.6.2
resolver: lts-10.6
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment