Commit c0914f9a authored by Alexandre Delanoë's avatar Alexandre Delanoë

[RENAME/ORG] ngrams -> terms.

parent d12b405e
......@@ -7,10 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Given a Gargantext CSV File and its Query
This script cleans and compress the contexts around the main terms of the query.
Given a Gargantext CSV File and its Query This script cleans and
compress the contexts around the main terms of the query.
-}
{-# LANGUAGE NoImplicitPrelude #-}
......@@ -20,7 +18,7 @@ module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Text (pack)
import Data.Vector (Vector)
import qualified Data.Vector as V
......@@ -50,7 +48,7 @@ main = do
let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine q
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
......
......@@ -14,13 +14,13 @@ Script to start gargantext with different modes (Dev, Prod, Mock).
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
module Main where
import Prelude (putStrLn)
import Options.Generic
import Data.Text (unpack)
......
......@@ -23,18 +23,10 @@ library:
- Gargantext
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Utils
- Gargantext.Core.Types
- Gargantext.Core.Types.Node
- Gargantext.Text
- Gargantext.Text.Metrics
- Gargantext.Text.Metrics.Occurrences
- Gargantext.Text.Metrics.FrequentItemSet
- Gargantext.Text.Ngrams.PosTagging.CoreNLP
- Gargantext.Text.Ngrams.PosTagging.Parser
- Gargantext.Text.Ngrams.Token.Text
- Gargantext.Text.Search
- Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date
- Gargantext.API
dependencies:
- QuickCheck
......@@ -124,14 +116,18 @@ library:
executable:
main: Main.hs
source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
dependencies:
- base
- containers
- gargantext
- vector
- cassava
- ini
- optparse-generic
- unordered-containers
- full-text-search
tests:
garg-test:
......
......@@ -11,7 +11,7 @@ Portability : POSIX
-}
module Gargantext (
module Gargantext.Database,
module Gargantext.Database
-- module Gargantext.Ngrams,
-- module Gargantext.Utils,
) where
......
......@@ -6,18 +6,17 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
Loads all static file for the front-end.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE TemplateHaskell #-}
---------------------------------------------------------------------
module Gargantext.API.FrontEnd
where
module Gargantext.API.FrontEnd where
import Servant.Static.TH (createApiAndServerDecs)
......
......@@ -11,11 +11,104 @@ Here is a longer description of this module, containing some
commentary with @some markup@.
-}
------------------------------------------------------------------------
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Core.Types.Node
) where
, module Gargantext.Core.Types.Node
, Term, Terms(..)
, TokenTag(..), POS(..), NER(..)
) where
import GHC.Generics
import Data.Aeson
import Data.Monoid
import qualified Data.Set as S
import Data.Set (Set, empty)
import Data.Text (Text, unpack)
import Gargantext.Core.Types.Main
import Gargantext.Core.Types.Node
import Gargantext.Prelude
------------------------------------------------------------------------
type Term = Text
data Terms = Terms { _terms_label :: [Text]
, _terms_stem :: Set Text
} deriving (Show)
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
instance Monoid TokenTag where
mempty = TokenTag [] empty Nothing Nothing
mappend (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _)
= TokenTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
......@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams exctration.
Definitions of ngrams.
n non negative integer
Text gathers terms in unit of contexts.
-}
......@@ -24,25 +21,17 @@ import qualified Data.Text as DT
--import Data.Text.IO (readFile)
import Data.Map.Strict (Map
, lookupIndex
--, fromList, keys
)
import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter)
import NLP.FullStop (segment)
-----------------------------------------------------------------
import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences
import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
import Gargantext.Core.Types
import Gargantext.Prelude hiding (filter)
-----------------------------------------------------------------
data Group = Group { _group_label :: Ngrams
, _group_ngrams :: [Ngrams]
data Group = Group { _group_label :: Terms
, _group_terms :: Terms
} deriving (Show)
......@@ -52,41 +41,10 @@ clean txt = DT.map clean' txt
clean' '’' = '\''
clean' c = c
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
--indexNgram :: Ord a => Map a Occ -> Map Index a
--indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
--ngramIndex :: Ord a => Map a Occ -> Map a Index
--ngramIndex m = fromList (zip (keys m) [1..])
indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
indexIt xs = (m, is)
where
m = sumOcc (map occ xs)
is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m', fs)
where
(m, is) = indexIt xs
m' = M.filter (>50000) m
fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
......@@ -112,17 +70,18 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances
-- | Ngrams Test
-- >>> ngramsTest testText
-- 248
ngramsTest :: Text -> Int
ngramsTest x= length ws
where
--txt = concat <$> lines <$> clean <$> readFile filePath
txt = clean x
-- | Number of sentences
--ls = sentences $ txt
-- | Number of monograms used in the full text
ws = ngrams $ txt
-- | stem ngrams
--ngramsTest :: Text -> Int
--ngramsTest x = length ws
-- where
-- --txt = concat <$> lines <$> clean <$> readFile filePath
-- txt = clean x
-- -- | Number of sentences
-- --ls = sentences $ txt
-- -- | Number of monograms used in the full text
-- ws = ngrams $ txt
-- -- | stem ngrams
-- TODO
-- group ngrams
--ocs = occ $ ws
......@@ -89,3 +89,39 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------
------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
--
---- | Index ngrams from Map
----indexNgram :: Ord a => Map a Occ -> Map Index a
----indexNgram m = fromList (zip [1..] (keys m))
--
---- | Index ngrams from Map
----ngramIndex :: Ord a => Map a Occ -> Map a Index
----ngramIndex m = fromList (zip (keys m) [1..])
--
--indexWith :: Ord a => Map a Occ -> [a] -> [Int]
--indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
--
--indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
--indexIt xs = (m, is)
-- where
-- m = sumOcc (map occ xs)
-- is = map (indexWith m) xs
--
--list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
--list2fis n xs = (m', fs)
-- where
-- (m, is) = indexIt xs
-- m' = M.filter (>50000) m
-- fs = FIS.all n is
--
--text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fis n xs = list2fis n (map terms xs)
--
----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
----text2fisWith = undefined
--
{-|
Module : Gargantext.Text.Ngrams.PosTagging
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.PosTagging
where
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.En
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
where
--import Data.Text (Text)
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
-- group xs = group3 NN IN DT xs
-- TO BE REMOVED old code
--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
-- where
-- jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-- cc = (c1, "CC", c1')
-- jn1 = (j1, "JJ", j1')
-- jn2 = jn j2 j3 j2'
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--
---- extractNgrams "Test the antiinflammatory or analgesic activity?"
---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
---- > should be (antiinflammatory activity) <> (analgesic activity)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [Text]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.Fr
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french. -
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
------------------------------------------------------------------------
--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
-- where
--
--import Gargantext.Prelude
--import Data.Text (Text)
--import Data.Monoid ((<>))
--
--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--selectNgrams xs = filter selectNgrams' xs
-- where
-- selectNgrams' (_,"N" ,_ ) = True
-- selectNgrams' (_,"NC" ,_ ) = True
-- selectNgrams' (_,"NN+CC",_ ) = True
-- selectNgrams' (_,_ ,"PERSON" ) = True
-- selectNgrams' (_,_ ,"ORGANIZATION") = True
-- selectNgrams' (_,_ ,"LOCATION" ) = True
-- selectNgrams' (_,_ ,_ ) = False
--
--
--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--groupNgrams [] = []
--
----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
--
---- "Groupe : nom commun et adjectifs avec conjonction"
--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "NC", n')
-- n2 = (n <> " " <> j2, "NC", n')
--
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "N", n')
-- n2 = (n <> " " <> j2, "N", n')
--
---- Groupe : Adjectif + Conjonction de coordination + Adjectif
---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
--
---- Groupe : Nom commun + préposition + Nom commun
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
--
---- Groupe : Plusieurs adjectifs successifs
--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
--
---- Groupe : nom commun et adjectif
--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
---- Groupe : adjectif et nom commun
--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--
--
---- Si aucune des règles précédentes n'est remplie
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--
--textTest :: [Text]
--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
--
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Parser
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Ngrams selection algorithms
A form is a list of characters seperated by one or more spaces in a sentence.
A word is a form.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Text.Ngrams.PosTagging.Parser
where
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams lang s = map (group lang) <$> extractNgrams' lang s
extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams' lang t = map tokens2ngramsTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyse and groups (or not) ngrams according to
---- grammars specific of each language.
group :: Lang -> [NgramsTag] -> [NgramsTag]
group EN = En.group
group FR = Fr.group
......@@ -28,8 +28,8 @@ import Data.Ix
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Ngrams
import Gargantext.Text.Ngrams.Stem as ST
import Gargantext.Text.Terms.Mono (monoterms)
import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Parsers.CSV
type DocId = Int
......@@ -52,14 +52,14 @@ docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTokens,
extractDocumentTerms = extractTerms,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTokens :: Doc -> DocField -> [Text]
extractTokens doc TitleField = monograms (d_title doc)
extractTokens doc AbstractField = monograms (d_abstract doc)
extractTerms :: Doc -> DocField -> [Text]
extractTerms doc TitleField = monoterms (d_title doc)
extractTerms doc AbstractField = monoterms (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
......
......@@ -18,56 +18,27 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
Source: https://en.wikipedia.org/wiki/Ngrams
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack, toLower)
import Data.Set (Set)
import qualified Data.Set as S
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Text.Ngrams.Stem (stem)
TODO
-- Prelude.concat <$> Prelude.map (filter (\n -> _my_token_pos n == Just NP)) <$> extractNgrams Gargantext.Core.EN testText_en
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups
data Ngrams = Ngrams { _ngrams_label :: [Text]
, _ngrams_stem :: Set Text
} deriving (Show)
data Terms = MonoGrams | MultiGrams
type MonoGrams = Text
type MultiGrams = [Text]
compute cooccurrences
compute graph
-}
ngrams :: Text -> [Text]
ngrams = monograms
{-# LANGUAGE NoImplicitPrelude #-}
text2ngrams :: Lang -> Text -> Ngrams
text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
module Gargantext.Text.Terms
where
txt' = splitOn (pack " ") txt
import Gargantext.Core.Types
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
------------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag w t _ _) = Terms w t
------------------------------------------------------------------------
......@@ -13,7 +13,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Lists
module Gargantext.Text.Terms.Lists
where
--import Data.Maybe
......
{-|
Module : Gargantext.Text.Terms.Mono
Description : Mono Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mono-terms are Nterms where n == 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono
where
import Data.Text (Text, toLower, split, splitOn, pack)
import qualified Data.Set as S
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
import Data.Char (isAlphaNum, isSpace)
monoterms :: Text -> [Term]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems
where
label = splitOn (pack " ") txt
stems = S.fromList $ map (stem lang) label
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
......@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-}
module Gargantext.Text.Ngrams.Stem (stem, Lang(..))
module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
where
import Data.Text (Text)
......
......@@ -17,7 +17,7 @@ Adapted from:
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Stem.En (stem)
module Gargantext.Text.Terms.Mono.Stem.En (stem)
where
import Control.Monad
......
......@@ -17,11 +17,11 @@ Source: https://en.wikipedia.org/wiki/Tokenize
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Token (tokenize)
module Gargantext.Text.Terms.Mono.Token (tokenize)
where
import Data.Text (Text)
import qualified Gargantext.Text.Ngrams.Token.Text as En
import qualified Gargantext.Text.Terms.Mono.Token.En as En
type Token = Text
......
......@@ -7,13 +7,12 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Inspired from https://bitbucket.org/gchrupala/lingo/overview
First inspired from https://bitbucket.org/gchrupala/lingo/overview
-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.Token.Text
module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..)
, Tokenizer
, tokenize
......
{-|
Module : Gargantext.Text.Terms.Multi
Description : Multi Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Multi-terms are ngrams where n > 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi (extractTokenTags)
where
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.PosTagging
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
extractTokenTags :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags lang s = map (group lang) <$> extractTokenTags' lang s
extractTokenTags' :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags' lang t = map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
{-|
Module : Gargantext.Text.Terms.Multi.Group
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to
group the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Group (group2)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Core.Types
import Gargantext.Prelude
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
{-|
Module : Gargantext.Text.Terms.Multi.Lang.En
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to group
the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.En (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group
------------------------------------------------------------------------
-- | Rule grammar to group tokens
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
{-|
Module : Gargantext.Text.Terms.Multi.Lang.Fr
Description : French Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.Fr (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group (group2)
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- TODO
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
{-|
Module : Gargantext.Text.Ngrams.PosTagging.CoreNLP
Description : CoreNLP module
Module : Gargantext.Text.Terms.Multi.PosTagging
Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In corpus linguistics, part-of-speech tagging (POS tagging or PoS
tagging or POST), also called grammatical tagging or word-category
disambiguation, is the process of marking up a word in a text (corpus)
as corresponding to a particular part of speech,[1] based on both its
definition and its context—i.e., its relationship with adjacent and
related words in a phrase, sentence, or paragraph. A simplified form of
this is commonly taught to school-age children, in the identification of
words as nouns, verbs, adjectives, adverbs, etc.
Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
module Gargantext.Text.Ngrams.PosTagging.CoreNLP
module Gargantext.Text.Terms.Multi.PosTagging
where
import GHC.Generics
......@@ -26,14 +35,14 @@ import GHC.Show (Show(..))
import Data.ByteString.Lazy.Internal (ByteString)
import Data.Aeson.TH (deriveJSON)
import Data.Aeson
import Data.Monoid
import Data.Maybe (isJust)
import Data.Set (Set, fromList, empty)
import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower, unpack)
import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
......@@ -44,53 +53,6 @@ import Control.Monad.IO.Class (MonadIO)
import Data.String.Conversions (ConvertibleStrings)
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text
......@@ -105,39 +67,21 @@ data Token = Token { _tokenIndex :: Int
} deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token)
------------------------------------------------------------------------
data NgramsTag = NgramsTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
------------------------------------------------------------------------
tokens2ngramsTags :: [Token] -> [NgramsTag]
tokens2ngramsTags ts = select $ map ngramsTag ts
tokens2tokensTags :: [Token] -> [TokenTag]
tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------
ngramsTag :: Token -> NgramsTag
ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n
tokenTag :: Token -> TokenTag
tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n
where
w' = split w
s' = fromList (split s)
split = splitOn (pack " ") . toLower
select :: [NgramsTag] -> [NgramsTag]
select xs = filter isNgrams xs
filter' :: [TokenTag] -> [TokenTag]
filter' xs = filter isNgrams xs
where
isNgrams (NgramsTag _ _ p n) = isJust p || isJust n
instance Monoid NgramsTag where
mempty = NgramsTag [] empty Nothing Nothing
mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
= NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
isNgrams (TokenTag _ _ p n) = isJust p || isJust n
------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment