Commit c0914f9a authored by Alexandre Delanoë's avatar Alexandre Delanoë

[RENAME/ORG] ngrams -> terms.

parent d12b405e
...@@ -7,10 +7,8 @@ Maintainer : team@gargantext.org ...@@ -7,10 +7,8 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Given a Gargantext CSV File and its Query Given a Gargantext CSV File and its Query This script cleans and
This script cleans and compress the contexts around the main terms of the query. compress the contexts around the main terms of the query.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
...@@ -20,7 +18,7 @@ module CleanCsvCorpus where ...@@ -20,7 +18,7 @@ module CleanCsvCorpus where
--import GHC.IO (FilePath) --import GHC.IO (FilePath)
import Data.SearchEngine as S import Data.SearchEngine as S
import qualified Data.Set as S import qualified Data.Set as S
import Data.Text (pack)
import Data.Vector (Vector) import Data.Vector (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
...@@ -50,7 +48,7 @@ main = do ...@@ -50,7 +48,7 @@ main = do
let docs = toDocs csvDocs let docs = toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine q let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs) let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs') putStrLn $ "Number of documents after:" <> show (V.length docs')
......
...@@ -14,13 +14,13 @@ Script to start gargantext with different modes (Dev, Prod, Mock). ...@@ -14,13 +14,13 @@ Script to start gargantext with different modes (Dev, Prod, Mock).
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
module Main where module Main where
import Prelude (putStrLn)
import Options.Generic import Options.Generic
import Data.Text (unpack) import Data.Text (unpack)
......
...@@ -23,18 +23,10 @@ library: ...@@ -23,18 +23,10 @@ library:
- Gargantext - Gargantext
- Gargantext.Prelude - Gargantext.Prelude
- Gargantext.Core - Gargantext.Core
- Gargantext.Core.Utils
- Gargantext.Core.Types - Gargantext.Core.Types
- Gargantext.Core.Types.Node
- Gargantext.Text - Gargantext.Text
- Gargantext.Text.Metrics - Gargantext.Text.Search
- Gargantext.Text.Metrics.Occurrences
- Gargantext.Text.Metrics.FrequentItemSet
- Gargantext.Text.Ngrams.PosTagging.CoreNLP
- Gargantext.Text.Ngrams.PosTagging.Parser
- Gargantext.Text.Ngrams.Token.Text
- Gargantext.Text.Parsers.CSV - Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date
- Gargantext.API - Gargantext.API
dependencies: dependencies:
- QuickCheck - QuickCheck
...@@ -124,14 +116,18 @@ library: ...@@ -124,14 +116,18 @@ library:
executable: executable:
main: Main.hs main: Main.hs
source-dirs: app source-dirs: app
ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2 ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2
dependencies: dependencies:
- base - base
- containers
- gargantext - gargantext
- vector
- cassava
- ini - ini
- optparse-generic - optparse-generic
- unordered-containers - unordered-containers
- full-text-search
tests: tests:
garg-test: garg-test:
......
...@@ -11,7 +11,7 @@ Portability : POSIX ...@@ -11,7 +11,7 @@ Portability : POSIX
-} -}
module Gargantext ( module Gargantext (
module Gargantext.Database, module Gargantext.Database
-- module Gargantext.Ngrams, -- module Gargantext.Ngrams,
-- module Gargantext.Utils, -- module Gargantext.Utils,
) where ) where
......
...@@ -6,18 +6,17 @@ License : AGPL + CECILL v3 ...@@ -6,18 +6,17 @@ License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-} Loads all static file for the front-end.
-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
--------------------------------------------------------------------- ---------------------------------------------------------------------
module Gargantext.API.FrontEnd module Gargantext.API.FrontEnd where
where
import Servant.Static.TH (createApiAndServerDecs) import Servant.Static.TH (createApiAndServerDecs)
......
...@@ -11,11 +11,104 @@ Here is a longer description of this module, containing some ...@@ -11,11 +11,104 @@ Here is a longer description of this module, containing some
commentary with @some markup@. commentary with @some markup@.
-} -}
------------------------------------------------------------------------
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Core.Types.Node , module Gargantext.Core.Types.Node
) where , Term, Terms(..)
, TokenTag(..), POS(..), NER(..)
) where
import GHC.Generics
import Data.Aeson
import Data.Monoid
import qualified Data.Set as S
import Data.Set (Set, empty)
import Data.Text (Text, unpack)
import Gargantext.Core.Types.Main import Gargantext.Core.Types.Main
import Gargantext.Core.Types.Node import Gargantext.Core.Types.Node
import Gargantext.Prelude
------------------------------------------------------------------------
type Term = Text
data Terms = Terms { _terms_label :: [Text]
, _terms_stem :: Set Text
} deriving (Show)
instance Eq Terms where
(==) (Terms _ s1) (Terms _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
data TokenTag = TokenTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
instance Monoid TokenTag where
mempty = TokenTag [] empty Nothing Nothing
mappend (TokenTag w1 s1 p1 n1) (TokenTag w2 s2 p2 _)
= TokenTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
...@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org ...@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Ngrams exctration. Text gathers terms in unit of contexts.
Definitions of ngrams.
n non negative integer
-} -}
...@@ -24,25 +21,17 @@ import qualified Data.Text as DT ...@@ -24,25 +21,17 @@ import qualified Data.Text as DT
--import Data.Text.IO (readFile) --import Data.Text.IO (readFile)
import Data.Map.Strict (Map
, lookupIndex
--, fromList, keys
)
import Data.Text (Text, split) import Data.Text (Text, split)
import qualified Data.Map.Strict as M (filter)
import NLP.FullStop (segment) import NLP.FullStop (segment)
----------------------------------------------------------------- -----------------------------------------------------------------
import Gargantext.Text.Ngrams
import Gargantext.Text.Metrics.Occurrences
import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS import Gargantext.Core.Types
import Gargantext.Prelude hiding (filter) import Gargantext.Prelude hiding (filter)
----------------------------------------------------------------- -----------------------------------------------------------------
data Group = Group { _group_label :: Ngrams data Group = Group { _group_label :: Terms
, _group_ngrams :: [Ngrams] , _group_terms :: Terms
} deriving (Show) } deriving (Show)
...@@ -52,41 +41,10 @@ clean txt = DT.map clean' txt ...@@ -52,41 +41,10 @@ clean txt = DT.map clean' txt
clean' '’' = '\'' clean' '’' = '\''
clean' c = c clean' c = c
--noApax :: Ord a => Map a Occ -> Map a Occ --noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m --noApax m = M.filter (>1) m
-- | /!\ indexes are not the same:
-- | Index ngrams from Map
--indexNgram :: Ord a => Map a Occ -> Map Index a
--indexNgram m = fromList (zip [1..] (keys m))
-- | Index ngrams from Map
--ngramIndex :: Ord a => Map a Occ -> Map a Index
--ngramIndex m = fromList (zip (keys m) [1..])
indexWith :: Ord a => Map a Occ -> [a] -> [Int]
indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
indexIt xs = (m, is)
where
m = sumOcc (map occ xs)
is = map (indexWith m) xs
list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
list2fis n xs = (m', fs)
where
(m, is) = indexIt xs
m' = M.filter (>50000) m
fs = FIS.all n is
text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
text2fis n xs = list2fis n (map ngrams xs)
--text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fisWith = undefined
------------------------------------------------------------------- -------------------------------------------------------------------
-- Contexts of text -- Contexts of text
sentences :: Text -> [Text] sentences :: Text -> [Text]
...@@ -112,17 +70,18 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances ...@@ -112,17 +70,18 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances
-- | Ngrams Test -- | Ngrams Test
-- >>> ngramsTest testText -- >>> ngramsTest testText
-- 248 -- 248
ngramsTest :: Text -> Int --ngramsTest :: Text -> Int
ngramsTest x= length ws --ngramsTest x = length ws
where -- where
--txt = concat <$> lines <$> clean <$> readFile filePath -- --txt = concat <$> lines <$> clean <$> readFile filePath
txt = clean x -- txt = clean x
-- | Number of sentences -- -- | Number of sentences
--ls = sentences $ txt -- --ls = sentences $ txt
-- | Number of monograms used in the full text -- -- | Number of monograms used in the full text
ws = ngrams $ txt -- ws = ngrams $ txt
-- | stem ngrams -- -- | stem ngrams
-- TODO -- TODO
-- group ngrams -- group ngrams
--ocs = occ $ ws --ocs = occ $ ws
...@@ -89,3 +89,39 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f ...@@ -89,3 +89,39 @@ fisWith s f is = unMaybe $ map items2fis $ filter' $ runLCMmatrix is f
------------------------------------------------------------------------ ------------------------------------------------------------------------
------------------------------------------------------------------------ ------------------------------------------------------------------------
--
---- | /!\ indexes are not the same:
--
---- | Index ngrams from Map
----indexNgram :: Ord a => Map a Occ -> Map Index a
----indexNgram m = fromList (zip [1..] (keys m))
--
---- | Index ngrams from Map
----ngramIndex :: Ord a => Map a Occ -> Map a Index
----ngramIndex m = fromList (zip (keys m) [1..])
--
--indexWith :: Ord a => Map a Occ -> [a] -> [Int]
--indexWith m xs = unMaybe $ map (\x -> lookupIndex x m) xs
--
--indexIt :: Ord a => [[a]] -> (Map a Int, [[Int]])
--indexIt xs = (m, is)
-- where
-- m = sumOcc (map occ xs)
-- is = map (indexWith m) xs
--
--list2fis :: Ord a => FIS.Frequency -> [[a]] -> (Map a Int, [FIS.Fis])
--list2fis n xs = (m', fs)
-- where
-- (m, is) = indexIt xs
-- m' = M.filter (>50000) m
-- fs = FIS.all n is
--
--text2fis :: FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
--text2fis n xs = list2fis n (map terms xs)
--
----text2fisWith :: FIS.Size -> FIS.Frequency -> [Text] -> (Map Text Int, [FIS.Fis])
----text2fisWith = undefined
--
{-|
Module : Gargantext.Text.Ngrams.PosTagging
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.PosTagging
where
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.En
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.En (group)
where
--import Data.Text (Text)
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
-- group3 :: POS -> POS -> POS -> [NgramsTag] -> [NgramsTag]
-- group xs = group3 NN IN DT xs
-- TO BE REMOVED old code
--groupNgrams ((j1,"JJ",j1'):(c1,"CC",c1'):(j2,"JJ",j2'):(j3,"JJ",_):xs) = groupNgrams (jn1:cc:jn2:xs)
-- where
-- jn j' j'' jn' = (j' <> " " <> j'', "JJ", jn')
-- cc = (c1, "CC", c1')
-- jn1 = (j1, "JJ", j1')
-- jn2 = jn j2 j3 j2'
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NN",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((j1,"JJ",_):(_,"CC",_):(j2,"JJ",_):(n,"NNS",nn):xs) = groupNgrams (jn1:jn2:xs)
-- where
-- jn j m mm p = (j <> " " <> m, p, mm)
-- jn1 = jn j1 n nn ("NN+CC" :: Text)
-- jn2 = jn j2 n nn ("NN+CC" :: Text)
--
--groupNgrams ((x,"JJ",_):(y,"JJ",yy):xs) = groupNgrams ((x <> " " <> y, "JJ", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"JJ",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NNP",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NNS",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NP",_):(y,"NP",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> y, "NN", yy):xs)
--
--
---- extractNgrams "Test the antiinflammatory or analgesic activity?"
---- [[("``","``","O"),("Test","VB","O"),("the","DT","O"),("antiinflammatory activity analgesic activity","NN","O"),("?",".","O"),("''","''","O")]]
---- > should be (antiinflammatory activity) <> (analgesic activity)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NN",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--groupNgrams ((x,"NN",_):(o,"IN",_):(det,"DT",_):(y,"NNP",yy):xs) = groupNgrams ((x <> " " <> o <> " " <> det <> " " <> y, "NN", yy):xs)
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--textTest :: [Text]
--textTest = [ "Alcoholic extract of Kaempferia galanga was tested for analgesic and antiinflammatory activities in animal models. "
-- , "Three doses, 300 mg/kg, 600 mg/kg and 1200 mg/kg of the plant extract prepared as a suspension in 2 ml of 2% gum acacia were used. "
-- , " Acute and sub acute inflammatory activities were studied in rats by carrageenan induced paw edema and cotton pellet induced granuloma models respectively. "
-- , "In both models, the standard drug used was aspirin 100 mg/kg. "
-- , "Two doses 600 mg/kg and 1200 mg/kg of plant extract exhibited significant (P<0.001) antiinflammatory activity in carrageenan model and cotton pellet granuloma model in comparison to control. "
-- , "Analgesic activity was studied in rats using hot plate and tail-flick models. "
-- , "Codeine 5 mg/kg and vehicle served as standard and control respectively. "
-- , "The two doses of plant extract exhibited significant analgesic activity in tail flick model (P<0.001) and hot plate model (P<0.001) in comparison to control. "
-- , "In conclusion K. galanga possesses antiinflammatory and analgesic activities. "]
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Lang.Fr
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french. -
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (group)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import Gargantext.Prelude
group :: [NgramsTag] -> [NgramsTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [NgramsTag] -> [NgramsTag]
group2 p1 p2 (x@(NgramsTag _ _ (Just p1') _):y@(NgramsTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(NgramsTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(NgramsTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(NgramsTag _ _ (Just _) _):y@(NgramsTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
------------------------------------------------------------------------
--module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
-- where
--
--import Gargantext.Prelude
--import Data.Text (Text)
--import Data.Monoid ((<>))
--
--selectNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--selectNgrams xs = filter selectNgrams' xs
-- where
-- selectNgrams' (_,"N" ,_ ) = True
-- selectNgrams' (_,"NC" ,_ ) = True
-- selectNgrams' (_,"NN+CC",_ ) = True
-- selectNgrams' (_,_ ,"PERSON" ) = True
-- selectNgrams' (_,_ ,"ORGANIZATION") = True
-- selectNgrams' (_,_ ,"LOCATION" ) = True
-- selectNgrams' (_,_ ,_ ) = False
--
--
--groupNgrams :: [(Text, Text, Text)] -> [(Text, Text, Text)]
--groupNgrams [] = []
--
----groupNgrams ((_,"DET",_):xs) = groupNgrams xs
--
---- "Groupe : nom commun et adjectifs avec conjonction"
--groupNgrams ((n,"NC",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "NC", n')
-- n2 = (n <> " " <> j2, "NC", n')
--
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((n,"N",n'):(j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",_):xs) = groupNgrams (n1:n2:xs)
-- where
-- n1 = (n <> " " <> j1, "N", n')
-- n2 = (n <> " " <> j2, "N", n')
--
---- Groupe : Adjectif + Conjonction de coordination + Adjectif
---- groupNgrams ((j1,"ADJ",_):(_,"CC",_):(j2,"ADJ",j2'):xs) = groupNgrams ((j1 <> " " <> j2, "ADJ", j2'):xs)
--
---- Groupe : Nom commun + préposition + Nom commun
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NC",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(p,"P",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> p <> " " <> n2, "NC", n2'):xs)
--groupNgrams ((n1,"NC",_):(prep,"P",_):(det,"DET",_):(n2,"NPP",n2'):xs) = groupNgrams ((n1 <> " " <> prep <> " " <> det <> " " <> n2, "NC", n2'):xs)
--
---- Groupe : Plusieurs adjectifs successifs
--groupNgrams ((x,"ADJ",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "ADJ", yy):xs)
--
---- Groupe : nom commun et adjectif
--groupNgrams ((x,"NC",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"N",_):(y,"ADJ",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
---- Groupe : adjectif et nom commun
--groupNgrams ((x,"ADJ",_):(y,"NC",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
---- /!\ sometimes N instead of NC (why?)
--groupNgrams ((x,"ADJ",_):(y,"N",yy):xs) = groupNgrams ((x <> " " <> y, "NC", yy):xs)
--
--
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--
--
---- Si aucune des règles précédentes n'est remplie
--groupNgrams (x:xs) = (x:(groupNgrams xs))
--
--
--textTest :: [Text]
--textTest = [ "L'heure d'arrivée des coureurs dépend de la météo du jour."]
--
{-|
Module : Gargantext.Text.Ngrams.PosTagging.Parser
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Ngrams selection algorithms
A form is a list of characters seperated by one or more spaces in a sentence.
A word is a form.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Text.Ngrams.PosTagging.Parser
where
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
extractNgrams :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams lang s = map (group lang) <$> extractNgrams' lang s
extractNgrams' :: Lang -> Text -> IO [[NgramsTag]]
extractNgrams' lang t = map tokens2ngramsTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyse and groups (or not) ngrams according to
---- grammars specific of each language.
group :: Lang -> [NgramsTag] -> [NgramsTag]
group EN = En.group
group FR = Fr.group
...@@ -28,8 +28,8 @@ import Data.Ix ...@@ -28,8 +28,8 @@ import Data.Ix
import Data.Text (Text) import Data.Text (Text)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Ngrams import Gargantext.Text.Terms.Mono (monoterms)
import Gargantext.Text.Ngrams.Stem as ST import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Parsers.CSV import Gargantext.Text.Parsers.CSV
type DocId = Int type DocId = Int
...@@ -52,14 +52,14 @@ docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures ...@@ -52,14 +52,14 @@ docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig = docSearchConfig =
SearchConfig { SearchConfig {
documentKey = d_docId, documentKey = d_docId,
extractDocumentTerms = extractTokens, extractDocumentTerms = extractTerms,
transformQueryTerm = normaliseQueryToken, transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures documentFeatureValue = const noFeatures
} }
where where
extractTokens :: Doc -> DocField -> [Text] extractTerms :: Doc -> DocField -> [Text]
extractTokens doc TitleField = monograms (d_title doc) extractTerms doc TitleField = monoterms (d_title doc)
extractTokens doc AbstractField = monograms (d_abstract doc) extractTerms doc AbstractField = monoterms (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok = normaliseQueryToken tok =
......
...@@ -18,56 +18,27 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a ...@@ -18,56 +18,27 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
Source: https://en.wikipedia.org/wiki/Ngrams Source: https://en.wikipedia.org/wiki/Ngrams
-} TODO
-- Prelude.concat <$> Prelude.map (filter (\n -> _my_token_pos n == Just NP)) <$> extractNgrams Gargantext.Core.EN testText_en
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams
where
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, split, splitOn, pack, toLower)
import Data.Set (Set)
import qualified Data.Set as S
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Text.Ngrams.Stem (stem)
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups
data Ngrams = Ngrams { _ngrams_label :: [Text] compute cooccurrences
, _ngrams_stem :: Set Text compute graph
} deriving (Show)
data Terms = MonoGrams | MultiGrams
type MonoGrams = Text
type MultiGrams = [Text]
-}
ngrams :: Text -> [Text] {-# LANGUAGE NoImplicitPrelude #-}
ngrams = monograms
text2ngrams :: Lang -> Text -> Ngrams module Gargantext.Text.Terms
text2ngrams lang txt = Ngrams txt' (S.fromList $ map (stem lang) txt')
where where
txt' = splitOn (pack " ") txt
import Gargantext.Core.Types
equivNgrams :: Ngrams -> Ngrams -> Bool ------------------------------------------------------------------------
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2 tokenTag2terms :: TokenTag -> Terms
|| s2 `S.isSubsetOf` s1 tokenTag2terms (TokenTag w t _ _) = Terms w t
------------------------------------------------------------------------
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
...@@ -13,7 +13,7 @@ commentary with @some markup@. ...@@ -13,7 +13,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Lists module Gargantext.Text.Terms.Lists
where where
--import Data.Maybe --import Data.Maybe
......
{-|
Module : Gargantext.Text.Terms.Mono
Description : Mono Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mono-terms are Nterms where n == 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono
where
import Data.Text (Text, toLower, split, splitOn, pack)
import qualified Data.Set as S
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
import Data.Char (isAlphaNum, isSpace)
monoterms :: Text -> [Term]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems
where
label = splitOn (pack " ") txt
stems = S.fromList $ map (stem lang) label
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
...@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming ...@@ -18,7 +18,7 @@ Source : https://en.wikipedia.org/wiki/Stemming
-} -}
module Gargantext.Text.Ngrams.Stem (stem, Lang(..)) module Gargantext.Text.Terms.Mono.Stem (stem, Lang(..))
where where
import Data.Text (Text) import Data.Text (Text)
......
...@@ -17,7 +17,7 @@ Adapted from: ...@@ -17,7 +17,7 @@ Adapted from:
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Stem.En (stem) module Gargantext.Text.Terms.Mono.Stem.En (stem)
where where
import Control.Monad import Control.Monad
......
...@@ -17,11 +17,11 @@ Source: https://en.wikipedia.org/wiki/Tokenize ...@@ -17,11 +17,11 @@ Source: https://en.wikipedia.org/wiki/Tokenize
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.Token (tokenize) module Gargantext.Text.Terms.Mono.Token (tokenize)
where where
import Data.Text (Text) import Data.Text (Text)
import qualified Gargantext.Text.Ngrams.Token.Text as En import qualified Gargantext.Text.Terms.Mono.Token.En as En
type Token = Text type Token = Text
......
...@@ -7,13 +7,12 @@ Maintainer : team@gargantext.org ...@@ -7,13 +7,12 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Inspired from https://bitbucket.org/gchrupala/lingo/overview First inspired from https://bitbucket.org/gchrupala/lingo/overview
-} -}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Ngrams.Token.Text module Gargantext.Text.Terms.Mono.Token.En
( EitherList(..) ( EitherList(..)
, Tokenizer , Tokenizer
, tokenize , tokenize
......
{-|
Module : Gargantext.Text.Terms.Multi
Description : Multi Terms module
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Multi-terms are ngrams where n > 1.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi (extractTokenTags)
where
import Data.Text hiding (map, group)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.PosTagging
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
extractTokenTags :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags lang s = map (group lang) <$> extractTokenTags' lang s
extractTokenTags' :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags' lang t = map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
group :: Lang -> [TokenTag] -> [TokenTag]
group EN = En.group
group FR = Fr.group
{-|
Module : Gargantext.Text.Terms.Multi.Group
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to
group the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Group (group2)
where
import Data.Maybe (Maybe(Just))
import Gargantext.Core.Types
import Gargantext.Prelude
-- | FIXME p1 and p2 not really taken into account
group2 :: POS -> POS -> [TokenTag] -> [TokenTag]
group2 p1 p2 (x@(TokenTag _ _ (Just p1') _):y@(TokenTag _ _ (Just p2') _):z) =
if (p1 == p1') && (p2 == p2')
then group2 p1 p2 (x<>y : z)
else (x : group2 p1 p2 (y:z))
group2 p1 p2 (x@(TokenTag _ _ Nothing _):y) = (x: group2 p1 p2 y)
group2 _ _ [x@(TokenTag _ _ (Just _) _)] = [x]
group2 p1 p2 (x@(TokenTag _ _ (Just _) _):y@(TokenTag _ _ Nothing _):z) = (x:y: group2 p1 p2 (y:z))
group2 _ _ [] = []
{-|
Module : Gargantext.Text.Terms.Multi.Lang.En
Description : English Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Rule-based grammars are computed in this english module in order to group
the tokens into extracted terms.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.En (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group
------------------------------------------------------------------------
-- | Rule grammar to group tokens
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
--groupNgrams ((x,_,"LOCATION"):(y,yy,"LOCATION"):xs) = groupNgrams ((x <> " " <> y,yy,"LOCATION"):xs)
--
--groupNgrams (x:xs) = (x:(groupNgrams xs))
{-|
Module : Gargantext.Text.Terms.Multi.Lang.Fr
Description : French Grammar rules to group postag tokens.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This @group@ function groups horizontally ngrams in their context of
sentence according to grammars specific of each language. In english, JJ
is ADJectiv in french.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Terms.Multi.Lang.Fr (group)
where
import Gargantext.Prelude
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi.Group (group2)
group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
$ group2 NP IN
$ group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
$ group2 JJ CC
$ ntags
------------------------------------------------------------------------
-- TODO
--groupNgrams ((x,_,"PERSON"):(y,yy,"PERSON"):xs) = groupNgrams ((x <> " " <> y,yy,"PERSON"):xs)
--groupNgrams ((x,_,"ORGANIZATION"):(y,yy,"ORGANIZATION"):xs) = groupNgrams ((x <> " " <> y,yy,"ORGANIZATION"):xs)
{-| {-|
Module : Gargantext.Text.Ngrams.PosTagging.CoreNLP Module : Gargantext.Text.Terms.Multi.PosTagging
Description : CoreNLP module Description : PosTagging module using Stanford java REST API
Copyright : (c) CNRS, 2017 Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
In corpus linguistics, part-of-speech tagging (POS tagging or PoS
tagging or POST), also called grammatical tagging or word-category
disambiguation, is the process of marking up a word in a text (corpus)
as corresponding to a particular part of speech,[1] based on both its
definition and its context—i.e., its relationship with adjacent and
related words in a phrase, sentence, or paragraph. A simplified form of
this is commonly taught to school-age children, in the identification of
words as nouns, verbs, adjectives, adverbs, etc.
Source: https://en.wikipedia.org/wiki/Part-of-speech_tagging
-} -}
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeOperators #-}
{-# LANGUAGE FlexibleContexts #-}
module Gargantext.Text.Terms.Multi.PosTagging
module Gargantext.Text.Ngrams.PosTagging.CoreNLP
where where
import GHC.Generics import GHC.Generics
...@@ -26,14 +35,14 @@ import GHC.Show (Show(..)) ...@@ -26,14 +35,14 @@ import GHC.Show (Show(..))
import Data.ByteString.Lazy.Internal (ByteString) import Data.ByteString.Lazy.Internal (ByteString)
import Data.Aeson.TH (deriveJSON) import Data.Aeson.TH (deriveJSON)
import Data.Aeson import Data.Aeson
import Data.Monoid
import Data.Maybe (isJust) import Data.Maybe (isJust)
import Data.Set (Set, fromList, empty) import Data.Set (fromList)
import Data.Text (Text, splitOn, pack, toLower, unpack) import Data.Text (Text, splitOn, pack, toLower)
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
import Gargantext.Core.Utils.Prefix (unPrefix) import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude import Gargantext.Prelude
...@@ -44,53 +53,6 @@ import Control.Monad.IO.Class (MonadIO) ...@@ -44,53 +53,6 @@ import Control.Monad.IO.Class (MonadIO)
import Data.String.Conversions (ConvertibleStrings) import Data.String.Conversions (ConvertibleStrings)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Tag = POS | NER
deriving (Show, Eq)
------------------------------------------------------------------------
------------------------------------------------------------------------
data POS = NP
| JJ | VB
| CC | IN | DT
| NoPos
deriving (Show, Generic, Eq)
------------------------------------------------------------------------
instance FromJSON POS where
parseJSON = withText "String" (\x -> pure (pos $ unpack x))
where
pos :: [Char] -> POS
pos "NP" = NP
pos "NN" = NP
pos "NC" = NP
pos "NNS" = NP
pos "NNP" = NP
pos "JJ" = JJ
pos "ADJ" = JJ
pos "VB" = VB
pos "VBN" = VB
pos "VBG" = VB
pos "CC" = CC
pos "IN" = IN
pos "DT" = DT
-- French specific
pos "P" = IN
pos _ = NoPos
instance ToJSON POS
------------------------------------------------------------------------
data NER = PERSON | ORGANIZATION | LOCATION | NoNER
deriving (Show, Generic)
------------------------------------------------------------------------
instance FromJSON NER where
parseJSON = withText "String" (\x -> pure (ner $ unpack x))
where
ner :: [Char] -> NER
ner "PERSON" = PERSON
ner "ORGANIZATION" = ORGANIZATION
ner "LOCATION" = LOCATION
ner _ = NoNER
instance ToJSON NER
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Token = Token { _tokenIndex :: Int data Token = Token { _tokenIndex :: Int
, _tokenWord :: Text , _tokenWord :: Text
...@@ -105,39 +67,21 @@ data Token = Token { _tokenIndex :: Int ...@@ -105,39 +67,21 @@ data Token = Token { _tokenIndex :: Int
} deriving (Show, Generic) } deriving (Show, Generic)
$(deriveJSON (unPrefix "_token") ''Token) $(deriveJSON (unPrefix "_token") ''Token)
------------------------------------------------------------------------ ------------------------------------------------------------------------
data NgramsTag = NgramsTag { _my_token_word :: [Text]
, _my_token_stem :: Set Text
, _my_token_pos :: Maybe POS
, _my_token_ner :: Maybe NER
} deriving (Show)
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokens2ngramsTags :: [Token] -> [NgramsTag] tokens2tokensTags :: [Token] -> [TokenTag]
tokens2ngramsTags ts = select $ map ngramsTag ts tokens2tokensTags ts = filter' $ map tokenTag ts
------------------------------------------------------------------------ ------------------------------------------------------------------------
ngramsTag :: Token -> NgramsTag tokenTag :: Token -> TokenTag
ngramsTag (Token _ _ w s _ _ p n _ _) = NgramsTag w' s' p n tokenTag (Token _ _ w s _ _ p n _ _) = TokenTag w' s' p n
where where
w' = split w w' = split w
s' = fromList (split s) s' = fromList (split s)
split = splitOn (pack " ") . toLower split = splitOn (pack " ") . toLower
select :: [NgramsTag] -> [NgramsTag] filter' :: [TokenTag] -> [TokenTag]
select xs = filter isNgrams xs filter' xs = filter isNgrams xs
where where
isNgrams (NgramsTag _ _ p n) = isJust p || isJust n isNgrams (TokenTag _ _ p n) = isJust p || isJust n
instance Monoid NgramsTag where
mempty = NgramsTag [] empty Nothing Nothing
mappend (NgramsTag w1 s1 p1 n1) (NgramsTag w2 s2 p2 _)
= NgramsTag (w1 <> w2) (s1 <> s2) p3 n1
where
p3 = case (p1,p2) of
(Just JJ, Just NP) -> Just NP
(Just VB, Just NP) -> Just NP
_ -> p1
mconcat = foldl mappend mempty
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Sentence = Sentence { _sentenceIndex :: Int data Sentence = Sentence { _sentenceIndex :: Int
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment