WIP Gargantext/Text/Terms/WithList

parent cc31b225
...@@ -70,6 +70,7 @@ library: ...@@ -70,6 +70,7 @@ library:
- ini - ini
- jose-jwt - jose-jwt
- kmeans-vector - kmeans-vector
- KMP
- lens - lens
- logging-effect - logging-effect
- matrix - matrix
......
...@@ -13,6 +13,7 @@ commentary with @some markup@. ...@@ -13,6 +13,7 @@ commentary with @some markup@.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.List module Gargantext.Text.List
where where
......
...@@ -17,7 +17,6 @@ CSV parser for Gargantext corpus files. ...@@ -17,7 +17,6 @@ CSV parser for Gargantext corpus files.
module Gargantext.Text.List.CSV where module Gargantext.Text.List.CSV where
import GHC.Real (round)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Control.Applicative import Control.Applicative
...@@ -26,14 +25,14 @@ import Control.Monad (mzero) ...@@ -26,14 +25,14 @@ import Control.Monad (mzero)
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate) import Data.Text (Text, pack)
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector) import Data.Vector (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
import Gargantext.Text.List.Types -- import Gargantext.Text.List.Types
------------------------------------------------------------------------ ------------------------------------------------------------------------
--csv2lists :: Vector CsvList -> Lists --csv2lists :: Vector CsvList -> Lists
......
...@@ -18,7 +18,7 @@ module Gargantext.Text.List.Types where ...@@ -18,7 +18,7 @@ module Gargantext.Text.List.Types where
import Prelude (Bounded, Enum, minBound, maxBound) import Prelude (Bounded, Enum, minBound, maxBound)
import Data.Text (Text) import Data.Text (Text)
import Data.Map (Map, empty, fromList, insert, lookup) import Data.Map (Map, empty, fromList)
import Gargantext.Prelude import Gargantext.Prelude
------------------------------------------------------------------- -------------------------------------------------------------------
......
...@@ -33,6 +33,7 @@ compute graph ...@@ -33,6 +33,7 @@ compute graph
module Gargantext.Text.Terms module Gargantext.Text.Terms
where where
import qualified Data.Set as Set
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
...@@ -41,8 +42,9 @@ import Gargantext.Core ...@@ -41,8 +42,9 @@ import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms') import Gargantext.Text.Terms.Mono (monoterms')
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
-- remove Stop Words -- remove Stop Words
-- map (filter (\t -> not . elem t)) $ -- map (filter (\t -> not . elem t)) $
...@@ -60,5 +62,6 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -60,5 +62,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt terms (Mono lang) txt = pure $ monoterms' lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
{-|
Module : Gargantext.Text.Terms.WithList
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE BangPatterns #-}
module Gargantext.Text.Terms.WithList where
import qualified Data.Algorithms.KMP as KMP
import Data.Char (isSpace)
import qualified Data.Text as T
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Prelude
import Data.List (concatMap)
type Term = Text
type Label = Term
type Pattern = KMP.Table Term
type TermList = [(Label, [[Term]])]
type Patterns = [(Pattern, Int, Label)]
isMultiTermSep :: Char -> Bool
isMultiTermSep = (`elem` ",.:;?!(){}[]")
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
replaceTerms :: Patterns -> Sentence Term -> Sentence Label
replaceTerms pats terms = go 0 terms
where
go _ [] = []
go !ix (t:ts) =
case IntMap.lookup ix m of
Nothing -> t : go (ix + 1) ts
Just (len, label) ->
label : go (ix + len) (drop (len - 1) ts)
-- TODO is it what we want?
merge (len1, lab1) (len2, lab2) =
if len1 > len2 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
[ (ix, (len, label))
| (pat, len, label) <- pats, ix <- KMP.match pat terms ]
buildPatterns :: TermList -> Patterns
buildPatterns = concatMap buildPattern
where
buildPattern (label, alts) = map f alts
where
f alt = (KMP.build alt, length alt, label)
-- monoterms'' :: Lang -> Text -> [Terms]
-- monoterms'' l txt = map (text2terms l) $ monoterms txt
extractTermsWithList :: Patterns -> Text -> Corpus Label
extractTermsWithList pats =
map (replaceTerms pats) .
map (T.split isSpace) . -- text2terms
T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps
...@@ -41,4 +41,5 @@ extra-deps: ...@@ -41,4 +41,5 @@ extra-deps:
- servant-flatten-0.2 - servant-flatten-0.2
- serialise-0.2.0.0 # imt-api-client - serialise-0.2.0.0 # imt-api-client
- cborg-0.2.0.0 # imt-api-client - cborg-0.2.0.0 # imt-api-client
- KMP-0.1.0.2
resolver: lts-11.10 resolver: lts-11.10
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment