WIP Gargantext/Text/Terms/WithList

parent cc31b225
......@@ -70,6 +70,7 @@ library:
- ini
- jose-jwt
- kmeans-vector
- KMP
- lens
- logging-effect
- matrix
......
......@@ -13,6 +13,7 @@ commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.List
where
......
......@@ -17,7 +17,6 @@ CSV parser for Gargantext corpus files.
module Gargantext.Text.List.CSV where
import GHC.Real (round)
import GHC.IO (FilePath)
import Control.Applicative
......@@ -26,14 +25,14 @@ import Control.Monad (mzero)
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import Data.Text (Text, pack)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude hiding (length)
import Gargantext.Text.List.Types
-- import Gargantext.Text.List.Types
------------------------------------------------------------------------
--csv2lists :: Vector CsvList -> Lists
......
......@@ -18,7 +18,7 @@ module Gargantext.Text.List.Types where
import Prelude (Bounded, Enum, minBound, maxBound)
import Data.Text (Text)
import Data.Map (Map, empty, fromList, insert, lookup)
import Data.Map (Map, empty, fromList)
import Gargantext.Prelude
-------------------------------------------------------------------
......
......@@ -33,6 +33,7 @@ compute graph
module Gargantext.Text.Terms
where
import qualified Data.Set as Set
import Data.Text (Text)
import Data.Traversable
......@@ -41,8 +42,9 @@ import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang
data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
......@@ -60,5 +62,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt
------------------------------------------------------------------------
{-|
Module : Gargantext.Text.Terms.WithList
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE BangPatterns #-}
module Gargantext.Text.Terms.WithList where
import qualified Data.Algorithms.KMP as KMP
import Data.Char (isSpace)
import qualified Data.Text as T
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Prelude
import Data.List (concatMap)
type Term = Text
type Label = Term
type Pattern = KMP.Table Term
type TermList = [(Label, [[Term]])]
type Patterns = [(Pattern, Int, Label)]
isMultiTermSep :: Char -> Bool
isMultiTermSep = (`elem` ",.:;?!(){}[]")
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
replaceTerms :: Patterns -> Sentence Term -> Sentence Label
replaceTerms pats terms = go 0 terms
where
go _ [] = []
go !ix (t:ts) =
case IntMap.lookup ix m of
Nothing -> t : go (ix + 1) ts
Just (len, label) ->
label : go (ix + len) (drop (len - 1) ts)
-- TODO is it what we want?
merge (len1, lab1) (len2, lab2) =
if len1 > len2 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
[ (ix, (len, label))
| (pat, len, label) <- pats, ix <- KMP.match pat terms ]
buildPatterns :: TermList -> Patterns
buildPatterns = concatMap buildPattern
where
buildPattern (label, alts) = map f alts
where
f alt = (KMP.build alt, length alt, label)
-- monoterms'' :: Lang -> Text -> [Terms]
-- monoterms'' l txt = map (text2terms l) $ monoterms txt
extractTermsWithList :: Patterns -> Text -> Corpus Label
extractTermsWithList pats =
map (replaceTerms pats) .
map (T.split isSpace) . -- text2terms
T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps
......@@ -41,4 +41,5 @@ extra-deps:
- servant-flatten-0.2
- serialise-0.2.0.0 # imt-api-client
- cborg-0.2.0.0 # imt-api-client
- KMP-0.1.0.2
resolver: lts-11.10
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment