Fix WithList

parent 4b12a41d
...@@ -28,9 +28,10 @@ import Gargantext.Prelude hiding (length) ...@@ -28,9 +28,10 @@ import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------ ------------------------------------------------------------------------
type Term = Text type Term = Text
type Label = Term type MultiTerm = [Term]
type Label = MultiTerm
type TermList = [(Label, [[Term]])] type TermList = [(Label, [MultiTerm])]
type Sentence a = [a] -- or a nominal group type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences type Corpus a = [Sentence a] -- a list of sentences
......
...@@ -22,7 +22,7 @@ import GHC.IO (FilePath) ...@@ -22,7 +22,7 @@ import GHC.IO (FilePath)
import Control.Applicative import Control.Applicative
import Control.Monad (mzero) import Control.Monad (mzero)
import Data.Char (ord, isSpace) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack) import Data.Text (Text, pack)
...@@ -42,7 +42,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp ...@@ -42,7 +42,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp
csv2list :: CsvListType -> Vector CsvList -> TermList csv2list :: CsvListType -> Vector CsvList -> TermList
csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms) csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms)
-> (label, map (DT.split isSpace) $ DT.splitOn csvListFormsDelimiter forms)) -> (DT.words label, map DT.words $ DT.splitOn csvListFormsDelimiter forms))
$ V.filter (\l -> csvList_status l == lt ) vs $ V.filter (\l -> csvList_status l == lt ) vs
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -33,7 +33,7 @@ compute graph ...@@ -33,7 +33,7 @@ compute graph
module Gargantext.Text.Terms module Gargantext.Text.Terms
where where
import qualified Data.Set as Set import Data.List (concat)
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
...@@ -64,6 +64,6 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -64,6 +64,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -20,44 +20,55 @@ import qualified Data.Algorithms.KMP as KMP ...@@ -20,44 +20,55 @@ import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text) import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
import Gargantext.Core.Types (Terms(Terms))
import Gargantext.Text.Context import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Prelude import Gargantext.Prelude
import Data.List (concatMap) import Data.List (concatMap)
import Data.Ord
import qualified Data.Set as Set
------------------------------------------------------------------------ ------------------------------------------------------------------------
type Pattern = KMP.Table Term data Pattern = Pattern
type Patterns = [(Pattern, Int, Label)] { _pat_table :: !(KMP.Table Term)
, _pat_length :: !Int
, _pat_terms :: !Terms
}
type Patterns = [Pattern]
------------------------------------------------------------------------ ------------------------------------------------------------------------
replaceTerms :: Patterns -> Sentence Term -> Sentence Label replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
replaceTerms pats terms = go 0 terms replaceTerms pats terms = go 0
where where
go _ [] = [] terms_len = length terms
go !ix (t:ts) =
go ix | ix >= terms_len = []
| otherwise =
case IntMap.lookup ix m of case IntMap.lookup ix m of
Nothing -> t : go (ix + 1) ts Nothing -> go (ix + 1)
Just (len, label) -> Just (len, terms) ->
label : go (ix + len) (drop (len - 1) ts) terms : go (ix + len)
merge (len1, lab1) (len2, lab2) = merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2) if len2 < len1 then (len1, lab1) else (len2, lab2)
m = m =
IntMap.fromListWith merge IntMap.fromListWith merge
[ (ix, (len, label)) [ (ix, (len, terms))
| (pat, len, label) <- pats, ix <- KMP.match pat terms ] | Pattern pat len terms <- pats, ix <- KMP.match pat terms ]
buildPatterns :: TermList -> Patterns buildPatterns :: TermList -> Patterns
buildPatterns = concatMap buildPattern buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where where
buildPattern (label, alts) = map f alts buildPattern (label, alts) = map f (label : alts)
where where
f alt = (KMP.build alt, length alt, label) f alt = Pattern (KMP.build alt) (length alt)
(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Label extractTermsWithList :: Patterns -> Text -> Corpus Terms
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment