Fix WithList

parent 4b12a41d
......@@ -28,9 +28,10 @@ import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
type Term = Text
type Label = Term
type MultiTerm = [Term]
type Label = MultiTerm
type TermList = [(Label, [[Term]])]
type TermList = [(Label, [MultiTerm])]
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
......
......@@ -22,7 +22,7 @@ import GHC.IO (FilePath)
import Control.Applicative
import Control.Monad (mzero)
import Data.Char (ord, isSpace)
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack)
......@@ -42,7 +42,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp
csv2list :: CsvListType -> Vector CsvList -> TermList
csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms)
-> (label, map (DT.split isSpace) $ DT.splitOn csvListFormsDelimiter forms))
-> (DT.words label, map DT.words $ DT.splitOn csvListFormsDelimiter forms))
$ V.filter (\l -> csvList_status l == lt ) vs
------------------------------------------------------------------------
......
......@@ -33,7 +33,7 @@ compute graph
module Gargantext.Text.Terms
where
import qualified Data.Set as Set
import Data.List (concat)
import Data.Text (Text)
import Data.Traversable
......@@ -64,6 +64,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt
terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
......@@ -20,44 +20,55 @@ import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Core.Types (Terms(Terms))
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Prelude
import Data.List (concatMap)
import Data.Ord
import qualified Data.Set as Set
------------------------------------------------------------------------
type Pattern = KMP.Table Term
type Patterns = [(Pattern, Int, Label)]
data Pattern = Pattern
{ _pat_table :: !(KMP.Table Term)
, _pat_length :: !Int
, _pat_terms :: !Terms
}
type Patterns = [Pattern]
------------------------------------------------------------------------
replaceTerms :: Patterns -> Sentence Term -> Sentence Label
replaceTerms pats terms = go 0 terms
replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
replaceTerms pats terms = go 0
where
go _ [] = []
go !ix (t:ts) =
terms_len = length terms
go ix | ix >= terms_len = []
| otherwise =
case IntMap.lookup ix m of
Nothing -> t : go (ix + 1) ts
Just (len, label) ->
label : go (ix + len) (drop (len - 1) ts)
Nothing -> go (ix + 1)
Just (len, terms) ->
terms : go (ix + len)
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
[ (ix, (len, label))
| (pat, len, label) <- pats, ix <- KMP.match pat terms ]
[ (ix, (len, terms))
| Pattern pat len terms <- pats, ix <- KMP.match pat terms ]
buildPatterns :: TermList -> Patterns
buildPatterns = concatMap buildPattern
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
buildPattern (label, alts) = map f alts
buildPattern (label, alts) = map f (label : alts)
where
f alt = (KMP.build alt, length alt, label)
f alt = Pattern (KMP.build alt) (length alt)
(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Label
extractTermsWithList :: Patterns -> Text -> Corpus Terms
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment