1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
{-|
Module : Gargantext.Text.Terms.WithList
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Gargantext.Text.Terms.WithList where
import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text, concat)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Prelude (error)
import Gargantext.Prelude
import Data.List (null, concatMap)
import Data.Ord
------------------------------------------------------------------------
data Pattern = Pattern
{ _pat_table :: !(KMP.Table Text)
, _pat_length :: !Int
, _pat_terms :: ![Text]
}
type Patterns = [Pattern]
------------------------------------------------------------------------
replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0
where
terms_len = length terms
go ix | ix >= terms_len = []
| otherwise =
case IntMap.lookup ix m of
Nothing -> go (ix + 1)
Just (len, term) ->
term : go (ix + len)
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
[ (ix, (len, term))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ]
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
buildPattern (label, alts) = map f (label : alts)
where
f alt | "" `elem` alt = error "buildPatterns: ERR1"
| null alt = error "buildPatterns: ERR2"
| otherwise =
Pattern (KMP.build alt) (length alt) label
--(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
-- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats) . monoTextsBySentence