Commit 4f5491c3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WithList] adding labelPolicy.

parent 1237e415
...@@ -29,6 +29,7 @@ library: ...@@ -29,6 +29,7 @@ library:
- Gargantext.Core - Gargantext.Core
- Gargantext.Core.Types - Gargantext.Core.Types
- Gargantext.Text - Gargantext.Text
- Gargantext.Text.List.CSV
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Parsers.CSV - Gargantext.Text.Parsers.CSV
- Gargantext.API - Gargantext.API
......
...@@ -7,7 +7,7 @@ Maintainer : team@gargantext.org ...@@ -7,7 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Context of text management tool Context of text management tool, here are logic of main types.
-} -}
...@@ -25,6 +25,20 @@ import Gargantext.Text ...@@ -25,6 +25,20 @@ import Gargantext.Text
import Gargantext.Prelude hiding (length) import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
type Term = Text
type Label = Term
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
data SplitContext = Chars Int | Sentences Int | Paragraphs Int data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag :: Text -> [Tag Text] tag :: Text -> [Tag Text]
......
...@@ -28,7 +28,7 @@ import Data.Ix ...@@ -28,7 +28,7 @@ import Data.Ix
import Data.Text (Text) import Data.Text (Text)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Terms.Mono (monoterms) import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Text.Terms.Mono.Stem as ST import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Parsers.CSV import Gargantext.Text.Parsers.CSV
...@@ -58,8 +58,8 @@ docSearchConfig = ...@@ -58,8 +58,8 @@ docSearchConfig =
} }
where where
extractTerms :: Doc -> DocField -> [Text] extractTerms :: Doc -> DocField -> [Text]
extractTerms doc TitleField = monoterms (d_title doc) extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoterms (d_abstract doc) extractTerms doc AbstractField = monoTexts (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok = normaliseQueryToken tok =
......
...@@ -41,7 +41,7 @@ import Gargantext.Prelude ...@@ -41,7 +41,7 @@ import Gargantext.Prelude
import Gargantext.Core import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms') import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList) import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
...@@ -59,9 +59,11 @@ extractTerms termTypeLang = mapM (terms termTypeLang) ...@@ -59,9 +59,11 @@ extractTerms termTypeLang = mapM (terms termTypeLang)
-- MonoMulti : mono and multi -- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet) -- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms] terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList labelPolicy list txt
where
labelPolicy = undefined
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -13,10 +13,17 @@ Mono-terms are Nterms where n == 1. ...@@ -13,10 +13,17 @@ Mono-terms are Nterms where n == 1.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono (monoterms, monoterms') module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence)
where where
import Prelude (String)
import Data.Char (isSpace)
import Data.Text (Text, toLower, split, splitOn, pack) import Data.Text (Text, toLower, split, splitOn, pack)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.List as L
import qualified Data.Set as S import qualified Data.Set as S
import Gargantext.Core import Gargantext.Core
...@@ -26,23 +33,30 @@ import Gargantext.Text.Terms.Mono.Stem (stem) ...@@ -26,23 +33,30 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude import Gargantext.Prelude
--import Data.Char (isAlphaNum, isSpace) --import Data.Char (isAlphaNum, isSpace)
monoterms' :: Lang -> Text -> [Terms] -- | TODO remove Num ?
monoterms' l txt = map (text2terms l) $ monoterms txt --isGram c = isAlphaNum c
monoterms :: Text -> [Text]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms -- | Sentence split separators
text2terms lang txt = Terms label stems isSep :: Char -> Bool
where isSep = (`elem` (",.:;?!(){}[]\"" :: String))
label = splitOn (pack " ") txt
stems = S.fromList $ map (stem lang) label
monoTerms :: Lang -> Text -> [Terms]
monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence
monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map (T.split isSpace)
. T.split isSep
. T.toLower
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
--isGram :: Char -> Bool
--isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
...@@ -16,32 +16,28 @@ commentary with @some markup@. ...@@ -16,32 +16,28 @@ commentary with @some markup@.
module Gargantext.Text.Terms.WithList where module Gargantext.Text.Terms.WithList where
import Prelude (String)
import qualified Data.Algorithms.KMP as KMP import qualified Data.Algorithms.KMP as KMP
import Data.Char (isSpace) import Data.Char (isSpace)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text (Text) import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Prelude import Gargantext.Prelude
import Data.List (concatMap) import Data.List (concatMap)
type Term = Text
type Label = Term
type Pattern = KMP.Table Term type Pattern = KMP.Table Term
type TermList = [(Label, [[Term]])] type TermList = [(Label, [[Term]])]
type Patterns = [(Pattern, Int, Label)] type Patterns = [(Pattern, Int, Label)]
isMultiTermSep :: Char -> Bool
isMultiTermSep = (`elem` ",.:;?!(){}[]")
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
replaceTerms :: Patterns -> Sentence Term -> Sentence Label replaceTerms :: (Term -> Label) -> Patterns -> Sentence Term -> Sentence Label
replaceTerms pats terms = go 0 terms replaceTerms labelPolicy pats terms = go 0 terms
where where
go _ [] = [] go _ [] = []
go !ix (t:ts) = go !ix (t:ts) =
...@@ -50,9 +46,9 @@ replaceTerms pats terms = go 0 terms ...@@ -50,9 +46,9 @@ replaceTerms pats terms = go 0 terms
Just (len, label) -> Just (len, label) ->
label : go (ix + len) (drop (len - 1) ts) label : go (ix + len) (drop (len - 1) ts)
-- TODO is it what we want? -- | merge with labelPolicy (can be a Map Term label)
merge (len1, lab1) (len2, lab2) = merge (len1, lab1) (len2, lab2) =
if len1 > len2 then (len1, lab1) else (len2, lab2) if (labelPolicy lab1) == lab2 then (len2, lab2) else (len1, lab1)
m = m =
IntMap.fromListWith merge IntMap.fromListWith merge
...@@ -66,11 +62,5 @@ buildPatterns = concatMap buildPattern ...@@ -66,11 +62,5 @@ buildPatterns = concatMap buildPattern
where where
f alt = (KMP.build alt, length alt, label) f alt = (KMP.build alt, length alt, label)
-- monoterms'' :: Lang -> Text -> [Terms] extractTermsWithList :: (Term -> Label) -> Patterns -> Text -> Corpus Label
-- monoterms'' l txt = map (text2terms l) $ monoterms txt extractTermsWithList labelPolicy pats = map (replaceTerms labelPolicy pats) . monoTextsBySentence
extractTermsWithList :: Patterns -> Text -> Corpus Label
extractTermsWithList pats =
map (replaceTerms pats) .
map (T.split isSpace) . -- text2terms
T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment