Commit 4f5491c3 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WithList] adding labelPolicy.

parent 1237e415
......@@ -29,6 +29,7 @@ library:
- Gargantext.Core
- Gargantext.Core.Types
- Gargantext.Text
- Gargantext.Text.List.CSV
- Gargantext.Text.Search
- Gargantext.Text.Parsers.CSV
- Gargantext.API
......
......@@ -7,7 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Context of text management tool
Context of text management tool, here are logic of main types.
-}
......@@ -25,6 +25,20 @@ import Gargantext.Text
import Gargantext.Prelude hiding (length)
------------------------------------------------------------------------
type Term = Text
type Label = Term
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
-- type ConText a = [Sentence a]
-- type Corpus a = [ConText a]
------------------------------------------------------------------------
data SplitContext = Chars Int | Sentences Int | Paragraphs Int
tag :: Text -> [Tag Text]
......
......@@ -28,7 +28,7 @@ import Data.Ix
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Terms.Mono (monoterms)
import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Parsers.CSV
......@@ -58,8 +58,8 @@ docSearchConfig =
}
where
extractTerms :: Doc -> DocField -> [Text]
extractTerms doc TitleField = monoterms (d_title doc)
extractTerms doc AbstractField = monoterms (d_abstract doc)
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
......
......@@ -41,7 +41,7 @@ import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
......@@ -59,9 +59,11 @@ extractTerms termTypeLang = mapM (terms termTypeLang)
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoterms' lang txt
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt
terms (WithList list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList labelPolicy list txt
where
labelPolicy = undefined
------------------------------------------------------------------------
......@@ -13,10 +13,17 @@ Mono-terms are Nterms where n == 1.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono (monoterms, monoterms')
module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence)
where
import Prelude (String)
import Data.Char (isSpace)
import Data.Text (Text, toLower, split, splitOn, pack)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.List as L
import qualified Data.Set as S
import Gargantext.Core
......@@ -26,23 +33,30 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
--import Data.Char (isAlphaNum, isSpace)
monoterms' :: Lang -> Text -> [Terms]
monoterms' l txt = map (text2terms l) $ monoterms txt
-- | TODO remove Num ?
--isGram c = isAlphaNum c
monoterms :: Text -> [Text]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems
where
label = splitOn (pack " ") txt
stems = S.fromList $ map (stem lang) label
-- | Sentence split separators
isSep :: Char -> Bool
isSep = (`elem` (",.:;?!(){}[]\"" :: String))
monoTerms :: Lang -> Text -> [Terms]
monoTerms l txt = map (monoText2term l) $ monoTexts txt
monoTexts :: Text -> [Text]
monoTexts = L.concat . monoTextsBySentence
monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map (T.split isSpace)
. T.split isSep
. T.toLower
--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs
--isGram :: Char -> Bool
--isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
......@@ -16,32 +16,28 @@ commentary with @some markup@.
module Gargantext.Text.Terms.WithList where
import Prelude (String)
import qualified Data.Algorithms.KMP as KMP
import Data.Char (isSpace)
import qualified Data.Text as T
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Prelude
import Data.List (concatMap)
type Term = Text
type Label = Term
type Pattern = KMP.Table Term
type TermList = [(Label, [[Term]])]
type Patterns = [(Pattern, Int, Label)]
isMultiTermSep :: Char -> Bool
isMultiTermSep = (`elem` ",.:;?!(){}[]")
type Sentence a = [a] -- or a nominal group
type Corpus a = [Sentence a] -- a list of sentences
replaceTerms :: Patterns -> Sentence Term -> Sentence Label
replaceTerms pats terms = go 0 terms
replaceTerms :: (Term -> Label) -> Patterns -> Sentence Term -> Sentence Label
replaceTerms labelPolicy pats terms = go 0 terms
where
go _ [] = []
go !ix (t:ts) =
......@@ -50,9 +46,9 @@ replaceTerms pats terms = go 0 terms
Just (len, label) ->
label : go (ix + len) (drop (len - 1) ts)
-- TODO is it what we want?
-- | merge with labelPolicy (can be a Map Term label)
merge (len1, lab1) (len2, lab2) =
if len1 > len2 then (len1, lab1) else (len2, lab2)
if (labelPolicy lab1) == lab2 then (len2, lab2) else (len1, lab1)
m =
IntMap.fromListWith merge
......@@ -66,11 +62,5 @@ buildPatterns = concatMap buildPattern
where
f alt = (KMP.build alt, length alt, label)
-- monoterms'' :: Lang -> Text -> [Terms]
-- monoterms'' l txt = map (text2terms l) $ monoterms txt
extractTermsWithList :: Patterns -> Text -> Corpus Label
extractTermsWithList pats =
map (replaceTerms pats) .
map (T.split isSpace) . -- text2terms
T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps
extractTermsWithList :: (Term -> Label) -> Patterns -> Text -> Corpus Label
extractTermsWithList labelPolicy pats = map (replaceTerms labelPolicy pats) . monoTextsBySentence
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment