[WithList] adding labelPolicy.

4f5491c3 · Alexandre Delanoë · 1237e415 · 4f5491c3 · 4f5491c3 · 4f5491c3
Commit 4f5491c3 authored Jul 03, 2018 by Alexandre Delanoë
6 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -29,6 +29,7 @@ library:
  - Gargantext.Core
  - Gargantext.Core.Types
  - Gargantext.Text
+  - Gargantext.Text.List.CSV
  - Gargantext.Text.Search
  - Gargantext.Text.Parsers.CSV
  - Gargantext.API

--- a/src/Gargantext/Text/Context.hs
+++ b/src/Gargantext/Text/Context.hs
@@ -7,7 +7,7 @@ Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX

-Context of text management tool
+Context of text management tool, here are logic of main types.

 -}

@@ -25,6 +25,20 @@ import Gargantext.Text
 import Gargantext.Prelude hiding (length)


+------------------------------------------------------------------------
+
+type Term = Text
+type Label = Term
+
+type Sentence  a = [a] -- or a nominal group
+type Corpus    a = [Sentence a] -- a list of sentences
+
+-- type ConText a = [Sentence a]
+-- type Corpus a = [ConText a]
+
+
+------------------------------------------------------------------------
+
 data SplitContext = Chars Int | Sentences Int | Paragraphs Int

 tag :: Text -> [Tag Text]

--- a/src/Gargantext/Text/Search.hs
+++ b/src/Gargantext/Text/Search.hs
@@ -28,7 +28,7 @@ import Data.Ix
 import Data.Text (Text)

 import Gargantext.Prelude
-import Gargantext.Text.Terms.Mono (monoterms)
+import Gargantext.Text.Terms.Mono (monoTexts)
 import Gargantext.Text.Terms.Mono.Stem as ST
 import Gargantext.Text.Parsers.CSV

@@ -58,8 +58,8 @@ docSearchConfig =
  }
  where
    extractTerms :: Doc -> DocField -> [Text]
-    extractTerms doc TitleField       = monoterms (d_title doc)
-    extractTerms doc AbstractField    = monoterms (d_abstract doc)
+    extractTerms doc TitleField       = monoTexts (d_title doc)
+    extractTerms doc AbstractField    = monoTexts (d_abstract doc)

    normaliseQueryToken :: Text -> DocField -> Text
    normaliseQueryToken tok =

--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -41,7 +41,7 @@ import Gargantext.Prelude
 import Gargantext.Core
 import Gargantext.Core.Types
 import Gargantext.Text.Terms.Multi (multiterms)
-import Gargantext.Text.Terms.Mono  (monoterms')
+import Gargantext.Text.Terms.Mono  (monoTerms)
 import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)

 data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
@@ -59,9 +59,11 @@ extractTerms termTypeLang = mapM (terms termTypeLang)
 -- MonoMulti : mono and multi
 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 terms :: TermType Lang -> Text -> IO [Terms]
-terms (Mono      lang) txt = pure $ monoterms' lang txt
+terms (Mono      lang) txt = pure $ monoTerms lang txt
 terms (Multi     lang) txt = multiterms lang txt
 terms (MonoMulti lang) txt = terms (Multi lang) txt
-terms (WithList  list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList list txt
+terms (WithList  list) txt = pure . map (\x -> Terms x Set.empty {-TODO-}) $ extractTermsWithList labelPolicy list txt
+  where
+    labelPolicy = undefined
 ------------------------------------------------------------------------

--- a/src/Gargantext/Text/Terms/Mono.hs
+++ b/src/Gargantext/Text/Terms/Mono.hs
@@ -13,10 +13,17 @@ Mono-terms are Nterms where n == 1.

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Terms.Mono (monoterms, monoterms')
+module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence)
  where

+import Prelude (String)
+import Data.Char (isSpace)
 import Data.Text (Text, toLower, split, splitOn, pack)
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import qualified Data.List as L
 import qualified Data.Set as S

 import Gargantext.Core
@@ -26,23 +33,30 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
 import Gargantext.Prelude
 --import Data.Char (isAlphaNum, isSpace)

-monoterms' :: Lang -> Text -> [Terms]
-monoterms' l txt = map (text2terms l) $ monoterms txt
+-- | TODO remove Num ?
+--isGram  c  = isAlphaNum c

-monoterms :: Text -> [Text]
-monoterms txt = map toLower $ split isWord txt
-  where
-    isWord c = c `elem` [' ', '\'', ',', ';']

-text2terms :: Lang -> Text -> Terms
-text2terms lang txt = Terms label stems
-  where
-    label = splitOn (pack " ") txt
-    stems = S.fromList $ map (stem lang) label
+-- | Sentence split separators
+isSep :: Char -> Bool
+isSep = (`elem` (",.:;?!(){}[]\"" :: String))
+
+
+monoTerms :: Lang -> Text -> [Terms]
+monoTerms l txt = map (monoText2term l) $ monoTexts txt
+
+monoTexts :: Text -> [Text]
+monoTexts = L.concat . monoTextsBySentence
+
+monoText2term :: Lang -> Text -> Terms
+monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
+
+
+monoTextsBySentence :: Text -> [[Text]]
+monoTextsBySentence = map (T.split isSpace)
+                    . T.split isSep
+                    . T.toLower
+

--monograms :: Text -> [Text]
--monograms xs = monograms $ toLower $ filter isGram xs

--isGram :: Char -> Bool
--isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']

--- a/src/Gargantext/Text/Terms/WithList.hs
+++ b/src/Gargantext/Text/Terms/WithList.hs
@@ -16,32 +16,28 @@ commentary with @some markup@.

 module Gargantext.Text.Terms.WithList where

+import Prelude (String)
 import qualified Data.Algorithms.KMP as KMP
 import Data.Char (isSpace)
 import qualified Data.Text as T
 import Data.Text (Text)
 import qualified Data.IntMap.Strict as IntMap
+
+import Gargantext.Text.Context
+import Gargantext.Text.Terms.Mono (monoTextsBySentence)
+
 import Gargantext.Prelude
 import Data.List (concatMap)

-type Term = Text

-type Label = Term

 type Pattern = KMP.Table Term
-
 type TermList = [(Label, [[Term]])]
-
 type Patterns = [(Pattern, Int, Label)]

-isMultiTermSep :: Char -> Bool
-isMultiTermSep = (`elem` ",.:;?!(){}[]")
-
-type Sentence  a = [a] -- or a nominal group
-type Corpus    a = [Sentence a] -- a list of sentences

-replaceTerms :: Patterns -> Sentence Term -> Sentence Label
-replaceTerms pats terms = go 0 terms
+replaceTerms :: (Term -> Label) -> Patterns -> Sentence Term -> Sentence Label
+replaceTerms labelPolicy pats terms = go 0 terms
  where
    go _ [] = []
    go !ix (t:ts) =
@@ -50,9 +46,9 @@ replaceTerms pats terms = go 0 terms
        Just (len, label) ->
          label : go (ix + len) (drop (len - 1) ts)

-    -- TODO is it what we want?
+    -- | merge with labelPolicy (can be a Map Term label)
    merge (len1, lab1) (len2, lab2) =
-      if len1 > len2 then (len1, lab1) else (len2, lab2)
+      if (labelPolicy lab1) == lab2 then (len2, lab2) else (len1, lab1)

    m =
      IntMap.fromListWith merge
@@ -66,11 +62,5 @@ buildPatterns = concatMap buildPattern
      where
        f alt = (KMP.build alt, length alt, label)

-- monoterms'' :: Lang -> Text -> [Terms]
-- monoterms'' l txt = map (text2terms l) $ monoterms txt
-
-extractTermsWithList :: Patterns -> Text -> Corpus Label
-extractTermsWithList pats =
-  map (replaceTerms pats) .
-  map (T.split isSpace) . -- text2terms
-  T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps
+extractTermsWithList :: (Term -> Label) -> Patterns -> Text -> Corpus Label
+extractTermsWithList labelPolicy pats = map (replaceTerms labelPolicy pats) . monoTextsBySentence