[TERMS] main function.

7665bcb6 · Alexandre Delanoë · c0914f9a · 7665bcb6 · 7665bcb6 · 7665bcb6
Commit 7665bcb6 authored May 21, 2018 by Alexandre Delanoë
9 changed files
--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -47,6 +47,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , elem, die, mod, div, const, either
                 , curry, uncurry
                 , otherwise, when
+                 , undefined
                 )

 -- TODO import functions optimized in Utils.Count

--- a/src/Gargantext/Text.hs
+++ b/src/Gargantext/Text.hs
@@ -35,16 +35,6 @@ data Group = Group { _group_label  :: Terms
                   } deriving (Show)


-clean :: Text -> Text
-clean txt = DT.map clean' txt
-  where
-    clean' '’' = '\''
-    clean' c  = c
-
-
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-
 -------------------------------------------------------------------
 -- Contexts of text
 sentences :: Text -> [Text]
@@ -84,4 +74,3 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances
    -- group ngrams
    --ocs  = occ       $ ws

-
--- a/src/Gargantext/Text/Metrics.hs
+++ b/src/Gargantext/Text/Metrics.hs
@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@

 {-# LANGUAGE NoImplicitPrelude #-}

+module Gargantext.Text.Metrics where

-module Gargantext.Text.Metrics (levenshtein
-                                      , levenshteinNorm
-                                      , damerauLevenshtein
-                                      , damerauLevenshteinNorm
-                                      , overlap
-                                      , jaccard
-                                      , hamming
-                                      ) where
-
-
-import Data.Text (Text)
-import GHC.Real (Ratio)
-import qualified Data.Text.Metrics as DTM
-
-import Gargantext.Prelude
-{- * Example de titre
-}
-
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science, 
-- the Levenshtein distance is a string metric for measuring 
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
+--import Data.Text (Text)
+--import GHC.Real (Ratio)
+--import qualified Data.Text.Metrics as DTM
 --
-levenshtein :: Text -> Text -> Int
-levenshtein = DTM.levenshtein
-
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
+--import Gargantext.Prelude
 --
-levenshteinNorm :: Text -> Text -> Ratio Int
-levenshteinNorm = DTM.levenshteinNorm
-
-- | Return Damerau-Levenshtein distance between two 'Text' values. The 
-- function works like 'levenshtein', but the collection of allowed     
-- operations also includes transposition of two /adjacent/ characters. 
-- See also:                                                            
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance> 
--
-damerauLevenshtein :: Text -> Text -> Int
-damerauLevenshtein = DTM.damerauLevenshtein
-
-- damerau-Levenshtein distance normalized
--
-damerauLevenshteinNorm :: Text -> Text -> Ratio Int
-damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-
-- Treating inputs like sets
-
-- | Return overlap coefficient for two 'Text' values. Returned value   
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1  
-- if both 'Text' values are empty.                                     
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
-overlap :: Text -> Text -> Ratio Int
-overlap = DTM.overlap
-
-
-- | Jaccard distance
-- measures dissimilarity between sample sets
-jaccard :: Text -> Text -> Ratio Int
-jaccard = DTM.jaccard
-
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See:  https://en.wikipedia.org/wiki/Hamming_distance
-
-hamming :: Text -> Text -> Maybe Int
-hamming = DTM.hamming
-
+--noApax :: Ord a => Map a Occ -> Map a Occ
+--noApax m = M.filter (>1) m

--- a/src/Gargantext/Text/Metrics/CharByChar.hs
+++ b/src/Gargantext/Text/Metrics/CharByChar.hs
+{-|
+Module      : Gargantext.Text.Metrics.CharByChar
+Description : All parsers of Gargantext in one file.
+Copyright   : (c) CNRS, 2017 - present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Mainly reexport functions in @Data.Text.Metrics@
+-}
+
+{-# LANGUAGE NoImplicitPrelude #-}
+
+
+module Gargantext.Text.Metrics.CharByChar (levenshtein
+                                      , levenshteinNorm
+                                      , damerauLevenshtein
+                                      , damerauLevenshteinNorm
+                                      , overlap
+                                      , jaccard
+                                      , hamming
+                                      ) where
+
+
+import Data.Text (Text)
+import GHC.Real (Ratio)
+import qualified Data.Text.Metrics as DTM
+
+import Gargantext.Prelude
+
+--noApax :: Ord a => Map a Occ -> Map a Occ
+--noApax m = M.filter (>1) m
+
+
+{- * Example de titre
+-}
+
+-- | This module provide metrics to compare Text
+-- starting as an API rexporting main functions of the great lib
+-- text-metrics of Mark Karpov
+
+-- | Levenshtein Distance
+-- In information theory, Linguistics and computer science, 
+-- the Levenshtein distance is a string metric for measuring 
+-- the difference between two sequences.
+-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
+--
+levenshtein :: Text -> Text -> Int
+levenshtein = DTM.levenshtein
+
+-- | Return normalized Levenshtein distance between two 'Text' values.
+-- Result is a non-negative rational number (represented as @'Ratio'
+-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
+-- strings, while 1 means exact match.
+--
+levenshteinNorm :: Text -> Text -> Ratio Int
+levenshteinNorm = DTM.levenshteinNorm
+
+-- | Return Damerau-Levenshtein distance between two 'Text' values. The 
+-- function works like 'levenshtein', but the collection of allowed     
+-- operations also includes transposition of two /adjacent/ characters. 
+-- See also:                                                            
+-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance> 
+--
+damerauLevenshtein :: Text -> Text -> Int
+damerauLevenshtein = DTM.damerauLevenshtein
+
+-- damerau-Levenshtein distance normalized
+--
+damerauLevenshteinNorm :: Text -> Text -> Ratio Int
+damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
+
+-- Treating inputs like sets
+
+-- | Return overlap coefficient for two 'Text' values. Returned value   
+-- is in the range from 0 (no similarity) to 1 (exact match). Return 1  
+-- if both 'Text' values are empty.                                     
+--
+-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
+overlap :: Text -> Text -> Ratio Int
+overlap = DTM.overlap
+
+
+-- | Jaccard distance
+-- measures dissimilarity between sample sets
+jaccard :: Text -> Text -> Ratio Int
+jaccard = DTM.jaccard
+
+-- | Hamming Distance
+-- In information theory, the Hamming distance between two strings of
+-- equal length is the number of positions at which the corresponding
+-- symbols are different. In other words, it measures the minimum number of
+-- substitutions required to change one string into the other
+-- See:  https://en.wikipedia.org/wiki/Hamming_distance
+
+hamming :: Text -> Text -> Maybe Int
+hamming = DTM.hamming
+
+
--- a/src/Gargantext/Text/Parsers.hs
+++ b/src/Gargantext/Text/Parsers.hs
@@ -37,6 +37,7 @@ import Data.Either.Extra(Either())

 import Data.Text (Text)
 import Data.Text.Encoding (decodeUtf8)
+import qualified Data.Text as DT
 ----
 --import Control.Monad (join)
 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
@@ -108,4 +109,10 @@ openZip fp = do
    bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
    pure bs

+clean :: Text -> Text
+clean txt = DT.map clean' txt
+  where
+    clean' '’' = '\''
+    clean' c  = c
+

--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -19,8 +19,6 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
 Source: https://en.wikipedia.org/wiki/Ngrams

 TODO
-- Prelude.concat <$> Prelude.map (filter (\n -> _my_token_pos n == Just NP)) <$> extractNgrams  Gargantext.Core.EN   testText_en
-
 group Ngrams -> Tree
 compute occ by node of Tree
 group occs according groups
@@ -35,10 +33,27 @@ compute graph
 module Gargantext.Text.Terms
  where

+import Data.Text (Text)
+
+import Gargantext.Prelude
+import Gargantext.Core
 import Gargantext.Core.Types
+import Gargantext.Text.Terms.Multi (multiterms)
+import Gargantext.Text.Terms.Mono  (monoterms')
+
+data TermType = Mono | Multi

 ------------------------------------------------------------------------
-tokenTag2terms :: TokenTag -> Terms
-tokenTag2terms (TokenTag w t _ _) =  Terms w t
+terms :: TermType -> Maybe Lang -> Text -> IO [Terms]
+terms Mono  (Just lang)  txt = pure $ monoterms' lang txt
+terms Multi (Just lang ) txt = multiterms lang txt
+terms _      Nothing _ = panic "Lang needed"
 ------------------------------------------------------------------------

+termTests :: Text
+termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
+
+
+
+
+
--- a/src/Gargantext/Text/Terms/Mono.hs
+++ b/src/Gargantext/Text/Terms/Mono.hs
@@ -13,7 +13,7 @@ Mono-terms are Nterms where n == 1.

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Terms.Mono
+module Gargantext.Text.Terms.Mono (monoterms, monoterms')
  where

 import Data.Text (Text, toLower, split, splitOn, pack)
@@ -26,12 +26,14 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
 import Gargantext.Prelude
 import Data.Char (isAlphaNum, isSpace)

-monoterms :: Text -> [Term]
+monoterms' :: Lang -> Text -> [Terms]
+monoterms' l txt = map (text2terms l) $ monoterms txt
+
+monoterms :: Text -> [Text]
 monoterms txt = map toLower $ split isWord txt
  where
    isWord c = c `elem` [' ', '\'', ',', ';']

-
 text2terms :: Lang -> Text -> Terms
 text2terms lang txt = Terms label stems
  where

--- a/src/Gargantext/Text/Terms/Mono/Token.hs
+++ b/src/Gargantext/Text/Terms/Mono/Token.hs
@@ -23,11 +23,20 @@ module Gargantext.Text.Terms.Mono.Token (tokenize)
 import Data.Text (Text)
 import qualified Gargantext.Text.Terms.Mono.Token.En as En

+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude
+
 type Token = Text

 -- >>> tokenize "A rose is a rose is a rose."
 -- ["A","rose","is","a","rose","is","a","rose", "."]
 -- 
+
+data Context = Letter | Word | Sentence | Line | Paragraph
+
 tokenize :: Text -> [Token]
 tokenize = En.tokenize

+tokenize' :: Lang -> Context -> [Token]
+tokenize' = undefined
+
--- a/src/Gargantext/Text/Terms/Multi.hs
+++ b/src/Gargantext/Text/Terms/Multi.hs
@@ -13,10 +13,11 @@ Multi-terms are ngrams where n > 1.

 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Terms.Multi (extractTokenTags)
+module Gargantext.Text.Terms.Multi (multiterms)
  where

-import Data.Text hiding (map, group)
+import Data.Text hiding (map, group, filter, concat)
+import Data.List (concat)

 import Gargantext.Prelude
 import Gargantext.Core (Lang(..))
@@ -26,13 +27,21 @@ import Gargantext.Text.Terms.Multi.PosTagging
 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr

+multiterms :: Lang -> Text -> IO [Terms]
+multiterms lang txt = concat
+                   <$> map (map tokenTag2terms)
+                   <$> map (filter (\t -> _my_token_pos t == Just NP)) 
+                   <$> tokenTags lang txt

-extractTokenTags :: Lang -> Text -> IO [[TokenTag]]
-extractTokenTags lang s = map (group lang) <$> extractTokenTags' lang s
+tokenTag2terms :: TokenTag -> Terms
+tokenTag2terms (TokenTag w t _ _) =  Terms w t

+tokenTags :: Lang -> Text -> IO [[TokenTag]]
+tokenTags lang s = map (group lang) <$> tokenTags' lang s

-extractTokenTags' :: Lang -> Text -> IO [[TokenTag]]
-extractTokenTags' lang t =  map tokens2tokensTags
+
+tokenTags' :: Lang -> Text -> IO [[TokenTag]]
+tokenTags' lang t =  map tokens2tokensTags
                     <$> map _sentenceTokens
                     <$> _sentences
                     <$> corenlp lang t