[TEXT-MINING] adding first functions/datatypes.

b6df8e42 · Alexandre Delanoë · f152533b · b6df8e42 · b6df8e42 · b6df8e42
Commit b6df8e42 authored Apr 05, 2018 by Alexandre Delanoë
14 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -23,7 +23,6 @@ library:
  - -Werror
  exposed-modules:
  - Gargantext
-  - Gargantext.Analysis
  - Gargantext.DSL
  - Gargantext.Database
  - Gargantext.Database.Instances
@@ -37,7 +36,9 @@ library:
  - Gargantext.Database.Utils
  - Gargantext.Database.User
  - Gargantext.Ngrams
-  - Gargantext.Ngrams.Count
+  - Gargantext.Ngrams.Analysis
+  - Gargantext.Ngrams.TFICF
+  - Gargantext.Ngrams.Letters
  - Gargantext.Ngrams.CoreNLP
  - Gargantext.Ngrams.Parser
  - Gargantext.Ngrams.Lang.En

--- a/src/Gargantext/API/Node.hs
+++ b/src/Gargantext/API/Node.hs
@@ -62,7 +62,8 @@ type NodeAPI   = Get '[JSON] (Node HyperdataDocument)
                             :> QueryParam "offset" Int
                             :> QueryParam "limit"  Int
                             :> Get '[JSON] [Node HyperdataDocument]
-             :<|> "facet" :> "documents" :> FacetDocAPI
+             :<|> "facet" :> Summary " Facet documents"
+                          :> "documents" :> FacetDocAPI
 --             :<|> "facet" :<|> "sources"   :<|> FacetSourcesAPI
 --             :<|> "facet" :<|> "authors"   :<|> FacetAuthorsAPI
 --             :<|> "facet" :<|> "terms"     :<|> FacetTermsAPI
@@ -73,11 +74,13 @@ type NodeAPI   = Get '[JSON] (Node HyperdataDocument)
 type FacetDocAPI = "table"
+                   :> Summary " Table data"
                   :> QueryParam "offset" Int
                   :> QueryParam "limit"  Int
                   :> Get '[JSON] [FacetDoc]
                :<|> "chart"
+                   :> Summary " Chart data"
                   :> QueryParam "from" UTCTime
                   :> QueryParam "to"   UTCTime
                   :> Get '[JSON] [FacetChart]

--- a/src/Gargantext/Database/Ngram.hs
+++ b/src/Gargantext/Database/Ngram.hs
@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t)
 --userWithId t xs = userWith userUserId t xs
 -- | not optimized (get all ngrams without filters)
-ngrams :: PGS.Connection -> IO [Ngram]
+getNgrams :: PGS.Connection -> IO [Ngram]
-ngrams conn = runQuery conn queryNgramTable
+getNgrams conn = runQuery conn queryNgramTable
--- a/src/Gargantext/Ngrams.hs
+++ b/src/Gargantext/Ngrams.hs
-module Gargantext.Ngrams ( module Gargantext.Ngrams.Count
+{-|
+Module      : Gargantext.Ngrams
+Description : Ngrams tools
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Ngrams exctration.
+Definitions of ngrams.
+n non negative integer
+-}
+module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
                              --, module Gargantext.Ngrams.Hetero
                         , module Gargantext.Ngrams.CoreNLP
                         , module Gargantext.Ngrams.Parser
                         , module Gargantext.Ngrams.Occurrences
                         , module Gargantext.Ngrams.TextMining
                         , module Gargantext.Ngrams.Metrics
+                         , ngrams, occurrences
                             --, module Gargantext.Ngrams.Words
                         ) where
-import Gargantext.Ngrams.Count
+import Gargantext.Ngrams.Letters
 --import Gargantext.Ngrams.Hetero
 import Gargantext.Ngrams.CoreNLP
 import Gargantext.Ngrams.Parser
@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining
 --import Gargantext.Ngrams.Words
 import Gargantext.Ngrams.Metrics
+-----------------------------------------------------------------
+import Data.Char (Char, isAlpha, isSpace)
+import Data.Text (Text, words, filter, toLower)
+import Data.Map.Strict  (Map, empty, insertWith)
+import Data.Foldable (foldl')
+import Gargantext.Prelude hiding (filter)
+-- Maybe useful later:
+--import NLP.Stemmer (stem, Stemmer(..))
+--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
+--import Language.Aspell.Options (ACOption(..))
+ngrams :: Text -> [Text]
+ngrams xs = monograms $ toLower $ filter isGram xs
+monograms :: Text -> [Text]
+monograms = words
+isGram :: Char -> Bool
+isGram '-' = True
+isGram  c  = isAlpha c || isSpace c
+-- | Compute the occurrences
+occurrences :: Ord a => [a] -> Map a Int
+occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
--- a/src/Gargantext/Analysis.hs
+++ b/src/Gargantext/Analysis.hs
@@ -11,7 +11,7 @@ Portability : POSIX
 {-# LANGUAGE NoImplicitPrelude #-}
-module Gargantext.Analysis 
+module Gargantext.Ngrams.Analysis 
  where
 import Gargantext.Prelude (undefined, IO(), Int())

--- a/src/Gargantext/Ngrams/CoreNLP.hs
+++ b/src/Gargantext/Ngrams/CoreNLP.hs
+{-|
+Module      : Gargantext.Ngrams.CoreNLP
+Description : CoreNLP module
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+-}
 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE DeriveGeneric     #-}
-{-# LANGUAGE TypeOperators #-}
-{-# LANGUAGE TemplateHaskell #-}
 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TemplateHaskell   #-}
+{-# LANGUAGE TypeOperators     #-}
 module Gargantext.Ngrams.CoreNLP where
@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators  :: Text
 $(deriveJSON (unPrefix "_properties") ''Properties)
-data Sentences = Sentences { sentences :: [Sentence]}
+data Sentences = Sentences { _sentences :: [Sentence]}
  deriving (Show, Generic)
 instance ToJSON Sentences
 instance FromJSON Sentences
@@ -102,7 +114,7 @@ corenlp lang txt = do
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
-tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s
+tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
--- a/src/Gargantext/Ngrams/Count.hs
+++ b/src/Gargantext/Ngrams/Count.hs
-{-# LANGUAGE OverloadedStrings #-}
+{-|
+Module      : Gargantext.Ngrams.Letters
-module Gargantext.Ngrams.Count where
+Description : Ngrams.Letters module
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
-import Gargantext.Prelude
+Sugar to work on letters with Text.
+-}
-import Data.Foldable as F
+{-# LANGUAGE OverloadedStrings #-}
-import Data.Map.Strict (insertWith)
+module Gargantext.Ngrams.Letters where
-import Data.Map (Map)
-import qualified Data.Map as M
--import qualified Data.Text.Lazy.IO as DTLIO
 import qualified Data.Text.Lazy as DTL
+-- import qualified Data.Text.Lazy.IO as DTLIO
+import Gargantext.Prelude
 -- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
 -- from slower to faster:
@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text]
 letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
-- words
-- lines
-- words between punctuation
-- number of punctuation
-occurrences :: Ord a => [a] -> Map a Int
-occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) M.empty xs
-- for optimization :
--occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
--countMain :: IO ()
--countMain = do
--  (fichier:_) <- getArgs
--  c <- DTLIO.readFile fichier
--  --print $ occurrences $ DTL.chunksOf 1 c
--  pure $ occurrences $ letters'' c
--  --print $ occurrences $ DTL.words $ DTL.toLower c
--
--- a/src/Gargantext/Ngrams/Metrics.hs
+++ b/src/Gargantext/Ngrams/Metrics.hs
@@ -8,8 +8,7 @@ Maintainer  : sample@email.com
 Stability   : experimental
 Portability : POSIX
-Here is a longer description of this module, containing some
+Mainly reexport functions in @Data.Text.Metrics@
-commentary with @some markup@.
 -}
 module Gargantext.Ngrams.Metrics (levenshtein

--- a/src/Gargantext/Ngrams/Parser.hs
+++ b/src/Gargantext/Ngrams/Parser.hs
@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
 extractNgrams' :: Language -> Text -> IO [[Ngrams]]
 extractNgrams' lang t =  map (map token2text)
                     <$> map _sentenceTokens
-                     <$> sentences
+                     <$> _sentences
                     <$> corenlp lang t
 -- | This function selects ngrams according to grammars specific

--- a/src/Gargantext/Ngrams/TFICF.hs
+++ b/src/Gargantext/Ngrams/TFICF.hs
+{-|
+Module      : Gargantext.Ngrams.TFICF
+Description : TFICF Ngrams tools
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+Definition of TFICF
+-}
+{-# LANGUAGE DeriveGeneric        #-}
+module Gargantext.Ngrams.TFICF where
+import GHC.Generics (Generic)
+import Data.Maybe (Maybe)
+import Data.Text (Text)
+import Text.Show (Show())
+-- import Gargantext.Types
+import Gargantext.Prelude
+data Context = Corpus | Document
+  deriving (Show, Generic)
+data TFICF = TFICF { _tficfTerms    :: Text
+                   , _tficfContext1 :: Context
+                   , _tficfContext2 :: Context
+                   , _tficfScore    :: Maybe Double
+                   } deriving (Show, Generic)
+--tfidf :: Text -> TFICF
+--tfidf txt = TFICF txt Document Corpus score
+--    where
+--        score = Nothing
--- a/src/Gargantext/Ngrams/TFICF_hs
+++ b/src/Gargantext/Ngrams/TFICF_hs
-module Data.Gargantext.Ngrams.TFICF where
-data TFICF = TFICF { _tficfTerms    :: Text
-                   , _tficfContext1 :: Context
-                   , _tficfContext2 :: Context
-                   , _tficfScore    :: Maybe Double
-                   } deriving (Read, Show, Generics)
-tfidf :: Text -> TFICF
-tfidf txt = TFICF txt Document Corpus score
-    where
-        score = Nothing
--- a/src/Gargantext/Ngrams/Words_hs
+++ b/src/Gargantext/Ngrams/Words_hs
-module Data.Gargantext.Ngrams.Words where
-import Data.List (partition)
-import Data.Set (fromList, notMember, member)
-import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
-import NLP.Stemmer (stem, Stemmer(..))
-import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
-import Language.Aspell.Options (ACOption(..))
--import Data.Either.Utils (fromRight)
-import Data.ByteString.Internal (packChars)
-get_lang x = do
-    let lang = Lang (packChars x)
-    spell_lang <- spellCheckerWithOptions [lang]
-    return spell_lang
-check' lang x = check lang (packChars x)
-suggest' lang x = suggest lang (packChars x)
--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x
-- stem French "naturelles"
-- paragraphes
-- lines
-- sentences
-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
-repl :: Char -> Char
-repl x
-    | x == '\'' = ' '
-    | x == '/' = ' '
-    -- | x == '\t' = ' '
-    -- | x == '\n' = ' '
-    | otherwise = x
-cleanText text = do
-    -- pb avec \'
-    --words $ filter (not . isPunctuation) $ Prelude.map toLower text
-    words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
-isMiamWord word = do
-    let miamWord_set = fromList ["salut", "phrase"]
-    member word miamWord_set
-isStopWord word = do
-    let stopWord_set = fromList ["de", "la", "une", "avec"]
-    member word stopWord_set
-wordsMain = do
-    let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
-    print $ partition (not . isStopWord) $ cleanText text
-    print $ filter (not . isStopWord) $ cleanText text
-    --print $ filter isStopWord $ words $ filter (not . isPunctuation) text
--- a/src/Gargantext/Parsers.hs
+++ b/src/Gargantext/Parsers.hs
@@ -25,11 +25,16 @@ import Gargantext.Prelude
 import System.FilePath (takeExtension, FilePath())
 import Data.Attoparsec.ByteString (parseOnly, Parser)
-import Data.ByteString as DB
+import qualified Data.ByteString as DB
-import Data.Map        as DM
+import qualified Data.Map        as DM
+import Data.Either.Extra (partitionEithers)
 import Data.Ord()
+import Data.Foldable (concat)
 import Data.String()
 import Data.Either.Extra(Either())
+import Data.Text (Text)
+import Data.Text.Encoding (decodeUtf8)
 ----
 --import Control.Monad (join)
 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
@@ -57,13 +62,20 @@ data FileFormat = WOS        -- Implemented (ISI Format)
 --                | XML        -- Not Implemented / see :
 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-parse :: FileFormat -> FilePath 
+-- TODO: to debug maybe add the filepath in error message
-      -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
+type ParseError = String
+parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 parse format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> DB.readFile path
-    mapConcurrently (runParser format) files
+    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
+    pure (as, map toText $ concat bs)
+      where
+        -- TODO : decode with bayesian inference on encodings
+        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
 -- | withParser:

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>)
                 , Eq, (==), (<>)
                 , (&&), (||), not
-                 , toS
+                 , fst, snd, toS
                 )
 -- TODO import functions optimized in Utils.Count
 -- import Protolude hiding (head, last, all, any, sum, product, length)
 -- import Gargantext.Utils.Count
 import qualified Data.List     as L hiding (head, sum)
 import qualified Control.Monad as M
 import qualified Data.Map as Map