[TEXT-MINING] adding first functions/datatypes.

b6df8e42 · Alexandre Delanoë · f152533b · b6df8e42 · b6df8e42 · b6df8e42
Commit b6df8e42 authored Apr 05, 2018 by Alexandre Delanoë
14 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -23,7 +23,6 @@ library:
  - -Werror
  exposed-modules:
  - Gargantext
-  - Gargantext.Analysis
  - Gargantext.DSL
  - Gargantext.Database
  - Gargantext.Database.Instances
@@ -37,7 +36,9 @@ library:
  - Gargantext.Database.Utils
  - Gargantext.Database.User
  - Gargantext.Ngrams
-  - Gargantext.Ngrams.Count
+  - Gargantext.Ngrams.Analysis
+  - Gargantext.Ngrams.TFICF
+  - Gargantext.Ngrams.Letters
  - Gargantext.Ngrams.CoreNLP
  - Gargantext.Ngrams.Parser
  - Gargantext.Ngrams.Lang.En

--- a/src/Gargantext/API/Node.hs
+++ b/src/Gargantext/API/Node.hs
@@ -62,7 +62,8 @@ type NodeAPI   = Get '[JSON] (Node HyperdataDocument)
                             :> QueryParam "offset" Int
                             :> QueryParam "limit"  Int
                             :> Get '[JSON] [Node HyperdataDocument]
-             :<|> "facet" :> "documents" :> FacetDocAPI
+             :<|> "facet" :> Summary " Facet documents"
+                          :> "documents" :> FacetDocAPI
 --             :<|> "facet" :<|> "sources"   :<|> FacetSourcesAPI
 --             :<|> "facet" :<|> "authors"   :<|> FacetAuthorsAPI
 --             :<|> "facet" :<|> "terms"     :<|> FacetTermsAPI
@@ -73,11 +74,13 @@ type NodeAPI   = Get '[JSON] (Node HyperdataDocument)


 type FacetDocAPI = "table"
+                   :> Summary " Table data"
                   :> QueryParam "offset" Int
                   :> QueryParam "limit"  Int
                   :> Get '[JSON] [FacetDoc]

                :<|> "chart"
+                   :> Summary " Chart data"
                   :> QueryParam "from" UTCTime
                   :> QueryParam "to"   UTCTime
                   :> Get '[JSON] [FacetChart]

--- a/src/Gargantext/Database/Ngram.hs
+++ b/src/Gargantext/Database/Ngram.hs
@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t)
 --userWithId t xs = userWith userUserId t xs

 -- | not optimized (get all ngrams without filters)
-ngrams :: PGS.Connection -> IO [Ngram]
-ngrams conn = runQuery conn queryNgramTable
+getNgrams :: PGS.Connection -> IO [Ngram]
+getNgrams conn = runQuery conn queryNgramTable
--- a/src/Gargantext/Ngrams.hs
+++ b/src/Gargantext/Ngrams.hs
-module Gargantext.Ngrams ( module Gargantext.Ngrams.Count
+{-|
+Module      : Gargantext.Ngrams
+Description : Ngrams tools
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Ngrams exctration.
+
+Definitions of ngrams.
+n non negative integer
+
+-}
+
+module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
                              --, module Gargantext.Ngrams.Hetero
-                              , module Gargantext.Ngrams.CoreNLP
-                              , module Gargantext.Ngrams.Parser
-                              , module Gargantext.Ngrams.Occurrences
-                              , module Gargantext.Ngrams.TextMining
-                              , module Gargantext.Ngrams.Metrics
+                         , module Gargantext.Ngrams.CoreNLP
+                         , module Gargantext.Ngrams.Parser
+                         , module Gargantext.Ngrams.Occurrences
+                         , module Gargantext.Ngrams.TextMining
+                         , module Gargantext.Ngrams.Metrics
+                         , ngrams, occurrences
                             --, module Gargantext.Ngrams.Words
-    ) where
+                         ) where

-import Gargantext.Ngrams.Count
+import Gargantext.Ngrams.Letters
 --import Gargantext.Ngrams.Hetero
 import Gargantext.Ngrams.CoreNLP
 import Gargantext.Ngrams.Parser
@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining
 --import Gargantext.Ngrams.Words

 import Gargantext.Ngrams.Metrics
+
+-----------------------------------------------------------------
+
+import Data.Char (Char, isAlpha, isSpace)
+import Data.Text (Text, words, filter, toLower)
+import Data.Map.Strict  (Map, empty, insertWith)
+import Data.Foldable (foldl')
+import Gargantext.Prelude hiding (filter)
+
+-- Maybe useful later:
+--import NLP.Stemmer (stem, Stemmer(..))
+--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
+--import Language.Aspell.Options (ACOption(..))
+
+
+ngrams :: Text -> [Text]
+ngrams xs = monograms $ toLower $ filter isGram xs
+
+monograms :: Text -> [Text]
+monograms = words
+
+isGram :: Char -> Bool
+isGram '-' = True
+isGram  c  = isAlpha c || isSpace c
+
+-- | Compute the occurrences
+occurrences :: Ord a => [a] -> Map a Int
+occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
+
+
+
+
--- a/src/Gargantext/Analysis.hs
+++ b/src/Gargantext/Analysis.hs
@@ -11,7 +11,7 @@ Portability : POSIX
 {-# LANGUAGE NoImplicitPrelude #-}


-module Gargantext.Analysis 
+module Gargantext.Ngrams.Analysis 
  where

 import Gargantext.Prelude (undefined, IO(), Int())

--- a/src/Gargantext/Ngrams/CoreNLP.hs
+++ b/src/Gargantext/Ngrams/CoreNLP.hs
-{-# LANGUAGE DataKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE TypeOperators #-}
-{-# LANGUAGE TemplateHaskell #-}
-{-# LANGUAGE     NoImplicitPrelude       #-}
+{-|
+Module      : Gargantext.Ngrams.CoreNLP
+Description : CoreNLP module
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+-}
+
+
+{-# LANGUAGE DataKinds         #-}
+{-# LANGUAGE DeriveGeneric     #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TemplateHaskell   #-}
+{-# LANGUAGE TypeOperators     #-}

 module Gargantext.Ngrams.CoreNLP where

@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators  :: Text

 $(deriveJSON (unPrefix "_properties") ''Properties)

-data Sentences = Sentences { sentences :: [Sentence]}
+data Sentences = Sentences { _sentences :: [Sentence]}
  deriving (Show, Generic)
 instance ToJSON Sentences
 instance FromJSON Sentences
@@ -102,7 +114,7 @@ corenlp lang txt = do
 -- parseWith  _tokenNer     "Hello world of Peter."
 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
-tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s
+tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s



--- a/src/Gargantext/Ngrams/Count.hs
+++ b/src/Gargantext/Ngrams/Count.hs
-{-# LANGUAGE OverloadedStrings #-}
-
-module Gargantext.Ngrams.Count where
+{-|
+Module      : Gargantext.Ngrams.Letters
+Description : Ngrams.Letters module
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX

-import Gargantext.Prelude
+Sugar to work on letters with Text.

+-}

-import Data.Foldable as F
+{-# LANGUAGE OverloadedStrings #-}

-import Data.Map.Strict (insertWith)
-import Data.Map (Map)
-import qualified Data.Map as M
+module Gargantext.Ngrams.Letters where

--import qualified Data.Text.Lazy.IO as DTLIO
 import qualified Data.Text.Lazy as DTL
+-- import qualified Data.Text.Lazy.IO as DTLIO
+import Gargantext.Prelude
+

 -- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
 -- from slower to faster:
@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text]
 letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []


-- words
-- lines
-- words between punctuation
-- number of punctuation
-
-occurrences :: Ord a => [a] -> Map a Int
-occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) M.empty xs
-
-- for optimization :
--occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
-
--countMain :: IO ()
--countMain = do
--  (fichier:_) <- getArgs
--  c <- DTLIO.readFile fichier
--  --print $ occurrences $ DTL.chunksOf 1 c
--  pure $ occurrences $ letters'' c
--  --print $ occurrences $ DTL.words $ DTL.toLower c
--
--- a/src/Gargantext/Ngrams/Metrics.hs
+++ b/src/Gargantext/Ngrams/Metrics.hs
@@ -8,8 +8,7 @@ Maintainer  : sample@email.com
 Stability   : experimental
 Portability : POSIX

-Here is a longer description of this module, containing some
-commentary with @some markup@.
+Mainly reexport functions in @Data.Text.Metrics@
 -}

 module Gargantext.Ngrams.Metrics (levenshtein

--- a/src/Gargantext/Ngrams/Parser.hs
+++ b/src/Gargantext/Ngrams/Parser.hs
@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
 extractNgrams' :: Language -> Text -> IO [[Ngrams]]
 extractNgrams' lang t =  map (map token2text)
                     <$> map _sentenceTokens
-                     <$> sentences
+                     <$> _sentences
                     <$> corenlp lang t

 -- | This function selects ngrams according to grammars specific

--- a/src/Gargantext/Ngrams/TFICF.hs
+++ b/src/Gargantext/Ngrams/TFICF.hs
+{-|
+Module      : Gargantext.Ngrams.TFICF
+Description : TFICF Ngrams tools
+Copyright   : (c) CNRS, 2017
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Definition of TFICF
+
+-}
+
+{-# LANGUAGE DeriveGeneric        #-}
+
+module Gargantext.Ngrams.TFICF where
+
+import GHC.Generics (Generic)
+
+import Data.Maybe (Maybe)
+import Data.Text (Text)
+import Text.Show (Show())
+
+-- import Gargantext.Types
+import Gargantext.Prelude
+
+
+data Context = Corpus | Document
+  deriving (Show, Generic)
+
+data TFICF = TFICF { _tficfTerms    :: Text
+                   , _tficfContext1 :: Context
+                   , _tficfContext2 :: Context
+                   , _tficfScore    :: Maybe Double
+                   } deriving (Show, Generic)
+
+
+--tfidf :: Text -> TFICF
+--tfidf txt = TFICF txt Document Corpus score
+--    where
+--        score = Nothing
+
--- a/src/Gargantext/Ngrams/TFICF_hs
+++ b/src/Gargantext/Ngrams/TFICF_hs
-module Data.Gargantext.Ngrams.TFICF where
-
-
-
-data TFICF = TFICF { _tficfTerms    :: Text
-                   , _tficfContext1 :: Context
-                   , _tficfContext2 :: Context
-                   , _tficfScore    :: Maybe Double
-                   } deriving (Read, Show, Generics)
-
-
-tfidf :: Text -> TFICF
-tfidf txt = TFICF txt Document Corpus score
-    where
-        score = Nothing
-
--- a/src/Gargantext/Ngrams/Words_hs
+++ b/src/Gargantext/Ngrams/Words_hs
-module Data.Gargantext.Ngrams.Words where
-import Data.List (partition)
-import Data.Set (fromList, notMember, member)
-import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
-
-import NLP.Stemmer (stem, Stemmer(..))
-import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
-import Language.Aspell.Options (ACOption(..))
-
--import Data.Either.Utils (fromRight)
-import Data.ByteString.Internal (packChars)
-
-
-get_lang x = do
-    let lang = Lang (packChars x)
-    spell_lang <- spellCheckerWithOptions [lang]
-    return spell_lang
-
-check' lang x = check lang (packChars x)
-suggest' lang x = suggest lang (packChars x)
-
--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x
-
-- stem French "naturelles"
-
-
-- paragraphes
-- lines
-- sentences
-
-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
-repl :: Char -> Char
-repl x
-    | x == '\'' = ' '
-    | x == '/' = ' '
-    -- | x == '\t' = ' '
-    -- | x == '\n' = ' '
-    | otherwise = x
-
-cleanText text = do
-    -- pb avec \'
-    --words $ filter (not . isPunctuation) $ Prelude.map toLower text
-    words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
-
-isMiamWord word = do
-    let miamWord_set = fromList ["salut", "phrase"]
-    member word miamWord_set
-
-isStopWord word = do
-    let stopWord_set = fromList ["de", "la", "une", "avec"]
-    member word stopWord_set
-
-wordsMain = do
-    let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
-    print $ partition (not . isStopWord) $ cleanText text
-    print $ filter (not . isStopWord) $ cleanText text
-    --print $ filter isStopWord $ words $ filter (not . isPunctuation) text
--- a/src/Gargantext/Parsers.hs
+++ b/src/Gargantext/Parsers.hs
@@ -25,11 +25,16 @@ import Gargantext.Prelude

 import System.FilePath (takeExtension, FilePath())
 import Data.Attoparsec.ByteString (parseOnly, Parser)
-import Data.ByteString as DB
-import Data.Map        as DM
+import qualified Data.ByteString as DB
+import qualified Data.Map        as DM
+import Data.Either.Extra (partitionEithers)
 import Data.Ord()
+import Data.Foldable (concat)
 import Data.String()
 import Data.Either.Extra(Either())
+
+import Data.Text (Text)
+import Data.Text.Encoding (decodeUtf8)
 ----
 --import Control.Monad (join)
 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
@@ -57,13 +62,20 @@ data FileFormat = WOS        -- Implemented (ISI Format)
 --                | XML        -- Not Implemented / see :
 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml

-parse :: FileFormat -> FilePath 
-      -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
+-- TODO: to debug maybe add the filepath in error message
+type ParseError = String
+
+
+parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
 parse format path = do
    files <- case takeExtension path of
              ".zip" -> openZip              path
              _      -> pure <$> DB.readFile path
-    mapConcurrently (runParser format) files
+    (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
+    pure (as, map toText $ concat bs)
+      where
+        -- TODO : decode with bayesian inference on encodings
+        toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))


 -- | withParser:

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>)
                 , Eq, (==), (<>)
                 , (&&), (||), not
-                 , toS
+                 , fst, snd, toS
                 )

 -- TODO import functions optimized in Utils.Count
 -- import Protolude hiding (head, last, all, any, sum, product, length)
 -- import Gargantext.Utils.Count
-
 import qualified Data.List     as L hiding (head, sum)
 import qualified Control.Monad as M
 import qualified Data.Map as Map