Commit b6df8e42 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT-MINING] adding first functions/datatypes.

parent f152533b
......@@ -23,7 +23,6 @@ library:
- -Werror
exposed-modules:
- Gargantext
- Gargantext.Analysis
- Gargantext.DSL
- Gargantext.Database
- Gargantext.Database.Instances
......@@ -37,7 +36,9 @@ library:
- Gargantext.Database.Utils
- Gargantext.Database.User
- Gargantext.Ngrams
- Gargantext.Ngrams.Count
- Gargantext.Ngrams.Analysis
- Gargantext.Ngrams.TFICF
- Gargantext.Ngrams.Letters
- Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En
......
......@@ -62,7 +62,8 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
:> QueryParam "offset" Int
:> QueryParam "limit" Int
:> Get '[JSON] [Node HyperdataDocument]
:<|> "facet" :> "documents" :> FacetDocAPI
:<|> "facet" :> Summary " Facet documents"
:> "documents" :> FacetDocAPI
-- :<|> "facet" :<|> "sources" :<|> FacetSourcesAPI
-- :<|> "facet" :<|> "authors" :<|> FacetAuthorsAPI
-- :<|> "facet" :<|> "terms" :<|> FacetTermsAPI
......@@ -73,11 +74,13 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
type FacetDocAPI = "table"
:> Summary " Table data"
:> QueryParam "offset" Int
:> QueryParam "limit" Int
:> Get '[JSON] [FacetDoc]
:<|> "chart"
:> Summary " Chart data"
:> QueryParam "from" UTCTime
:> QueryParam "to" UTCTime
:> Get '[JSON] [FacetChart]
......
......@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t)
--userWithId t xs = userWith userUserId t xs
-- | not optimized (get all ngrams without filters)
ngrams :: PGS.Connection -> IO [Ngram]
ngrams conn = runQuery conn queryNgramTable
getNgrams :: PGS.Connection -> IO [Ngram]
getNgrams conn = runQuery conn queryNgramTable
module Gargantext.Ngrams ( module Gargantext.Ngrams.Count
{-|
Module : Gargantext.Ngrams
Description : Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams exctration.
Definitions of ngrams.
n non negative integer
-}
module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
--, module Gargantext.Ngrams.Hetero
, module Gargantext.Ngrams.CoreNLP
, module Gargantext.Ngrams.Parser
, module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, module Gargantext.Ngrams.CoreNLP
, module Gargantext.Ngrams.Parser
, module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, ngrams, occurrences
--, module Gargantext.Ngrams.Words
) where
) where
import Gargantext.Ngrams.Count
import Gargantext.Ngrams.Letters
--import Gargantext.Ngrams.Hetero
import Gargantext.Ngrams.CoreNLP
import Gargantext.Ngrams.Parser
......@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining
--import Gargantext.Ngrams.Words
import Gargantext.Ngrams.Metrics
-----------------------------------------------------------------
import Data.Char (Char, isAlpha, isSpace)
import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map, empty, insertWith)
import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms = words
isGram :: Char -> Bool
isGram '-' = True
isGram c = isAlpha c || isSpace c
-- | Compute the occurrences
occurrences :: Ord a => [a] -> Map a Int
occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
......@@ -11,7 +11,7 @@ Portability : POSIX
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Analysis
module Gargantext.Ngrams.Analysis
where
import Gargantext.Prelude (undefined, IO(), Int())
......
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-|
Module : Gargantext.Ngrams.CoreNLP
Description : CoreNLP module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Ngrams.CoreNLP where
......@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators :: Text
$(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { sentences :: [Sentence]}
data Sentences = Sentences { _sentences :: [Sentence]}
deriving (Show, Generic)
instance ToJSON Sentences
instance FromJSON Sentences
......@@ -102,7 +114,7 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Ngrams.Count where
{-|
Module : Gargantext.Ngrams.Letters
Description : Ngrams.Letters module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
import Gargantext.Prelude
Sugar to work on letters with Text.
-}
import Data.Foldable as F
{-# LANGUAGE OverloadedStrings #-}
import Data.Map.Strict (insertWith)
import Data.Map (Map)
import qualified Data.Map as M
module Gargantext.Ngrams.Letters where
--import qualified Data.Text.Lazy.IO as DTLIO
import qualified Data.Text.Lazy as DTL
-- import qualified Data.Text.Lazy.IO as DTLIO
import Gargantext.Prelude
-- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
-- from slower to faster:
......@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text]
letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
-- words
-- lines
-- words between punctuation
-- number of punctuation
occurrences :: Ord a => [a] -> Map a Int
occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) M.empty xs
-- for optimization :
--occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
--countMain :: IO ()
--countMain = do
-- (fichier:_) <- getArgs
-- c <- DTLIO.readFile fichier
-- --print $ occurrences $ DTL.chunksOf 1 c
-- pure $ occurrences $ letters'' c
-- --print $ occurrences $ DTL.words $ DTL.toLower c
--
......@@ -8,8 +8,7 @@ Maintainer : sample@email.com
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Mainly reexport functions in @Data.Text.Metrics@
-}
module Gargantext.Ngrams.Metrics (levenshtein
......
......@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: Language -> Text -> IO [[Ngrams]]
extractNgrams' lang t = map (map token2text)
<$> map _sentenceTokens
<$> sentences
<$> _sentences
<$> corenlp lang t
-- | This function selects ngrams according to grammars specific
......
{-|
Module : Gargantext.Ngrams.TFICF
Description : TFICF Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Definition of TFICF
-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Ngrams.TFICF where
import GHC.Generics (Generic)
import Data.Maybe (Maybe)
import Data.Text (Text)
import Text.Show (Show())
-- import Gargantext.Types
import Gargantext.Prelude
data Context = Corpus | Document
deriving (Show, Generic)
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Show, Generic)
--tfidf :: Text -> TFICF
--tfidf txt = TFICF txt Document Corpus score
-- where
-- score = Nothing
module Data.Gargantext.Ngrams.TFICF where
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Read, Show, Generics)
tfidf :: Text -> TFICF
tfidf txt = TFICF txt Document Corpus score
where
score = Nothing
module Data.Gargantext.Ngrams.Words where
import Data.List (partition)
import Data.Set (fromList, notMember, member)
import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
import NLP.Stemmer (stem, Stemmer(..))
import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
import Language.Aspell.Options (ACOption(..))
--import Data.Either.Utils (fromRight)
import Data.ByteString.Internal (packChars)
get_lang x = do
let lang = Lang (packChars x)
spell_lang <- spellCheckerWithOptions [lang]
return spell_lang
check' lang x = check lang (packChars x)
suggest' lang x = suggest lang (packChars x)
--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x
-- stem French "naturelles"
-- paragraphes
-- lines
-- sentences
-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
repl :: Char -> Char
repl x
| x == '\'' = ' '
| x == '/' = ' '
-- | x == '\t' = ' '
-- | x == '\n' = ' '
| otherwise = x
cleanText text = do
-- pb avec \'
--words $ filter (not . isPunctuation) $ Prelude.map toLower text
words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
isMiamWord word = do
let miamWord_set = fromList ["salut", "phrase"]
member word miamWord_set
isStopWord word = do
let stopWord_set = fromList ["de", "la", "une", "avec"]
member word stopWord_set
wordsMain = do
let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
print $ partition (not . isStopWord) $ cleanText text
print $ filter (not . isStopWord) $ cleanText text
--print $ filter isStopWord $ words $ filter (not . isPunctuation) text
......@@ -25,11 +25,16 @@ import Gargantext.Prelude
import System.FilePath (takeExtension, FilePath())
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.ByteString as DB
import Data.Map as DM
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import Data.Either.Extra (partitionEithers)
import Data.Ord()
import Data.Foldable (concat)
import Data.String()
import Data.Either.Extra(Either())
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
----
--import Control.Monad (join)
import Codec.Archive.Zip (withArchive, getEntry, getEntries)
......@@ -57,13 +62,20 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
parse :: FileFormat -> FilePath
-> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
-- TODO: to debug maybe add the filepath in error message
type ParseError = String
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
mapConcurrently (runParser format) files
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-- | withParser:
......
......@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>)
, Eq, (==), (<>)
, (&&), (||), not
, toS
, fst, snd, toS
)
-- TODO import functions optimized in Utils.Count
-- import Protolude hiding (head, last, all, any, sum, product, length)
-- import Gargantext.Utils.Count
import qualified Data.List as L hiding (head, sum)
import qualified Control.Monad as M
import qualified Data.Map as Map
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment