Commit b6df8e42 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT-MINING] adding first functions/datatypes.

parent f152533b
...@@ -23,7 +23,6 @@ library: ...@@ -23,7 +23,6 @@ library:
- -Werror - -Werror
exposed-modules: exposed-modules:
- Gargantext - Gargantext
- Gargantext.Analysis
- Gargantext.DSL - Gargantext.DSL
- Gargantext.Database - Gargantext.Database
- Gargantext.Database.Instances - Gargantext.Database.Instances
...@@ -37,7 +36,9 @@ library: ...@@ -37,7 +36,9 @@ library:
- Gargantext.Database.Utils - Gargantext.Database.Utils
- Gargantext.Database.User - Gargantext.Database.User
- Gargantext.Ngrams - Gargantext.Ngrams
- Gargantext.Ngrams.Count - Gargantext.Ngrams.Analysis
- Gargantext.Ngrams.TFICF
- Gargantext.Ngrams.Letters
- Gargantext.Ngrams.CoreNLP - Gargantext.Ngrams.CoreNLP
- Gargantext.Ngrams.Parser - Gargantext.Ngrams.Parser
- Gargantext.Ngrams.Lang.En - Gargantext.Ngrams.Lang.En
......
...@@ -62,7 +62,8 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument) ...@@ -62,7 +62,8 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
:> QueryParam "offset" Int :> QueryParam "offset" Int
:> QueryParam "limit" Int :> QueryParam "limit" Int
:> Get '[JSON] [Node HyperdataDocument] :> Get '[JSON] [Node HyperdataDocument]
:<|> "facet" :> "documents" :> FacetDocAPI :<|> "facet" :> Summary " Facet documents"
:> "documents" :> FacetDocAPI
-- :<|> "facet" :<|> "sources" :<|> FacetSourcesAPI -- :<|> "facet" :<|> "sources" :<|> FacetSourcesAPI
-- :<|> "facet" :<|> "authors" :<|> FacetAuthorsAPI -- :<|> "facet" :<|> "authors" :<|> FacetAuthorsAPI
-- :<|> "facet" :<|> "terms" :<|> FacetTermsAPI -- :<|> "facet" :<|> "terms" :<|> FacetTermsAPI
...@@ -73,11 +74,13 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument) ...@@ -73,11 +74,13 @@ type NodeAPI = Get '[JSON] (Node HyperdataDocument)
type FacetDocAPI = "table" type FacetDocAPI = "table"
:> Summary " Table data"
:> QueryParam "offset" Int :> QueryParam "offset" Int
:> QueryParam "limit" Int :> QueryParam "limit" Int
:> Get '[JSON] [FacetDoc] :> Get '[JSON] [FacetDoc]
:<|> "chart" :<|> "chart"
:> Summary " Chart data"
:> QueryParam "from" UTCTime :> QueryParam "from" UTCTime
:> QueryParam "to" UTCTime :> QueryParam "to" UTCTime
:> Get '[JSON] [FacetChart] :> Get '[JSON] [FacetChart]
......
...@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t) ...@@ -67,5 +67,5 @@ findWith f t = find (\x -> f x == t)
--userWithId t xs = userWith userUserId t xs --userWithId t xs = userWith userUserId t xs
-- | not optimized (get all ngrams without filters) -- | not optimized (get all ngrams without filters)
ngrams :: PGS.Connection -> IO [Ngram] getNgrams :: PGS.Connection -> IO [Ngram]
ngrams conn = runQuery conn queryNgramTable getNgrams conn = runQuery conn queryNgramTable
module Gargantext.Ngrams ( module Gargantext.Ngrams.Count {-|
Module : Gargantext.Ngrams
Description : Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Ngrams exctration.
Definitions of ngrams.
n non negative integer
-}
module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
--, module Gargantext.Ngrams.Hetero --, module Gargantext.Ngrams.Hetero
, module Gargantext.Ngrams.CoreNLP , module Gargantext.Ngrams.CoreNLP
, module Gargantext.Ngrams.Parser , module Gargantext.Ngrams.Parser
, module Gargantext.Ngrams.Occurrences , module Gargantext.Ngrams.Occurrences
, module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics , module Gargantext.Ngrams.Metrics
, ngrams, occurrences
--, module Gargantext.Ngrams.Words --, module Gargantext.Ngrams.Words
) where ) where
import Gargantext.Ngrams.Count import Gargantext.Ngrams.Letters
--import Gargantext.Ngrams.Hetero --import Gargantext.Ngrams.Hetero
import Gargantext.Ngrams.CoreNLP import Gargantext.Ngrams.CoreNLP
import Gargantext.Ngrams.Parser import Gargantext.Ngrams.Parser
...@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining ...@@ -19,3 +36,35 @@ import Gargantext.Ngrams.TextMining
--import Gargantext.Ngrams.Words --import Gargantext.Ngrams.Words
import Gargantext.Ngrams.Metrics import Gargantext.Ngrams.Metrics
-----------------------------------------------------------------
import Data.Char (Char, isAlpha, isSpace)
import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map, empty, insertWith)
import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms = words
isGram :: Char -> Bool
isGram '-' = True
isGram c = isAlpha c || isSpace c
-- | Compute the occurrences
occurrences :: Ord a => [a] -> Map a Int
occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
...@@ -11,7 +11,7 @@ Portability : POSIX ...@@ -11,7 +11,7 @@ Portability : POSIX
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Analysis module Gargantext.Ngrams.Analysis
where where
import Gargantext.Prelude (undefined, IO(), Int()) import Gargantext.Prelude (undefined, IO(), Int())
......
{-|
Module : Gargantext.Ngrams.CoreNLP
Description : CoreNLP module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Ngrams.CoreNLP where module Gargantext.Ngrams.CoreNLP where
...@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators :: Text ...@@ -51,7 +63,7 @@ data Properties = Properties { _propertiesAnnotators :: Text
$(deriveJSON (unPrefix "_properties") ''Properties) $(deriveJSON (unPrefix "_properties") ''Properties)
data Sentences = Sentences { sentences :: [Sentence]} data Sentences = Sentences { _sentences :: [Sentence]}
deriving (Show, Generic) deriving (Show, Generic)
instance ToJSON Sentences instance ToJSON Sentences
instance FromJSON Sentences instance FromJSON Sentences
...@@ -102,7 +114,7 @@ corenlp lang txt = do ...@@ -102,7 +114,7 @@ corenlp lang txt = do
-- parseWith _tokenNer "Hello world of Peter." -- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]] -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]] tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
{-# LANGUAGE OverloadedStrings #-} {-|
Module : Gargantext.Ngrams.Letters
module Gargantext.Ngrams.Count where Description : Ngrams.Letters module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
import Gargantext.Prelude Sugar to work on letters with Text.
-}
import Data.Foldable as F {-# LANGUAGE OverloadedStrings #-}
import Data.Map.Strict (insertWith) module Gargantext.Ngrams.Letters where
import Data.Map (Map)
import qualified Data.Map as M
--import qualified Data.Text.Lazy.IO as DTLIO
import qualified Data.Text.Lazy as DTL import qualified Data.Text.Lazy as DTL
-- import qualified Data.Text.Lazy.IO as DTLIO
import Gargantext.Prelude
-- | /O(n)/ Breaks a 'Text' up into each Text list of chars. -- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
-- from slower to faster: -- from slower to faster:
...@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text] ...@@ -26,23 +32,3 @@ letters'' :: DTL.Text -> [DTL.Text]
letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) [] letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
-- words
-- lines
-- words between punctuation
-- number of punctuation
occurrences :: Ord a => [a] -> Map a Int
occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) M.empty xs
-- for optimization :
--occurrences' :: Ord a => [a] -> Map a Integer
--occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
--countMain :: IO ()
--countMain = do
-- (fichier:_) <- getArgs
-- c <- DTLIO.readFile fichier
-- --print $ occurrences $ DTL.chunksOf 1 c
-- pure $ occurrences $ letters'' c
-- --print $ occurrences $ DTL.words $ DTL.toLower c
--
...@@ -8,8 +8,7 @@ Maintainer : sample@email.com ...@@ -8,8 +8,7 @@ Maintainer : sample@email.com
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Here is a longer description of this module, containing some Mainly reexport functions in @Data.Text.Metrics@
commentary with @some markup@.
-} -}
module Gargantext.Ngrams.Metrics (levenshtein module Gargantext.Ngrams.Metrics (levenshtein
......
...@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s ...@@ -38,7 +38,7 @@ extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: Language -> Text -> IO [[Ngrams]] extractNgrams' :: Language -> Text -> IO [[Ngrams]]
extractNgrams' lang t = map (map token2text) extractNgrams' lang t = map (map token2text)
<$> map _sentenceTokens <$> map _sentenceTokens
<$> sentences <$> _sentences
<$> corenlp lang t <$> corenlp lang t
-- | This function selects ngrams according to grammars specific -- | This function selects ngrams according to grammars specific
......
{-|
Module : Gargantext.Ngrams.TFICF
Description : TFICF Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Definition of TFICF
-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Ngrams.TFICF where
import GHC.Generics (Generic)
import Data.Maybe (Maybe)
import Data.Text (Text)
import Text.Show (Show())
-- import Gargantext.Types
import Gargantext.Prelude
data Context = Corpus | Document
deriving (Show, Generic)
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Show, Generic)
--tfidf :: Text -> TFICF
--tfidf txt = TFICF txt Document Corpus score
-- where
-- score = Nothing
module Data.Gargantext.Ngrams.TFICF where
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Read, Show, Generics)
tfidf :: Text -> TFICF
tfidf txt = TFICF txt Document Corpus score
where
score = Nothing
module Data.Gargantext.Ngrams.Words where
import Data.List (partition)
import Data.Set (fromList, notMember, member)
import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
import NLP.Stemmer (stem, Stemmer(..))
import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
import Language.Aspell.Options (ACOption(..))
--import Data.Either.Utils (fromRight)
import Data.ByteString.Internal (packChars)
get_lang x = do
let lang = Lang (packChars x)
spell_lang <- spellCheckerWithOptions [lang]
return spell_lang
check' lang x = check lang (packChars x)
suggest' lang x = suggest lang (packChars x)
--spell_lang <- spellChecker
--lang = fromRight s
--suggest' lang x
-- stem French "naturelles"
-- paragraphes
-- lines
-- sentences
-- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
repl :: Char -> Char
repl x
| x == '\'' = ' '
| x == '/' = ' '
-- | x == '\t' = ' '
-- | x == '\n' = ' '
| otherwise = x
cleanText text = do
-- pb avec \'
--words $ filter (not . isPunctuation) $ Prelude.map toLower text
words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
isMiamWord word = do
let miamWord_set = fromList ["salut", "phrase"]
member word miamWord_set
isStopWord word = do
let stopWord_set = fromList ["de", "la", "une", "avec"]
member word stopWord_set
wordsMain = do
let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
print $ partition (not . isStopWord) $ cleanText text
print $ filter (not . isStopWord) $ cleanText text
--print $ filter isStopWord $ words $ filter (not . isPunctuation) text
...@@ -25,11 +25,16 @@ import Gargantext.Prelude ...@@ -25,11 +25,16 @@ import Gargantext.Prelude
import System.FilePath (takeExtension, FilePath()) import System.FilePath (takeExtension, FilePath())
import Data.Attoparsec.ByteString (parseOnly, Parser) import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.ByteString as DB import qualified Data.ByteString as DB
import Data.Map as DM import qualified Data.Map as DM
import Data.Either.Extra (partitionEithers)
import Data.Ord() import Data.Ord()
import Data.Foldable (concat)
import Data.String() import Data.String()
import Data.Either.Extra(Either()) import Data.Either.Extra(Either())
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
---- ----
--import Control.Monad (join) --import Control.Monad (join)
import Codec.Archive.Zip (withArchive, getEntry, getEntries) import Codec.Archive.Zip (withArchive, getEntry, getEntries)
...@@ -57,13 +62,20 @@ data FileFormat = WOS -- Implemented (ISI Format) ...@@ -57,13 +62,20 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- | XML -- Not Implemented / see : -- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
parse :: FileFormat -> FilePath -- TODO: to debug maybe add the filepath in error message
-> IO [Either String [[(DB.ByteString, DB.ByteString)]]] type ParseError = String
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do parse format path = do
files <- case takeExtension path of files <- case takeExtension path of
".zip" -> openZip path ".zip" -> openZip path
_ -> pure <$> DB.readFile path _ -> pure <$> DB.readFile path
mapConcurrently (runParser format) files (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
-- | withParser: -- | withParser:
......
...@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer ...@@ -27,13 +27,12 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>) , (+), (*), (/), (-), (.), (>=), ($), (**), (^), (<), (>)
, Eq, (==), (<>) , Eq, (==), (<>)
, (&&), (||), not , (&&), (||), not
, toS , fst, snd, toS
) )
-- TODO import functions optimized in Utils.Count -- TODO import functions optimized in Utils.Count
-- import Protolude hiding (head, last, all, any, sum, product, length) -- import Protolude hiding (head, last, all, any, sum, product, length)
-- import Gargantext.Utils.Count -- import Gargantext.Utils.Count
import qualified Data.List as L hiding (head, sum) import qualified Data.List as L hiding (head, sum)
import qualified Control.Monad as M import qualified Control.Monad as M
import qualified Data.Map as Map import qualified Data.Map as Map
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment