Commit 697d44ea authored by Alexandre Delanoë's avatar Alexandre Delanoë

[STRUCTURE] Make it simple and clean old code.

parent f4e687d5
......@@ -22,41 +22,20 @@ library:
- -Werror
exposed-modules:
- Gargantext
- Gargantext.Database
- Gargantext.Database.Instances
- Gargantext.Database.Ngram
- Gargantext.Database.Node
- Gargantext.Database.Facet
- Gargantext.Database.NodeNgram
- Gargantext.Database.NodeNgramNgram
- Gargantext.Database.NodeNode
- Gargantext.Database.NodeNodeNgram
- Gargantext.Database.Utils
- Gargantext.Database.User
- Gargantext.Prelude
- Gargantext.Core
- Gargantext.Core.Utils
- Gargantext.Core.Types
- Gargantext.Core.Types.Node
- Gargantext.Text
- Gargantext.Text.Analysis
- Gargantext.Text.TFICF
- Gargantext.Text.Letters
- Gargantext.Text.CoreNLP
- Gargantext.Text.Parser
- Gargantext.Text.Token.Text
- Gargantext.Text.Lang.En
- Gargantext.Text.Stem.En
- Gargantext.Text.Lang.Fr
- Gargantext.Text.Metrics
- Gargantext.Text.TextMining
- Gargantext.Text.Occurrences
- Gargantext.Text.Parsers
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Metrics.Occurrences
- Gargantext.Text.Metrics.FrequentItemSet
- Gargantext.Text.Ngrams.PosTagging.CoreNLP
- Gargantext.Text.Ngrams.PosTagging.Parser
- Gargantext.Text.Parsers.Date
- Gargantext.Prelude
- Gargantext.Database
- Gargantext.API
- Gargantext.API.Auth
- Gargantext.Types
- Gargantext.Types.Main
- Gargantext.Types.Node
- Gargantext.Utils.DateUtils
- Gargantext.Utils.Prefix
dependencies:
- QuickCheck
- aeson
......
......@@ -11,7 +11,7 @@ Portability : POSIX
{-# LANGUAGE NoImplicitPrelude #-}
import Gargantext.Prelude
import Gargantext.Types.Main (Language(..))
import Gargantext.Core (Lang(..))
--import qualified Ngrams.Lang.Fr as Fr
import qualified Ngrams.Lang as Lang
import qualified Ngrams.Lang.Occurrences as Occ
......
......@@ -18,11 +18,11 @@ module Ngrams.Lang where
import Gargantext.Prelude (IO())
import Gargantext.Types.Main (Language(..))
import Gargantext.Core (Lang(..))
import qualified Ngrams.Lang.Fr as Fr
import qualified Ngrams.Lang.En as En
ngramsExtractionTest :: Language -> IO ()
ngramsExtractionTest :: Lang -> IO ()
ngramsExtractionTest FR = Fr.ngramsExtractionTest
ngramsExtractionTest EN = En.ngramsExtractionTest
......@@ -23,8 +23,8 @@ import Data.Text (Text)
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Types.Main (Language(..))
import Gargantext.Text.Parser (extractNgrams, selectNgrams)
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.Parser (extractNgrams, selectNgrams)
ngramsExtractionTest :: IO ()
......
......@@ -20,8 +20,8 @@ module Ngrams.Lang.Fr where
import Test.Hspec
import Gargantext.Prelude
import Gargantext.Types.Main (Language(..))
import Gargantext.Text.Parser (extractNgrams, selectNgrams)
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.Parser (extractNgrams, selectNgrams)
ngramsExtractionTest :: IO ()
ngramsExtractionTest = hspec $ do
......
......@@ -22,7 +22,7 @@ import Test.Hspec
import Data.Either (Either(Right))
import Gargantext.Prelude
import Gargantext.Text.Occurrences (parseOccurrences)
import Gargantext.Text.Metrics.Occurrences (parseOccurrences)
parsersTest :: IO ()
parsersTest = hspec $ do
......
......@@ -15,11 +15,9 @@ commentary with @some markup@.
module Parsers.Date where
import Gargantext.Prelude
import Test.Hspec
import Test.QuickCheck
import Parsers.Types
import Control.Applicative ((<*>))
import Data.Either (Either(..))
......@@ -28,7 +26,12 @@ import Data.Text (pack, Text)
import Text.Parsec.Error (ParseError)
import Duckling.Time.Types (toRFC3339)
import Gargantext.Parsers.Date (fromRFC3339)
-----------------------------------------------------------
import Gargantext.Prelude
import Gargantext.Text.Parsers.Date (fromRFC3339)
import Parsers.Types
-----------------------------------------------------------
fromRFC3339Inv :: Either ParseError ZonedTime -> Text
fromRFC3339Inv (Right z) = toRFC3339 z
......
......@@ -42,7 +42,7 @@ import Test.QuickCheck (elements)
-- import Control.Applicative ((<*>))
import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Gargantext.Core.Utils.Prefix (unPrefix)
-----------------------------------------------------------------------
type CountAPI = Post '[JSON] Counts
......
......@@ -40,7 +40,7 @@ import Servant
-- import Servant.Multipart
import Gargantext.Prelude
import Gargantext.Types.Node
import Gargantext.Core.Types.Node
import Gargantext.Database.Node ( getNodesWithParentId
, getNode, getNodesWith
, deleteNode, deleteNodes)
......
{-|
Module : Gargantext.Core
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
module Gargantext.Core
where
------------------------------------------------------------------------
-- | Language of a Text
-- For simplicity, we suppose text has an homogenous language
data Lang = EN | FR
-- | DE | IT | SP
-- > EN == english
-- > FR == french
-- > DE == deutch (not implemented yet)
-- > IT == italian (not implemented yet)
-- > SP == spanish (not implemented yet)
-- > ... add your language and help us to implement it (:
......@@ -13,9 +13,9 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Types ( module Gargantext.Types.Main
, module Gargantext.Types.Node
module Gargantext.Core.Types ( module Gargantext.Core.Types.Main
, module Gargantext.Core.Types.Node
) where
import Gargantext.Types.Main
import Gargantext.Types.Node
import Gargantext.Core.Types.Main
import Gargantext.Core.Types.Node
{-|
Module : Gargantext.Types.Main
Module : Gargantext.Core.Types.Main
Description : Short description
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -19,7 +19,7 @@ commentary with @some markup@.
{-# LANGUAGE OverloadedStrings #-}
------------------------------------------------------------------------
module Gargantext.Types.Main where
module Gargantext.Core.Types.Main where
------------------------------------------------------------------------
import Data.Maybe (fromMaybe)
......@@ -28,22 +28,10 @@ import Data.Monoid ((<>))
import Data.Text (Text, pack)
import Data.List (lookup)
import Gargantext.Types.Node
import Gargantext.Core.Types.Node
import Gargantext.Prelude
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Language of a Text
-- For simplicity, we suppose text has an homogenous language
data Language = EN | FR
-- | DE | IT | SP
-- > EN == english
-- > FR == french
-- > DE == deutch (not implemented yet)
-- > IT == italian (not implemented yet)
-- > SP == spanish (not implemented yet)
-- > ... add your language and help us to implement it (:
-- All the Database is structred like a hierarchical Tree
data Tree a = NodeT a [Tree a]
deriving (Show, Read, Eq)
......
{-|
Module : Gargantext.Types.Nodes
Module : Gargantext.Core.Types.Nodes
Description : Main Types of Nodes
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -18,7 +18,7 @@ Portability : POSIX
{-# LANGUAGE TemplateHaskell #-}
-- {-# LANGUAGE DuplicateRecordFields #-}
module Gargantext.Types.Node where
module Gargantext.Core.Types.Node where
import Prelude (Enum, Bounded, minBound, maxBound)
......@@ -48,7 +48,7 @@ import Test.QuickCheck.Arbitrary
import Test.QuickCheck (elements)
import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Gargantext.Core.Utils.Prefix (unPrefix)
------------------------------------------------------------------------
......
......@@ -22,7 +22,7 @@ Phylomemy was first described in [REF].
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Types.Phylo where
module Gargantext.Core.Types.Phylo where
import Data.Aeson.TH (deriveJSON)
import Data.Maybe (Maybe)
......@@ -32,7 +32,7 @@ import Data.Time.Clock.POSIX (POSIXTime)
import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Gargantext.Core.Utils.Prefix (unPrefix)
------------------------------------------------------------------------
-- | Phylo datatype descriptor of a phylomemy
......
......@@ -13,9 +13,9 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Utils (
module Gargantext.Core.Utils (
-- module Gargantext.Utils.Chronos
module Gargantext.Utils.Prefix
module Gargantext.Core.Utils.Prefix
) where
-- import Gargantext.Utils.Chronos
import Gargantext.Utils.Prefix
import Gargantext.Core.Utils.Prefix
{-|
Module : Gargantext.Utils.Counts
Module : Gargantext.Core.Utils.Count
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -9,6 +9,9 @@ Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Inspired from Gabriel Gonzales, "beautiful folds" talk.
-}
{-# LANGUAGE NoImplicitPrelude #-}
......@@ -16,15 +19,17 @@ commentary with @some markup@.
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RankNTypes #-}
module Data.Gargantext.Utils.Count (head, last, all, any, sum, product, length)
module Gargantext.Core.Utils.Count (head, last, all, any, sum, product, length)
where
import Data.Monoid
import Protolude hiding ((<>), head, last, all, any, sum, product, length)
import Data.Functor
import Control.Applicative
import qualified Data.Foldable
import Data.Monoid
import Control.Lens (Getting, foldMapOf)
import Gargantext.Prelude hiding (head, sum, length)
data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o)
instance Functor (Fold i) where
......
{-|
Module : Gargantext.Utils.DateUtils
Module : Gargantext.Core.Utils.DateUtils
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -10,7 +10,7 @@ Portability : POSIX
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Utils.DateUtils where
module Gargantext.Core.Utils.DateUtils where
import Gargantext.Prelude
import Data.Time (UTCTime, toGregorian, utctDay)
......
{-|
Module : Gargantext.Utils.Prefix
Module : Gargantext.Core.Utils.Prefix
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -13,7 +13,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Utils.Prefix where
module Gargantext.Core.Utils.Prefix where
import Prelude
......
......@@ -49,13 +49,13 @@ import qualified Opaleye.Internal.Unpackspec()
import Test.QuickCheck.Arbitrary
import Test.QuickCheck (elements)
import Gargantext.Types
import Gargantext.Types.Node (NodeType)
import Gargantext.Core.Types
import Gargantext.Core.Types.Node (NodeType)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.NodeNode
import Gargantext.Database.NodeNodeNgram
import Gargantext.Database.Node
import Gargantext.Database.Queries
import Gargantext.Utils.Prefix (unPrefix)
-- import Gargantext.Database.NodeNgram
------------------------------------------------------------------------
......
......@@ -29,8 +29,8 @@ import Database.PostgreSQL.Simple.FromField ( Conversion
)
import Prelude hiding (null, id, map, sum)
import Gargantext.Types
import Gargantext.Types.Node (NodeType)
import Gargantext.Core.Types
import Gargantext.Core.Types.Node (NodeType)
import Gargantext.Database.Queries
import Gargantext.Prelude hiding (sum)
......
......@@ -22,7 +22,7 @@ Portability : POSIX
module Gargantext.Database.Queries where
import Gargantext.Prelude
import Gargantext.Types (Limit, Offset, NodePoly)
import Gargantext.Core.Types (Limit, Offset, NodePoly)
import Data.Maybe (Maybe, maybe)
import Control.Arrow ((>>>))
import Control.Applicative ((<*>))
......
......@@ -24,7 +24,6 @@ commentary with @some markup@.
module Gargantext.Database.User where
import Gargantext.Prelude
import GHC.Show(Show(..))
import Data.Eq(Eq(..))
import Data.Time (UTCTime)
......@@ -40,6 +39,8 @@ import Opaleye
-- Functions only
import Data.List (find)
import Gargantext.Prelude
data UserLight = UserLight { userLight_id :: Int
, userLight_username :: Text
......
module Data.Gargantext.Error where
import Control.Exception (Exception)
import Data.Typeable (Typeable)
import GHC.Generics (Generic)
import Network.HTTP.Client (HttpException)
import System.Exit (ExitCode (..), exitWith)
import System.IO (stderr)
--import Text.Parsec.Error
--import Text.Parsec.Pos hiding (Line)
data GargError = GargIOError String IOError
| GargHttpError String HttpException
| GargParseError String
| GargNgramsError String
| GargDatabaseError String
deriving (Show, Typeable, Generic)
instance Exception PandocError
-- | Handle GargError by exiting with an error message.
handleError :: Either GargError a -> IO a
handleError (Right r) = pure r
handleError (Left e) =
case e of
GargIOError _ err' -> ioError err'
GargHttpError u err' -> err 61 $
"Could not fetch " ++ u ++ "\n" ++ show err'
GargParseError s -> err 64 s
_ s -> err 0 s
err :: Int -> String -> IO a
err exitCode msg = do
UTF8.hPutStrLn stderr msg
exitWith $ ExitFailure exitCode
return undefined
......@@ -15,38 +15,18 @@ n non negative integer
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text ( module Gargantext.Text.Letters
--, module Gargantext.Text.Hetero
, module Gargantext.Text.CoreNLP
, module Gargantext.Text.Parser
, module Gargantext.Text.Occurrences
, module Gargantext.Text.TextMining
, module Gargantext.Text.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis, clean
, ListName(..), equivNgrams, isGram, sentences
, ngramsTest
) where
import Gargantext.Text.Letters
--import Gargantext.Text.Hetero
import Gargantext.Text.CoreNLP
import Gargantext.Text.Parser
import Gargantext.Text.Occurrences
import Gargantext.Text.TextMining
--import Gargantext.Text.Words
import Gargantext.Text.Metrics
import qualified Gargantext.Text.FrequentItemSet as FIS
-----------------------------------------------------------------
module Gargantext.Text
where
import Data.List (sort)
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, filter, toLower, split, lines, concat)
import Data.Text (Text, filter, toLower, split, splitOn)
import qualified Data.Text as DT
import Data.Text.IO (readFile)
--import Data.Text.IO (readFile)
import Data.Set (Set)
import qualified Data.Set as S
import Data.Map.Strict (Map
, empty
, insertWith, unionWith
......@@ -55,25 +35,38 @@ import Data.Map.Strict (Map
)
import qualified Data.Map.Strict as M (filter)
import Data.Foldable (foldl')
import Gargantext.Prelude hiding (filter)
-- Maybe useful later:
--import NLP.Stemmer (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
-----------------------------------------------------------------
import Gargantext.Text.Ngrams.Stem.En
import qualified Gargantext.Text.Metrics.FrequentItemSet as FIS
import Gargantext.Prelude hiding (filter)
-----------------------------------------------------------------
data ListName = Stop | Candidate | Graph
deriving (Show, Eq)
data Ngrams = Ngrams { _ngramsNgrams :: [Text]
, _ngramsStem :: [Text]
, _ngramsListName :: Maybe ListName
data Ngroup = Ngroup { _ngroup_label :: Ngrams
, _ngroup_ngrams :: [Ngrams]
} deriving (Show)
data Ngrams = Ngrams { _ngrams_label :: [Text]
, _ngrams_stem :: Set Text
} deriving (Show)
text2ngrams :: Text -> Ngrams
text2ngrams txt = Ngrams txt' (S.fromList $ map stem txt')
where
txt' = splitOn " " txt
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
= (sort n1) == (sort n2) || (sort s1) == (sort s2)
equivNgrams (Ngrams _ s1) (Ngrams _ s2) = s1 `S.isSubsetOf` s2
|| s2 `S.isSubsetOf` s1
type Occ = Int
--type Index = Int
......@@ -149,20 +142,25 @@ sentences txt = split isStop txt
isStop :: Char -> Bool
isStop c = c `elem` ['.','?','!']
-- | Tests
ngramsTest fp = ws
where
txt = concat <$> lines <$> clean <$> readFile fp
-- | Number of sentences
ls = sentences <$> txt
-- | Number of monograms used in the full text
ws = ngrams <$> txt
-- | stem ngrams
-- TODO
-- group ngrams
ocs = occ <$> ws
--
---- | https://en.wikipedia.org/wiki/Text_mining
--testText :: Text
--testText = DT.pack "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities). Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods. A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
--
--
--
---- | Tests
----ngramsTest :: [Text]
--ngramsTest = ocs
-- where
-- --txt = concat <$> lines <$> clean <$> readFile filePath
-- txt = clean $ testText
-- -- | Number of sentences
-- ls = sentences $ txt
-- -- | Number of monograms used in the full text
-- ws = ngrams $ txt
-- -- | stem ngrams
-- -- TODO
-- -- group ngrams
-- ocs = occ $ ws
--
--
{-|
Module : Gargantext.Analysis
Description : Gargantext Analysis
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Analysis
where
import Gargantext.Prelude (undefined, IO(), Int())
-- import qualified Data.Text.Lazy as DTL
import Data.Text
import Opaleye (Column, PGInt4)
--import Data.Map as DM
--import Data.Vector as DV
-- | Simple function to count Occurrences in a context of text.
occOfDocument :: Column PGInt4 -> Text -> IO Int
occOfDocument = undefined
--occOfDocument c_id txt = do
-- docs <- pm (hyperdataDocument_Abstract . node_hyperdata) <$> getCorpusDocument c_id
-- let occs = pm (\x -> maybe "" identity x) docs
-- let result = case sequence $ pm (parseOccurrences txt) occs of
-- -- TODO find a way to get nice message d'errors (file, function, line)
-- Left str -> error $ "[ERRROR] at file/function/line" ++ str
-- Right xs -> xs
-- pure (sum result)
data Occurrences a b = Map a b
{-|
Module : Gargantext.Text.Letters
Description : Ngrams.Letters module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Sugar to work on letters with Text.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Letters where
import qualified Data.Text.Lazy as DTL
-- import qualified Data.Text.Lazy.IO as DTLIO
import Gargantext.Prelude
-- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
-- from slower to faster:
letters :: DTL.Text -> [DTL.Text]
letters text = DTL.chunksOf 1 text
letters' :: DTL.Text -> [DTL.Text]
letters' text = DTL.splitOn "#" $ DTL.intersperse '#' text
letters'' :: DTL.Text -> [DTL.Text]
letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
{-|
Module : Gargantext.Text.Metrics
Description : Short description
Copyright : (c) Some Guy, 2013
Someone Else, 2014
License : GPL-3
Maintainer : sample@email.com
Module : Gargantext.Text.Metrics
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
......@@ -13,6 +12,7 @@ Mainly reexport functions in @Data.Text.Metrics@
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics (levenshtein
, levenshteinNorm
, damerauLevenshtein
......@@ -22,12 +22,12 @@ module Gargantext.Text.Metrics (levenshtein
, hamming
) where
import Gargantext.Prelude
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
{- * Example de titre
-}
......
{-|
Module : Gargantext.Text.FrequentItemSet
Module : Gargantext.Text.Metrics.FrequentItemSet
Description : Ngrams tools
Copyright : (c) CNRS, 2018
License : AGPL + CECILL v3
......@@ -13,7 +13,7 @@ Domain Specific Language to manage Frequent Item Set (FIS)
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.FrequentItemSet
module Gargantext.Text.Metrics.FrequentItemSet
( Fis, Size
, occ, cooc
, all, between
......
{-|
Module : Gargantext.
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
module Gargantext.Text.Hetero where
import GHC.Real as R
......
{-|
Module : Gargantext.Text.Occurrences
Module : Gargantext.Text.Metrics.Occurrences
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -14,7 +14,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Occurrences where
module Gargantext.Text.Metrics.Occurrences where
import Gargantext.Prelude
......
{-|
Module : Gargantext.Text.TFICF
Module : Gargantext.Text.Metrics.TFICF
Description : TFICF Ngrams tools
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
......@@ -15,7 +15,7 @@ Definition of TFICF
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.TFICF where
module Gargantext.Text.Metrics.TFICF where
import GHC.Generics (Generic)
......
{-|
Module : Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams
where
{-|
Module : Gargantext.Text.List
Module : Gargantext.Text.Ngrams.Lists
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -13,19 +13,20 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.List where
import Data.Maybe
import Data.List (filter)
import Gargantext.Text
import Gargantext.Prelude
graph :: [Ngrams] -> [Ngrams]
graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
candidates :: [Ngrams] -> [Ngrams]
candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs
stop :: [Ngrams] -> [Ngrams]
stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
module Gargantext.Text.Ngrams.Lists
where
--import Data.Maybe
--import Data.List (filter)
--import Gargantext.Text
--import Gargantext.Prelude
--
--graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--
--candidates :: [Ngrams] -> [Ngrams]
--candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs
--
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
{-|
Module : Gargantext.Text.Ngrams.PosTagging
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Ngrams.PosTagging
where
{-|
Module : Gargantext.Text.CoreNLP
Module : Gargantext.Text.Ngrams.PosTagging.CoreNLP
Description : CoreNLP module
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
......@@ -16,18 +16,20 @@ Portability : POSIX
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.Text.CoreNLP where
module Gargantext.Text.Ngrams.PosTagging.CoreNLP
where
import Data.Aeson.TH (deriveJSON)
import GHC.Generics
import Data.Monoid ((<>))
import GHC.Show (Show(..))
import Gargantext.Types.Main (Language(..))
import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Data.Aeson.TH (deriveJSON)
import Data.Monoid ((<>))
import Data.Text (Text)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import Network.HTTP.Simple
......@@ -80,7 +82,7 @@ $(deriveJSON (unPrefix "_") ''Sentences)
--
corenlp :: Language -> Text -> IO Sentences
corenlp :: Lang -> Text -> IO Sentences
corenlp lang txt = do
let properties = case lang of
EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
......@@ -99,7 +101,7 @@ corenlp lang txt = do
-- Named Entity Recognition example
-- parseWith _tokenNer "Hello world of Peter."
-- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
tokenWith :: (Token -> t) -> Lang -> Text -> IO [[(Text, t)]]
tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
......
{-|
Module : Gargantext.Text.Lang.En
Module : Gargantext.Text.Ngrams.PosTagging.Lang.En
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -14,7 +14,8 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Lang.En (selectNgrams, groupNgrams, textTest) where
module Gargantext.Text.Ngrams.PosTagging.Lang.En (selectNgrams, groupNgrams, textTest)
where
import Gargantext.Prelude
import Data.Text (Text)
......
{-|
Module : Gargantext.Text.Lang.Fr
Module : Gargantext.Text.Ngrams.PosTagging.Lang.Fr
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -14,7 +14,7 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Lang.Fr (selectNgrams, groupNgrams, textTest)
module Gargantext.Text.Ngrams.PosTagging.Lang.Fr (selectNgrams, groupNgrams, textTest)
where
import Gargantext.Prelude
......
{-|
Module : Gargantext.Text.Parser
Module : Gargantext.Text.Ngrams.PosTagging.Parser
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -15,15 +15,16 @@ commentary with @some markup@.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Text.Parser where
module Gargantext.Text.Ngrams.PosTagging.Parser
where
import Gargantext.Prelude
import Gargantext.Text.CoreNLP
import Data.Text hiding (map)
import Gargantext.Types.Main (Language(..))
import qualified Gargantext.Text.Lang.En as En
import qualified Gargantext.Text.Lang.Fr as Fr
import Gargantext.Core (Lang(..))
import Gargantext.Text.Ngrams.PosTagging.CoreNLP
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.En as En
import qualified Gargantext.Text.Ngrams.PosTagging.Lang.Fr as Fr
type SNgrams = (Text, Text, Text)
......@@ -45,11 +46,11 @@ type SNgrams = (Text, Text, Text)
-- TODO for scientific papers: add maesures
-- TODO add the p score regex
extractNgrams :: Language -> Text -> IO [[SNgrams]]
extractNgrams :: Lang -> Text -> IO [[SNgrams]]
extractNgrams lang s = map (groupNgrams lang) <$> extractNgrams' lang s
extractNgrams' :: Language -> Text -> IO [[SNgrams]]
extractNgrams' :: Lang -> Text -> IO [[SNgrams]]
extractNgrams' lang t = map (map token2text)
<$> map _sentenceTokens
<$> _sentences
......@@ -58,13 +59,13 @@ extractNgrams' lang t = map (map token2text)
-- | This function selects ngrams according to grammars specific
-- of each language.
-- In english, JJ is ADJectiv in french.
selectNgrams :: Language -> [SNgrams] -> [SNgrams]
selectNgrams :: Lang -> [SNgrams] -> [SNgrams]
selectNgrams EN = En.selectNgrams
selectNgrams FR = Fr.selectNgrams
-- | This function analyze and groups (or not) ngrams according to
-- grammars specific of each language.
groupNgrams :: Language -> [SNgrams] -> [SNgrams]
groupNgrams :: Lang -> [SNgrams] -> [SNgrams]
groupNgrams EN = En.groupNgrams
groupNgrams FR = Fr.groupNgrams
{-|
Module : Gargantext.Text.Ngrams.Stem
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
module Gargantext.Text.Ngrams.Stem
where
import Data.Text (Text)
import qualified Data.Text as DT
import qualified NLP.Stemmer as N
import Gargantext.Core (Lang(..))
-- (stem, Stemmer(..))
--import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
--import Language.Aspell.Options (ACOption(..))
stem :: Lang -> Text -> Text
stem lang = DT.pack . N.stem lang' . DT.unpack
where
lang' = case lang of
EN -> N.English
FR -> N.French
{-|
Module : Gargantext.
Module : Gargantext.Text.Ngrams.Stem.En
Description : Porter Algorithm Implementation purely Haskell
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
......@@ -17,7 +17,7 @@ Adapted from:
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Stem.En
module Gargantext.Text.Ngrams.Stem.En
where
import Control.Monad
......
{-|
Module : Gargantext.Text.Token.Text
Module : Gargantext.Text.Ngrams.Token.Text
Description :
Copyright : (c) CNRS, 2017-Present
Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
......@@ -13,7 +13,7 @@ Inspired from https://bitbucket.org/gchrupala/lingo/overview
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Token.Text
module Gargantext.Text.Ngrams.Token.Text
( EitherList(..)
, Tokenizer
, tokenize
......@@ -28,7 +28,7 @@ module Gargantext.Text.Token.Text
, contractions
, negatives
)
where
where
import qualified Data.Char as Char
import Data.Maybe
......
{-|
Module : Gargantext.Parsers
Module : Gargantext.Text.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
......
{-|
Module : Gargantext.Text.TextMining
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.TextMining where
import Gargantext.Prelude
import Data.Ord(Ordering(LT,GT), compare)
import Data.Text (pack)
import Data.Bool (otherwise)
import Data.Map (empty, Map, insertWith, toList)
import Data.List (foldl, foldl')
import qualified Data.List as L
sortGT :: (Ord a, Ord b) => (a, b) -> (a, b) -> Ordering
sortGT (a1, b1) (a2, b2)
| a1 < a2 = GT
| a1 > a2 = LT
| a1 == a2 = compare b1 b2
sortGT (_, _) (_, _) = panic (pack "What is this case ?")
--histogram :: Ord a => [a] -> [(a, Int)]
--histogram = map (head &&& length) Prelude.. group Prelude.. sort Prelude.. words
--histogram = sortGT Prelude.. $ map (head &&& length) Prelude.. group Prelude.. sort Prelude.. words
countElem :: (Ord k) => Data.Map.Map k Int -> k -> Data.Map.Map k Int
countElem m e = Data.Map.insertWith (\n o -> n + o) e 1 m
freqList :: (Ord k) => [k] -> Data.Map.Map k Int
freqList = foldl countElem Data.Map.empty
getMaxFromMap :: Ord a => Map a1 a -> [a1]
getMaxFromMap m = go [] Nothing (toList m)
where
go ks _ [] = ks
go ks Nothing ((k,v):rest) = go (k:ks) (Just v) rest
go ks (Just u) ((k,v):rest)
| v < u = go ks (Just u) rest
| v > u = go [k] (Just v) rest
| otherwise = go (k:ks) (Just v) rest
merge :: [a] -> [a] -> [a]
merge [] ys = ys
merge (x:xs) ys = x:merge ys xs
average :: [Double] -> Double
average x = L.sum x / L.genericLength x
average' :: [Int] -> Double
average' x = (L.sum y) / (L.genericLength y) where
y = L.map fromIntegral x
countYear :: [Integer] -> Map Integer Integer
countYear [] = empty
countYear (x:xs) = insertWith (+) x 1 (countYear xs)
countYear' :: [Integer] -> Map Integer Integer
countYear' (xs) = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-- Word2Vec
-- Word Vector in a Field
module Data.Gargantext.Utils.Chronos where
import Data.Maybe (fromJust, isJust, isNothing)
import qualified Data.Time as DT
import qualified Data.UTC as DU
import Data.Time
import Data.Time.Clock.POSIX
import Text.Regex
parseDate :: String -> Maybe [String]
parseDate d = matchRegex (mkRegex "(.*)/(.*)/(.*)") d
getDate' :: Maybe [String] -> (Integer, Int, Int)
getDate' d
| isJust d == True = toGregorian $ fromGregorian (read year) (read month) (read day)
| otherwise = toGregorian $ fromGregorian 2015 1 1
where
Just [day, month, year] = d
getDate :: String -> (Integer, Int, Int)
getDate = getDate' . parseDate
--getDateDay :: Maybe [String] -> Day
--getDateDay d = fromGregorian (read year) (read month) (read day)
-- where Just [day, month, year] = matchRegex (mkRegex "(.*)/(.*)/(.*)") d
getDateDay' :: Maybe [String] -> Day
getDateDay' d
| isJust d == True = fromGregorian (read year) (read month) (read day)
| otherwise = fromGregorian 2015 1 1
where Just [day, month, year] = d
getDateDay :: String -> Day
getDateDay = getDateDay' . parseDate
getDateUTC :: String -> String
getDateUTC d = show $ DT.UTCTime (getDateDay d) (DT.timeOfDayToTime $ DT.TimeOfDay 0 0 0)
getYear :: String -> String
getYear date = s where
(y, m, d) = getDate date
s = show y
getMonth :: String -> String
getMonth date = s where
(y, m, d) = getDate date
s = show m
getDay :: String -> String
getDay date = s where
(y, m, d) = getDate date
s = show d
--for Dates exported via xls2csv tool
type MT = Maybe (DU.Local DU.DateTime)
type MS = Maybe String
--getDate'' :: String -> String
--getDate'' gd = d where
-- start = "1900-01-01T00:00:00Z"
-- da = (DU.parseRfc3339 start :: MT) >>= DU.addDays ( (read gd :: Integer) -2) >>= DU.renderRfc3339 :: MS
-- d = fromJust da
--
--getDate''' :: String -> String
--getDate''' gd = d where
-- start = "1900-01-01T00:00:00Z"
-- da = (DU.parseRfc3339 start :: MT) >>= DU.addDays ( (read gd :: Integer) -2) >>= DU.renderIso8601CalendarDate :: MS
-- d = fromJust da
--
--date2greg :: String ->
date2greg date = (y, m, d) where
(y, m, d) = DT.toGregorian $ DT.addDays ((read date :: Integer) -2) $ DT.utctDay (read "1900-01-01 00:00:00" :: DT.UTCTime)
getYear' :: String -> String
getYear' date = s where
(y, m, d) = date2greg date
s = show y
getMonth' :: String -> String
getMonth' date = s where
(y, m, d) = date2greg date
s = show m
getDay' :: String -> String
getDay' date = s where
(y, m, d) = date2greg date
s = show d
-- DEFINITIONS as SPECS
-- (Engineering axioms for Gargantext)
------------------------------------------------------------------------
-- From file to corpus
------------------------------------------------------------------------
-- > A Corpus is a list of Documents
data Corpus = [Document]
-- > A Document should have a date, some text and a maybe a language.
-- > Remarks :
-- > If no date then force one ?
-- > Analyze either text or numbers
-- > only one language per document
data Document = Document { date :: UTCTime
, uce :: Map Text $ Either (Maybe Text) (Maybe Double)
, lang :: Maybe Language
}
parseFiles :: Maybe ParserType -> [File] -> Corpus
parseFiles = undefined
-- This function exists already (in Python)
parseFile' :: ParserType -> File -> Maybe [Document]
parseFile' = undefined
-- This function does not exist yet
parseFile :: Maybe ParserType -> File -> Maybe [Document]
parseFile parserType file = documents
where
documents = case parserType of
Nothing -> case guessParserType file of
Nothing -> askUser "Answer to the question with link to $doc"
Just parserType' -> parseFile (Just parserType') file
Just parserType'' -> case parserType'' of
UnsupportedYet -> askUser "Not supported yet, which priority ?"
otherwise -> parseFile' parserType'' file
data ParserType = RIS | ISI | XML | CSV | Europresse | Book | UnsupportedYet
guessParserType :: File -> Maybe ParserType
guessParserType = undefined
------------------------------------------------------------------------
-- What kind of interactions with our users ?
------------------------------------------------------------------------
-- Question is Text only
type Question = Text
-- Possible Answers:
data Answer = ClosedAnswer | NumAnswer | OpenAnswer
-- Definitions of the Answers
type ClosedAnswer = Bool
type OpenAnswer = Text
type NumAnswer = Int
-- Un formulaire est un mapping entre question et peut-être une réponse
-- Un formulaire vide a Nothing au champs (Maybe Answer)
-- Une question répondue a la valeur (Just Response)
type Formular = Map Question (Maybe Answer)
askUser :: Question -> ClosedAnswer
askUser = undefined
data Advice = BugReport | WishList
askUser' :: Question -> Advice
askUser' question = case askUser question of
True -> BugReport
False -> WishList
------------------------------------------------------------------------
-- Specs for Lang Detection
------------------------------------------------------------------------
data Language = English | French
tagDoc :: Document -> Ngrams
tagDoc doc = ngrams
where
ngrams = case lang doc of
Nothing -> case guessLang doc of
Nothing -> tag
------------------------------------------------------------------------
-- Specs for ngrams Worflow
------------------------------------------------------------------------
module Data.Gargantext.Utils.SaveGetHash where
import System.FilePath (addExtension, joinPath)
import Data.ByteString.Lazy.Internal (ByteString)
import Data.List (elem, intersperse, insert)
import Data.List.Extra (chunksOf)
import Data.Digest.Pure.MD5 (md5)
import System.Directory (getDirectoryContents, createDirectory, findFile, createDirectoryIfMissing)
import Control.Monad (foldM)
import Data.List (splitAt)
import Data.ByteString.Lazy.Internal (packChars)
import qualified Data.ByteString.Lazy as BL
import Codec.Compression.Zlib (compress, decompress)
data Config = Config {
root :: String
, chunkSize :: Int
, compression :: Bool
} deriving Show
conf = Config {
root="/tmp/robot"
, chunkSize=2
, compression = True
}
chunkUrl :: Int -> ByteString -> [[Char]]
chunkUrl a url = chunksOf a $ show $ md5 url
-- replace it with createDirectoryIfMissing
existOrCreate :: [[Char]] -> FilePath -> IO [[Char]]
existOrCreate path_ dir = do
let path = joinPath path_
let returnPath = return $ path_ ++ [dir]
is <- elem dir <$> getDirectoryContents path -- ?
case is of
True -> do
returnPath
False -> do
createDirectory $ path ++ "/" ++ dir
returnPath
doPath :: [[Char]] -> [FilePath] -> IO [[Char]]
doPath root path = foldM (\x y -> existOrCreate x y) root path
splitAt' :: Int -> Int -> [Char] -> ([Char], [Char], [Char])
splitAt' i1 i2 x = (a, b, c) where
(a, a') = splitAt i1 x
(b, c) = splitAt i2 a'
-- ne pas écraser le fichier s'il existe
-- spliter l'url proprement
saveFile :: ByteString -> String -> IO String
saveFile url'' file = do
let url' = chunkUrl (chunkSize conf) url''
let url = init url'
-- add extension according to the filetype
let filename = Prelude.foldl addExtension (last url') ["html", "zlib"]
doPath [(root conf)] url
let path = (root conf) ++ "/" ++ joinPath url ++ "/" ++ filename
--case (findFile ["/tmp/sdfs"] "file.hmtl.zib"
-- Nothing -> create
-- _ -> change name
case (compression conf) of
True -> BL.writeFile path (compress $ packChars file)
False -> writeFile path file
return path
getFile :: FilePath -> IO ByteString
getFile path = do
case (compression conf) of
True -> decompress <$> BL.readFile path
False -> packChars <$> Prelude.readFile path
-- resources
-- add Resource
-- levensthein distance...
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment