diff --git a/package.yaml b/package.yaml index e04182302a4a46bbdc08dc27a0fc2a79bef2c42f..a8088a98382f921f1cd4aa7a54299d9e9d006a8e 100644 --- a/package.yaml +++ b/package.yaml @@ -115,6 +115,7 @@ library: - servant-static-th - split - stemmer + - string-conversions - swagger2 - tagsoup - text-metrics diff --git a/src/Gargantext/Ngrams.hs b/src/Gargantext/Ngrams.hs index 3104ba3bf3cfb582b0e5d5c65b7dc009337e3a18..c25843192d2279c7a887873d263934365917f1f2 100644 --- a/src/Gargantext/Ngrams.hs +++ b/src/Gargantext/Ngrams.hs @@ -22,7 +22,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters , module Gargantext.Ngrams.TextMining , module Gargantext.Ngrams.Metrics , Ngrams(..), ngrams, occ, sumOcc, text2fis - , NgramsList(..) + , ListName(..), equivNgrams, isGram --, module Gargantext.Ngrams.Words ) where @@ -40,7 +40,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS ----------------------------------------------------------------- import Data.List (sort) -import Data.Char (Char, isAlpha, isSpace) +import Data.Char (Char, isAlphaNum, isSpace) import Data.Text (Text, words, filter, toLower) import Data.Map.Strict (Map , empty @@ -58,32 +58,31 @@ import Gargantext.Prelude hiding (filter) --import Language.Aspell.Options (ACOption(..)) -data NgramsList = Stop | Candidate | Graph +data ListName = Stop | Candidate | Graph deriving (Show, Eq) -data Ngrams = Ngrams { _ngramsNgrams :: [Text] - , _ngramsStem :: [Text] - , _ngramsList :: Maybe NgramsList +data Ngrams = Ngrams { _ngramsNgrams :: [Text] + , _ngramsStem :: [Text] + , _ngramsListName :: Maybe ListName } deriving (Show) -instance Eq Ngrams where - Ngrams n1 s1 _ == Ngrams n2 s2 _ = (sort n1) == (sort n2) || (sort s1) == (sort s2) +equivNgrams :: Ngrams -> Ngrams -> Bool +equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _) + = (sort n1) == (sort n2) || (sort s1) == (sort s2) type Occ = Int --type Index = Int +-- Data Ngrams = Monograms | MultiGrams + ngrams :: Text -> [Text] -ngrams xs = monograms $ toLower $ filter isChar xs +ngrams xs = monograms $ toLower $ filter isGram xs monograms :: Text -> [Text] monograms = words --- TODO --- 12-b -isChar :: Char -> Bool -isChar '-' = True -isChar '/' = True -isChar c = isAlpha c || isSpace c +isGram :: Char -> Bool +isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/'] -- | Compute the occurrences (occ) occ :: Ord a => [a] -> Map a Occ @@ -91,7 +90,7 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs -- TODO add groups and filter stops sumOcc :: Ord a => [Map a Occ] -> Map a Occ -sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs +sumOcc xs = foldl' (unionWith (+)) empty xs --noApax :: Ord a => Map a Occ -> Map a Occ --noApax m = M.filter (>1) m diff --git a/src/Gargantext/Ngrams/CoreNLP.hs b/src/Gargantext/Ngrams/CoreNLP.hs index 051a739f59a823e1cac2b5d2b960eeeb8f6b8c77..4ce14fc4de16a71cf223b6a078d15545858b37b5 100644 --- a/src/Gargantext/Ngrams/CoreNLP.hs +++ b/src/Gargantext/Ngrams/CoreNLP.hs @@ -28,8 +28,6 @@ import Gargantext.Prelude import Gargantext.Utils.Prefix (unPrefix) import Data.Text (Text) -import qualified Data.ByteString.Char8 as S8 -import qualified Data.Yaml as Yaml import Network.HTTP.Simple @@ -82,17 +80,6 @@ $(deriveJSON (unPrefix "_") ''Sentences) -- -corenlpPretty :: Text -> IO () -corenlpPretty txt = do - url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" - let request = setRequestBodyJSON txt url - response <- httpJSON request - --- putStrLn $ "The status code was: " ++ --- show (getResponseStatusCode response) --- print $ getResponseHeader "Content-Type" response - S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences) - corenlp :: Language -> Text -> IO Sentences corenlp lang txt = do let properties = case lang of @@ -100,7 +87,7 @@ corenlp lang txt = do -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}" url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties - let request = setRequestBodyJSON txt url + let request = setRequestBodyLBS (cs txt) url response <- httpJSON request pure (getResponseBody response :: Sentences) diff --git a/src/Gargantext/Ngrams/FrequentItemSet.hs b/src/Gargantext/Ngrams/FrequentItemSet.hs index ab220d504f3e49a6b545f70acbf273ac4093178a..6e1918749fce0d8a1b2ddec3276607e4377f2ade 100644 --- a/src/Gargantext/Ngrams/FrequentItemSet.hs +++ b/src/Gargantext/Ngrams/FrequentItemSet.hs @@ -28,6 +28,7 @@ import Gargantext.Prelude type Size = Either Int (Int, Int) +--data Size = Point | Segment ------------------------------------------------------------------------ -- | Occurrence is Frequent Item Set of size 1 diff --git a/src/Gargantext/Ngrams/List.hs b/src/Gargantext/Ngrams/List.hs new file mode 100644 index 0000000000000000000000000000000000000000..7ea2cfba689eddbcaefb9b091c1a6960ba11a13b --- /dev/null +++ b/src/Gargantext/Ngrams/List.hs @@ -0,0 +1,16 @@ +module Gargantext.Ngrams.List where + +import Data.Maybe +import Data.List (filter) +import Gargantext.Ngrams +import Gargantext.Prelude + +graph :: [Ngrams] -> [Ngrams] +graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs + +candidates :: [Ngrams] -> [Ngrams] +candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs + +stop :: [Ngrams] -> [Ngrams] +stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs + diff --git a/src/Gargantext/Parsers.hs b/src/Gargantext/Parsers.hs index 0904bf18ac6a4abae3345c748a67bad8a036b751..29b50199ae8d4503c561e46ca8a790053c3ea503 100644 --- a/src/Gargantext/Parsers.hs +++ b/src/Gargantext/Parsers.hs @@ -52,6 +52,14 @@ import Gargantext.Parsers.WOS (wosParser) --import Gargantext.Prelude (pm) --import Gargantext.Types.Main (ErrorMessage(), Corpus) +-- FIXME +--type Field = Text +type ParseError = String +-- +--data Corpus = Corpus { _corpusErrors :: [ParseError] +-- , _corpusMap :: Map FilePath (Map Field Text) +-- } + -- | According to the format of Input file, -- different parser are available. @@ -63,7 +71,6 @@ data FileFormat = WOS -- Implemented (ISI Format) -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml -- TODO: to debug maybe add the filepath in error message -type ParseError = String parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) diff --git a/src/Gargantext/Prelude.hs b/src/Gargantext/Prelude.hs index 3123758220159fc1649387c7c32dd23ba2057fcf..ab7d02954fa25a0359d7bec6e2df2c2f43f601c9 100644 --- a/src/Gargantext/Prelude.hs +++ b/src/Gargantext/Prelude.hs @@ -12,6 +12,7 @@ module Gargantext.Prelude , headMay , module Text.Show , module Text.Read + , cs ) where @@ -30,6 +31,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer , Eq, (==), (>=), (<=), (<>) , (&&), (||), not , fst, snd, toS + , elem ) -- TODO import functions optimized in Utils.Count @@ -43,6 +45,8 @@ import qualified Data.Vector as V import Safe (headMay) import Text.Show (Show(), show) import Text.Read (Read()) +import Data.String.Conversions (cs) + --pf :: (a -> Bool) -> [a] -> [a] --pf = filter diff --git a/src/Gargantext/Utils/Prefix.hs b/src/Gargantext/Utils/Prefix.hs index c27eb7fad3f1110570ef7c8fe1b856f11cb7dd48..cfa48b0875173a47780e8b15ef768384d8a11602 100644 --- a/src/Gargantext/Utils/Prefix.hs +++ b/src/Gargantext/Utils/Prefix.hs @@ -21,8 +21,8 @@ unPrefix prefix = defaultOptions -- | Lower case leading character unCapitalize :: String -> String unCapitalize [] = [] ---unCapitalize (c:cs) = toLower c : cs -unCapitalize cs = map toLower cs +unCapitalize (c:cs) = toLower c : cs +--unCapitalize cs = map toLower cs -- | Remove given prefix dropPrefix :: String -> String -> String