[CODE/READ] with NP.

3f8c2b19 · Alexandre Delanoë · a1f70708 · 3f8c2b19 · 3f8c2b19 · 3f8c2b19
Commit 3f8c2b19 authored Apr 18, 2018 by Alexandre Delanoë
8 changed files
--- a/package.yaml
+++ b/package.yaml
@@ -115,6 +115,7 @@ library:
  - servant-static-th
  - split
  - stemmer
+  - string-conversions
  - swagger2
  - tagsoup
  - text-metrics

--- a/src/Gargantext/Ngrams.hs
+++ b/src/Gargantext/Ngrams.hs
@@ -22,7 +22,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
                         , module Gargantext.Ngrams.TextMining
                         , module Gargantext.Ngrams.Metrics
                         , Ngrams(..), ngrams, occ, sumOcc, text2fis
-                         , NgramsList(..)
+                         , ListName(..), equivNgrams, isGram
                             --, module Gargantext.Ngrams.Words
                         ) where

@@ -40,7 +40,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
 -----------------------------------------------------------------

 import Data.List (sort)
-import Data.Char (Char, isAlpha, isSpace)
+import Data.Char (Char, isAlphaNum, isSpace)
 import Data.Text (Text, words, filter, toLower)
 import Data.Map.Strict  (Map
                        , empty
@@ -58,32 +58,31 @@ import Gargantext.Prelude hiding (filter)
 --import Language.Aspell.Options (ACOption(..))


-data NgramsList = Stop | Candidate | Graph
+data ListName = Stop | Candidate | Graph
  deriving (Show, Eq)

-data Ngrams = Ngrams { _ngramsNgrams :: [Text]
-                     , _ngramsStem   :: [Text]
-                     , _ngramsList   :: Maybe NgramsList
+data Ngrams = Ngrams { _ngramsNgrams   :: [Text]
+                     , _ngramsStem     :: [Text]
+                     , _ngramsListName :: Maybe ListName
                     } deriving (Show)

-instance Eq Ngrams where
-  Ngrams n1 s1 _ == Ngrams n2 s2 _ = (sort n1) == (sort n2) || (sort s1) == (sort s2)
+equivNgrams :: Ngrams -> Ngrams -> Bool
+equivNgrams  (Ngrams n1 s1 _) (Ngrams n2 s2 _)
+  = (sort n1) == (sort n2) || (sort s1) == (sort s2)

 type Occ     = Int
 --type Index   = Int

+-- Data Ngrams = Monograms | MultiGrams
+
 ngrams :: Text -> [Text]
-ngrams xs = monograms $ toLower $ filter isChar xs
+ngrams xs = monograms $ toLower $ filter isGram xs

 monograms :: Text -> [Text]
 monograms = words

-- TODO
-- 12-b
-isChar :: Char -> Bool
-isChar '-' = True
-isChar '/' = True
-isChar  c  = isAlpha c || isSpace c
+isGram :: Char -> Bool
+isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/']

 -- | Compute the occurrences (occ)
 occ :: Ord a => [a] -> Map a Occ
@@ -91,7 +90,7 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs

 -- TODO add groups and filter stops
 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
-sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
+sumOcc xs = foldl' (unionWith (+)) empty xs

 --noApax :: Ord a => Map a Occ -> Map a Occ
 --noApax m = M.filter (>1) m

--- a/src/Gargantext/Ngrams/CoreNLP.hs
+++ b/src/Gargantext/Ngrams/CoreNLP.hs
@@ -28,8 +28,6 @@ import Gargantext.Prelude
 import Gargantext.Utils.Prefix (unPrefix)
 import Data.Text (Text)

-import qualified Data.ByteString.Char8 as S8
-import qualified Data.Yaml             as Yaml
 import           Network.HTTP.Simple


@@ -82,17 +80,6 @@ $(deriveJSON (unPrefix "_") ''Sentences)
 -- 


-corenlpPretty :: Text -> IO ()
-corenlpPretty txt = do
-    url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" 
-    let request = setRequestBodyJSON txt url
-    response <- httpJSON request
-
--    putStrLn $ "The status code was: " ++
--               show (getResponseStatusCode response)
--    print $ getResponseHeader "Content-Type" response
-    S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
-
 corenlp :: Language -> Text -> IO Sentences
 corenlp lang txt = do
    let properties = case lang of
@@ -100,7 +87,7 @@ corenlp lang txt = do
            -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
            FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
    url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
-    let request = setRequestBodyJSON txt url
+    let request = setRequestBodyLBS (cs txt) url
    response <- httpJSON request
    pure (getResponseBody response :: Sentences)


--- a/src/Gargantext/Ngrams/FrequentItemSet.hs
+++ b/src/Gargantext/Ngrams/FrequentItemSet.hs
@@ -28,6 +28,7 @@ import Gargantext.Prelude

 type Size = Either Int (Int, Int)

+--data Size = Point | Segment

 ------------------------------------------------------------------------
 -- | Occurrence is Frequent Item Set of size 1

--- a/src/Gargantext/Ngrams/List.hs
+++ b/src/Gargantext/Ngrams/List.hs
+module Gargantext.Ngrams.List where
+
+import Data.Maybe
+import Data.List (filter)
+import Gargantext.Ngrams
+import Gargantext.Prelude
+
+graph :: [Ngrams] -> [Ngrams]
+graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
+
+candidates :: [Ngrams] -> [Ngrams]
+candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs
+
+stop :: [Ngrams] -> [Ngrams]
+stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
+
--- a/src/Gargantext/Parsers.hs
+++ b/src/Gargantext/Parsers.hs
@@ -52,6 +52,14 @@ import Gargantext.Parsers.WOS (wosParser)
 --import Gargantext.Prelude (pm)
 --import Gargantext.Types.Main (ErrorMessage(), Corpus)

+-- FIXME
+--type Field = Text
+type ParseError = String
+--
+--data Corpus = Corpus { _corpusErrors :: [ParseError]
+--                     , _corpusMap    :: Map FilePath (Map Field Text)
+--                    }
+

 -- | According to the format of Input file,
 -- different parser are available.
@@ -63,7 +71,6 @@ data FileFormat = WOS        -- Implemented (ISI Format)
 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml

 -- TODO: to debug maybe add the filepath in error message
-type ParseError = String


 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])

--- a/src/Gargantext/Prelude.hs
+++ b/src/Gargantext/Prelude.hs
@@ -12,6 +12,7 @@ module Gargantext.Prelude
  , headMay
  , module Text.Show
  , module Text.Read
+  , cs
  )
  where

@@ -30,6 +31,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
                 , Eq, (==), (>=), (<=), (<>)
                 , (&&), (||), not
                 , fst, snd, toS
+                 , elem
                 )

 -- TODO import functions optimized in Utils.Count
@@ -43,6 +45,8 @@ import qualified Data.Vector as V
 import Safe (headMay)
 import Text.Show (Show(), show)
 import Text.Read (Read())
+import Data.String.Conversions (cs)
+
 --pf :: (a -> Bool) -> [a] -> [a]
 --pf = filter


--- a/src/Gargantext/Utils/Prefix.hs
+++ b/src/Gargantext/Utils/Prefix.hs
@@ -21,8 +21,8 @@ unPrefix prefix = defaultOptions
 -- | Lower case leading character
 unCapitalize :: String -> String
 unCapitalize [] = []
--unCapitalize (c:cs) = toLower c : cs
-unCapitalize cs = map toLower cs
+unCapitalize (c:cs) = toLower c : cs
+--unCapitalize cs = map toLower cs

 -- | Remove given prefix
 dropPrefix :: String -> String -> String