Commit 3f8c2b19 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CODE/READ] with NP.

parent a1f70708
......@@ -115,6 +115,7 @@ library:
- servant-static-th
- split
- stemmer
- string-conversions
- swagger2
- tagsoup
- text-metrics
......
......@@ -22,7 +22,7 @@ module Gargantext.Ngrams ( module Gargantext.Ngrams.Letters
, module Gargantext.Ngrams.TextMining
, module Gargantext.Ngrams.Metrics
, Ngrams(..), ngrams, occ, sumOcc, text2fis
, NgramsList(..)
, ListName(..), equivNgrams, isGram
--, module Gargantext.Ngrams.Words
) where
......@@ -40,7 +40,7 @@ import qualified Gargantext.Ngrams.FrequentItemSet as FIS
-----------------------------------------------------------------
import Data.List (sort)
import Data.Char (Char, isAlpha, isSpace)
import Data.Char (Char, isAlphaNum, isSpace)
import Data.Text (Text, words, filter, toLower)
import Data.Map.Strict (Map
, empty
......@@ -58,32 +58,31 @@ import Gargantext.Prelude hiding (filter)
--import Language.Aspell.Options (ACOption(..))
data NgramsList = Stop | Candidate | Graph
data ListName = Stop | Candidate | Graph
deriving (Show, Eq)
data Ngrams = Ngrams { _ngramsNgrams :: [Text]
, _ngramsStem :: [Text]
, _ngramsList :: Maybe NgramsList
data Ngrams = Ngrams { _ngramsNgrams :: [Text]
, _ngramsStem :: [Text]
, _ngramsListName :: Maybe ListName
} deriving (Show)
instance Eq Ngrams where
Ngrams n1 s1 _ == Ngrams n2 s2 _ = (sort n1) == (sort n2) || (sort s1) == (sort s2)
equivNgrams :: Ngrams -> Ngrams -> Bool
equivNgrams (Ngrams n1 s1 _) (Ngrams n2 s2 _)
= (sort n1) == (sort n2) || (sort s1) == (sort s2)
type Occ = Int
--type Index = Int
-- Data Ngrams = Monograms | MultiGrams
ngrams :: Text -> [Text]
ngrams xs = monograms $ toLower $ filter isChar xs
ngrams xs = monograms $ toLower $ filter isGram xs
monograms :: Text -> [Text]
monograms = words
-- TODO
-- 12-b
isChar :: Char -> Bool
isChar '-' = True
isChar '/' = True
isChar c = isAlpha c || isSpace c
isGram :: Char -> Bool
isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/']
-- | Compute the occurrences (occ)
occ :: Ord a => [a] -> Map a Occ
......@@ -91,7 +90,7 @@ occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
-- TODO add groups and filter stops
sumOcc :: Ord a => [Map a Occ] -> Map a Occ
sumOcc xs = foldl' (\x y -> unionWith (+) x y) empty xs
sumOcc xs = foldl' (unionWith (+)) empty xs
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
......
......@@ -28,8 +28,6 @@ import Gargantext.Prelude
import Gargantext.Utils.Prefix (unPrefix)
import Data.Text (Text)
import qualified Data.ByteString.Char8 as S8
import qualified Data.Yaml as Yaml
import Network.HTTP.Simple
......@@ -82,17 +80,6 @@ $(deriveJSON (unPrefix "_") ''Sentences)
--
corenlpPretty :: Text -> IO ()
corenlpPretty txt = do
url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
let request = setRequestBodyJSON txt url
response <- httpJSON request
-- putStrLn $ "The status code was: " ++
-- show (getResponseStatusCode response)
-- print $ getResponseHeader "Content-Type" response
S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
corenlp :: Language -> Text -> IO Sentences
corenlp lang txt = do
let properties = case lang of
......@@ -100,7 +87,7 @@ corenlp lang txt = do
-- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
let request = setRequestBodyJSON txt url
let request = setRequestBodyLBS (cs txt) url
response <- httpJSON request
pure (getResponseBody response :: Sentences)
......
......@@ -28,6 +28,7 @@ import Gargantext.Prelude
type Size = Either Int (Int, Int)
--data Size = Point | Segment
------------------------------------------------------------------------
-- | Occurrence is Frequent Item Set of size 1
......
module Gargantext.Ngrams.List where
import Data.Maybe
import Data.List (filter)
import Gargantext.Ngrams
import Gargantext.Prelude
graph :: [Ngrams] -> [Ngrams]
graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
candidates :: [Ngrams] -> [Ngrams]
candidates ngs = filter (\ng -> _ngramsListName ng == Just Candidate) ngs
stop :: [Ngrams] -> [Ngrams]
stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
......@@ -52,6 +52,14 @@ import Gargantext.Parsers.WOS (wosParser)
--import Gargantext.Prelude (pm)
--import Gargantext.Types.Main (ErrorMessage(), Corpus)
-- FIXME
--type Field = Text
type ParseError = String
--
--data Corpus = Corpus { _corpusErrors :: [ParseError]
-- , _corpusMap :: Map FilePath (Map Field Text)
-- }
-- | According to the format of Input file,
-- different parser are available.
......@@ -63,7 +71,6 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
type ParseError = String
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
......
......@@ -12,6 +12,7 @@ module Gargantext.Prelude
, headMay
, module Text.Show
, module Text.Read
, cs
)
where
......@@ -30,6 +31,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, Eq, (==), (>=), (<=), (<>)
, (&&), (||), not
, fst, snd, toS
, elem
)
-- TODO import functions optimized in Utils.Count
......@@ -43,6 +45,8 @@ import qualified Data.Vector as V
import Safe (headMay)
import Text.Show (Show(), show)
import Text.Read (Read())
import Data.String.Conversions (cs)
--pf :: (a -> Bool) -> [a] -> [a]
--pf = filter
......
......@@ -21,8 +21,8 @@ unPrefix prefix = defaultOptions
-- | Lower case leading character
unCapitalize :: String -> String
unCapitalize [] = []
--unCapitalize (c:cs) = toLower c : cs
unCapitalize cs = map toLower cs
unCapitalize (c:cs) = toLower c : cs
--unCapitalize cs = map toLower cs
-- | Remove given prefix
dropPrefix :: String -> String -> String
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment