Commit 787cd46e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Adding WOS parser, NLP functions, Rights management guidelines (draft).

parent a72e061f
......@@ -18,17 +18,21 @@ library
build-depends: base >= 4.7 && < 5
, aeson
, attoparsec
, async
, base16-bytestring
, bytestring
, case-insensitive
, containers
, contravariant
, conduit
, conduit-extra
, directory
, extra
, filepath
, http-conduit
, lens
, opaleye
, path
, postgresql-simple
, pretty
, product-profunctors
......@@ -53,11 +57,15 @@ library
, warp
, yaml
, zlib
, zip
, path-io
--, LibZip
-- , stemmer
--, utc
exposed-modules: Data.Gargantext
, Data.Gargantext.Analysis
, Data.Gargantext.DSL
, Data.Gargantext.Ngrams.Occurrences
, Data.Gargantext.NLP
, Data.Gargantext.NLP.CoreNLP
, Data.Gargantext.Database
......@@ -71,7 +79,7 @@ library
, Data.Gargantext.Database.Private
, Data.Gargantext.Database.User
, Data.Gargantext.Parsers
, Data.Gargantext.Parsers.Occurrences
, Data.Gargantext.Parsers.WOS
, Data.Gargantext.Prelude
, Data.Gargantext.Server
, Data.Gargantext.Types
......
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE RankNTypes #-}
import Control.Lens (Getting, foldMapOf)
data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o)
instance Functor (Fold i) where
fmap k (Fold tally summarize) = Fold tally (k . summarize)
instance Applicative (Fold i) where
pure o = Fold (\_ -> ()) (\_ -> o)
Fold tallyF summarizeF <*> Fold tallyX summarizeX = Fold tally summarize
where
tally i = (tallyF i, tallyX i)
summarize (nF, nX) = summarizeF nF (summarizeX nX)
focus :: (forall m . Monoid m => Getting m b a) -> Fold a o -> Fold b o
focus lens (Fold tally summarize) = Fold (foldMapOf lens tally) summarize
{-# LANGUAGE ExistentialQuantification #-}
-- | Thanks to Gabriel Gonzales and his beautiful folds
import Data.Monoid
import Prelude hiding (head, last, all, any, sum, product, length)
import qualified Data.Foldable
data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o)
fold :: Fold i o -> [i] -> o
fold (Fold tally summarize) is = summarize (reduce (map tally is))
where
reduce = Data.Foldable.foldl' (<>) mempty
--
head :: Fold a (Maybe a)
head = Fold (First . Just) getFirst
last :: Fold a (Maybe a)
last = Fold (Last . Just) getLast
--
all :: (a -> Bool) -> Fold a Bool
all predicate = Fold (All . predicate) getAll
any :: (a -> Bool) -> Fold a Bool
any predicate = Fold (Any . predicate) getAny
--
sum :: Num n => Fold n n
sum = Fold Sum getSum
product :: Num n => Fold n n
product = Fold Product getProduct
length :: Num n => Fold i n
length = Fold (\_ -> Sum 1) getSum
--
{-# LANGUAGE BangPatterns #-}
data Average a = Average { numerator :: !a, denominator :: !Int }
instance Num a => Monoid (Average a) where
mempty = Average 0 0
mappend (Average xL nL) (Average xR nR) = Average (xL + xR) (nL + nR)
average :: Fractional a => Fold a a
average = Fold tally summarize
where
tally x = Average x 1
summarize (Average numerator denominator) =
numerator / fromIntegral denominator
......@@ -3,6 +3,8 @@ module Data.Gargantext.Analysis where
-- import qualified Data.Text.Lazy as DTL
import Data.Text
import Opaleye (Column, PGInt4)
--import Data.Map as DM
--import Data.Vector as DV
-- | Simple function to count Occurrences in a context of text.
occOfDocument :: Column PGInt4 -> Text -> IO Int
......@@ -15,3 +17,16 @@ occOfDocument = undefined
-- Left str -> error $ "[ERRROR] at file/function/line" ++ str
-- Right xs -> xs
-- pure (sum result)
data Occurrences a b = Map a b
garg
garg CorpusWith
makeAListOf Corpus ListType
......@@ -52,7 +52,7 @@ instance FromJSON Sentences
corenlpPretty :: String -> IO ()
corenlpPretty txt = do
let url = "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
let request = setRequestBodyJSON txt url
response <- httpJSON request
......@@ -63,7 +63,7 @@ corenlpPretty txt = do
corenlp :: String -> IO Sentences
corenlp txt = do
let url = "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
let request = setRequestBodyJSON txt url
response <- httpJSON request
pure (getResponseBody response :: Sentences)
......
module Data.Gargantext.Network where
import Data.Gargantext.Prelude
import Data.Map as DM
import Data.Vector as DV
type Measure a b c = DM.Map a (DM.Map b c)
-- UTCTime Paire Granularity [Candle]
-- GargVector Paire Granularity [Candle]
type GargVector a b c = DM.Map a ( DM.Map b c)
-- GargMatrix Granularity (Paire Paire) [Candle]
type GargMatrix a b c d = DM.Map a (FolioVector b c d)
-- GargMatrix Granularity (Paire Paire) [Candle]
type GargTensor a b c d e = DM.Map a (FolioMatrix b c d e)
--data PortGarg = PortGarg { _portFolioParameters :: Parameters
-- , _portGargData :: Garg
--}
toMeasure :: Granularity -> Paire -> [Candle]
-> Measure Granularity Paire Candle
toMeasure g c1 c2 cs = DM.fromList [(g,
{-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.Occurrences where
module Data.Gargantext.Ngrams.Occurrences where
import Data.Attoparsec.Text
import Data.Text (Text)
......
module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.Occurrences)
module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.WOS)
where
import Data.Gargantext.Parsers.Occurrences
import Data.Gargantext.Parsers.WOS
module Data.Gargantext.Parsers.Utils where
-- use Duckling here
parseDate = undefined
module Data.Gargantext.Parsers.WOS where
import Prelude hiding (takeWhile, take, concat, readFile)
import qualified Data.List as DL
import Data.Map as DM
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar, char8, endOfLine, isDigit_w8, isAlpha_ascii, isEndOfLine)
import Data.ByteString (ByteString, unpack, pack, concat, readFile)
import Data.Either.Extra(Either(..))
import Control.Applicative
import Control.Monad (join)
-- To be removed just for Tests
--
-- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
--import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
import Control.Concurrent.Async as CCA (mapConcurrently)
import System.Environment
import Codec.Archive.Zip
import Path (parseAbsFile)
import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) )
zipFiles :: FilePath -> IO [ByteString]
zipFiles fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
parseFile :: ParserType -> ByteString -> IO Int
parseFile p x = case runParser p x of
Left e -> pure 1
Right r -> pure $ length r
testWos :: FilePath -> IO [Int]
testWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
-- type Parser a = a -> Text -> [Document]
data ParserType = WOS | CSV
wosParser :: Parser [Maybe [ByteString]]
wosParser = do
-- TODO Warning if version /= 1.0
_ <- manyTill anyChar (string "\nVR 1.0")
ns <- many1 wosNotice <* "\nEF"
return ns
startNotice :: Parser ByteString
startNotice = "\nPT " *> takeTill isEndOfLine
wosNotice :: Parser (Maybe [ByteString])
wosNotice = do
n <- startNotice *> wosFields <* manyTill anyChar (string "\nER\n")
return n
field' :: Parser (ByteString, [ByteString])
field' = do
f <- "\n" *> take 2 <* " "
a <- takeTill isEndOfLine
as <- try wosLines
let as' = case DL.length as > 0 of
True -> as
False -> []
return (f, [a] ++ as')
wosFields' :: Parser [(ByteString, [ByteString])]
wosFields' = many field'
wosFields :: Parser (Maybe [ByteString])
wosFields = do
-- a <- field "AU"
-- t <- field "TI"
-- s <- field "SO"
-- d <- field "DI" -- DOI
-- p <- field "PD"
-- b <- field "AB"
-- u <- field "UT"
ws <- many field'
return $ DL.lookup "UT" ws
-- return $ HyperdataDocument
-- Just "WOS"
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
--
wosLines :: Parser [ByteString]
wosLines = many line
where
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
runParser :: ParserType -> ByteString -> Either String [Maybe [ByteString]]
runParser p x = parseOnly parser x
where
parser = case p of
WOS -> wosParser
_ -> error "Not implemented yet"
-- isTokenChar :: Word8 -> Bool
-- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
-- http://chrisdone.com/posts/fast-haskell-c-parsing-xml
......@@ -2,6 +2,6 @@ flags: {}
extra-package-dbs: []
packages:
- .
- /home/alexandre/local/logiciels/haskell/servant/servant-multipart
extra-deps:
- servant-multipart-0.10.0.1
resolver: lts-9.2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment