Commit 787cd46e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Adding WOS parser, NLP functions, Rights management guidelines (draft).

parent a72e061f
...@@ -18,17 +18,21 @@ library ...@@ -18,17 +18,21 @@ library
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, aeson , aeson
, attoparsec , attoparsec
, async
, base16-bytestring , base16-bytestring
, bytestring , bytestring
, case-insensitive , case-insensitive
, containers , containers
, contravariant , contravariant
, conduit
, conduit-extra
, directory , directory
, extra , extra
, filepath , filepath
, http-conduit , http-conduit
, lens , lens
, opaleye , opaleye
, path
, postgresql-simple , postgresql-simple
, pretty , pretty
, product-profunctors , product-profunctors
...@@ -53,11 +57,15 @@ library ...@@ -53,11 +57,15 @@ library
, warp , warp
, yaml , yaml
, zlib , zlib
, zip
, path-io
--, LibZip
-- , stemmer -- , stemmer
--, utc --, utc
exposed-modules: Data.Gargantext exposed-modules: Data.Gargantext
, Data.Gargantext.Analysis , Data.Gargantext.Analysis
, Data.Gargantext.DSL , Data.Gargantext.DSL
, Data.Gargantext.Ngrams.Occurrences
, Data.Gargantext.NLP , Data.Gargantext.NLP
, Data.Gargantext.NLP.CoreNLP , Data.Gargantext.NLP.CoreNLP
, Data.Gargantext.Database , Data.Gargantext.Database
...@@ -71,7 +79,7 @@ library ...@@ -71,7 +79,7 @@ library
, Data.Gargantext.Database.Private , Data.Gargantext.Database.Private
, Data.Gargantext.Database.User , Data.Gargantext.Database.User
, Data.Gargantext.Parsers , Data.Gargantext.Parsers
, Data.Gargantext.Parsers.Occurrences , Data.Gargantext.Parsers.WOS
, Data.Gargantext.Prelude , Data.Gargantext.Prelude
, Data.Gargantext.Server , Data.Gargantext.Server
, Data.Gargantext.Types , Data.Gargantext.Types
......
{-# LANGUAGE ExistentialQuantification #-}
{-# LANGUAGE RankNTypes #-}
import Control.Lens (Getting, foldMapOf)
data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o)
instance Functor (Fold i) where
fmap k (Fold tally summarize) = Fold tally (k . summarize)
instance Applicative (Fold i) where
pure o = Fold (\_ -> ()) (\_ -> o)
Fold tallyF summarizeF <*> Fold tallyX summarizeX = Fold tally summarize
where
tally i = (tallyF i, tallyX i)
summarize (nF, nX) = summarizeF nF (summarizeX nX)
focus :: (forall m . Monoid m => Getting m b a) -> Fold a o -> Fold b o
focus lens (Fold tally summarize) = Fold (foldMapOf lens tally) summarize
{-# LANGUAGE ExistentialQuantification #-}
-- | Thanks to Gabriel Gonzales and his beautiful folds
import Data.Monoid
import Prelude hiding (head, last, all, any, sum, product, length)
import qualified Data.Foldable
data Fold i o = forall m . Monoid m => Fold (i -> m) (m -> o)
fold :: Fold i o -> [i] -> o
fold (Fold tally summarize) is = summarize (reduce (map tally is))
where
reduce = Data.Foldable.foldl' (<>) mempty
--
head :: Fold a (Maybe a)
head = Fold (First . Just) getFirst
last :: Fold a (Maybe a)
last = Fold (Last . Just) getLast
--
all :: (a -> Bool) -> Fold a Bool
all predicate = Fold (All . predicate) getAll
any :: (a -> Bool) -> Fold a Bool
any predicate = Fold (Any . predicate) getAny
--
sum :: Num n => Fold n n
sum = Fold Sum getSum
product :: Num n => Fold n n
product = Fold Product getProduct
length :: Num n => Fold i n
length = Fold (\_ -> Sum 1) getSum
--
{-# LANGUAGE BangPatterns #-}
data Average a = Average { numerator :: !a, denominator :: !Int }
instance Num a => Monoid (Average a) where
mempty = Average 0 0
mappend (Average xL nL) (Average xR nR) = Average (xL + xR) (nL + nR)
average :: Fractional a => Fold a a
average = Fold tally summarize
where
tally x = Average x 1
summarize (Average numerator denominator) =
numerator / fromIntegral denominator
...@@ -3,6 +3,8 @@ module Data.Gargantext.Analysis where ...@@ -3,6 +3,8 @@ module Data.Gargantext.Analysis where
-- import qualified Data.Text.Lazy as DTL -- import qualified Data.Text.Lazy as DTL
import Data.Text import Data.Text
import Opaleye (Column, PGInt4) import Opaleye (Column, PGInt4)
--import Data.Map as DM
--import Data.Vector as DV
-- | Simple function to count Occurrences in a context of text. -- | Simple function to count Occurrences in a context of text.
occOfDocument :: Column PGInt4 -> Text -> IO Int occOfDocument :: Column PGInt4 -> Text -> IO Int
...@@ -15,3 +17,16 @@ occOfDocument = undefined ...@@ -15,3 +17,16 @@ occOfDocument = undefined
-- Left str -> error $ "[ERRROR] at file/function/line" ++ str -- Left str -> error $ "[ERRROR] at file/function/line" ++ str
-- Right xs -> xs -- Right xs -> xs
-- pure (sum result) -- pure (sum result)
data Occurrences a b = Map a b
garg
garg CorpusWith
makeAListOf Corpus ListType
...@@ -52,7 +52,7 @@ instance FromJSON Sentences ...@@ -52,7 +52,7 @@ instance FromJSON Sentences
corenlpPretty :: String -> IO () corenlpPretty :: String -> IO ()
corenlpPretty txt = do corenlpPretty txt = do
let url = "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
let request = setRequestBodyJSON txt url let request = setRequestBodyJSON txt url
response <- httpJSON request response <- httpJSON request
...@@ -63,7 +63,7 @@ corenlpPretty txt = do ...@@ -63,7 +63,7 @@ corenlpPretty txt = do
corenlp :: String -> IO Sentences corenlp :: String -> IO Sentences
corenlp txt = do corenlp txt = do
let url = "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}" url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
let request = setRequestBodyJSON txt url let request = setRequestBodyJSON txt url
response <- httpJSON request response <- httpJSON request
pure (getResponseBody response :: Sentences) pure (getResponseBody response :: Sentences)
......
module Data.Gargantext.Network where
import Data.Gargantext.Prelude
import Data.Map as DM
import Data.Vector as DV
type Measure a b c = DM.Map a (DM.Map b c)
-- UTCTime Paire Granularity [Candle]
-- GargVector Paire Granularity [Candle]
type GargVector a b c = DM.Map a ( DM.Map b c)
-- GargMatrix Granularity (Paire Paire) [Candle]
type GargMatrix a b c d = DM.Map a (FolioVector b c d)
-- GargMatrix Granularity (Paire Paire) [Candle]
type GargTensor a b c d e = DM.Map a (FolioMatrix b c d e)
--data PortGarg = PortGarg { _portFolioParameters :: Parameters
-- , _portGargData :: Garg
--}
toMeasure :: Granularity -> Paire -> [Candle]
-> Measure Granularity Paire Candle
toMeasure g c1 c2 cs = DM.fromList [(g,
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Data.Gargantext.Parsers.Occurrences where module Data.Gargantext.Ngrams.Occurrences where
import Data.Attoparsec.Text import Data.Attoparsec.Text
import Data.Text (Text) import Data.Text (Text)
......
module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.Occurrences) module Data.Gargantext.Parsers (module Data.Gargantext.Parsers.WOS)
where where
import Data.Gargantext.Parsers.Occurrences import Data.Gargantext.Parsers.WOS
module Data.Gargantext.Parsers.Utils where
-- use Duckling here
parseDate = undefined
module Data.Gargantext.Parsers.WOS where
import Prelude hiding (takeWhile, take, concat, readFile)
import qualified Data.List as DL
import Data.Map as DM
import Data.Attoparsec.ByteString
import Data.Attoparsec.ByteString.Char8 (anyChar, char8, endOfLine, isDigit_w8, isAlpha_ascii, isEndOfLine)
import Data.ByteString (ByteString, unpack, pack, concat, readFile)
import Data.Either.Extra(Either(..))
import Control.Applicative
import Control.Monad (join)
-- To be removed just for Tests
--
-- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
--import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
import Control.Concurrent.Async as CCA (mapConcurrently)
import System.Environment
import Codec.Archive.Zip
import Path (parseAbsFile)
import Path.IO (resolveFile')
-- import qualified Data.ByteString.Lazy as B
import Control.Applicative ( (<$>) )
zipFiles :: FilePath -> IO [ByteString]
zipFiles fp = do
path <- resolveFile' fp
entries <- withArchive path (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
parseFile :: ParserType -> ByteString -> IO Int
parseFile p x = case runParser p x of
Left e -> pure 1
Right r -> pure $ length r
testWos :: FilePath -> IO [Int]
testWos fp = join $ mapConcurrently (parseFile WOS) <$> zipFiles fp
-- type Parser a = a -> Text -> [Document]
data ParserType = WOS | CSV
wosParser :: Parser [Maybe [ByteString]]
wosParser = do
-- TODO Warning if version /= 1.0
_ <- manyTill anyChar (string "\nVR 1.0")
ns <- many1 wosNotice <* "\nEF"
return ns
startNotice :: Parser ByteString
startNotice = "\nPT " *> takeTill isEndOfLine
wosNotice :: Parser (Maybe [ByteString])
wosNotice = do
n <- startNotice *> wosFields <* manyTill anyChar (string "\nER\n")
return n
field' :: Parser (ByteString, [ByteString])
field' = do
f <- "\n" *> take 2 <* " "
a <- takeTill isEndOfLine
as <- try wosLines
let as' = case DL.length as > 0 of
True -> as
False -> []
return (f, [a] ++ as')
wosFields' :: Parser [(ByteString, [ByteString])]
wosFields' = many field'
wosFields :: Parser (Maybe [ByteString])
wosFields = do
-- a <- field "AU"
-- t <- field "TI"
-- s <- field "SO"
-- d <- field "DI" -- DOI
-- p <- field "PD"
-- b <- field "AB"
-- u <- field "UT"
ws <- many field'
return $ DL.lookup "UT" ws
-- return $ HyperdataDocument
-- Just "WOS"
-- DL.lookup "DI" ws
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
--
wosLines :: Parser [ByteString]
wosLines = many line
where
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
runParser :: ParserType -> ByteString -> Either String [Maybe [ByteString]]
runParser p x = parseOnly parser x
where
parser = case p of
WOS -> wosParser
_ -> error "Not implemented yet"
-- isTokenChar :: Word8 -> Bool
-- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
-- http://chrisdone.com/posts/fast-haskell-c-parsing-xml
...@@ -2,6 +2,6 @@ flags: {} ...@@ -2,6 +2,6 @@ flags: {}
extra-package-dbs: [] extra-package-dbs: []
packages: packages:
- . - .
- /home/alexandre/local/logiciels/haskell/servant/servant-multipart
extra-deps: extra-deps:
- servant-multipart-0.10.0.1
resolver: lts-9.2 resolver: lts-9.2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment