Commit 2ddd6408 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Alceste/Iramuteq Parser

parent e49efe51
......@@ -192,6 +192,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS
......
......@@ -44,6 +44,7 @@ import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
import Gargantext.Prelude
import System.FilePath (FilePath(), takeExtension)
import System.IO.Temp (emptySystemTempFile)
......@@ -52,11 +53,12 @@ import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Text as Text
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
import qualified Prelude
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
------------------------------------------------------------------------
type ParseError = String
......@@ -70,7 +72,12 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
data FileType = WOS
| RIS
| RisPresse
| CsvGargV3
| CsvHal
| Iramuteq
deriving (Show)
-- Implemented (ISI Format)
......@@ -177,6 +184,14 @@ parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile Iramuteq Plain p = do
docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
<$> snd
<$> enrichWith Iramuteq
<$> readFileWith Iramuteq p
pure $ Right docs
parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
......@@ -217,6 +232,7 @@ enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
enrichWith _ = enrichWith' identity
......@@ -243,6 +259,7 @@ readFileWith format path = do
withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = WOS.parser
withParser RIS = RIS.parser
withParser Iramuteq = Iramuteq.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet"
......
......@@ -12,7 +12,7 @@ commentary with @some markup@.
-}
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, notices) where
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
......@@ -21,14 +21,14 @@ import Data.ByteString (ByteString)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.ByteString as DB
parseIramuteqFile :: String -> IO (Either String [[(ByteString, ByteString)]])
parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
parseIramuteqFile fp = do
txts <- DB.readFile fp
pure $ parseOnly notices txts
pure $ parseOnly parser txts
-------------------------------------------------------------
notices :: Parser [[(ByteString, ByteString)]]
notices = do
parser :: Parser [[(ByteString, ByteString)]]
parser = do
ns <- (many notice)
pure ns
......@@ -70,4 +70,15 @@ parseOf ptxt pa = bothParse <|> empty
where
bothParse = ptxt >>= constP pa
-----------------------------------------------------------------
-- These keys may not be constant for Iramuteq files formats
keys :: ByteString -> ByteString
keys f
| f == "id" = "doi"
| f == "qui" = "authors"
| f == "quand" = "PY"
| f == "type" = "source"
| f == "titre" = "title"
| f == "ou" = "institutes"
| f == "text" = "abstract"
| otherwise = f
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment