Commit 2ddd6408 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Alceste/Iramuteq Parser

parent e49efe51
...@@ -192,6 +192,7 @@ library ...@@ -192,6 +192,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.GrandDebat Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.Json2Csv Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS Gargantext.Core.Text.Corpus.Parsers.RIS
......
...@@ -44,6 +44,7 @@ import Gargantext.Core (Lang(..)) ...@@ -44,6 +44,7 @@ import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC) import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
import System.FilePath (FilePath(), takeExtension) import System.FilePath (FilePath(), takeExtension)
import System.IO.Temp (emptySystemTempFile) import System.IO.Temp (emptySystemTempFile)
...@@ -52,11 +53,12 @@ import qualified Data.ByteString.Char8 as DBC ...@@ -52,11 +53,12 @@ import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date import qualified Data.Text as Text
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
import qualified Prelude import qualified Prelude
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
------------------------------------------------------------------------ ------------------------------------------------------------------------
type ParseError = String type ParseError = String
...@@ -70,7 +72,12 @@ type ParseError = String ...@@ -70,7 +72,12 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal data FileType = WOS
| RIS
| RisPresse
| CsvGargV3
| CsvHal
| Iramuteq
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -177,6 +184,14 @@ parseFile WOS Plain p = do ...@@ -177,6 +184,14 @@ parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs pure $ Right docs
parseFile Iramuteq Plain p = do
docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
<$> snd
<$> enrichWith Iramuteq
<$> readFileWith Iramuteq p
pure $ Right docs
parseFile ff _ p = do parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs pure $ Right docs
...@@ -217,6 +232,7 @@ enrichWith :: FileType ...@@ -217,6 +232,7 @@ enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]]) -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys)) enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
enrichWith _ = enrichWith' identity enrichWith _ = enrichWith' identity
...@@ -241,8 +257,9 @@ readFileWith format path = do ...@@ -241,8 +257,9 @@ readFileWith format path = do
-- According to the format of the text, choose the right parser. -- According to the format of the text, choose the right parser.
-- TODO withParser :: FileType -> Parser [Document] -- TODO withParser :: FileType -> Parser [Document]
withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]] withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = WOS.parser withParser WOS = WOS.parser
withParser RIS = RIS.parser withParser RIS = RIS.parser
withParser Iramuteq = Iramuteq.parser
--withParser ODT = odtParser --withParser ODT = odtParser
--withParser XML = xmlParser --withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet" withParser _ = panic "[ERROR] Parser not implemented yet"
......
...@@ -12,7 +12,7 @@ commentary with @some markup@. ...@@ -12,7 +12,7 @@ commentary with @some markup@.
-} -}
module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, notices) where module Gargantext.Core.Text.Corpus.Parsers.Iramuteq (parseIramuteqFile, parser, keys) where
import Control.Applicative import Control.Applicative
import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly) import Data.Attoparsec.ByteString (Parser, takeTill, parseOnly)
...@@ -21,14 +21,14 @@ import Data.ByteString (ByteString) ...@@ -21,14 +21,14 @@ import Data.ByteString (ByteString)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat) import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
parseIramuteqFile :: String -> IO (Either String [[(ByteString, ByteString)]]) parseIramuteqFile :: FilePath -> IO (Either String [[(ByteString, ByteString)]])
parseIramuteqFile fp = do parseIramuteqFile fp = do
txts <- DB.readFile fp txts <- DB.readFile fp
pure $ parseOnly notices txts pure $ parseOnly parser txts
------------------------------------------------------------- -------------------------------------------------------------
notices :: Parser [[(ByteString, ByteString)]] parser :: Parser [[(ByteString, ByteString)]]
notices = do parser = do
ns <- (many notice) ns <- (many notice)
pure ns pure ns
...@@ -70,4 +70,15 @@ parseOf ptxt pa = bothParse <|> empty ...@@ -70,4 +70,15 @@ parseOf ptxt pa = bothParse <|> empty
where where
bothParse = ptxt >>= constP pa bothParse = ptxt >>= constP pa
-----------------------------------------------------------------
-- These keys may not be constant for Iramuteq files formats
keys :: ByteString -> ByteString
keys f
| f == "id" = "doi"
| f == "qui" = "authors"
| f == "quand" = "PY"
| f == "type" = "source"
| f == "titre" = "title"
| f == "ou" = "institutes"
| f == "text" = "abstract"
| otherwise = f
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment