1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
{-|
Module : Gargantext.Core.Text.Corpus.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
The parsers suppose we know the format of the Text (TextFormat data
type) according to which the right parser is chosen among the list of
available parsers.
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}
{-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
where
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
import Control.Monad.Trans.Control (MonadBaseControl)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat, lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
import Gargantext.Prelude
import System.FilePath (FilePath(), takeExtension)
import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Text as Text
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
import qualified Prelude
------------------------------------------------------------------------
type ParseError = String
--type Field = Text
--type Document = DM.Map Field Text
--type FilesParsed = DM.Map FilePath FileParsed
--data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
-- , _fileParsed_result :: [Document]
-- } deriving (Show)
-- | According to the format of Input file,
-- different parser are available.
data FileType = WOS
| RIS
| RisPresse
| CsvGargV3
| CsvHal
| Iramuteq
| JSON
deriving (Show, Eq)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
parseFormatC :: MonadBaseControl IO m
=> FileType
-> FileFormat
-> DB.ByteString
-> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
parseFormatC CsvGargV3 Plain bs = do
let eParsedC = parseCsvC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC CsvHal Plain bs = do
let eParsedC = parseCsvC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC RisPresse Plain bs = do
--docs <- enrichWith RisPresse
let eDocs = runParser' RisPresse bs
pure $ (\docs ->
( Just $ fromIntegral $ length docs
, yieldMany docs
.| mapC presseEnrich
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc RIS)) ) <$> eDocs
parseFormatC WOS Plain bs = do
let eDocs = runParser' WOS bs
pure $ (\docs ->
( Just $ fromIntegral $ length docs
, yieldMany docs
.| mapC (map $ first WOS.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC (toDoc WOS)) ) <$> eDocs
parseFormatC Iramuteq Plain bs = do
let eDocs = runParser' Iramuteq bs
pure $ (\docs ->
( Just $ fromIntegral $ length docs
, yieldMany docs
.| mapC (map $ first Iramuteq.keys)
.| mapC (map $ both decodeUtf8)
.| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
)
)
<$> eDocs
parseFormatC JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs
fileContents <- liftBase $ withArchive path $ do
files <- DM.keys <$> getEntries
mapM getEntry files
--printDebug "[parseFormatC] fileContents" fileContents
eContents <- mapM (parseFormatC ft Plain) fileContents
--printDebug "[parseFormatC] contents" contents
--pure $ Left $ "Not implemented for ZIP"
let (errs, contents) = partitionEithers eContents
case errs of
[] ->
case contents of
[] -> pure $ Left "No files in zip"
_ -> do
let lenghts = fst <$> contents
let contents' = snd <$> contents
let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
pure $ Right ( Just totalLength
, sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined
etale :: [HyperdataDocument] -> [HyperdataDocument]
etale = concat . (map etale')
where
etale' :: HyperdataDocument -> [HyperdataDocument]
etale' h = map (\t -> h { _hd_abstract = Just t })
$ map snd
$ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
-- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
-- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-- parseFormat RisPresse bs = do
-- docs <- mapM (toDoc RIS)
-- <$> snd
-- <$> enrichWith RisPresse
-- $ partitionEithers
-- $ [runParser' RisPresse bs]
-- pure $ Right docs
-- parseFormat WOS bs = do
-- docs <- mapM (toDoc WOS)
-- <$> snd
-- <$> enrichWith WOS
-- $ partitionEithers
-- $ [runParser' WOS bs]
-- pure $ Right docs
-- parseFormat ZIP bs = do
-- path <- emptySystemTempFile "parsed-zip"
-- DB.writeFile path bs
-- parsedZip <- withArchive path $ do
-- DM.keys <$> getEntries
-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
-- parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal Plain p = parseHal p
parseFile CsvGargV3 Plain p = parseCsv p
parseFile RisPresse Plain p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
parseFile WOS Plain p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile Iramuteq Plain p = do
docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
<$> snd
<$> enrichWith Iramuteq
<$> readFileWith Iramuteq p
pure $ Right docs
parseFile ff _ p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
toDoc ff d = do
-- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = lookup "authors" d
, _hd_institutes = lookup "institutes" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
-- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
pure hd
enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
enrichWith _ = enrichWith' identity
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
readFileWith :: FileType -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> clean <$> DB.readFile path
partitionEithers <$> mapConcurrently (runParser format) files
-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileType -> Parser [Document]
withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = WOS.parser
withParser RIS = RIS.parser
withParser Iramuteq = Iramuteq.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet"
runParser :: FileType -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ runParser' format text
runParser' :: FileType -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' format text = parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
entries <- withArchive fp (DM.keys <$> getEntries)
bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
pure bs
cleanText :: Text -> Text
cleanText = cs . clean . cs
clean :: DB.ByteString -> DB.ByteString
clean txt = DBC.map clean' txt
where
clean' '’' = '\''
clean' '\r' = ' '
clean' '\t' = ' '
clean' ';' = '.'
clean' c = c
--
splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ")