[corpus] import JSON support

parent b210ee82
Pipeline #4001 failed with stage
in 28 minutes and 36 seconds
...@@ -5,7 +5,7 @@ cabal-version: 1.12 ...@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
name: gargantext name: gargantext
version: 0.0.6.9.9.4.3 version: 0.0.6.9.9.4.3
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
...@@ -56,6 +56,7 @@ library ...@@ -56,6 +56,7 @@ library
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.List.Formats.CSV Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar Gargantext.Core.Text.Metrics.CharByChar
......
...@@ -82,6 +82,7 @@ library: ...@@ -82,6 +82,7 @@ library:
- Gargantext.Core.Text.Corpus.Parsers - Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV - Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec - Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
- Gargantext.Core.Text.Corpus.Parsers.JSON
- Gargantext.Core.Text.List.Formats.CSV - Gargantext.Core.Text.List.Formats.CSV
- Gargantext.Core.Text.Metrics - Gargantext.Core.Text.Metrics
- Gargantext.Core.Text.Metrics.CharByChar - Gargantext.Core.Text.Metrics.CharByChar
......
...@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
markProgress 1 jobHandle markProgress 1 jobHandle
void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
-- printDebug "corpus id" cids -- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user sendMail user
-- TODO ... -- TODO ...
...@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do ...@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
WOS -> Parser.parseFormatC Parser.WOS WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse PresseRIS -> Parser.parseFormatC Parser.RisPresse
Iramuteq -> Parser.parseFormatC Parser.Iramuteq Iramuteq -> Parser.parseFormatC Parser.Iramuteq
JSON -> Parser.parseFormatC Parser.JSON
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
let data' = case ff of let data' = case ff of
......
...@@ -15,6 +15,7 @@ data FileType = CSV ...@@ -15,6 +15,7 @@ data FileType = CSV
| PresseRIS | PresseRIS
| WOS | WOS
| Iramuteq | Iramuteq
| JSON
deriving (Eq, Show, Generic) deriving (Eq, Show, Generic)
instance ToSchema FileType instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS] instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
...@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where ...@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where
parseUrlPiece "PresseRis" = pure PresseRIS parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS parseUrlPiece "WOS" = pure WOS
parseUrlPiece "Iramuteq" = pure Iramuteq parseUrlPiece "Iramuteq" = pure Iramuteq
parseUrlPiece _ = panic "[G.A.A.Node.Corpus.New] File Type not implemented (yet)" parseUrlPiece "JSON" = pure JSON
parseUrlPiece s = panic $ "[G.A.A.Node.Corpus.New] File Type not implemented (yet): " <> s
instance ToHttpApiData FileType where instance ToHttpApiData FileType where
toUrlPiece = pack . show toUrlPiece = pack . show
......
...@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second) ...@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..)) import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC) import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..)) import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
...@@ -79,7 +80,8 @@ data FileType = WOS ...@@ -79,7 +80,8 @@ data FileType = WOS
| CsvGargV3 | CsvGargV3
| CsvHal | CsvHal
| Iramuteq | Iramuteq
deriving (Show) | JSON
deriving (Show, Eq)
-- Implemented (ISI Format) -- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc -- | DOC -- Not Implemented / import Pandoc
...@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do ...@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do
) )
<$> eDocs <$> eDocs
parseFormatC JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC ft ZIP bs = do parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip" path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs liftBase $ DB.writeFile path bs
...@@ -154,7 +162,7 @@ parseFormatC ft ZIP bs = do ...@@ -154,7 +162,7 @@ parseFormatC ft ZIP bs = do
pure $ Right ( Just totalLength pure $ Right ( Just totalLength
, sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc") , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined parseFormatC _ _ _ = undefined
...@@ -211,7 +219,7 @@ parseFile WOS Plain p = do ...@@ -211,7 +219,7 @@ parseFile WOS Plain p = do
parseFile Iramuteq Plain p = do parseFile Iramuteq Plain p = do
docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " ")))) docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
<$> snd <$> snd
<$> enrichWith Iramuteq <$> enrichWith Iramuteq
<$> readFileWith Iramuteq p <$> readFileWith Iramuteq p
pure $ Right docs pure $ Right docs
...@@ -226,7 +234,7 @@ toDoc ff d = do ...@@ -226,7 +234,7 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d -- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract)) let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
...@@ -314,9 +322,8 @@ clean txt = DBC.map clean' txt ...@@ -314,9 +322,8 @@ clean txt = DBC.map clean' txt
clean' ';' = '.' clean' ';' = '.'
clean' c = c clean' c = c
-- --
splitOn :: NgramsType -> Maybe Text -> Text -> [Text] splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ") splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ") splitOn _ _ = (DT.splitOn ", ")
{-| {-|
Module : Gargantext.Core.Text.Corpus.Parsers.CSV Module : Gargantext.Core.Text.Corpus.Parsers.CSV
Description : Description :
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
...@@ -378,7 +378,7 @@ instance ToNamedRecord CsvHal where ...@@ -378,7 +378,7 @@ instance ToNamedRecord CsvHal where
, "instStructId_i" .= csvHal_instStructId_i , "instStructId_i" .= csvHal_instStructId_i
, "deptStructId_i" .= csvHal_deptStructId_i , "deptStructId_i" .= csvHal_deptStructId_i
, "labStructId_i" .= csvHal_labStructId_i , "labStructId_i" .= csvHal_labStructId_i
, "rteamStructId_i" .= csvHal_rteamStructId_i , "rteamStructId_i" .= csvHal_rteamStructId_i
, "docType_s" .= csvHal_docType_s , "docType_s" .= csvHal_docType_s
] ]
...@@ -472,7 +472,7 @@ parseCsvC bs = do ...@@ -472,7 +472,7 @@ parseCsvC bs = do
Right res -> Right res Right res -> Right res
case result of case result of
Left err -> Left err Left err -> Left err
Right r -> Right $ (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc) Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.JSON
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
JSON parser for Gargantext corpus files.
-}
{-# LANGUAGE DuplicateRecordFields #-}
module Gargantext.Core.Text.Corpus.Parsers.JSON where
import Conduit
import Data.Aeson
import qualified Data.ByteString.Lazy as BL
import Data.Either (Either(..))
import Data.Text
import GHC.Generics
import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude hiding (length)
data JSONStruct =
JSONStruct { documents :: [ JSONStructDocument ]
, garg_version :: Text }
deriving (Generic)
instance FromJSON JSONStruct
data JSONStructDocument =
JSONStructDocument { document :: JSONDocument
, ngrams :: JSONNgrams
, hash :: Text }
deriving (Generic)
instance FromJSON JSONStructDocument
data JSONDocument =
JSONDocument { id :: Int
, hash_id :: Maybe Text
, typename :: Int
, user_id :: Int
, parent_id :: Maybe Int
, name :: Text
, date :: Text
, hyperdata :: HyperdataDocument }
deriving (Generic)
instance FromJSON JSONDocument
data JSONNgrams =
JSONNgrams { ngrams :: [Text]
, hash :: Text }
deriving (Generic)
instance FromJSON JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct
readJSONLazyBS bs = eitherDecode bs
parseJSONC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = do
case readJSONLazyBS bs of
Left err -> Left err
Right (JSONStruct { documents }) ->
Right ( Just $ Prelude.fromIntegral $ Prelude.length documents
, yieldMany documents .| mapC doc2hyperdoc )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment