[corpus] import JSON support

parent b210ee82
Pipeline #4001 failed with stage
in 28 minutes and 36 seconds
......@@ -5,7 +5,7 @@ cabal-version: 1.12
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.9.9.4.3
version: 0.0.6.9.9.4.3
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -56,6 +56,7 @@ library
Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.CSV
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.List.Formats.CSV
Gargantext.Core.Text.Metrics
Gargantext.Core.Text.Metrics.CharByChar
......
......@@ -82,6 +82,7 @@ library:
- Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
- Gargantext.Core.Text.Corpus.Parsers.JSON
- Gargantext.Core.Text.List.Formats.CSV
- Gargantext.Core.Text.Metrics
- Gargantext.Core.Text.Metrics.CharByChar
......
......@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
markProgress 1 jobHandle
void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
-- printDebug "corpus id" cids
-- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
......@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse
Iramuteq -> Parser.parseFormatC Parser.Iramuteq
JSON -> Parser.parseFormatC Parser.JSON
-- TODO granularity of the logStatus
let data' = case ff of
......
......@@ -15,6 +15,7 @@ data FileType = CSV
| PresseRIS
| WOS
| Iramuteq
| JSON
deriving (Eq, Show, Generic)
instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
......@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where
parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS
parseUrlPiece "Iramuteq" = pure Iramuteq
parseUrlPiece _ = panic "[G.A.A.Node.Corpus.New] File Type not implemented (yet)"
parseUrlPiece "JSON" = pure JSON
parseUrlPiece s = panic $ "[G.A.A.Node.Corpus.New] File Type not implemented (yet): " <> s
instance ToHttpApiData FileType where
toUrlPiece = pack . show
......
......@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
......@@ -79,7 +80,8 @@ data FileType = WOS
| CsvGargV3
| CsvHal
| Iramuteq
deriving (Show)
| JSON
deriving (Show, Eq)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
......@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do
)
<$> eDocs
parseFormatC JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs
......@@ -154,7 +162,7 @@ parseFormatC ft ZIP bs = do
pure $ Right ( Just totalLength
, sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
_ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
parseFormatC _ _ _ = undefined
......@@ -211,7 +219,7 @@ parseFile WOS Plain p = do
parseFile Iramuteq Plain p = do
docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
<$> snd
<$> enrichWith Iramuteq
<$> enrichWith Iramuteq
<$> readFileWith Iramuteq p
pure $ Right docs
......@@ -226,7 +234,7 @@ toDoc ff d = do
-- let abstract = lookup "abstract" d
let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
-- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
......@@ -314,9 +322,8 @@ clean txt = DBC.map clean' txt
clean' ';' = '.'
clean' c = c
--
--
splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ")
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.CSV
Description :
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
......@@ -378,7 +378,7 @@ instance ToNamedRecord CsvHal where
, "instStructId_i" .= csvHal_instStructId_i
, "deptStructId_i" .= csvHal_deptStructId_i
, "labStructId_i" .= csvHal_labStructId_i
, "rteamStructId_i" .= csvHal_rteamStructId_i
, "docType_s" .= csvHal_docType_s
]
......@@ -472,7 +472,7 @@ parseCsvC bs = do
Right res -> Right res
case result of
Left err -> Left err
Right r -> Right $ (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.JSON
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
JSON parser for Gargantext corpus files.
-}
{-# LANGUAGE DuplicateRecordFields #-}
module Gargantext.Core.Text.Corpus.Parsers.JSON where
import Conduit
import Data.Aeson
import qualified Data.ByteString.Lazy as BL
import Data.Either (Either(..))
import Data.Text
import GHC.Generics
import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude hiding (length)
data JSONStruct =
JSONStruct { documents :: [ JSONStructDocument ]
, garg_version :: Text }
deriving (Generic)
instance FromJSON JSONStruct
data JSONStructDocument =
JSONStructDocument { document :: JSONDocument
, ngrams :: JSONNgrams
, hash :: Text }
deriving (Generic)
instance FromJSON JSONStructDocument
data JSONDocument =
JSONDocument { id :: Int
, hash_id :: Maybe Text
, typename :: Int
, user_id :: Int
, parent_id :: Maybe Int
, name :: Text
, date :: Text
, hyperdata :: HyperdataDocument }
deriving (Generic)
instance FromJSON JSONDocument
data JSONNgrams =
JSONNgrams { ngrams :: [Text]
, hash :: Text }
deriving (Generic)
instance FromJSON JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct
readJSONLazyBS bs = eitherDecode bs
parseJSONC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = do
case readJSONLazyBS bs of
Left err -> Left err
Right (JSONStruct { documents }) ->
Right ( Just $ Prelude.fromIntegral $ Prelude.length documents
, yieldMany documents .| mapC doc2hyperdoc )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment