Commit aa7e1142 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/203-dev-corpus-json-import' into dev

parents 19c6f5ad 65fa2834
...@@ -82,6 +82,7 @@ library: ...@@ -82,6 +82,7 @@ library:
- Gargantext.Core.Text.Corpus.Parsers - Gargantext.Core.Text.Corpus.Parsers
- Gargantext.Core.Text.Corpus.Parsers.CSV - Gargantext.Core.Text.Corpus.Parsers.CSV
- Gargantext.Core.Text.Corpus.Parsers.Date.Parsec - Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
- Gargantext.Core.Text.Corpus.Parsers.JSON
- Gargantext.Core.Text.List.Formats.CSV - Gargantext.Core.Text.List.Formats.CSV
- Gargantext.Core.Text.Metrics - Gargantext.Core.Text.Metrics
- Gargantext.Core.Text.Metrics.CharByChar - Gargantext.Core.Text.Metrics.CharByChar
......
...@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -233,7 +233,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
markProgress 1 jobHandle markProgress 1 jobHandle
void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle void $ flowDataText user txt (Multi l) cid (Just flw) jobHandle
-- printDebug "corpus id" cids -- printDebug "corpus id" cids
-- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user sendMail user
-- TODO ... -- TODO ...
...@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do ...@@ -270,6 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
WOS -> Parser.parseFormatC Parser.WOS WOS -> Parser.parseFormatC Parser.WOS
PresseRIS -> Parser.parseFormatC Parser.RisPresse PresseRIS -> Parser.parseFormatC Parser.RisPresse
Iramuteq -> Parser.parseFormatC Parser.Iramuteq Iramuteq -> Parser.parseFormatC Parser.Iramuteq
JSON -> Parser.parseFormatC Parser.JSON
-- TODO granularity of the logStatus -- TODO granularity of the logStatus
let data' = case ff of let data' = case ff of
......
...@@ -15,6 +15,7 @@ data FileType = CSV ...@@ -15,6 +15,7 @@ data FileType = CSV
| PresseRIS | PresseRIS
| WOS | WOS
| Iramuteq | Iramuteq
| JSON
deriving (Eq, Show, Generic) deriving (Eq, Show, Generic)
instance ToSchema FileType instance ToSchema FileType
instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS] instance Arbitrary FileType where arbitrary = elements [CSV, PresseRIS]
...@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where ...@@ -28,7 +29,8 @@ instance FromHttpApiData FileType where
parseUrlPiece "PresseRis" = pure PresseRIS parseUrlPiece "PresseRis" = pure PresseRIS
parseUrlPiece "WOS" = pure WOS parseUrlPiece "WOS" = pure WOS
parseUrlPiece "Iramuteq" = pure Iramuteq parseUrlPiece "Iramuteq" = pure Iramuteq
parseUrlPiece _ = panic "[G.A.A.Node.Corpus.New] File Type not implemented (yet)" parseUrlPiece "JSON" = pure JSON
parseUrlPiece s = panic $ "[G.A.A.Node.Corpus.New] File Type not implemented (yet): " <> s
instance ToHttpApiData FileType where instance ToHttpApiData FileType where
toUrlPiece = pack . show toUrlPiece = pack . show
......
...@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second) ...@@ -42,6 +42,7 @@ import Data.Tuple.Extra (both, first, second)
import Gargantext.API.Node.Corpus.New.Types (FileFormat(..)) import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC) import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich) import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Database.Query.Table.Ngrams (NgramsType(..)) import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
...@@ -79,7 +80,8 @@ data FileType = WOS ...@@ -79,7 +80,8 @@ data FileType = WOS
| CsvGargV3 | CsvGargV3
| CsvHal | CsvHal
| Iramuteq | Iramuteq
deriving (Show) | JSON
deriving (Show, Eq)
-- Implemented (ISI Format) -- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc -- | DOC -- Not Implemented / import Pandoc
...@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do ...@@ -132,6 +134,12 @@ parseFormatC Iramuteq Plain bs = do
) )
<$> eDocs <$> eDocs
parseFormatC JSON Plain bs = do
let eParsedC = parseJSONC $ DBL.fromStrict bs
case eParsedC of
Left err -> pure $ Left err
Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
parseFormatC ft ZIP bs = do parseFormatC ft ZIP bs = do
path <- liftBase $ emptySystemTempFile "parsed-zip" path <- liftBase $ emptySystemTempFile "parsed-zip"
liftBase $ DB.writeFile path bs liftBase $ DB.writeFile path bs
...@@ -319,4 +327,3 @@ clean txt = DBC.map clean' txt ...@@ -319,4 +327,3 @@ clean txt = DBC.map clean' txt
splitOn :: NgramsType -> Maybe Text -> Text -> [Text] splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
splitOn Authors (Just "WOS") = (DT.splitOn "; ") splitOn Authors (Just "WOS") = (DT.splitOn "; ")
splitOn _ _ = (DT.splitOn ", ") splitOn _ _ = (DT.splitOn ", ")
...@@ -472,7 +472,7 @@ parseCsvC bs = do ...@@ -472,7 +472,7 @@ parseCsvC bs = do
Right res -> Right res Right res -> Right res
case result of case result of
Left err -> Left err Left err -> Left err
Right r -> Right $ (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc) Right r -> Right (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- Csv v3 weighted for phylo -- Csv v3 weighted for phylo
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.JSON
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
JSON parser for Gargantext corpus files.
-}
{-# LANGUAGE DuplicateRecordFields #-}
module Gargantext.Core.Text.Corpus.Parsers.JSON where
import Conduit
import Data.Aeson
import qualified Data.ByteString.Lazy as BL
import Data.Either (Either(..))
import Data.Text
import GHC.Generics
import qualified Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import Gargantext.Prelude hiding (length)
data JSONStruct =
JSONStruct { documents :: [ JSONStructDocument ]
, garg_version :: Text }
deriving (Generic)
instance FromJSON JSONStruct
data JSONStructDocument =
JSONStructDocument { document :: JSONDocument
, ngrams :: JSONNgrams
, hash :: Text }
deriving (Generic)
instance FromJSON JSONStructDocument
data JSONDocument =
JSONDocument { id :: Int
, hash_id :: Maybe Text
, typename :: Int
, user_id :: Int
, parent_id :: Maybe Int
, name :: Text
, date :: Text
, hyperdata :: HyperdataDocument }
deriving (Generic)
instance FromJSON JSONDocument
data JSONNgrams =
JSONNgrams { ngrams :: [Text]
, hash :: Text }
deriving (Generic)
instance FromJSON JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS :: BL.ByteString -> Either Prelude.String JSONStruct
readJSONLazyBS bs = eitherDecode bs
parseJSONC :: BL.ByteString
-> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
parseJSONC bs = do
case readJSONLazyBS bs of
Left err -> Left err
Right (JSONStruct { documents }) ->
Right ( Just $ Prelude.fromIntegral $ Prelude.length documents
, yieldMany documents .| mapC doc2hyperdoc )
doc2hyperdoc :: JSONStructDocument -> HyperdataDocument
doc2hyperdoc (JSONStructDocument { document = JSONDocument { hyperdata } }) = hyperdata
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment