Commit 7b9de040 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

Merge branch 'dev' into 86-dev-graphql

parents e30519a3 f5bb8c77
use_nix
......@@ -11,6 +11,7 @@ profiling
# Stack
.stack-work
dist-newstyle
# Emacs
TAGS
......@@ -33,5 +34,7 @@ _darcs
# Runtime
# Repo
repos
repo.json*
tmp*repo*json
data
#!/bin/bash
stack install --nix --profile --test --fast # --haddock
stack install --nix --profile --test --fast --no-install-ghc --skip-ghc-check
......@@ -33,6 +33,7 @@ FRAME_VISIO_URL = URL_TO_CHANGE
FRAME_SEARX_URL = URL_TO_CHANGE
FRAME_ISTEX_URL = URL_TO_CHANGE
MAX_DOCS_PARSERS = 1000000
MAX_DOCS_SCRAPERS = 10000
[server]
......
......@@ -26,6 +26,8 @@ rec {
blas
gfortran7
# gfortran7.cc.lib
expat
icu
];
libPaths = pkgs.lib.makeLibraryPath nonhsBuildInputs;
shellHook = ''
......
name: gargantext
version: '0.0.4.3'
version: '0.0.4.4'
synopsis: Search, map, share
description: Please see README.md
category: Data
......
......@@ -3,6 +3,7 @@ module Gargantext.API.Job where
import Control.Lens (over, _Just)
import Data.IORef
import Data.Maybe
import qualified Data.Text as T
import Gargantext.Prelude
......@@ -16,6 +17,14 @@ jobLogInit rem =
, _scst_failed = Just 0
, _scst_events = Just [] }
addEvent :: T.Text -> T.Text -> JobLog -> JobLog
addEvent level message (JobLog { _scst_events = mEvts, .. }) = JobLog { _scst_events = Just (evts <> [ newEvt ]), .. }
where
evts = fromMaybe [] mEvts
newEvt = ScraperEvent { _scev_message = Just message
, _scev_level = Just level
, _scev_date = Nothing }
jobLogSuccess :: JobLog -> JobLog
jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
......@@ -38,6 +47,9 @@ jobLogFailTotal (JobLog { _scst_succeeded = mSucc
Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogFailTotalWithMessage :: T.Text -> JobLog -> JobLog
jobLogFailTotalWithMessage message jl = addEvent "ERROR" message $ jobLogFailTotal jl
jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
......
......@@ -21,25 +21,25 @@ module Gargantext.API.Node.Corpus.New
import Control.Lens hiding (elements, Empty)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString.Base64 as BSB64
import Data.Either
import Data.Maybe (fromMaybe)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import qualified Prelude as Prelude
import Protolude (readFile)
import Servant
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
......@@ -57,11 +57,12 @@ import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
import Gargantext.Prelude.Config (gc_max_docs_parsers)
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
......@@ -240,7 +241,7 @@ type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
addToCorpusWithForm :: FlowCmdM env err m
addToCorpusWithForm :: (FlowCmdM env err m)
=> User
-> CorpusId
-> NewWithForm
......@@ -258,12 +259,33 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
WOS -> Parser.parseFormat Parser.WOS
PresseRIS -> Parser.parseFormat Parser.RisPresse
ZIP -> Parser.parseFormat Parser.ZIP
-- TODO granularity of the logStatus
eDocs <- liftBase $ parse $ cs d
let data' = case ft of
ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
Right decoded -> decoded
_ -> cs d
eDocs <- liftBase $ parse data'
case eDocs of
Right docs' -> do
let docs = splitEvery 500 $ take 1000000 docs'
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
limit' <- view $ hasConfig . gc_max_docs_parsers
let limit = fromIntegral limit'
if length docs' > limit then do
printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
, show $ length docs'
, ") exceeds the MAX_DOCS_PARSERS limit ("
, show limit
, ")" ]
let panicMsg = T.concat $ T.pack <$> panicMsg'
logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
panic panicMsg
else
pure ()
let docs = splitEvery 500 $ take limit docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
......@@ -283,20 +305,19 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
logStatus jobLog3
pure $ jobLog3
Left e -> do
printDebug "Error" e
printDebug "[addToCorpusWithForm] parse error" e
logStatus jobLogE
let evt = ScraperEvent { _scev_message = Just $ T.pack e
, _scev_level = Just "ERROR"
, _scev_date = Nothing }
logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
pure jobLogE
where
jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog
parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
parseCsvGargV3Path fp = do
contents <- readFile fp
Parser.parseFormat Parser.CsvGargV3 $ cs contents
{-
addToCorpusWithFile :: FlowCmdM env err m
=> CorpusId
......
......@@ -27,6 +27,7 @@ import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Prelude (HasConfig)
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
......@@ -53,7 +54,7 @@ frameCalcUploadAPI uId nId =
)
frameCalcUploadAsync :: FlowCmdM env err m
frameCalcUploadAsync :: (HasConfig env, FlowCmdM env err m)
=> UserId
-> NodeId
-> FrameCalcUpload
......
......@@ -24,7 +24,7 @@ import Gargantext.API.Node.Corpus.New.File (FileType)
-------------------------------------------------------
data NewWithForm = NewWithForm
{ _wf_filetype :: !FileType
, _wf_data :: !Text
, _wf_data :: !Text -- NOTE for binary files, this represents base-64 data
, _wf_lang :: !(Maybe Lang)
, _wf_name :: !Text
} deriving (Eq, Show, Generic)
......
......@@ -25,7 +25,8 @@ module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cl
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
import Control.Monad (join, sequence)
import Control.Monad.IO.Class (liftIO)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
......@@ -43,6 +44,7 @@ import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Prelude as Prelude
import System.IO.Temp (emptySystemTempFile)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
......@@ -95,9 +97,14 @@ parseFormat WOS bs = do
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
parseFormat ZIP _bs = do
printDebug "[parseFormat]" ZIP
pure $ Left "Not implemented for ZIP"
parseFormat ZIP bs = do
path <- emptySystemTempFile "parsed.zip"
DB.writeFile path bs
withArchive path $ do
files <- DM.keys <$> getEntries
filesContents <- mapM getEntry files
ddocs <- liftIO $ mapM (parseFormat CsvGargV3) filesContents
pure $ concat <$> sequence ddocs
parseFormat _ _ = undefined
-- | Parse file into documents
......
......@@ -360,7 +360,7 @@ viewDocuments cId t ntId mQuery = proc () -> do
restrict -< if query == ""
then pgBool True
--else (n^.ns_search) @@ (pgTSQuery (T.unpack query))
else (n^.ns_search) @@ (toTSQuery $ T.unpack query)
else (n^.ns_search) @@ (plaintoTSQuery $ T.unpack query)
returnA -< FacetDoc (_ns_id n)
(_ns_date n)
......
resolver:
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/12.yaml
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/13.yaml
flags: {}
extra-package-dbs: []
packages:
......@@ -11,6 +11,7 @@ packages:
docker:
enable: false
#enable: true
repo: 'cgenie/stack-build:lts-18.12-garg'
run-args:
- '--publish=8008:8008'
......@@ -26,8 +27,9 @@ allow-newer: true
# "$everything": -haddock
extra-deps:
- git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
commit: 3e32ec3aca71eb326805355d3a99b9288dc342ee
- #git: https://gitlab.iscpif.fr/gargantext/haskell-gargantext-prelude.git
git: https://gitlab.iscpif.fr/cgenie/haskell-gargantext-prelude.git
commit: 35b09629a658fc16cc9ff63e7591e58511cd98a7
# Data Mining Libs
- git: https://github.com/delanoe/data-time-segment.git
commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef
......@@ -44,7 +46,7 @@ extra-deps:
# Databases libs
- git: https://github.com/delanoe/haskell-opaleye.git
commit: 806da7f9fb6fe1032f51c1822fc224b281cdd84f
commit: d3ab7acd5ede737478763630035aa880f7e34444
- git: https://github.com/delanoe/hsparql.git
commit: 308c74b71a1abb0a91546fa57d353131248e3a7f
- git: https://github.com/robstewart57/rdf4h.git
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment