Commit 50497ca2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

merge

parents f81fa897 09e9fa50
......@@ -16,17 +16,30 @@ Adaptative Phylo binaries
module Main where
import Control.Concurrent.Async (mapConcurrently)
import Crypto.Hash.SHA256 (hash)
import Data.Aeson
import Data.Either (Either(..))
import Data.List (concat, nub, isSuffixOf)
import Data.String (String)
import GHC.IO (FilePath)
import qualified Prelude as Prelude
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Data.Text (Text, unwords, unpack, replace, pack)
import Crypto.Hash.SHA256 (hash)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Data.Text as T
import Gargantext.Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
......@@ -36,20 +49,6 @@ import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setCon
import Gargantext.Core.Viz.Phylo.PhyloExport (toPhyloExport, dotToFile)
-- import Gargantext.API.Ngrams.Prelude (toTermList)
import GHC.IO (FilePath)
import Prelude (Either(Left, Right),toInteger)
import System.Environment
import System.Directory (listDirectory,doesFileExist)
import Control.Concurrent.Async (mapConcurrently)
import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, diffDays, showGregorian)
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
-- import Debug.Trace (trace)
data PhyloStage = PhyloWithCliques | PhyloWithLinks deriving (Show)
......@@ -84,13 +83,13 @@ toDays y m d = fromIntegral
toPhyloDate :: Int -> Int -> Int -> TimeUnit -> Date
toPhyloDate y m d tu = case tu of
Year _ _ _ -> y
Month _ _ _ -> toMonths (toInteger y) m d
Week _ _ _ -> div (toDays (toInteger y) m d) 7
Day _ _ _ -> toDays (toInteger y) m d
Month _ _ _ -> toMonths (Prelude.toInteger y) m d
Week _ _ _ -> div (toDays (Prelude.toInteger y) m d) 7
Day _ _ _ -> toDays (Prelude.toInteger y) m d
toPhyloDate' :: Int -> Int -> Int -> Text
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (toInteger y) m d
toPhyloDate' y m d = pack $ showGregorian $ fromGregorian (Prelude.toInteger y) m d
--------------
......@@ -113,43 +112,53 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
wosToDocs :: Int -> Patterns -> TimeUnit -> FilePath -> IO [Document]
wosToDocs limit patterns time path = do
files <- getFilesFromPath path
take limit
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d))
(termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> parseFile WOS (path <> file) ) files
files <- getFilesFromPath path
let parseFile' file = do
eParsed <- parseFile WOS (path <> file)
case eParsed of
Right ps -> pure ps
Left e -> panic $ "Error: " <> (pack e)
take limit
<$> map (\d -> let title = fromJust $ _hd_title d
abstr = if (isJust $ _hd_abstract d)
then fromJust $ _hd_abstract d
else ""
in Document (toPhyloDate
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d) time)
(toPhyloDate'
(fromIntegral $ fromJust $ _hd_publication_year d)
(fromJust $ _hd_publication_month d)
(fromJust $ _hd_publication_day d))
(termsInText patterns $ title <> " " <> abstr) Nothing [])
<$> concat
<$> mapConcurrently (\file ->
filter (\d -> (isJust $ _hd_publication_year d)
&& (isJust $ _hd_title d))
<$> parseFile' file) files
-- To transform a Csv file into a list of Document
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO ([Document])
csvToDocs :: CorpusParser -> Patterns -> TimeUnit -> FilePath -> IO [Document]
csvToDocs parser patterns time path =
case parser of
Wos _ -> undefined
Csv limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) <$> snd <$> Csv.readFile path
Csv limit -> do
eR <- Csv.readFile path
case eR of
Right r ->
pure $ Vector.toList
$ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
) $ snd r
Left e -> panic $ "Error: " <> (pack e)
Csv' limit -> Vector.toList
<$> Vector.take limit
<$> Vector.map (\row -> Document (toPhyloDate (csv'_publication_year row) (csv'_publication_month row) (csv'_publication_day row) time)
......
......@@ -15,6 +15,7 @@ compress the contexts around the main terms of the query.
module CleanCsvCorpus where
--import GHC.IO (FilePath)
import Data.Either (Either(..))
import Data.SearchEngine as S
import qualified Data.Set as S
import Data.Text (pack)
......@@ -39,17 +40,19 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- CSV.readFile rPath
eDocs <- CSV.readFile rPath
case eDocs of
Right (h, csvDocs) -> do
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
let docs = CSV.toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
let docs = CSV.toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
CSV.writeFile wPath (h, docs')
CSV.writeFile wPath (h, docs')
Left e -> panic $ "Error: " <> (pack e)
......@@ -17,30 +17,24 @@ Main specifications to index a corpus with a term list
module Main where
import Data.ByteString.Lazy (writeFile)
import Data.Text (pack)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import Data.Map (Map)
import qualified Data.Map as DM
import GHC.Generics
import Data.Aeson
import Data.Text (Text)
import Data.ByteString.Lazy (writeFile)
import Data.Either (Either(..))
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
import Data.Map (Map)
import qualified Data.Map as DM
import Data.Text (pack, Text)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import GHC.Generics
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Concurrent (getNumCapabilities, myThreadId, threadCapability)
import Gargantext.Prelude
import Gargantext.Core
......@@ -48,7 +42,7 @@ import Gargantext.Core.Types
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
......@@ -92,22 +86,25 @@ main = do
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
corpus <- DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd
<$> readFile corpusFile
eCorpusFile <- readFile corpusFile
case eCorpusFile of
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (unIntOrDec $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
termList <- csvMapTermList termListFile
-- termListMap :: [Text]
termList <- csvMapTermList termListFile
putStrLn $ show $ length termList
putStrLn $ show $ length termList
let patterns = buildPatterns termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r)
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r)
Left e -> panic $ "Error: " <> (pack e)
......
......@@ -24,6 +24,16 @@ import Data.Maybe
import Data.Text (Text, unwords)
import GHC.Generics
import GHC.IO (FilePath)
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Prelude
import Gargantext.Core.Text.Context (TermList)
......@@ -36,15 +46,6 @@ import Gargantext.Core.Viz.Phylo.LevelMaker
import Gargantext.Core.Viz.Phylo.Tools
import Gargantext.Core.Viz.Phylo.View.Export
import Gargantext.Core.Viz.Phylo.View.ViewMaker
import System.Directory (doesFileExist)
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.List as DL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Data.Vector as DV
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
import qualified Prelude as P
--------------
......
......@@ -21,7 +21,7 @@ services:
ports:
- 8081:80
environment:
PGADMIN_DEFAULT_EMAIL: admin@localhost
PGADMIN_DEFAULT_EMAIL: admin@localhost.lan
PGADMIN_DEFAULT_PASSWORD: admin
depends_on:
......
This diff is collapsed.
......@@ -12,6 +12,8 @@ rec {
git
gmp
gsl
haskell-language-server
hlint
igraph
liblapack
lzma
......
......@@ -188,6 +188,7 @@ library:
- random
- rdf4h
- regex-compat
- regex-tdfa
- resource-pool
- resourcet
- safe
......@@ -223,6 +224,7 @@ library:
- transformers
- transformers-base
- unordered-containers
- utf8-string
- uuid
- validity
- vector
......
......@@ -101,6 +101,8 @@ data JobLog = JobLog
}
deriving (Show, Generic)
makeLenses ''JobLog
instance Arbitrary JobLog where
arbitrary = JobLog
<$> arbitrary
......
......@@ -166,7 +166,7 @@ newEnv port file = do
when (port /= settings' ^. appPort) $
panic "TODO: conflicting settings of port"
config' <- readConfig file
config' <- readConfig file
self_url <- parseBaseUrl $ "http://0.0.0.0:" <> show port
dbParam <- databaseParameters file
pool <- newPool dbParam
......
module Gargantext.API.Job where
import Control.Lens (over, _Just)
import Data.IORef
import Data.Maybe
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
import Gargantext.API.Admin.Orchestrator.Types
jobLogInit :: Int -> JobLog
......@@ -16,25 +17,29 @@ jobLogInit rem =
, _scst_events = Just [] }
jobLogSuccess :: JobLog -> JobLog
jobLogSuccess (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = (+ 1) <$> mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem
, _scst_failed = mFail
, _scst_events = evt }
jobLogSuccess jl = over (scst_succeeded . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
jobLogFail :: JobLog -> JobLog
jobLogFail (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
jobLogFail jl = over (scst_failed . _Just) (+ 1) $
over (scst_remaining . _Just) (\x -> x - 1) jl
jobLogFailTotal :: JobLog -> JobLog
jobLogFailTotal (JobLog { _scst_succeeded = mSucc
, _scst_remaining = mRem
, _scst_failed = mFail
, _scst_events = evt }) =
JobLog { _scst_succeeded = mSucc
, _scst_remaining = (\x -> x - 1) <$> mRem
, _scst_failed = (+ 1) <$> mFail
, _scst_remaining = newRem
, _scst_failed = newFail
, _scst_events = evt }
where
(newRem, newFail) = case mRem of
Nothing -> (Nothing, mFail)
Just rem -> (Just 0, (+ rem) <$> mFail)
jobLogEvt :: JobLog -> ScraperEvent -> JobLog
jobLogEvt jl evt = over (scst_events . _Just) (\evts -> (evt:evts)) jl
runJobLog :: MonadBase IO m => Int -> (JobLog -> m ()) -> m (m (), m (), m JobLog)
runJobLog num logStatus = do
......
......@@ -46,6 +46,7 @@ import Gargantext.API.Metrics
import Gargantext.API.Ngrams (TableNgramsApi, apiNgramsTableCorpus)
import Gargantext.API.Ngrams.Types (TabType(..))
import Gargantext.API.Node.File
import Gargantext.API.Node.FrameCalcUpload (FrameCalcUploadAPI, frameCalcUploadAPI)
import Gargantext.API.Node.New
import Gargantext.API.Prelude
import Gargantext.API.Table
......@@ -123,6 +124,7 @@ type NodeAPI a = Get '[JSON] (Node a)
:<|> "rename" :> RenameApi
:<|> PostNodeApi -- TODO move to children POST
:<|> PostNodeAsync
:<|> FrameCalcUploadAPI
:<|> ReqBody '[JSON] a :> Put '[JSON] Int
:<|> "update" :> Update.API
:<|> Delete '[JSON] Int
......@@ -205,6 +207,7 @@ nodeAPI p uId id' = withAccess (Proxy :: Proxy (NodeAPI a)) Proxy uId (PathNode
:<|> rename id'
:<|> postNode uId id'
:<|> postNodeAsyncAPI uId id'
:<|> frameCalcUploadAPI uId id'
:<|> putNode id'
:<|> Update.api uId id'
:<|> Action.deleteNode (RootId $ NodeId uId) id'
......
......@@ -19,6 +19,11 @@ module Gargantext.API.Node.Corpus.Export
import Data.Map (Map)
import Data.Maybe (fromMaybe)
import Data.Set (Set)
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.HashMap.Strict as HashMap
import Gargantext.API.Node.Corpus.Export.Types
import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Tools (filterListWithRoot, mapTermListRoot, getRepo)
......@@ -36,10 +41,6 @@ import Gargantext.Database.Query.Table.NodeNode (selectDocNodes)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Schema.Node (_node_id, _node_hyperdata)
import Gargantext.Prelude
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.HashMap.Strict as HashMap
--------------------------------------------------
-- | Hashes are ordered by Set
......
......@@ -27,6 +27,8 @@ import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import qualified Prelude as Prelude
import Protolude (readFile)
import Servant
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
......@@ -36,25 +38,27 @@ import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
import Gargantext.Database.Action.Mail (sendMail)
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
------------------------------------------------------------------------
{-
......@@ -125,28 +129,11 @@ info :: FlowCmdM env err m => UserId -> m ApiInfo
info _u = pure $ ApiInfo API.externalAPIs
------------------------------------------------------------------------
data Database = Empty
| PubMed
| HAL
| IsTex
| Isidore
deriving (Eq, Show, Generic)
deriveJSON (unPrefix "") ''Database
instance ToSchema Database
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
------------------------------------------------------------------------
data WithQuery = WithQuery
{ _wq_query :: !Text
, _wq_databases :: !Database
, _wq_datafield :: !Datafield
, _wq_lang :: !Lang
, _wq_node_id :: !Int
}
......@@ -190,36 +177,51 @@ addToCorpusWithQuery :: FlowCmdM env err m
-> Maybe Integer
-> (JobLog -> m ())
-> m JobLog
addToCorpusWithQuery user cid (WithQuery q dbs l _nid) maybeLimit logStatus = do
addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
-- TODO ...
logStatus JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
, _scst_remaining = Just 5
, _scst_events = Just []
}
printDebug "addToCorpusWithQuery" (cid, dbs)
-- TODO add cid
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
logStatus JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 1
, _scst_events = Just []
}
cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_remaining = Just 3
, _scst_events = Just []
}
printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
printDebug "[addToCorpusWithQuery] datafield" datafield
case datafield of
Web -> do
printDebug "[addToCorpusWithQuery] processing web request" datafield
_ <- triggerSearxSearch cid q l
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
_ -> do
-- TODO add cid
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
logStatus JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 1
, _scst_events = Just []
}
cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
-- TODO ...
pure JobLog { _scst_succeeded = Just 3
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
......@@ -235,16 +237,12 @@ addToCorpusWithForm :: FlowCmdM env err m
-> CorpusId
-> NewWithForm
-> (JobLog -> m ())
-> JobLog
-> m JobLog
addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] Parsing corpus: " cid
printDebug "[addToCorpusWithForm] fileType" ft
logStatus JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
, _scst_remaining = Just 2
, _scst_events = Just []
}
logStatus jobLog
let
parse = case ft of
CSV_HAL -> Parser.parseFormat Parser.CsvHal
......@@ -253,34 +251,41 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
PresseRIS -> Parser.parseFormat Parser.RisPresse
-- TODO granularity of the logStatus
docs <- liftBase $ splitEvery 500
<$> take 1000000
<$> parse (cs d)
printDebug "Parsing corpus finished : " cid
logStatus JobLog { _scst_succeeded = Just 1
, _scst_failed = Just 0
, _scst_remaining = Just 1
, _scst_events = Just []
}
printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
_cid' <- flowCorpus user
(Right [cid])
(Multi $ fromMaybe EN l)
(map (map toHyperdataDocument) docs)
printDebug "Extraction finished : " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
pure JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
eDocs <- liftBase $ parse $ cs d
case eDocs of
Right docs' -> do
let docs = splitEvery 500 $ take 1000000 docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
printDebug "Starting extraction : " cid
-- TODO granularity of the logStatus
_cid' <- flowCorpus user
(Right [cid])
(Multi $ fromMaybe EN l)
(map (map toHyperdataDocument) docs)
printDebug "Extraction finished : " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
logStatus jobLog3
pure $ jobLog3
Left e -> do
printDebug "Error" e
logStatus jobLogE
pure jobLogE
where
jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog
parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
parseCsvGargV3Path fp = do
contents <- readFile fp
Parser.parseFormat Parser.CsvGargV3 $ cs contents
{-
addToCorpusWithFile :: FlowCmdM env err m
......
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Node.Corpus.Searx where
import Control.Lens (view)
import qualified Data.Aeson as Aeson
import Data.Aeson.TH (deriveJSON)
import qualified Data.Text as T
import GHC.Generics (Generic)
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import qualified Prelude as Prelude
import Protolude (encodeUtf8, Text, Either)
import Gargantext.Prelude
import Gargantext.Prelude.Config
import Gargantext.Core (Lang(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (FlowCmdM)
import Gargantext.Database.Admin.Types.Node (CorpusId)
import Gargantext.Database.Prelude (hasConfig)
data SearxResult = SearxResult
{ _sr_url :: Text
, _sr_title :: Text
, _sr_content :: Maybe Text
, _sr_engine :: Text
, _sr_score :: Double
, _sr_category :: Text
, _sr_pretty_url :: Text }
deriving (Show, Eq, Generic)
-- , _sr_parsed_url
-- , _sr_engines
-- , _sr_positions
$(deriveJSON (unPrefix "_sr_") ''SearxResult)
data SearxResponse = SearxResponse
{ _srs_query :: Text
, _srs_number_of_results :: Int
, _srs_results :: [SearxResult] }
deriving (Show, Eq, Generic)
-- , _srs_answers
-- , _srs_corrections
-- , _srs_infoboxes
-- , _srs_suggestions :: [Text]
-- , _srs_unresponsive_engines :: [Text] }
$(deriveJSON (unPrefix "_srs_") ''SearxResponse)
data FetchSearxParams = FetchSearxParams
{ _fsp_manager :: Manager
, _fsp_pageno :: Int
, _fsp_query :: Text
, _fsp_url :: Text
}
fetchSearxPage :: FetchSearxParams -> IO (Either Prelude.String SearxResponse)
fetchSearxPage (FetchSearxParams { _fsp_manager
, _fsp_pageno
, _fsp_query
, _fsp_url }) = do
-- searx search API:
-- https://searx.github.io/searx/dev/search_api.html?highlight=json
req <- parseRequest $ T.unpack _fsp_url
let request = urlEncodedBody
[ --("category_general", "1")
("q", encodeUtf8 _fsp_query)
, ("pageno", encodeUtf8 $ T.pack $ show _fsp_pageno)
--, ("time_range", "None")
--, ("language", "en-US") -- TODO
, ("format", "json")
] req
res <- httpLbs request _fsp_manager
let dec = Aeson.eitherDecode $ responseBody res :: (Either Prelude.String SearxResponse)
pure dec
triggerSearxSearch :: (MonadBase IO m, FlowCmdM env err m)
=> CorpusId
-> API.Query
-> Lang
-> m ()
triggerSearxSearch cid q l = do
printDebug "[triggerSearxSearch] cid" cid
printDebug "[triggerSearxSearch] q" q
printDebug "[triggerSearxSearch] l" l
cfg <- view hasConfig
let surl = _gc_frame_searx_url cfg
printDebug "[triggerSearxSearch] surl" surl
manager <- liftBase $ newManager tlsManagerSettings
res <- liftBase $ fetchSearxPage $ FetchSearxParams { _fsp_manager = manager
, _fsp_pageno = 1
, _fsp_query = q
, _fsp_url = surl }
printDebug "[triggerSearxSearch] res" res
pure ()
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Node.Corpus.Types where
import Control.Lens hiding (elements, Empty)
import Control.Monad.Fail (fail)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
import Data.Monoid (mempty)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
import Text.Regex.TDFA ((=~))
import Protolude ((++))
import Gargantext.Prelude
import qualified Gargantext.API.Admin.Orchestrator.Types as T
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Action.Flow (DataOrigin(..))
data Database = Empty
| PubMed
| HAL
| IsTex
| Isidore
deriving (Eq, Show, Generic)
deriveJSON (unPrefix "") ''Database
instance ToSchema Database
database2origin :: Database -> DataOrigin
database2origin Empty = InternalOrigin T.IsTex
database2origin PubMed = ExternalOrigin T.PubMed
database2origin HAL = ExternalOrigin T.HAL
database2origin IsTex = ExternalOrigin T.IsTex
database2origin Isidore = ExternalOrigin T.Isidore
------------------------------------------------------------------------
data Datafield = Gargantext
| External (Maybe Database)
| Web
| Files
deriving (Eq, Show, Generic)
instance FromJSON Datafield where
parseJSON = withText "Datafield" $ \text ->
case text of
"Gargantext" -> pure Gargantext
"Web" -> pure Web
"Files" -> pure Files
v ->
let (preExternal, _, postExternal) = v =~ ("External " :: Text) :: (Text, Text, Text)
in
if preExternal == "" then do
db <- parseJSON $ String postExternal
pure $ External db
else fail $ "Cannot match patterh 'External <db>' for string " ++ (T.unpack v)
instance ToJSON Datafield where
toJSON (External db) = toJSON $ "External " ++ (show db)
toJSON s = toJSON $ show s
instance ToSchema Datafield where
declareNamedSchema _ = do
return $ NamedSchema (Just "Datafield") $ mempty
& type_ ?~ SwaggerObject
......@@ -9,6 +9,14 @@ import Control.Lens ((^.))
import Data.Swagger
import Data.Text
import GHC.Generics (Generic)
import Servant
import Servant.Job.Async (JobFunction(..), serveJobsAPI)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BSL
import qualified Data.MIME.Types as DMT
import qualified Gargantext.Database.GargDB as GargDB
import qualified Network.HTTP.Media as M
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Admin.Types (HasSettings)
import Gargantext.API.Node.Types
......@@ -22,13 +30,6 @@ import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
import Servant
import Servant.Job.Async (JobFunction(..), serveJobsAPI)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BSL
import qualified Data.MIME.Types as DMT
import qualified Gargantext.Database.GargDB as GargDB
import qualified Network.HTTP.Media as M
data RESPONSE deriving Typeable
......
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE MonoLocalBinds #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Node.FrameCalcUpload where
import Control.Lens ((^.))
import Data.Aeson
import qualified Data.ByteString.Lazy as BSL
import qualified Data.ByteString.UTF8 as BSU8
import Data.Swagger
import qualified Data.Text as T
import GHC.Generics (Generic)
import Network.HTTP.Client (newManager, httpLbs, parseRequest, responseBody)
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Servant
import Servant.Job.Async
import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Job (jobLogInit, jobLogSuccess, jobLogFail)
import Gargantext.API.Node.Corpus.New (addToCorpusWithForm)
import Gargantext.API.Node.Corpus.New.File (FileType(..))
import Gargantext.API.Node.Types (NewWithForm(..))
import Gargantext.API.Prelude
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.Flow.Types
import Gargantext.Database.Admin.Types.Hyperdata.Frame
import Gargantext.Database.Admin.Types.Node
import Gargantext.Database.Query.Table.Node (getClosestParentIdByType, getNodeWith)
import Gargantext.Database.Schema.Node (node_hyperdata)
import Gargantext.Prelude
data FrameCalcUpload = FrameCalcUpload ()
deriving (Generic)
instance FromForm FrameCalcUpload
instance FromJSON FrameCalcUpload
instance ToJSON FrameCalcUpload
instance ToSchema FrameCalcUpload
type FrameCalcUploadAPI = Summary " FrameCalc upload"
:> "add"
:> "framecalc"
:> "async"
:> AsyncJobs JobLog '[JSON] FrameCalcUpload JobLog
frameCalcUploadAPI :: UserId -> NodeId -> GargServer FrameCalcUploadAPI
frameCalcUploadAPI uId nId =
serveJobsAPI $
JobFunction (\p logs ->
frameCalcUploadAsync uId nId p (liftBase . logs) (jobLogInit 5)
)
frameCalcUploadAsync :: FlowCmdM env err m
=> UserId
-> NodeId
-> FrameCalcUpload
-> (JobLog -> m ())
-> JobLog
-> m JobLog
frameCalcUploadAsync uId nId _f logStatus jobLog = do
logStatus jobLog
-- printDebug "[frameCalcUploadAsync] uId" uId
-- printDebug "[frameCalcUploadAsync] nId" nId
node <- getNodeWith nId (Proxy :: Proxy HyperdataFrame)
let (HyperdataFrame { _hf_base = base
, _hf_frame_id = frame_id }) = node ^. node_hyperdata
let csvUrl = base <> "/" <> frame_id <> ".csv"
-- printDebug "[frameCalcUploadAsync] csvUrl" csvUrl
res <- liftBase $ do
manager <- newManager tlsManagerSettings
req <- parseRequest $ T.unpack csvUrl
httpLbs req manager
let body = T.pack $ BSU8.toString $ BSL.toStrict $ responseBody res
mCId <- getClosestParentIdByType nId NodeCorpus
-- printDebug "[frameCalcUploadAsync] mCId" mCId
jobLog2 <- case mCId of
Nothing -> pure $ jobLogFail jobLog
Just cId ->
addToCorpusWithForm (RootId (NodeId uId)) cId (NewWithForm CSV body Nothing "calc-upload.csv") logStatus jobLog
pure $ jobLogSuccess jobLog2
......@@ -24,6 +24,12 @@ import Data.Aeson
import Data.Swagger
import Data.Text (Text)
import GHC.Generics (Generic)
import Servant
import Servant.Job.Async
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Web.FormUrlEncoded (FromForm)
import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
import Gargantext.API.Prelude
import Gargantext.Database.Action.Flow.Types
......@@ -34,11 +40,6 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
import Gargantext.Database.Query.Table.Node.User
import Gargantext.Database.Schema.Node
import Gargantext.Prelude
import Servant
import Servant.Job.Async
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Web.FormUrlEncoded (FromForm)
------------------------------------------------------------------------
data PostNode = PostNode { pn_name :: Text
......
......@@ -27,10 +27,24 @@ import Control.Concurrent (threadDelay)
import Control.Lens (view)
import Data.Text (Text)
import Data.Validity
import Servant
import Servant.Auth as SA
import Servant.Auth.Swagger ()
import Servant.Job.Async
import Servant.Swagger.UI
import qualified Gargantext.API.Ngrams.List as List
import qualified Gargantext.API.Node.Contact as Contact
import qualified Gargantext.API.Node.Corpus.Annuaire as Annuaire
import qualified Gargantext.API.Node.Corpus.Export as Export
import qualified Gargantext.API.Node.Corpus.Export.Types as Export
import qualified Gargantext.API.Node.Corpus.New as New
import qualified Gargantext.API.Public as Public
import Gargantext.API.Admin.Auth.Types (AuthRequest, AuthResponse, AuthenticatedUser(..), PathId(..))
import Gargantext.API.Admin.Auth (withAccess)
import Gargantext.API.Admin.FrontEnd (FrontEndAPI)
import Gargantext.API.Count (CountAPI, count, Query)
import Gargantext.API.Job (jobLogInit)
import Gargantext.API.Ngrams (TableNgramsApi, apiNgramsTableDoc)
import Gargantext.API.Node
import Gargantext.API.Prelude
......@@ -41,18 +55,7 @@ import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node
import Gargantext.Prelude
import Gargantext.Prelude.Config (gc_max_docs_scrapers)
import Servant
import Servant.Auth as SA
import Servant.Auth.Swagger ()
import Servant.Job.Async
import Servant.Swagger.UI
import qualified Gargantext.API.Ngrams.List as List
import qualified Gargantext.API.Node.Contact as Contact
import qualified Gargantext.API.Node.Corpus.Annuaire as Annuaire
import qualified Gargantext.API.Node.Corpus.Export as Export
import qualified Gargantext.API.Node.Corpus.Export.Types as Export
import qualified Gargantext.API.Node.Corpus.New as New
import qualified Gargantext.API.Public as Public
type GargAPI = "api" :> Summary "API " :> GargAPIVersion
-- | TODO :<|> Summary "Latest API" :> GargAPI'
......@@ -282,9 +285,9 @@ addCorpusWithForm user cid =
JobFunction (\i log' ->
let
log'' x = do
printDebug "addToCorpusWithForm" x
printDebug "[addToCorpusWithForm] " x
liftBase $ log' x
in New.addToCorpusWithForm user cid i log'')
in New.addToCorpusWithForm user cid i log'' (jobLogInit 3))
addCorpusWithFile :: User -> GargServer New.AddWithFile
addCorpusWithFile user cid =
......@@ -292,7 +295,7 @@ addCorpusWithFile user cid =
JobFunction (\i log' ->
let
log'' x = do
printDebug "addToCorpusWithFile" x
printDebug "[addToCorpusWithFile]" x
liftBase $ log' x
in New.addToCorpusWithFile user cid i log'')
......
......@@ -11,14 +11,17 @@ Portability : POSIX
module Gargantext.Core.Ext.IMT where
import Gargantext.Prelude
import Data.Text (Text, splitOn)
import Data.Either (Either(..))
import Data.Map (Map)
import Data.Text (Text, splitOn)
import qualified Data.Set as S
import qualified Data.List as DL
import qualified Data.Vector as DV
import qualified Data.Map as M
import qualified Prelude as Prelude
import Gargantext.Prelude
import Gargantext.Core.Text.Metrics.Utils as Utils
import Gargantext.Core.Text.Corpus.Parsers.CSV as CSV
......@@ -98,8 +101,10 @@ schools = [ School
mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
hal_data :: IO (DV.Vector CsvHal)
hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
hal_data :: IO (Either Prelude.String (DV.Vector CsvHal))
hal_data = do
r <- CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
pure $ snd <$> r
names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
......
......@@ -16,15 +16,21 @@ Format Converter.
module Gargantext.Core.Text.Convert (risPress2csvWrite)
where
import Data.Either (Either(..))
import qualified Data.Text as T
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (writeDocs2Csv)
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..))
risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = parseFile RisPresse (f <> ".ris")
>>= \hs -> writeDocs2Csv (f <> ".csv") hs
risPress2csvWrite f = do
eContents <- parseFile RisPresse (f <> ".ris")
case eContents of
Right contents -> writeDocs2Csv (f <> ".csv") contents
Left e -> panic $ "Error: " <> (T.pack e)
......@@ -36,18 +36,20 @@ import Data.String()
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
import qualified Prelude as Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
......@@ -75,30 +77,40 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
parseFormat RisPresse bs = mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
parseFormat WOS bs = mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
parseFormat RisPresse bs = do
docs <- mapM (toDoc RIS)
<$> snd
<$> enrichWith RisPresse
$ partitionEithers
$ [runParser' RisPresse bs]
pure $ Right docs
parseFormat WOS bs = do
docs <- mapM (toDoc WOS)
<$> snd
<$> enrichWith WOS
$ partitionEithers
$ [runParser' WOS bs]
pure $ Right docs
parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseFile CsvHal p = parseHal p
parseFile CsvGargV3 p = parseCsv p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
parseFile RisPresse p = do
docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
pure $ Right docs
parseFile WOS p = do
docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
pure $ Right docs
parseFile ff p = do
docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
pure $ Right docs
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
......
......@@ -19,7 +19,7 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as BL
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Either (Either(..))
import Data.Text (Text, pack, length, intercalate)
import Data.Time.Segment (jour)
import qualified Data.Vector as V
......@@ -27,6 +27,8 @@ import Data.Vector (Vector)
import GHC.IO (FilePath)
import GHC.Word (Word8)
import qualified Prelude as Prelude
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Core.Text
......@@ -83,7 +85,7 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
-- | Types Conversions
toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
$ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
-> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
......@@ -94,7 +96,7 @@ toDocs v = V.toList
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
......@@ -137,10 +139,21 @@ docsSize csvDoc = mean ls
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
data CsvDoc = CsvDoc
{ csv_title :: !Text
, csv_source :: !Text
, csv_publication_year :: !Int
, csv_publication_year :: !IntOrDec
, csv_publication_month :: !Int
, csv_publication_day :: !Int
, csv_abstract :: !Text
......@@ -149,13 +162,13 @@ data CsvDoc = CsvDoc
deriving (Show)
instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
<*> (r .: "source" <|> r .: "Source")
<*> (r .: "publication_year" <|> r .: "Publication Year")
<*> (r .: "publication_month" <|> r .: "Publication Month")
<*> (r .: "publication_day" <|> r .: "Publication Day")
<*> (r .: "abstract" <|> r .: "Abstract")
<*> (r .: "authors" <|> r .: "Authors")
instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) =
......@@ -171,7 +184,7 @@ instance ToNamedRecord CsvDoc where
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
(m $ _hd_source h)
(mI $ _hd_publication_year h)
(IntOrDec $ mI $ _hd_publication_year h)
(mI $ _hd_publication_month h)
(mI $ _hd_publication_day h)
(m $ _hd_abstract h)
......@@ -192,52 +205,47 @@ delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn' fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
readCsvOn' fields fp = do
r <- readFile fp
pure $ ( V.toList
. V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
. snd ) <$> r
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------
......@@ -360,7 +368,7 @@ csvHal2doc (CsvHal title source
csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source
pub_year pub_month pub_day
(IntOrDec pub_year) pub_month pub_day
abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing
Nothing
......@@ -382,18 +390,22 @@ csv2doc (CsvDoc title source
Nothing
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseHal fp = do
r <- readCsvHal fp
pure $ (V.toList . V.map csvHal2doc . snd) <$> r
parseHal' :: BL.ByteString -> [HyperdataDocument]
parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
------------------------------------------------------------------------
parseCsv :: FilePath -> IO [HyperdataDocument]
parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
parseCsv fp = do
r <- readFile fp
pure $ (V.toList . V.map csv2doc . snd) <$> r
parseCsv' :: BL.ByteString -> [HyperdataDocument]
parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
......@@ -425,4 +437,4 @@ readWeightedCsv fp =
case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right corpus -> corpus
) $ BL.readFile fp
\ No newline at end of file
) $ BL.readFile fp
......@@ -69,7 +69,7 @@ data TermType lang
, _tt_ngramsSize :: !Int
, _tt_model :: !(Maybe (Tries Token ()))
}
deriving Generic
deriving (Generic)
makeLenses ''TermType
--group :: [Text] -> [Text]
......
......@@ -23,9 +23,6 @@ partitionWithKey :: (Ord a, Hashable k) => (k -> a -> Bool) -> HashMap k a -> (H
partitionWithKey p m = (HashMap.filterWithKey p m, HashMap.filterWithKey (\k -> not . p k) m)
mapKeys :: (Ord k2, Hashable k2) => (k1->k2) -> HashMap k1 a -> HashMap k2 a
mapKeys f = HashMap.fromList . HashMap.foldrWithKey (\k x xs -> (f k, x) : xs) []
------------------------------------------------------------------------
-- getKeyWithMaxValue :: Hashable k => HashMap k a -> Maybe k
getKeysOrderedByValueMaxFirst :: (Ord k, Hashable k, Ord a) => HashMap k a -> [k]
......
......@@ -56,7 +56,7 @@ import Data.Map (Map, lookup)
import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
import Data.Text (splitOn)
import qualified Data.Text as T
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
......@@ -178,11 +178,12 @@ flowCorpusFile :: (FlowCmdM env err m)
-> TermType Lang -> FileFormat -> FilePath
-> m CorpusId
flowCorpusFile u n l la ff fp = do
docs <- liftBase ( splitEvery 500
<$> take l
<$> parseFile ff fp
)
flowCorpus u n la (map (map toHyperdataDocument) docs)
eParsed <- liftBase $ parseFile ff fp
case eParsed of
Right parsed -> do
let docs = splitEvery 500 $ take l parsed
flowCorpus u n la (map (map toHyperdataDocument) docs)
Left e -> panic $ "Error: " <> (T.pack e)
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
......@@ -425,11 +426,11 @@ instance ExtractNgramsT HyperdataDocument
$ _hd_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
$ _hd_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ maybe ["Nothing"] (T.splitOn ", ")
$ _hd_authors doc
terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
......
......@@ -31,7 +31,6 @@ import Database.PostgreSQL.Simple (Connection, connect)
import Database.PostgreSQL.Simple.FromField ( Conversion, ResultError(ConversionFailed), fromField, returnError)
import Database.PostgreSQL.Simple.Internal (Field)
import Gargantext.Prelude
import Gargantext.Prelude.Config (GargConfig())
import Opaleye (Query, Unpackspec, showSqlForPostgres, FromFields, Select, runQuery, PGJsonb, QueryRunnerColumnDefault)
import Opaleye.Aggregate (countRows)
import System.IO (FilePath)
......@@ -41,6 +40,8 @@ import qualified Data.ByteString as DB
import qualified Data.List as DL
import qualified Database.PostgreSQL.Simple as PGS
import Gargantext.Prelude.Config (GargConfig())
-------------------------------------------------------
class HasConnectionPool env where
connPool :: Getter env (Pool Connection)
......
......@@ -121,7 +121,7 @@ getClosestParentIdByType :: HasDBid NodeType
getClosestParentIdByType nId nType = do
result <- runPGSQuery query (nId, 0 :: Int)
case result of
[DPS.Only parentId, DPS.Only pTypename] -> do
[(NodeId parentId, pTypename)] -> do
if toDBid nType == pTypename then
pure $ Just $ NodeId parentId
else
......@@ -131,7 +131,7 @@ getClosestParentIdByType nId nType = do
query :: DPS.Query
query = [sql|
SELECT n2.id, n2.typename
FROM nodes n1
FROM nodes n1
JOIN nodes n2 ON n1.parent_id = n2.id
WHERE n1.id = ? AND 0 = ?;
|]
......
resolver:
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/17/10.yaml
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/4.yaml
flags: {}
extra-package-dbs: []
packages:
......@@ -97,6 +97,7 @@ extra-deps:
- json-stream-0.4.2.4@sha256:8b7f17d54a6e1e6311756270f8bcf51e91bab4300945400de66118470dcf51b9,4716
- located-base-0.1.1.1@sha256:7c6395f2b6fbf2d5f76c3514f774423838c0ea94e1c6a5530dd3c94b30c9d1c8,1904
- logging-effect-1.3.12@sha256:72d168dd09887649ba9501627219b6027cbec2d5541931555b7885b133785ce3,1679
- MissingH-1.4.3.0@sha256:32f9892ec98cd21df4f4d3ed8d95a3831ae74287ea0641d6f09b2dc6ef061d39,4859
- monoid-extras-0.5.1@sha256:438dbfd7b4dce47d8f0ca577f56caf94bd1e21391afa545cad09fe7cf2e5793d,2333
- rake-0.0.1@sha256:3380f6567fb17505d1095b7f32222c0b631fa04126ad39726c84262da99c08b3,2025
- servant-cassava-0.10.1@sha256:07e7b6ca67cf57dcb4a0041a399a25d058844505837c6479e01d62be59d01fdf,1665
......@@ -108,4 +109,4 @@ extra-deps:
- xmlbf-xeno-0.2@sha256:39f70fced6052524c290cf595f114661c721452e65fc3e0953a44e7682a6a6b0,950
# need Vector.uncons
- vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953
\ No newline at end of file
- vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment