Commit d85b1128 authored by Grégoire Locqueville's avatar Grégoire Locqueville

Remove unimported modules

The removed modules are literally not imported anywhere, so removing them has no effect.
I may have missed some.
parent 0c74d722
...@@ -185,7 +185,6 @@ library ...@@ -185,7 +185,6 @@ library
Gargantext.Core.Text.Corpus.Parsers Gargantext.Core.Text.Corpus.Parsers
Gargantext.Core.Text.Corpus.Parsers.TSV Gargantext.Core.Text.Corpus.Parsers.TSV
Gargantext.Core.Text.Corpus.Parsers.Date Gargantext.Core.Text.Corpus.Parsers.Date
Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
Gargantext.Core.Text.Corpus.Query Gargantext.Core.Text.Corpus.Query
Gargantext.Core.Text.List Gargantext.Core.Text.List
Gargantext.Core.Text.List.Group.WithStem Gargantext.Core.Text.List.Group.WithStem
...@@ -195,8 +194,6 @@ library ...@@ -195,8 +194,6 @@ library
Gargantext.Core.Text.Metrics.Count Gargantext.Core.Text.Metrics.Count
Gargantext.Core.Text.Metrics.TFICF Gargantext.Core.Text.Metrics.TFICF
Gargantext.Core.Text.Ngrams Gargantext.Core.Text.Ngrams
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Eleve Gargantext.Core.Text.Terms.Eleve
Gargantext.Core.Text.Terms.Mono Gargantext.Core.Text.Terms.Mono
...@@ -216,7 +213,6 @@ library ...@@ -216,7 +213,6 @@ library
Gargantext.Core.Types.Query Gargantext.Core.Types.Query
Gargantext.Core.Utils Gargantext.Core.Utils
Gargantext.Core.Utils.Prefix Gargantext.Core.Utils.Prefix
Gargantext.Core.Viz.Graph
Gargantext.Core.Viz.Graph.Index Gargantext.Core.Viz.Graph.Index
Gargantext.Core.Viz.Graph.Tools Gargantext.Core.Viz.Graph.Tools
Gargantext.Core.Viz.Graph.Tools.IGraph Gargantext.Core.Viz.Graph.Tools.IGraph
...@@ -273,13 +269,9 @@ library ...@@ -273,13 +269,9 @@ library
other-modules: other-modules:
Gargantext.API.Admin.Auth Gargantext.API.Admin.Auth
Gargantext.API.Admin.FrontEnd Gargantext.API.Admin.FrontEnd
Gargantext.API.Admin.Orchestrator
Gargantext.API.Admin.Orchestrator.Scrapy.Schedule
Gargantext.API.Admin.Utils
Gargantext.API.Context Gargantext.API.Context
Gargantext.API.Count Gargantext.API.Count
Gargantext.API.EKG Gargantext.API.EKG
Gargantext.API.Flow
Gargantext.API.GraphQL Gargantext.API.GraphQL
Gargantext.API.GraphQL.Annuaire Gargantext.API.GraphQL.Annuaire
Gargantext.API.GraphQL.AsyncTask Gargantext.API.GraphQL.AsyncTask
...@@ -303,7 +295,6 @@ library ...@@ -303,7 +295,6 @@ library
Gargantext.API.Node.Corpus.Annuaire Gargantext.API.Node.Corpus.Annuaire
Gargantext.API.Node.Corpus.Export Gargantext.API.Node.Corpus.Export
Gargantext.API.Node.Corpus.Export.Types Gargantext.API.Node.Corpus.Export.Types
Gargantext.API.Node.Corpus.New.File
Gargantext.API.Node.Corpus.New.Types Gargantext.API.Node.Corpus.New.Types
Gargantext.API.Node.Corpus.Searx Gargantext.API.Node.Corpus.Searx
Gargantext.API.Node.Document.Export Gargantext.API.Node.Document.Export
...@@ -337,41 +328,29 @@ library ...@@ -337,41 +328,29 @@ library
Gargantext.Core.Flow.Ngrams Gargantext.Core.Flow.Ngrams
Gargantext.Core.Flow.Types Gargantext.Core.Flow.Types
Gargantext.Core.Mail Gargantext.Core.Mail
Gargantext.Core.Methods.Graph.BAC.Proxemy
Gargantext.Core.Methods.Graph.MaxClique Gargantext.Core.Methods.Graph.MaxClique
Gargantext.Core.Methods.Matrix.Accelerate.Utils Gargantext.Core.Methods.Matrix.Accelerate.Utils
Gargantext.Core.Methods.Similarities.Accelerate.Conditional Gargantext.Core.Methods.Similarities.Accelerate.Conditional
Gargantext.Core.Methods.Similarities.Accelerate.Distributional Gargantext.Core.Methods.Similarities.Accelerate.Distributional
Gargantext.Core.Methods.Similarities.Accelerate.SpeGen Gargantext.Core.Methods.Similarities.Accelerate.SpeGen
Gargantext.Core.Methods.Similarities.Distributional
Gargantext.Core.Statistics Gargantext.Core.Statistics
Gargantext.Core.Text.Convert
Gargantext.Core.Text.Corpus.API.Hal Gargantext.Core.Text.Corpus.API.Hal
Gargantext.Core.Text.Corpus.API.Isidore Gargantext.Core.Text.Corpus.API.Isidore
Gargantext.Core.Text.Corpus.API.Istex Gargantext.Core.Text.Corpus.API.Istex
Gargantext.Core.Text.Corpus.Parsers.Book
Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec Gargantext.Core.Text.Corpus.Parsers.Date.Attoparsec
Gargantext.Core.Text.Corpus.Parsers.FrameWrite Gargantext.Core.Text.Corpus.Parsers.FrameWrite
Gargantext.Core.Text.Corpus.Parsers.Gitlab
Gargantext.Core.Text.Corpus.Parsers.GrandDebat Gargantext.Core.Text.Corpus.Parsers.GrandDebat
Gargantext.Core.Text.Corpus.Parsers.Iramuteq Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
Gargantext.Core.Text.Corpus.Parsers.Telegram
Gargantext.Core.Text.Corpus.Parsers.WOS Gargantext.Core.Text.Corpus.Parsers.WOS
Gargantext.Core.Text.Corpus.Parsers.Wikidata
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Gargantext.Core.Text.Corpus.Parsers.Wikimedia
Gargantext.Core.Text.Learn Gargantext.Core.Text.Learn
Gargantext.Core.Text.List.Group Gargantext.Core.Text.List.Group
Gargantext.Core.Text.List.Group.Prelude Gargantext.Core.Text.List.Group.Prelude
Gargantext.Core.Text.List.Group.WithScores Gargantext.Core.Text.List.Group.WithScores
Gargantext.Core.Text.List.Learn
Gargantext.Core.Text.List.Merge
Gargantext.Core.Text.List.Social Gargantext.Core.Text.List.Social
Gargantext.Core.Text.List.Social.Find Gargantext.Core.Text.List.Social.Find
Gargantext.Core.Text.List.Social.Patch Gargantext.Core.Text.List.Social.Patch
...@@ -385,12 +364,10 @@ library ...@@ -385,12 +364,10 @@ library
Gargantext.Core.Text.Samples.FR Gargantext.Core.Text.Samples.FR
Gargantext.Core.Text.Samples.PL Gargantext.Core.Text.Samples.PL
Gargantext.Core.Text.Samples.ZH Gargantext.Core.Text.Samples.ZH
Gargantext.Core.Text.Terms.Mono.Token
Gargantext.Core.Text.Terms.Mono.Token.En Gargantext.Core.Text.Terms.Mono.Token.En
Gargantext.Core.Text.Terms.Multi.Group Gargantext.Core.Text.Terms.Multi.Group
Gargantext.Core.Text.Terms.Multi.PosTagging Gargantext.Core.Text.Terms.Multi.PosTagging
Gargantext.Core.Text.Terms.Multi.PosTagging.Types Gargantext.Core.Text.Terms.Multi.PosTagging.Types
Gargantext.Core.Text.Upload
Gargantext.Core.Types.Search Gargantext.Core.Types.Search
Gargantext.Core.Utils.DateUtils Gargantext.Core.Utils.DateUtils
Gargantext.Core.Viz Gargantext.Core.Viz
...@@ -421,9 +398,7 @@ library ...@@ -421,9 +398,7 @@ library
Gargantext.Database.Action.Metrics.NgramsByContext Gargantext.Database.Action.Metrics.NgramsByContext
Gargantext.Database.Action.Node Gargantext.Database.Action.Node
Gargantext.Database.Action.Share Gargantext.Database.Action.Share
Gargantext.Database.Action.TSQuery
Gargantext.Database.Admin.Access Gargantext.Database.Admin.Access
Gargantext.Database.Admin.Bashql
Gargantext.Database.Admin.Trigger.ContextNodeNgrams Gargantext.Database.Admin.Trigger.ContextNodeNgrams
Gargantext.Database.Admin.Trigger.Contexts Gargantext.Database.Admin.Trigger.Contexts
Gargantext.Database.Admin.Trigger.NodesContexts Gargantext.Database.Admin.Trigger.NodesContexts
...@@ -462,23 +437,18 @@ library ...@@ -462,23 +437,18 @@ library
Gargantext.Database.Query.Table.NodeContext_NodeContext Gargantext.Database.Query.Table.NodeContext_NodeContext
Gargantext.Database.Query.Table.NodeNgrams Gargantext.Database.Query.Table.NodeNgrams
Gargantext.Database.Query.Table.NodeNode Gargantext.Database.Query.Table.NodeNode
Gargantext.Database.Query.Table.NodeNodeNgrams
Gargantext.Database.Query.Tree Gargantext.Database.Query.Tree
Gargantext.Database.Query.Tree.Error Gargantext.Database.Query.Tree.Error
Gargantext.Database.Schema.Context Gargantext.Database.Schema.Context
Gargantext.Database.Schema.ContextNodeNgrams Gargantext.Database.Schema.ContextNodeNgrams
Gargantext.Database.Schema.ContextNodeNgrams2 Gargantext.Database.Schema.ContextNodeNgrams2
Gargantext.Database.Schema.NgramsPostag
Gargantext.Database.Schema.NodeContext Gargantext.Database.Schema.NodeContext
Gargantext.Database.Schema.NodeContext_NodeContext Gargantext.Database.Schema.NodeContext_NodeContext
Gargantext.Database.Schema.NodeNgrams Gargantext.Database.Schema.NodeNgrams
Gargantext.Database.Schema.NodeNode Gargantext.Database.Schema.NodeNode
Gargantext.Database.Schema.NodeNodeNgrams
Gargantext.Database.Schema.NodeNodeNgrams2
Gargantext.Database.Schema.Prelude Gargantext.Database.Schema.Prelude
Gargantext.Database.Types Gargantext.Database.Types
Gargantext.Utils.Aeson Gargantext.Utils.Aeson
Gargantext.Utils.JohnSnowNLP
Gargantext.Utils.Servant Gargantext.Utils.Servant
Gargantext.Utils.UTCTime Gargantext.Utils.UTCTime
Paths_gargantext Paths_gargantext
......
{-|
Module : Gargantext.API.Admin.Orchestrator
Description : Jobs Orchestrator
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.API.Admin.Orchestrator where
import Control.Lens hiding (elements)
import Data.Aeson
import Data.ByteString.Lazy.Char8 qualified as LBS
import Gargantext.API.Admin.Orchestrator.Scrapy.Schedule
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.Prelude hiding (to)
import Servant
import Servant.Job.Async
import Servant.Job.Client
callJobScrapy :: (ToJSON e, FromJSON e, FromJSON o, MonadClientJob m)
=> JobServerURL e Schedule o
-> (URL -> Schedule)
-> m o
callJobScrapy jurl schedule = do
progress $ NewTask jurl
out <- view job_output <$>
retryOnTransientFailure (clientCallbackJob' jurl
(fmap (const ()) . scrapySchedule . schedule))
progress $ Finished jurl Nothing
pure out
logConsole :: ToJSON a => a -> IO ()
logConsole = LBS.putStrLn . encode
callScraper :: MonadClientJob m => URL -> ScraperInput -> m JobLog
callScraper url input =
callJobScrapy jurl $ \cb ->
Schedule
{ s_project = "gargantext"
, s_spider = input ^. scin_spider
, s_setting = []
, s_jobid = Nothing
, s_version = Nothing
, s_extra =
[("query", input ^.. scin_query . _Just)
,("user", [input ^. scin_user])
,("corpus", [input ^. scin_corpus . to toUrlPiece])
,("report_every", input ^.. scin_report_every . _Just . to toUrlPiece)
,("limit", input ^.. scin_limit . _Just . to toUrlPiece)
,("url", input ^.. scin_local_file . _Just)
,("count_only", input ^.. scin_count_only . _Just . to toUrlPiece)
,("callback", [toUrlPiece cb])]
}
where
jurl :: JobServerURL JobLog Schedule JobLog
jurl = JobServerURL url Callback
pipeline :: FromJSON e => URL -> ClientEnv -> ScraperInput
-> (e -> IO ()) -> IO JobLog
pipeline scrapyurl client_env input log_status = do
e <- runJobMLog client_env log_status $ callScraper scrapyurl input
either (panicTrace . show) pure e -- TODO throwError
-- TODO integrate to ServerT
-- use:
-- * serveJobsAPI instead of simpleServeJobsAPI
-- * JobFunction instead of simpleJobFunction
-- TODO:
-- * HasSelfUrl or move self_url to settings
-- * HasScrapers or move scrapers to settings
-- * EnvC env
{- NOT USED YET
import Data.Text
import Servant.Job.Server
import Servant.Job.Utils (extendBaseUrl)
import Gargantext.API.Admin.Types
scrapyOrchestrator :: Env -> IO (Server (WithCallbacks ScraperAPI))
scrapyOrchestrator env = do
apiWithCallbacksServer (Proxy :: Proxy ScraperAPI)
defaultSettings (extendBaseUrl ("scraper" :: Text) $ env ^. env_self_url)
(env ^. env_manager) (LogEvent logConsole) $
simpleServeJobsAPI (env ^. env_scrapers) .
simpleJobFunction . pipeline (URL $ env ^. settings . scrapydUrl)
-}
{-|
Module : Gargantext.API.Admin.Orchestartor.Scrapy.Schedule
Description : Server API Auth Module
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Admin.Orchestrator.Scrapy.Schedule
where
import Control.Lens
import Data.Aeson
import GHC.Generics
import Protolude
import Servant
import Servant.Client
import Servant.Job.Utils (jsonOptions)
import Web.FormUrlEncoded hiding (parseMaybe)
import qualified Data.HashMap.Strict as H
------------------------------------------------------------------------
data Schedule = Schedule
{ s_project :: !Text
, s_spider :: !Text
, s_setting :: ![Text]
, s_jobid :: !(Maybe Text)
, s_version :: !(Maybe Text)
, s_extra :: ![(Text,[Text])]
}
deriving (Generic)
data ScheduleResponse = ScheduleResponse
{ r_status :: !Text
, r_jobid :: !Text
}
deriving (Generic)
instance FromJSON ScheduleResponse where
parseJSON = genericParseJSON (jsonOptions "r_")
instance ToForm Schedule where
toForm s =
Form . H.fromList $
[("project", [s_project s])
,("spider", [s_spider s])
,("setting", s_setting s)
,("jobid", s_jobid s ^.. _Just)
,("_version", s_version s ^.. _Just)
] ++ s_extra s
type Scrapy =
"schedule.json" :> ReqBody '[FormUrlEncoded] Schedule
:> Post '[JSON] ScheduleResponse
scrapyAPI :: Proxy Scrapy
scrapyAPI = Proxy
scrapySchedule :: Schedule -> ClientM ScheduleResponse
scrapySchedule = client scrapyAPI
{-|
Module : Gargantext.API.Admin.Utils
Description : Server API main Types
Copyright : (c) CNRS, 2017-Present
License : BSD3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly copied from Servant.Job.Utils (Thanks)
-}
module Gargantext.API.Admin.Utils
where
import Gargantext.Prelude
import Prelude (String)
import qualified Data.Text as T
infixr 4 ?|
-- Reverse infix form of "fromMaybe"
(?|) :: Maybe a -> a -> a
(?|) = flip fromMaybe
infixr 4 ?!
-- Reverse infix form of "fromJust" with a custom error message
(?!) :: Maybe a -> String -> a
(?!) ma' msg = ma' ?| panicTrace (T.pack msg)
{-|
Module : Gargantext.API.Flow
Description : Main Flow API DataTypes
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
module Gargantext.API.Flow
where
-- import Gargantext.API.Prelude
import Gargantext.Prelude
data InputFlow = TextsInput
| NgramsInput
| ListInput
data Flow = EndFlow
| Texts InputFlow [Flow]
| Ngrams InputFlow [Flow]
| Lists InputFlow [Flow]
data OutputFlow
flow :: Flow -> OutputFlow
flow = undefined
{-|
Module : Gargantext.API.Node.Corpus.New.File
Description : Server API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Node.Corpus.New.File
where
import Control.Lens ((?~))
import Data.Swagger
import Gargantext.API.Node.Corpus.New.Types (FileFormat, FileType)
import Gargantext.Core.Types (TODO)
import Gargantext.Database.Admin.Types.Node (NodeId)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Prelude hiding (hash)
import Gargantext.Prelude.Crypto.Hash (hash)
import Servant ( JSON, type (:>), Post, QueryParam, Summary )
import Servant.Multipart ( Input(iName), Mem, MultipartData(inputs), MultipartForm )
import Servant.Swagger.Internal ( addParam, HasSwagger(..) )
-------------------------------------------------------------
type Hash = Text
instance ToParamSchema (MultipartData Mem) where toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
instance (ToParamSchema a, HasSwagger sub) =>
HasSwagger (MultipartForm tag a :> sub) where
-- TODO
toSwagger _ = toSwagger (Proxy :: Proxy sub)
& addParam param
where
param = mempty
& required ?~ True
& schema .~ ParamOther sch
sch = mempty
& in_ .~ ParamFormData
& paramSchema .~ toParamSchema (Proxy :: Proxy a)
type WithUpload' = Summary "Upload file(s) to a corpus"
:> QueryParam "fileType" FileType
:> QueryParam "fileFormat" FileFormat
:> MultipartForm Mem (MultipartData Mem)
:> Post '[JSON] [Hash]
--postUpload :: NodeId -> Maybe FileType -> GargServer UploadAPI
--postUpload :: NodeId -> GargServer UploadAPI
postUpload :: NodeId
-> Maybe FileType
-> Maybe FileFormat
-> MultipartData Mem
-> Cmd err [Hash]
postUpload _ Nothing _ _ = panicTrace "fileType is a required parameter"
postUpload _ _ Nothing _ = panicTrace "fileFormat is a required parameter"
postUpload _ (Just _fileType) (Just _fileFormat) multipartData = do
-- printDebug "File Type: " fileType
-- printDebug "File format: " fileFormat
is <- liftBase $ do
-- printDebug "Inputs:" ()
forM (inputs multipartData) $ \input -> do
-- printDebug "iName " (iName input)
-- printDebug "iValue " (iValue input)
pure $ iName input
{-
_ <- forM (files multipartData) $ \file -> do
-- let content = fdPayload file
-- printDebug "XXX " (fdFileName file)
-- printDebug "YYY " content
pure () -- $ cs content
-- is <- inputs multipartData
-}
pure $ map hash is
-------------------------------------------------------------------
This diff is collapsed.
{-|
Module : Gargantext.Core.Methods.Similarities.Distributional
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Motivation and definition of the @Distributional@ distance.
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE Strict #-}
module Gargantext.Core.Methods.Similarities.Distributional
where
import Data.Matrix hiding (identity)
import qualified Data.Map as M
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Core.Viz.Graph.Utils
distributional' :: (Floating a, Ord a) => Matrix a -> [((Int, Int), a)]
distributional' m = filter (\((x,y), d) -> foldl' (&&) True (conditions x y d) ) distriList
where
conditions x y d = [ (x /= y)
, (d > miniMax')
, ((M.lookup (x,y) distriMap) > (M.lookup (y,x) distriMap))
]
distriList = toListsWithIndex distriMatrix
distriMatrix = ri (mi m)
distriMap = M.fromList $ distriList
miniMax' = miniMax distriMatrix
ri :: (Ord a, Fractional a) => Matrix a -> Matrix a
ri m = matrix c r doRi
where
doRi (x,y) = doRi' x y m
doRi' x y mi'' = sumMin x y mi'' / (V.sum $ ax Col x y mi'')
sumMin x y mi' = V.sum $ V.map (\(a,b) -> min a b )
$ V.zip (ax Col x y mi') (ax Row x y mi')
(c,r) = (nOf Col m, nOf Row m)
mi :: (Ord a, Floating a) => Matrix a -> Matrix a
mi m = matrix c r createMat
where
(c,r) = (nOf Col m, nOf Row m)
createMat (x,y) = doMi x y m
doMi x y m' = if x == y then 0 else (max (log (doMi' x y m')) 0 )
doMi' x y m' = (getElem x y m) / ( cross x y m / total m' )
cross x y m' = (V.sum $ ax Col x y m) * (V.sum $ ax Row x y m')
ax :: Axis -> Int -> Int -> Matrix a -> Vector a
ax a i j m = dropAt j' $ axis a i' m
where
i' = div i c + 1
j' = mod r j + 1
(c,r) = (nOf Col m, nOf Row m)
miniMax :: (Ord a) => Matrix a -> a
miniMax m = V.minimum $ V.map (\c -> V.maximum $ getCol c m) (V.enumFromTo 1 (nOf Col m))
{-|
Module : Gargantext.Core.Text.Convert
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Format Converter.
-}
{-# LANGUAGE PackageImports #-}
module Gargantext.Core.Text.Convert (risPress2tsvWrite)
where
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat(..), FileType(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV (writeDocs2Tsv)
import Gargantext.Prelude
risPress2tsvWrite :: FilePath -> IO ()
risPress2tsvWrite f = do
eContents <- parseFile RisPresse Plain (f <> ".ris")
case eContents of
Right contents -> writeDocs2Tsv (f <> ".csv") contents
Left e -> panicTrace $ "Error: " <> e
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Book
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Get Book into GarganText
-}
module Gargantext.Core.Text.Corpus.Parsers.Book
where
import Data.ByteString.Lazy qualified as DBL
import Data.List qualified as List
import Data.Text qualified as DT
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.TSV (hyperdataDocument2tsv)
import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Prelude
import System.Directory -- (getDirectoryContents)
------------------------------------------------------------------------
-- Main Export Function
type FileOut = FilePath
book2tsv :: Int -> FileDir -> FileOut -> IO ()
book2tsv n f_in f_out = do
files <- filesOf f_in
texts <- readPublis f_in files
let publis = concatMap (file2publi n) texts
let docs = zipWith publiToHyperdata [1..] publis
DBL.writeFile f_out (hyperdataDocument2tsv docs)
filesOf :: FileDir -> IO [FilePath]
filesOf fd = List.sort -- sort by filename
<$> List.filter (\f -> DT.length (cs f) > 2)
<$> getDirectoryContents fd
readPublis :: FileDir -> [FilePath] -> IO [(FilePath, Text)]
readPublis fd = mapM (\fp -> DBL.readFile (fd <> fp) >>= \txt -> pure (fp, cs txt))
------------------------------------------------------------------------
-- Main Types
data Publi = Publi { publi_authors :: [Text]
, publi_source :: Text
, publi_title :: Text
, publi_text :: Text
}
deriving (Show)
data FileInfo = FileInfo { fi_authors :: [Text]
, fi_source :: Text
}
deriving (Show)
type FileDir = FilePath
---------------------------------------------------------------------
file2publi :: Int -> (FilePath, Text) -> [Publi]
file2publi n (fp,theText) = map (uncurry (Publi authors source)) theTexts
where
theTexts = text2titleParagraphs n theText
FileInfo authors source = fileNameInfo fp
fileNameInfo :: FilePath -> FileInfo
fileNameInfo fp = toFileInfo xs
where
xs = DT.splitOn "_" $ DT.pack fp
toFileInfo (a:b:_) = FileInfo (DT.splitOn "-and-" a) (cs b)
toFileInfo _ = panicTrace "error"
---------------------------------------------------------------------
publiToHyperdata :: Int -> Publi -> HyperdataDocument
publiToHyperdata y (Publi a s t txt) =
HyperdataDocument { _hd_bdd = Just "Book File"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just t
, _hd_authors = Just (DT.concat a)
, _hd_institutes = Nothing
, _hd_source = Just s
, _hd_abstract = Just txt
, _hd_publication_date = Nothing
, _hd_publication_year = Just y
, _hd_publication_month = Just 1
, _hd_publication_day = Just 1
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ DT.pack $ show FR
}
-------------------------------------------------------------
-- MISC tool to remove urls for instance
clean :: Text -> Text
clean = DT.unwords . List.filter (\w -> DT.length w < 20) . DT.words
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.Date.Parsec
where
import Data.Fixed (Fixed (MkFixed))
import Data.String (String)
import Data.Text qualified as T
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Gargantext.Prelude
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.ParserCombinators.Parsec qualified (parse)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = pure . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
pure $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
pure $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in pure $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
pure $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = T.unpack t
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Gitlab
Description :
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.Gitlab (
Issue(..), gitlabIssue2hyperdataDocument, readFile_Issues, readFile_IssuesAsDocs
) where
import Data.Aeson ( decode, (.:), (.:?), withObject )
import Data.ByteString.Lazy qualified as DBL
import Data.Text qualified as DT
import Data.Time
import Gargantext.Core (Lang(..))
import Gargantext.Database.Admin.Types.Hyperdata.Document ( HyperdataDocument(..) )
import Gargantext.Prelude
data Issue = Issue { _issue_id :: !Int
, _issue_title :: !DT.Text
, _issue_content :: !DT.Text
, _issue_created :: !LocalTime
, _issue_closed :: !(Maybe UTCTime)
}
deriving (Show)
instance FromJSON Issue where
parseJSON = withObject "Issue" $ \v -> Issue
<$> v .: "c0" -- id
<*> v .: "c1" -- title
<*> v .: "c2" -- content
<*> v .: "c3" -- creation time
<*> v .:? "c4" -- close time
gitlabIssue2hyperdataDocument :: Issue -> HyperdataDocument
gitlabIssue2hyperdataDocument issue = HyperdataDocument
{ _hd_bdd = Nothing
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_page = Nothing
, _hd_title = Just (_issue_title issue)
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Nothing
, _hd_abstract = Just (_issue_content issue)
, _hd_publication_date = Just $ DT.pack $ show date
, _hd_publication_year = Just $ fromIntegral year
, _hd_publication_month = Just month
, _hd_publication_day = Just day
, _hd_publication_hour = Just (todHour tod)
, _hd_publication_minute = Just (todMin tod)
, _hd_publication_second = Just (round $ todSec tod)
, _hd_language_iso2 = Just $ (DT.pack . show) lang
}
where lang = EN
date = _issue_created issue
(year, month, day) = toGregorian $ localDay date
tod = localTimeOfDay date
readFile_Issues :: FilePath -> IO [Issue]
readFile_Issues fp = do
raw <- DBL.readFile fp
let mayIssues = decode raw
case mayIssues of
Just is -> pure is
Nothing -> pure []
readFile_IssuesAsDocs :: FilePath -> IO [HyperdataDocument]
readFile_IssuesAsDocs = fmap (fmap gitlabIssue2hyperdataDocument) . readFile_Issues
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Json parser to export towoard CSV GargV3 format.
(Export from the Patent Database.)
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Core.Text.Corpus.Parsers.Json2Csv (json2tsv, readPatents)
where
import Data.Aeson ( decode )
import Data.ByteString.Lazy (readFile)
import Data.Text (unpack)
import Data.Vector (fromList)
import Gargantext.Core.Text.Corpus.Parsers.TSV (TsvDoc(..), writeFile, headerTsvGargV3)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Defaults qualified as Defaults
import Gargantext.Prelude hiding (readFile, writeFile)
import Prelude (read)
data Patent = Patent { _patent_title :: Text
, _patent_abstract :: Text
, _patent_year :: Text
, _patent_id :: Text
} deriving (Show)
$(deriveJSON (unPrefix "_patent_") ''Patent)
readPatents :: FilePath -> IO (Maybe [Patent])
readPatents fp = decode <$> readFile fp
type FilePathIn = FilePath
type FilePathOut = FilePath
json2tsv :: FilePathIn -> FilePathOut -> IO ()
json2tsv fin fout = do
patents <- maybe (panicTrace "json2tsv error") identity <$> readPatents fin
writeFile fout (headerTsvGargV3, fromList $ map patent2tsvDoc patents)
patent2tsvDoc :: Patent -> TsvDoc
patent2tsvDoc (Patent { .. }) =
TsvDoc { tsv_title = _patent_title
, tsv_source = "Source"
, tsv_publication_year = Just $ read (unpack _patent_year)
, tsv_publication_month = Just $ Defaults.month
, tsv_publication_day = Just $ Defaults.day
, tsv_abstract = _patent_abstract
, tsv_authors = "Authors" }
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Telegram
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Text.Corpus.Parsers.Telegram
where
import Data.Aeson
import Data.ByteString.Lazy qualified as DBL
import Gargantext.Prelude
readFile_Telegram :: FilePath -> IO [TelegramMsg]
readFile_Telegram fp = do
raw <- DBL.readFile fp
let mayIssues = decode raw
case mayIssues of
Just is -> pure is
Nothing -> pure []
data TelegramMsg = TelegramMsg { _action_entities :: !Text
, _broadcastg :: !Text
, _buttonsg :: !Text
, _buttons_countg :: !Text
, _buttons_flatg :: !Text
, _chatg :: !Text
, _chat_peerg :: !Text
, _fileg :: !Text
, _forwardg :: !Text
, _input_chatg :: !Text
, _input_senderg :: !Text
, _linked_chatg :: !Text
, _reply_messageg :: !Text
, _senderg :: !Text
, _sender_idg :: !Text
, _textg :: !Text
, _via_botg :: !Text
, _via_input_botg :: !Text
, actiong :: !Text
, dateg :: !Text
, edit_dateg :: !Text
, edit_hideg :: !Text
, entitiesg :: !Text
, forwardsg :: !Text
, from_idg :: !Text
, from_scheduledg :: !Text
, fwd_fromg :: !Text
, grouped_idg :: !Text
, idg :: !Text
, legacyg :: !Text
, mediag :: !Text
, media_unreadg :: !Text
, mentionedg :: !Text
, messageg :: !Text
, noforwardsg :: !Text
, outg :: !Text
, peer_idg :: !Text
, pinnedg :: !Text
, postg :: !Text
, post_authorg :: !Text
, reactionsg :: !Text
, repliesg :: !Text
, reply_markupg :: !Text
, reply_tog :: !Text
, restriction_reasong :: !Text
, silentg :: !Text
, ttl_periodg :: !Text
, via_bot_idg :: !Text
, views :: !Text
}
deriving (Show, Generic)
instance FromJSON TelegramMsg
{-
gitlabIssue2hyperdataDocument :: Issue -> HyperdataDocument
gitlabIssue2hyperdataDocument issue = HyperdataDocument
{ _hd_bdd = Nothing
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just (_issue_title issue)
, _hd_authors = Nothing
, _hd_institutes = Nothing
, _hd_source = Nothing
, _hd_abstract = Just (_issue_content issue)
, _hd_publication_date = Just $ DT.pack $ show date
, _hd_publication_year = Just $ fromIntegral year
, _hd_publication_month = Just month
, _hd_publication_day = Just day
, _hd_publication_hour = Just (todHour tod)
, _hd_publication_minute = Just (todMin tod)
, _hd_publication_second = Just (round $ todSec tod)
, _hd_language_iso2 = Just $ (DT.pack . show) lang
}
where lang = EN
date = _issue_created issue
(year, month, day) = toGregorian $ localDay date
tod = localTimeOfDay date
-}
{-
readFile_IssuesAsDocs :: FilePath -> IO [HyperdataDocument]
readFile_IssuesAsDocs = fmap (fmap gitlabIssue2hyperdataDocument) . readFile_Issues
-}
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
<<<<<<< HEAD
Description : To query Wikidata
=======
Description : To query Wikidata
>>>>>>> dev-clustering
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
import Data.List qualified as List
import Data.Text (concat)
import Database.HSparql.Connection ( BindingValue, EndPoint, selectQueryRaw )
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Date (mDateSplit)
import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler ( crawlPage )
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude hiding (concat)
import Prelude qualified
data WikiResult = WikiResult { _wr_cid :: Maybe Text
, _wr_title :: Maybe Text
, _wr_url :: Maybe Text
, _wr_yearStart :: Maybe Text
, _wr_yearEnd :: Maybe Text
, _wr_yearFlorish :: Maybe Text
} deriving (Show, Eq)
$(makeLenses ''WikiResult)
type NumberOfSections = Int
wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
wikidataGet n m = do
results <- wikidataSelect n
mapM (wikiPageToDocument m) results
wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
wikiPageToDocument m wr = do
sections <- case wr ^. wr_url of
Nothing -> pure []
Just u -> crawlPage u
let bdd = Just "wikidata"
doi = Nothing
url = wr ^. wr_url
page = Nothing
title = wr ^. wr_title
authors = Nothing
institutes = Nothing
source = Nothing
abstract = Just $ concat $ take m sections
let mDateS = head $ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
, wr ^. wr_yearFlorish
, head sections
]
let (date, (year, month, day)) = mDateSplit mDateS
let hour = Nothing
minute = Nothing
sec = Nothing
iso2 = Just $ show EN
pure $ HyperdataDocument { _hd_bdd = bdd
, _hd_doi = doi
, _hd_url = url
, _hd_page = page
, _hd_title = title
, _hd_authors = authors
, _hd_institutes = institutes
, _hd_source = source
, _hd_abstract = abstract
, _hd_publication_date = show <$> date
, _hd_publication_year = year
, _hd_publication_month = month
, _hd_publication_day = day
, _hd_publication_hour = hour
, _hd_publication_minute = minute
, _hd_publication_second = sec
, _hd_language_iso2 = iso2 }
wikidataSelect :: Int -> IO [WikiResult]
wikidataSelect n = do
result <- selectQueryRaw wikidataRoute (wikidataQuery n)
case result of
Nothing -> pure []
Just result' -> pure $ map toWikiResult $ unbound' EN result'
unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
unbound' l = map (map (unbound l))
toWikiResult :: [Maybe Text] -> WikiResult
toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
toWikiResult _ = panicTrace "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
wikidataRoute :: EndPoint
wikidataRoute = "https://query.wikidata.org/sparql"
wikidataQuery :: Int -> Prelude.String
wikidataQuery n = List.unlines
[" PREFIX wd: <http://www.wikidata.org/entity/>"
," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
," PREFIX schema: <http://schema.org/>"
," PREFIX wikibase: <http://wikiba.se/ontology#>"
," SELECT DISTINCT "
," ?cid"
," ?title"
," ?url"
," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
," WHERE {"
," ?cid wdt:P31 wd:Q968159 ."
," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
," "
," ?url schema:about ?cid ."
," ?url schema:inLanguage \"en\" ."
," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
," }"
," LIMIT " <> show n
]
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Thx to Alp Well Typed for the first version.
-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
where
import Control.Lens hiding (element, elements, children)
import Data.ByteString.Lazy (ByteString)
import Data.Text (unpack)
import Data.Text.Lazy.Encoding (decodeUtf8With)
import Gargantext.Prelude hiding (ByteString, get, to, decodeUtf8With)
import Network.HTTP.Client (Response)
import Network.Wreq (responseBody, get)
import Text.Taggy.Lens
type WikipediaUrlPage = Text
crawlPage :: WikipediaUrlPage -> IO [Text]
crawlPage url = do
datas <- get (unpack url)
pure $ sectionsOf datas
sectionsOf :: Response ByteString -> [Text]
sectionsOf resp =
resp ^.. responseBody
. to (decodeUtf8With lenientDecode)
. html
. allAttributed (ix "class" . only "mw-parser-output")
. allNamed (only "p")
. to paragraphText
paragraphText :: Element -> Text
paragraphText p = collectTextN (p ^. children)
where collectTextN (NodeContent t : ns) = t <> collectTextN ns
collectTextN (NodeElement elt : ns) = collectTextE elt <> collectTextN ns
collectTextN [] = ""
collectTextE (Element _ _ ns) = collectTextN ns
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikimedia
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Core.Text.Corpus.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
module Gargantext.Core.Text.Corpus.Parsers.Wikimedia
where
import Control.Monad.Catch
import Data.Conduit
import Data.Either
import Data.Text as T
import Data.XML.Types (Event, Name)
import Gargantext.Prelude hiding (force)
import Text.Pandoc
import Text.XML.Stream.Parse
-- | Use case
-- :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :}
-- | A simple "Page" type.
-- For the moment it takes only text and title
-- (since there is no abstract) will see if other data are relevant.
data Page =
Page { _markupFormat :: MarkupFormat
, _title :: Maybe T.Text
, _text :: Maybe T.Text
}
deriving (Show)
data MarkupFormat = Mediawiki | Plaintext
deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ ignoreAnyTreeContent
pure text
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ n = many_ (ignoreTree (tagUntil n) ignoreAttrs)
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- TODO: remove ignoreExcept to:
-- many ignoreAnyTreeContentUntil "Article"
manyTagsUntil :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <-
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
parseRevision
many_ $ ignoreAnyTreeContent
pure $ Page { _markupFormat = Mediawiki
, _title = title
, _text = revision }
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki =
tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$ manyYield' parsePage
-- | Convert a Mediawiki Page to a Plaintext Page.
-- Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
pure $ Page { _markupFormat = Plaintext, _title = title, _text = revision }
where mediaToPlain media =
case media of
(Nothing) -> pure Nothing
(Just med) -> do
res <- runIO $ do
doc <- readMediaWiki def med
writePlain def doc
case res of
(Left _) -> pure Nothing
(Right r) -> pure $ Just r
{-|
Module : Gargantext.Core.Text.List.Learn
Description : Learn to make lists
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
module Gargantext.Core.Text.List.Learn
where
import Data.IntMap qualified as IntMap
import Data.List qualified as List
import Data.Map.Strict qualified as Map
import Data.SVM qualified as SVM
import Data.Vector qualified as Vec
import Gargantext.Core
import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.GargDB
import Gargantext.Prelude
------------------------------------------------------------------------
train :: Double -> Double -> SVM.Problem -> IO SVM.Model
train x y = (SVM.train (SVM.CSvc x) (SVM.RBF y))
predict :: SVM.Model -> [Vec.Vector Double] -> IO [Double]
predict m vs = mapM (predict' m) vs
where
predict' m' vs' = SVM.predict m' (IntMap.fromList $ (zip [1..]) $ Vec.toList vs')
------------------------------------------------------------------------
trainList :: Double -> Double -> Map ListType [Vec.Vector Double] -> IO SVM.Model
trainList x y = (train x y) . trainList'
where
trainList' :: Map ListType [Vec.Vector Double] -> SVM.Problem
trainList' = mapVec2problem . (Map.mapKeys (fromIntegral . toDBid))
mapVec2problem :: Map Double [Vec.Vector Double] -> SVM.Problem
mapVec2problem = List.concat . (map (\(a,as) -> zip (repeat a) as)) . Map.toList . (Map.map vecs2maps)
vecs2maps :: [Vec.Vector Double] -> [IntMap.IntMap Double]
vecs2maps = map (IntMap.fromList . (zip [1..]) . Vec.toList)
predictList :: HasCallStack => Model -> [Vec.Vector Double] -> IO [Maybe ListType]
predictList (ModelSVM m _ _) vs = map (Just . fromDBid . round) <$> predict m vs
------------------------------------------------------------------------
data Model = ModelSVM { modelSVM :: SVM.Model
, param1 :: Maybe Double
, param2 :: Maybe Double
}
--{-
instance SaveFile Model
where
saveFile' fp (ModelSVM m _ _) = SVM.saveModel m fp
instance ReadFile Model
where
readFile' fp = do
m <- SVM.loadModel fp
pure $ ModelSVM m Nothing Nothing
--}
------------------------------------------------------------------------
-- | TODO
-- shuffle list
-- split list : train / test
-- grid parameters on best result on test
type Train = Map ListType [Vec.Vector Double]
type Tests = Map ListType [Vec.Vector Double]
type Score = Double
type Param = Double
grid :: (MonadBase IO m)
=> Param -> Param -> Train -> [Tests] -> m (Maybe Model)
grid _ _ _ [] = panicTrace "Gargantext.Core.Text.List.Learn.grid : empty test data"
grid s e tr te = do
let
grid' :: (MonadBase IO m)
=> Double -> Double
-> Train
-> [Tests]
-> m (Score, Model)
grid' x y tr' te' = do
model'' <- liftBase $ trainList x y tr'
let
model' = ModelSVM model'' (Just x) (Just y)
score' :: [(ListType, Maybe ListType)] -> Map (Maybe Bool) Int
score' = occurrencesWith (\(a,b) -> (==) <$> Just a <*> b)
score'' :: Map (Maybe Bool) Int -> Double
score'' m'' = maybe 0 (\t -> (fromIntegral t)/total) (Map.lookup (Just True) m'')
where
total = fromIntegral $ foldl (+) 0 $ Map.elems m''
getScore m t = do
let (res, toGuess) = List.unzip
$ List.concat
$ map (\(k,vs) -> zip (repeat k) vs)
$ Map.toList t
res' <- liftBase $ predictList m toGuess
pure $ score'' $ score' $ List.zip res res'
score <- mapM (getScore model') te'
pure (mean score, model')
r <- head . List.reverse
. (List.sortOn fst)
<$> mapM (\(x,y) -> grid' x y tr te)
[(x,y) | x <- [s..e], y <- [s..e]]
-- printDebug "GRID SEARCH" (map fst r)
-- printDebug "file" fp
--fp <- saveFile (ModelSVM model')
--save best result
pure $ snd <$> r
{-|
Module : Gargantext.Core.Text.List.Merge
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE InstanceSigs #-}
module Gargantext.Core.Text.List.Merge
where
import Control.Lens (view)
import Data.Map.Strict.Patch hiding (PatchMap)
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.Types
import Gargantext.Prelude hiding (diff)
type List = Map NgramsTerm NgramsRepoElement
type Patch = PatchMap NgramsTerm (Replace (Maybe NgramsRepoElement))
-- Question: which version of Patching increment is using the FrontEnd ?
diffList :: Versioned List -> Versioned List -> Versioned Patch
diffList l1 l2 = Versioned (1 + view v_version l1)
(diff (view v_data l1) (view v_data l2))
-- | TODO
{-
commit :: ListId -> NgramsType -> Versioned Patch -> List -> List
commit = undefined
-}
{-|
Module : Gargantext.Core.Text.Clean
Description : Tools to clean text
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Clean some texts before importing it.
For a given Language, chose a big master piece of litteracy to analyze
it with GarganText. Here is a an example with a famous French Writer
that could be the incarnation of the mythic Gargantua.
-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Prepare
where
import Data.List qualified as List
import Data.Text qualified as Text
import Gargantext.Core.Text (sentences)
import Gargantext.Prelude
---------------------------------------------------------------------
prepareText :: Paragraph -> Text -> [Text]
prepareText p txt = groupText p
$ List.filter (/= "")
$ toParagraphs
$ Text.lines
$ Text.replace "_" " " -- some texts seem to be underlined
$ Text.replace "--" "" -- removing bullets like of dialogs
$ Text.replace "\xd" "" txt
---------------------------------------------------------------------
groupText :: Paragraph -> [Text] -> [Text]
groupText (Uniform blockSize) = groupUniform blockSize
groupText AuthorLike = groupLines
---------------------------------------------------------------------
data Paragraph = Uniform Grain | AuthorLike
-- Uniform does not preserve the paragraphs of the author but length of paragraphs is uniform
-- Author Like preserve the paragraphs of the Author but length of paragraphs is not uniform
-- Grain: number of Sentences by block of Text
-- Step : overlap of sentence between connex block of Text
groupUniform :: Grain -> [Text] -> [Text]
groupUniform g ts = map Text.unwords
$ chunkAlong g g
$ sentences
$ Text.concat ts
groupLines :: [Text] -> [Text]
groupLines xxx@(a:b:xs) =
if Text.length a > moyenne
then [a] <> (groupLines (b:xs))
else let ab = a <> " " <> b in
if Text.length ab > moyenne
then [ab] <> (groupLines xs)
else groupLines ([ab] <> xs)
where
moyenne = round
$ mean
$ (map (fromIntegral . Text.length) xxx :: [Double])
groupLines [a] = [a]
groupLines [] = []
groupLines_test :: [Text]
groupLines_test = groupLines theData
where
theData = ["abxxxx", "bc", "cxxx", "d"]
---------------------------------------------------------------------
toParagraphs :: [Text] -> [Text]
toParagraphs (a:x:xs) =
if a == ""
then [a] <> toParagraphs (x:xs)
else if x == ""
then [a] <> toParagraphs (x:xs)
else toParagraphs $ [a <> " " <> x ] <> xs
toParagraphs [a] = [a]
toParagraphs [] = []
-- Tests
-- TODO for internships: Property tests
toParagraphs_test :: Bool
toParagraphs_test =
toParagraphs ["a","b","","c","d","d","","e","f","","g","h",""]
== [ "a b", "", "c d d", "", "e f", "", "g h", ""]
{-|
Module : Gargantext.Core.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean TSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
module Gargantext.Core.Text.Search where
import Data.Ix
import Data.SearchEngine
import Gargantext.Core.Text.Corpus.Parsers.TSV
import Gargantext.Core.Text.Terms.Mono (monoTexts)
import Gargantext.Core.Text.Terms.Mono.Stem as ST
import Gargantext.Prelude
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
type DocId = Int
type DocSearchEngine = SearchEngine
TsvGargV3
DocId
DocField
NoFeatures
data DocField = TitleField
| AbstractField
deriving (Eq, Ord, Enum, Bounded, Ix, Show)
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig TsvGargV3 DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTerms,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTerms :: TsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN ST.PorterAlgorithm
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
SearchRankParameters {
paramK1,
paramB,
paramFieldWeights,
paramFeatureWeights = noFeatures,
paramFeatureFunctions = noFeatures,
paramResultsetSoftLimit = 2000,
paramResultsetHardLimit = 4000,
paramAutosuggestPrefilterLimit = 500,
paramAutosuggestPostfilterLimit = 500
}
where
paramK1 :: Float
paramK1 = 1.5
paramB :: DocField -> Float
paramB TitleField = 0.9
paramB AbstractField = 0.5
paramFieldWeights :: DocField -> Float
paramFieldWeights TitleField = 20
paramFieldWeights AbstractField = 5
{-|
Module : Gargantext.Core.Text.Ngrams.Token
Description : Tokens and tokenizing a text
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
In computer science, lexical analysis, lexing or tokenization is the
process of converting a sequence of characters (such as in a computer
program or web page) into a sequence of tokens (strings with an assigned
and thus identified meaning).
Source: https://en.wikipedia.org/wiki/Tokenize
-}
module Gargantext.Core.Text.Terms.Mono.Token (tokenize)
where
import Data.Text (Text)
import qualified Gargantext.Core.Text.Terms.Mono.Token.En as En
-- | Contexts depend on the lang
--import Gargantext.Core (Lang(..))
type Token = Text
-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
tokenize :: Text -> [Token]
tokenize = En.tokenize
--data Context = Letter | Word | Sentence | Line | Paragraph
--
--tokenize' :: Lang -> Context -> [Token]
--tokenize' = undefined
--
{-# LANGUAGE TypeOperators #-}
module Gargantext.Core.Text.Upload
( Host(..)
, DocId(..)
, Data(..)
, ContentType (..)
, ethercalc
, codimd
)
where
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import Gargantext.Utils.Servant (TSV, Markdown)
import Network.HTTP.Client (newManager, Request(..))
import Network.HTTP.Client.TLS (tlsManagerSettings)
import Protolude
import Servant.API
import Servant.Client
newtype Host = Host { fromHost :: Text }
newtype DocId = DocId { fromDocId :: Text }
newtype Data = Data { fromData :: Text }
data ContentType a =
CTPlain a
| CTTSV a
-- TODO SocialCalc, Excel XML ?
instance MimeRender TSV Data where
mimeRender p (Data d) = mimeRender p d
instance MimeRender PlainText Data where
mimeRender p (Data d) = mimeRender p d
instance ToHttpApiData DocId where
toUrlPiece (DocId docId) = docId
-- https://github.com/audreyt/ethercalc/blob/master/API.md
type EthercalcAPI =
"_" :> (
-- plain text
ReqBody '[PlainText] Data
:> Post '[PlainText] Text
:<|>
Capture "docId" DocId
:> ReqBody '[PlainText] Data
:> Put '[PlainText] Text
-- tsv
:<|>
ReqBody '[TSV] Data
:> Post '[PlainText, TSV] Text
:<|>
Capture "docId" DocId
:> ReqBody '[TSV] Data
:> Put '[PlainText, TSV] Text
)
ethercalcAPI :: Proxy EthercalcAPI
ethercalcAPI = Proxy
ethercalcNewPlain :: Data -> ClientM Text
ethercalcUpdatePlain :: DocId -> Data -> ClientM Text
ethercalcNewTSV :: Data -> ClientM Text
ethercalcUpdateTSV :: DocId -> Data -> ClientM Text
ethercalcNewPlain :<|> ethercalcUpdatePlain
:<|> ethercalcNewTSV :<|> ethercalcUpdateTSV = client ethercalcAPI
------------------------------
-- | Create new or update existing Ethercalc document (depending on
-- `Maybe DocId` constructor). `Data` can be in various formats (TSV,
-- etc).
ethercalc :: Host -> Maybe DocId -> ContentType Data -> IO (Either ClientError Text)
ethercalc (Host host) mDocId ctD = do
manager' <- newManager tlsManagerSettings
let env = mkClientEnv manager' (BaseUrl Https (T.unpack host) 443 "")
case (mDocId, ctD) of
(Nothing, CTPlain d) -> runClientM (ethercalcNewPlain d) env
(Nothing, CTTSV d) -> runClientM (ethercalcNewTSV d) env
(Just docId, CTPlain d) -> runClientM (ethercalcUpdatePlain docId d) env
(Just docId, CTTSV d) -> runClientM (ethercalcUpdateTSV docId d) env
-----------------------------------
type CodiMDAPI =
"new" :> (
ReqBody '[Markdown] Data
:> Post '[Markdown] Text
)
instance MimeRender Markdown Data where
mimeRender p (Data d) = mimeRender p d
codimdAPI :: Proxy CodiMDAPI
codimdAPI = Proxy
codimdAPINew :: Data -> ClientM Text
codimdAPINew = client codimdAPI
-- | Create a new CodiMD document (with Markdown contents). Please
-- note that AFAIK CodiMD update is not supported, see
-- https://github.com/hackmdio/codimd/issues/1013
codimd :: Host -> Data -> IO (Either Text Text)
codimd (Host host) d = do
manager' <- newManager tlsManagerSettings
let env' = mkClientEnv manager' (BaseUrl Https (T.unpack host) 443 "")
let env = env' { makeClientRequest = \burl req -> (defaultMakeClientRequest burl req) { redirectCount = 0 } }
eRes <- runClientM (codimdAPINew d) env
pure $ case eRes of
-- NOTE We actually expect a redirect here (a 302 with the new
-- page's URL). Hence we expect a `Left FailureResponse` because
-- we have set `redirectCount = 0` above.
Left (FailureResponse _req (Response { responseHeaders })) ->
case Map.lookup "location" (Map.fromList $ toList responseHeaders) of
Nothing -> Left "Cannot find 'Location' header in response"
Just loc -> Right $ TE.decodeUtf8 loc
err -> Left $ "Error creating codimd document: " <> show err
{-|
Module : Gargantext.Core.Viz.Graph
Description : Graph utils
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Core.Viz.Graph
where
import Data.Aeson qualified as DA
import Data.ByteString.Lazy as DBL (readFile, writeFile)
import Data.HashMap.Strict (HashMap, lookup)
import Data.Text qualified as Text
import Gargantext.API.Ngrams.Types (NgramsTerm(..), NgramsRepoElement(..), mSetToList)
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Core.Viz.Graph.Types
import Gargantext.Database.Admin.Types.Hyperdata.Prelude
import Gargantext.Prelude
import Text.Read qualified as Text
-----------------------------------------------------------
graphV3ToGraph :: GraphV3 -> Graph
graphV3ToGraph (GraphV3 links nodes) = Graph { _graph_nodes = map nodeV32node nodes
, _graph_edges = zipWith linkV32edge [1..] links
, _graph_metadata = Nothing }
where
nodeV32node :: NodeV3 -> Node
nodeV32node (NodeV3 no_id' (AttributesV3 cl') no_s' no_lb')
= Node { node_size = no_s'
, node_type = NgramsTerms
, node_id = show no_id'
, node_label = no_lb'
, node_x_coord = 0
, node_y_coord = 0
, node_attributes = Attributes cl'
, node_children = []
}
linkV32edge :: Int -> EdgeV3 -> Edge
linkV32edge n (EdgeV3 eo_s' eo_t' eo_w') =
Edge { edge_source = show eo_s'
, edge_hidden = Just False
, edge_target = show eo_t'
, edge_weight = (Text.read $ Text.unpack eo_w') :: Double
, edge_confluence = 0.5
, edge_id = show n }
graphV3ToGraphWithFiles :: FilePath -> FilePath -> IO ()
graphV3ToGraphWithFiles g1 g2 = do
-- GraphV3 <- IO Fichier
graph <- DBL.readFile g1
let newGraph = case DA.decode graph :: Maybe GraphV3 of
Nothing -> panicTrace "no graph"
Just new -> new
DBL.writeFile g2 (DA.encode $ graphV3ToGraph newGraph)
readGraphFromJson :: MonadBase IO m => FilePath -> m (Maybe Graph)
readGraphFromJson fp = do
graph <- liftBase $ DBL.readFile fp
pure $ DA.decode graph
-----------------------------------------------------------
mergeGraphNgrams :: Graph -> Maybe (HashMap NgramsTerm NgramsRepoElement) -> Graph
mergeGraphNgrams g Nothing = g
mergeGraphNgrams graph@(Graph { _graph_nodes }) (Just listNgrams) = set graph_nodes newNodes graph
where
newNodes = insertChildren <$> _graph_nodes
insertChildren (Node { node_label, .. }) = Node { node_children = children', .. }
where
-- lookup (NgramsTerm node_label) in listNgrams, then fetch (NgramsRepoElement _nre_children)
children' = case (lookup (NgramsTerm node_label) listNgrams) of
Nothing -> []
Just (NgramsRepoElement { _nre_children }) -> unNgramsTerm <$> mSetToList _nre_children
{-|
Module : Gargantext.Database.Action.TSQuery
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
module Gargantext.Database.Action.TSQuery where
import Data.Aeson
import Data.Maybe
import Data.String (IsString(..))
import Database.PostgreSQL.Simple (Query)
import Database.PostgreSQL.Simple.ToField
import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono.Stem (stem, StemmingAlgorithm(..))
import Gargantext.Core.Types
import Gargantext.Core.Types.Query (Limit, Offset)
import Gargantext.Database.Prelude (DBCmd, runPGSQuery)
import Gargantext.Prelude
newtype TSQuery = UnsafeTSQuery [Text]
-- | TODO [""] -> panic "error"
toTSQuery :: [Text] -> TSQuery
toTSQuery txt = UnsafeTSQuery $ map (stem EN GargPorterAlgorithm) txt
instance IsString TSQuery
where
fromString = UnsafeTSQuery . words . cs
instance ToField TSQuery
where
toField (UnsafeTSQuery xs)
= Many $ intersperse (Plain " && ")
$ map (\q -> Many [ Plain "plainto_tsquery("
, Escape (cs q)
, Plain ")"
]
) xs
data Order = Asc | Desc
instance ToField Order
where
toField Asc = Plain "ASC"
toField Desc = Plain "DESC"
-- TODO
-- FIX fav
-- ADD ngrams count
-- TESTS
textSearchQuery :: Query
textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year' \
\ , n.hyperdata->'title' \
\ , n.hyperdata->'source' \
\ , n.hyperdata->'authors' \
\ , COALESCE(nn.score,null) \
\ FROM nodes n \
\ LEFT JOIN nodes_nodes nn ON nn.node2_id = n.id \
\ WHERE \
\ n.search @@ (?::tsquery) \
\ AND (n.parent_id = ? OR nn.node1_id = ?) \
\ AND n.typename = ? \
\ ORDER BY n.hyperdata -> 'publication_date' ? \
\ offset ? limit ?;"
-- | Text Search Function for Master Corpus
-- TODO : text search for user corpus
-- Example:
-- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
-- textSearchTest pId q = textSearch q pId 5 0 Asc
textSearch :: HasDBid NodeType
=> TSQuery -> ParentId
-> Limit -> Offset -> Order
-> DBCmd err [(Int,Value,Value,Value, Value, Maybe Int)]
textSearch q p l o ord' = runPGSQuery textSearchQuery (q,p,p,typeId,ord',o,l)
where
typeId = toDBid NodeDocument
{-# OPTIONS_GHC -fno-warn-unused-top-binds #-}
{-|
Module : Gargantext.Database.Bashql
Description : BASHQL to deal with Gargantext Database.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
* BASHQL is a Domain Specific Language to deal with the Database
* BASHQL = functional (Bash * SQL)
* Which language to chose when working with a database ? To make it
simple, instead of all common Object Relational Mapping (ORM) [1]
strategy used nowadays inspired more by object logic than functional
logic, the semantics of BASHQL with focus on the function first.
* BASHQL focus on the function, i.e. use bash language function name,
and make it with SQL behind the scene. Then BASHQL is inspired more
by Bash language [2] than SQL and then follows its main commands as
specification and documentation.
* Main arguments:
1. Theoritical: database and FileSystems are each thought as a single
category, assumption based on theoretical work on databases by David Spivak [0].
2. Practical argument: basic bash commands are a daily practice among
developper community.
* How to help ?
1. Choose a command you like in Bash
2. Implement it in Haskell-SQL according to Gargantext Shema (Tree like
filesystem)
3. Translate it in BASHQL (follow previous implementations)
4. Make a pull request (enjoy the community)
* Implementation strategy: Functional adapations are made to the
gargantext languages options and SQL optimization are done continuously
during the project. For the Haskellish part, you may be inspired by
Turtle implementation written by Gabriel Gonzales [3] which shows how to
write Haskell bash translations.
* Semantics
- FileSystem is now a NodeSystem where each File is a Node in a Directed Graph (DG).
* References
[0] MIT Press has published "Category theory for the sciences". The book
can also be purchased on Amazon. Here are reviews by the MAA, by the
AMS, and by SIAM.
[1] https://en.wikipedia.org/wiki/Object-relational_mapping
[2] https://en.wikipedia.org/wiki/Bash_(Unix_shell)
[3] https://github.com/Gabriel439/Haskell-Turtle-Library
TODO-ACCESS: should the checks be done here or before.
-}
module Gargantext.Database.Admin.Bashql () {-( get
, ls
, home
, post
, del
, mv
, put
, rename
, tree
-- , mkCorpus, mkAnnuaire
)-}
where
import Control.Monad.Reader -- (Reader, ask)
import Data.List (last)
import Gargantext.Core.Types
import Gargantext.Database.Admin.Types.Hyperdata (HyperdataAny)
import Gargantext.Database.Prelude (runOpaQuery, Cmd)
import Gargantext.Database.Query.Table.Node
import Gargantext.Database.Query.Table.Node.Update qualified as U (Update(..), update)
import Gargantext.Database.Schema.Node
import Gargantext.Prelude hiding (get)
-- List of NodeId
-- type PWD a = PWD UserId [a]
type PWD = [NodeId]
--data PWD' a = a | PWD' [a]
rename :: NodeId -> Text -> Cmd err [Int]
rename n t = U.update $ U.Rename n t
mv :: NodeId -> ParentId -> Cmd err [Int]
mv n p = U.update $ U.Move n p
-- | TODO get Children or Node
get :: PWD -> Cmd err [Node HyperdataAny]
get [] = pure []
get pwd = runOpaQuery $ selectNodesWithParentID (last pwd)
-- | Home, need to filter with UserId
{-
home :: Cmd err PWD
home = map _node_id <$> getNodesWithParentId 0 Nothing
-}
-- | ls == get Children
ls :: PWD -> Cmd err [Node HyperdataAny]
ls = get
tree :: PWD -> Cmd err [Node HyperdataAny]
tree p = do
ns <- get p
children <- mapM (\n -> get [_node_id n]) ns
pure $ ns <> concat children
-- | TODO
post :: PWD -> [NodeWrite] -> Cmd err Int64
post [] _ = pure 0
post _ [] = pure 0
post pth ns = insertNodesWithParent (Just $ last pth) ns
--postR :: PWD -> [NodeWrite'] -> Cmd err [Int]
--postR [] _ _ = pure [0]
--postR _ [] _ = pure [0]
--postR pth ns c = mkNodeR (last pth) ns c
-- | WIP
-- rm : mv to trash
-- del : empty trash
--rm :: PWD -> [NodeId] -> IO Int
--rm = del
del :: [NodeId] -> Cmd err Int
del [] = pure 0
del ns = deleteNodes ns
-- | TODO
put :: U.Update -> Cmd err [Int]
put = U.update
-- | TODO
-- cd (Home UserId) | (Node NodeId)
-- cd Path
-- jump NodeId
-- touch Dir
-- type Name = Text
{-|
Module : Gargantext.Database.Schema.NodeNodeNgrams
Description : TODO: remove this module and table in database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module Gargantext.Database.Query.Table.NodeNodeNgrams
( module Gargantext.Database.Schema.NodeNodeNgrams
, queryNodeNodeNgramsTable
, insertNodeNodeNgrams
)
where
import Gargantext.Database.Admin.Types.Node (pgNodeId)
import Gargantext.Database.Prelude (DBCmd, mkCmd)
import Gargantext.Database.Schema.Ngrams (pgNgramsTypeId)
import Gargantext.Database.Schema.NodeNodeNgrams
import Gargantext.Database.Schema.Prelude
import Prelude
queryNodeNodeNgramsTable :: Query NodeNodeNgramsRead
queryNodeNodeNgramsTable = selectTable nodeNodeNgramsTable
-- | Insert utils
insertNodeNodeNgrams :: [NodeNodeNgrams] -> DBCmd err Int
insertNodeNodeNgrams = insertNodeNodeNgramsW
. map (\(NodeNodeNgrams n1 n2 ng nt w) ->
NodeNodeNgrams (pgNodeId n1)
(pgNodeId n2)
(sqlInt4 ng)
(pgNgramsTypeId nt)
(sqlDouble w)
)
insertNodeNodeNgramsW :: [NodeNodeNgramsWrite] -> DBCmd err Int
insertNodeNodeNgramsW nnnw =
mkCmd $ \c -> fromIntegral <$> runInsert_ c insertNothing
where
insertNothing = (Insert { iTable = nodeNodeNgramsTable
, iRows = nnnw
, iReturning = rCount
, iOnConflict = (Just DoNothing)
})
{-|
Module : Gargantext.Database.Schema.NgramsPostag
Description : Ngrams connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Each Ngrams has a pos-tagging version to ease the default groups of
ngrams in NgramsTerm Lists.
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Schema.NgramsPostag
where
import Database.PostgreSQL.Simple qualified as PGS
import Gargantext.Database.Schema.Prelude ( Column, SqlInt4, SqlText, ToField(toField), toRow )
import Gargantext.Prelude
data NgramsPostagPoly id
lang_id
algo_id
postag
ngrams_id
lemm_id
score
= NgramsPostagPoly { _ngramsPostag_id :: !id
, _ngramsPostag_lang_id :: !lang_id
, _ngramsPostag_algo_id :: !algo_id
, _ngramsPostag_postag :: !postag
, _ngramsPostag_ngrams_id :: !ngrams_id
, _ngramsPostag_lemm_id :: !lemm_id
, _ngramsPostag_score :: !score
} deriving (Show)
------------------------------------------------------------------------
data PosTag = PosTag { unPosTag :: !Text }
| NER { unNER :: !Text } -- TODO
------------------------------------------------------------------------
-- type NgramsPostag = NgramsPostagPoly (Maybe Int) Lang PostTagAlgo (Maybe PosTag) NgramsTerm NgramsTerm (Maybe Int)
type NgramsPostagDB = NgramsPostagPoly (Maybe Int) Int Int (Maybe Text) Int Int Int
------------------------------------------------------------------------
type NgramsPosTagWrite = NgramsPostagPoly (Maybe (Column SqlInt4))
(Column SqlInt4)
(Column SqlInt4)
(Maybe (Column SqlText))
(Column SqlInt4)
(Column SqlInt4)
(Maybe (Column SqlInt4))
type NgramsPosTagRead = NgramsPostagPoly (Column SqlInt4)
(Column SqlInt4)
(Column SqlInt4)
(Column SqlText)
(Column SqlInt4)
(Column SqlInt4)
(Column SqlInt4)
makeLenses ''NgramsPostagPoly
instance PGS.ToRow NgramsPostagDB where
toRow (NgramsPostagPoly f0 f1 f2 f3 f4 f5 f6) = [ toField f0
, toField f1
, toField f2
, toField f3
, toField f4
, toField f5
, toField f6
]
{-|
Module : Gargantext.Database.Schema.NodeNodeNgrams
Description : TODO: remove this module and table in database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Schema.NodeNodeNgrams
where
import Prelude
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsTypeId, NgramsId)
import Gargantext.Database.Admin.Types.Node
data NodeNodeNgramsPoly n1 n2 ngrams_id ngt w
= NodeNodeNgrams { _nnng_node1_id :: !n1
, _nnng_node2_id :: !n2
, _nnng_ngrams_id :: !ngrams_id
, _nnng_ngramsType :: !ngt
, _nnng_weight :: !w
} deriving (Show)
type NodeNodeNgramsWrite =
NodeNodeNgramsPoly (Column SqlInt4 )
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
type NodeNodeNgramsRead =
NodeNodeNgramsPoly (Column SqlInt4 )
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
type NodeNodeNgrams =
NodeNodeNgramsPoly CorpusId DocId NgramsId NgramsTypeId Double
$(makeAdaptorAndInstance "pNodeNodeNgrams" ''NodeNodeNgramsPoly)
makeLenses ''NodeNodeNgramsPoly
nodeNodeNgramsTable :: Table NodeNodeNgramsWrite NodeNodeNgramsRead
nodeNodeNgramsTable = Table "node_node_ngrams"
( pNodeNodeNgrams NodeNodeNgrams
{ _nnng_node1_id = requiredTableField "node1_id"
, _nnng_node2_id = requiredTableField "node2_id"
, _nnng_ngrams_id = requiredTableField "ngrams_id"
, _nnng_ngramsType = requiredTableField "ngrams_type"
, _nnng_weight = requiredTableField "weight"
}
)
{-|
Module : Gargantext.Database.Schema.NodeNodeNgrams
Description : TODO: remove this module and table in database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Schema.NodeNodeNgrams2
where
import Gargantext.Database.Schema.Prelude
import Gargantext.Database.Schema.NodeNgrams (NodeNgramsId)
import Gargantext.Database.Admin.Types.Node
import Prelude
data NodeNodeNgrams2Poly node_id nodengrams_id w
= NodeNodeNgrams2 { _nnng2_node_id :: !node_id
, _nnng2_nodengrams_id :: !nodengrams_id
, _nnng2_weight :: !w
} deriving (Show)
type NodeNodeNgrams2Write =
NodeNodeNgrams2Poly (Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
type NodeNodeNgrams2Read =
NodeNodeNgrams2Poly (Column SqlInt4 )
(Column SqlInt4 )
(Column SqlFloat8)
type NodeNodeNgrams2 =
NodeNodeNgrams2Poly DocId NodeNgramsId Double
$(makeAdaptorAndInstance "pNodeNodeNgrams2" ''NodeNodeNgrams2Poly)
makeLenses ''NodeNodeNgrams2Poly
nodeNodeNgrams2Table :: Table NodeNodeNgrams2Write NodeNodeNgrams2Read
nodeNodeNgrams2Table = Table "node_node_ngrams2"
( pNodeNodeNgrams2 NodeNodeNgrams2
{ _nnng2_node_id = requiredTableField "node_id"
, _nnng2_nodengrams_id = requiredTableField "nodengrams_id"
, _nnng2_weight = requiredTableField "weight"
}
)
{-|
Module : Gargantext.Utils.JohnSnow
Description : John Snow NLP API connexion
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Utils.JohnSnowNLP where
import Control.Lens ( FunctorWithIndex(imap) )
import Data.Aeson (encode, Value(..), (.:), (.:?))
import Data.Aeson.Types (prependFailure, typeMismatch)
import Data.List.Safe qualified as LS
import Data.Map.Strict qualified as Map
import Data.Text (unpack)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import Gargantext.Core.Types (POS(..))
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude hiding (All)
import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
import Prelude (userError)
data JSSpell = JSPOS Lang | JSLemma Lang
deriving (Show)
instance ToJSON JSSpell where
toJSON (JSPOS DE) = "de.pos"
toJSON (JSPOS EL) = "el.pos"
toJSON (JSPOS EN) = "en.pos"
toJSON (JSPOS ES) = "es.pos"
toJSON (JSPOS FR) = "fr.pos"
toJSON (JSPOS IT) = "it.pos"
toJSON (JSPOS PL) = "pl.pos"
toJSON (JSPOS PT) = "pt.pos"
toJSON (JSPOS RU) = "ru.pos"
toJSON (JSPOS UK) = "uk.pos"
toJSON (JSPOS ZH) = "zh.pos"
toJSON (JSLemma DE) = "de.lemma"
toJSON (JSLemma EL) = "el.lemma"
toJSON (JSLemma EN) = "en.lemma"
toJSON (JSLemma ES) = "es.lemma"
toJSON (JSLemma FR) = "fr.lemma"
toJSON (JSLemma IT) = "it.lemma"
toJSON (JSLemma PL) = "pl.lemma"
toJSON (JSLemma PT) = "pt.lemma"
toJSON (JSLemma RU) = "ru.lemma"
toJSON (JSLemma UK) = "uk.lemma"
toJSON (JSLemma ZH) = "zh.lemma"
instance FromJSON JSSpell where
parseJSON (String "de.pos") = pure $ JSPOS DE
parseJSON (String "en.pos") = pure $ JSPOS EN
parseJSON (String "el.pos") = pure $ JSPOS EL
parseJSON (String "es.pos") = pure $ JSPOS ES
parseJSON (String "fr.pos") = pure $ JSPOS FR
parseJSON (String "it.pos") = pure $ JSPOS IT
parseJSON (String "pl.pos") = pure $ JSPOS PL
parseJSON (String "pt.pos") = pure $ JSPOS PT
parseJSON (String "ru.pos") = pure $ JSPOS RU
parseJSON (String "uk.pos") = pure $ JSPOS UK
parseJSON (String "zh.pos") = pure $ JSPOS ZH
parseJSON (String "de.lemma") = pure $ JSLemma DE
parseJSON (String "en.lemma") = pure $ JSLemma EN
parseJSON (String "el.lemma") = pure $ JSLemma EL
parseJSON (String "es.lemma") = pure $ JSLemma ES
parseJSON (String "fr.lemma") = pure $ JSLemma FR
parseJSON (String "it.lemma") = pure $ JSLemma IT
parseJSON (String "pl.lemma") = pure $ JSLemma PL
parseJSON (String "pt.lemma") = pure $ JSLemma PT
parseJSON (String "ru.lemma") = pure $ JSLemma RU
parseJSON (String "uk.lemma") = pure $ JSLemma UK
parseJSON (String "zh.lemma") = pure $ JSLemma ZH
parseJSON s =
prependFailure "parsing spell failed, "
(typeMismatch "Spell" s)
data JSRequest =
JSRequest { _jsRequest_data :: !Text
, _jsRequest_format :: !Text
, _jsRequest_grouping :: !(Maybe Text)
, _jsRequest_spell :: !JSSpell }
deriving (Show)
-- "spell" options:
-- https://nlu.johnsnowlabs.com/docs/en/spellbook
deriveJSON (unPrefix "_jsRequest_") ''JSRequest
-- | JohnSnow NLP works via asynchronous tasks: send a query and get a
-- task in response. One must poll for task status and then get it's
-- result.
data JSAsyncTask =
JSAsyncTask { _jsAsyncTask_uuid :: !Text }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
-- | Task status.
data JSAsyncTaskStatus =
JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
, _jsAsyncTaskStatus_message :: !(Maybe Text) }
deriving (Show)
taskReady :: JSAsyncTaskStatus -> Bool
taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
--deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
instance FromJSON JSAsyncTaskStatus where
parseJSON (Object v) = do
status <- v .: "status"
code <- status .: "code"
message <- status .:? "message"
pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
, _jsAsyncTaskStatus_message = message }
parseJSON s =
prependFailure "parsing status failed"
(typeMismatch "status" s)
-- | Response for our query. The `Maybe` types are here because we
-- combine 2 types of responses into one: `pos` and `lemma`.
data JSAsyncTaskResponse =
JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
, _jsAsyncTaskResponse_document :: Map Text Text
, _jsAsyncTaskResponse_sentence :: Map Text [Text]
, _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
, _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
, _jsAsyncTaskResponse_token :: Map Text [Text] }
deriving (Show)
deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
makeLenses ''JSAsyncTaskResponse
-- | We need to combine 2 responses: `pos` and `lemma` spells.
jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
jsAsyncTaskResponseToSentences jsPos jsLemma =
PosSentences { _sentences }
where
_sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
, _sentenceTokens = sTokens }
where
sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
lemmas = fromMaybe [] $
if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
else
Nothing
sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
(Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
, _tokenWord = token
, _tokenOriginalText = ""
, _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
, _tokenCharacterOffsetBegin = -1
, _tokenCharacterOffsetEnd = -1
, _tokenPos = Just pos
, _tokenNer = Nothing
, _tokenBefore = Nothing
, _tokenAfter = Nothing }
-----------------------------------------------------
jsRequest :: Text -> JSSpell -> IO JSAsyncTask
jsRequest t s = do
url <- parseRequest $ "POST http://localhost:5000/api/results"
let jsReq = JSRequest { _jsRequest_data = t
, _jsRequest_format = "text"
, _jsRequest_grouping = Nothing
, _jsRequest_spell = s }
let request = setRequestBodyLBS (encode jsReq) url
task <- httpJSON request :: IO (Response JSAsyncTask)
pure $ getResponseBody task
jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
jsTaskStatus (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
status <- httpJSON url
pure $ getResponseBody status
jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
jsTaskResponse (JSAsyncTask uuid) = do
url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
result <- httpJSON url
pure $ getResponseBody result
waitForJsTask :: HasCallStack => JSAsyncTask -> IO JSAsyncTaskResponse
waitForJsTask jsTask = wait' 0
where
wait' :: Int -> IO JSAsyncTaskResponse
wait' counter = do
status <- jsTaskStatus jsTask
if taskReady status then
jsTaskResponse jsTask
else
if counter > 60 then
throwIO $ withStacktrace $ userError "waited for 1 minute and still no answer from JohnSnow NLP"
else do
-- printDebug "[waitForJsTask] task not ready, waiting" counter
_ <- threadDelay $ 1000000*1
wait' $ counter + 1
getPosTagAndLems :: Lang -> Text -> IO PosSentences
getPosTagAndLems l t = do
jsPosTask <- jsRequest t (JSPOS l)
jsLemmaTask <- jsRequest t (JSLemma l)
-- wait for both tasks
jsPos <- waitForJsTask jsPosTask
jsLemma <- waitForJsTask jsLemmaTask
pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
nlp :: Lang -> Text -> IO PosSentences
nlp = getPosTagAndLems
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment