Commit da3a6888 authored by qlobbe's avatar qlobbe

Merge branch 'dev' into dev-phylo

parents 3e2a42ab db0f6f08
......@@ -13,7 +13,11 @@ Institute of Paris Île-de-France (ISC-PIF) and its partners.
## Installation
curl -sSL https://gitlab.iscpif.fr/gargantext/haskell-gargantext/raw/master/install | sh
### Docker
curl -sSL https://gitlab.iscpif.fr/gargantext/haskell-gargantext/raw/master/devops/docker-install | sh
### Debian
curl -sSL https://gitlab.iscpif.fr/gargantext/haskell-gargantext/raw/master/devops/debian-install | sh
## Use Cases
......
......@@ -24,14 +24,13 @@ import qualified Data.Vector as V
import Gargantext.Prelude
import Gargantext.Text.Search
import Gargantext.Text.Parsers.CSV
import qualified Gargantext.Text.Parsers.CSV as CSV
------------------------------------------------------------------------
type Query = [S.Term]
filterDocs :: [DocId] -> Vector Doc -> Vector Doc
filterDocs docIds = V.filter (\doc -> S.member (d_docId doc) $ S.fromList docIds )
filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
main :: IO ()
......@@ -41,17 +40,17 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let q = ["gratuit", "gratuité", "culture", "culturel"]
(h,csvDocs) <- readCsv rPath
(h,csvDocs) <- CSV.readFile rPath
putStrLn $ "Number of documents before:" <> show (V.length csvDocs)
putStrLn $ "Mean size of docs:" <> show ( docsSize csvDocs)
putStrLn $ "Mean size of docs:" <> show ( CSV.docsSize csvDocs)
let docs = toDocs csvDocs
let docs = CSV.toDocs csvDocs
let engine = insertDocs docs initialDocSearchEngine
let docIds = S.query engine (map pack q)
let docs' = fromDocs $ filterDocs docIds (V.fromList docs)
let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
putStrLn $ "Number of documents after:" <> show (V.length docs')
putStrLn $ "Mean size of docs:" <> show (docsSize docs')
putStrLn $ "Mean size of docs:" <> show (CSV.docsSize docs')
writeCsv wPath (h, docs')
CSV.writeFile wPath (h, docs')
......@@ -57,7 +57,7 @@ import Gargantext.Core.Types
import Gargantext.Text.Terms
import Gargantext.Text.Context
import Gargantext.Text.Terms.WithList
import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
......@@ -105,7 +105,7 @@ main = do
. DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd
<$> readCsv corpusFile
<$> readFile corpusFile
-- termListMap :: [Text]
termList <- csvGraphTermList termListFile
......
......@@ -27,7 +27,7 @@ import Options.Generic
import Data.Text (unpack)
import Gargantext.Prelude
import Gargantext.API (startGargantext, startGargantextMock)
import Gargantext.API (startGargantext) -- , startGargantextMock)
--------------------------------------------------------
-- Graph Tests
......@@ -73,7 +73,9 @@ main = do
myIniFile' = case myIniFile of
Nothing -> panic "[ERROR] gargantext.ini needed"
Just i -> i
_ -> startGargantextMock myPort'
Dev -> panic "[ERROR] Dev mode unsupported"
Mock -> panic "[ERROR] Mock mode unsupported"
-- _ -> startGargantextMock myPort'
putStrLn $ "Starting with " <> show myMode <> " mode."
start
......
#!/bin/bash
stack haddock --no-haddock-deps
#!/bin/bash
if git --version;
then
echo "git installed, ok"
else
sudo apt update && sudo apt install git
fi
sudo apt update
sudo apt install liblzma-dev libpcre3-dev libblas-dev liblapack-dev pkg-config libgsl-dev libbz2-dev postgresql postgresql-server-dev-9.6 nginx
#echo "Which user?"
#read USER
#sudo adduser --disabled-password --gecos "" $USER
#sudo su $USER
curl -sSL https://get.haskellstack.org/ | sh
stack update
stack upgrade
git clone https://gitlab.iscpif.fr/gargantext/haskell-gargantext.git
cd haskell-gargantext
git clone https://gitlab.iscpif.fr/gargantext/purescript-gargantext
mkdir deps
cd deps
git clone https://gitlab.iscpif.fr/gargantext/clustering-louvain.git
git clone https://github.com/np/servant-job.git
git clone https://github.com/np/patches-map
git clone https://gitlab.com/npouillard/patches-class.git
git clone https://github.com/delanoe/haskell-opaleye
git clone -b next --single-branch https://github.com/delanoe/hsparql
cd ..
stack setup
stack build
stack install
# Specific to our servers
### Configure timezone and locale ###################################
echo "########### LOCALES & TZ #################"
echo "Europe/Paris" > /etc/timezone
dpkg-reconfigure --frontend=noninteractive tzdata
#ENV TZ "Europe/Paris"
sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
locale-gen && \
update-locale LANG=fr_FR.UTF-8 && \
update-locale LANGUAGE=fr_FR.UTF-8 && \
update-locale LC_ALL=fr_FR.UTF-8
################################################################
# Database configuration
# CREATE USER gargantua WITH PASSWORD $(grep DB_PASS gargantext.ini)
# GRANT ALL PRIVILEGES ON DATABASE gargandbV4 to gargantua
#######################################################################
## POSTGRESQL DATA (as ROOT)
#######################################################################
sed -iP "s%^data_directory.*%data_directory = \'\/srv\/gargandata\'%" /etc/postgresql/9.6/main/postgresql.conf
echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.6/main/pg_hba.conf
echo "listen_addresses='*'" >> /etc/postgresql/9.6/main/postgresql.conf
#!/bin/bash
if git --version;
then
echo "git installed, ok"
else
sudo apt update && sudo apt install git
fi
sudo apt update
sudo apt install liblzma-dev libpcre3-dev libblas-dev liblapack-dev pkg-config libgsl-dev libbz2-dev postgresql postgresql-server-dev-9.6 nginx libpq-dev
jq < repo.json '.state |= map_values(map_values(length)) | .history |= length'
name: gargantext
version: '4.0.0.4'
version: '4.0.0.5'
synopsis: Search, map, share
description: Please see README.md
category: Data
......@@ -40,7 +40,6 @@ library:
- Gargantext.Database
- Gargantext.Database.Flow
- Gargantext.Database.Schema.Node
- Gargantext.Database.Cooc
- Gargantext.Database.Tree
- Gargantext.Database.Types.Node
- Gargantext.Database.Utils
......@@ -102,6 +101,7 @@ library:
- conduit-extra
- containers
- contravariant
- crawlerPubMed
- data-time-segment
- directory
- duckling
......@@ -169,6 +169,7 @@ library:
- servant-swagger
- servant-swagger-ui
- servant-static-th
- servant-cassava
- serialise
- split
- stemmer
......
......@@ -56,7 +56,7 @@ import Network.Wai.Handler.Warp hiding (defaultSettings)
import Servant
import Servant.HTML.Blaze (HTML)
import Servant.Mock (mock)
--import Servant.Mock (mock)
--import Servant.Job.Server (WithCallbacks)
import Servant.Static.TH.Internal.Server (fileTreeToServer)
import Servant.Static.TH.Internal.FileTree (fileTypeToFileTree, FileType(FileTypeFile))
......@@ -66,23 +66,24 @@ import Servant.Swagger.UI
import Text.Blaze.Html (Html)
--import Gargantext.API.Swagger
import Gargantext.Prelude
import Gargantext.Core.Types (HasInvalidError(..))
import Gargantext.API.FrontEnd (FrontEndAPI, frontEndServer)
--import Gargantext.Database.Node.Contact (HyperdataContact)
import Gargantext.API.Auth (AuthRequest, AuthResponse, auth)
import Gargantext.API.Ngrams (HasRepo(..), HasRepoSaver(..), saveRepo)
import Gargantext.API.Types
import Gargantext.API.Count ( CountAPI, count, Query)
import Gargantext.API.FrontEnd (FrontEndAPI, frontEndServer)
import Gargantext.API.Ngrams (HasRepo(..), HasRepoSaver(..), saveRepo, TableNgramsApi, apiNgramsTableDoc)
import Gargantext.API.Node
import Gargantext.API.Search ( SearchAPI, search, SearchQuery)
import Gargantext.API.Types
import Gargantext.API.Upload
import Gargantext.Core.Types (HasInvalidError(..))
import Gargantext.Database.Facet
import Gargantext.Database.Schema.Node (HasNodeError(..), NodeError)
--import Gargantext.Database.Node.Contact (HyperdataContact)
import Gargantext.Database.Types.Node
import Gargantext.Database.Utils (HasConnection)
import Gargantext.Database.Tree (HasTreeError(..), TreeError)
import Gargantext.Database.Types.Node
import Gargantext.Database.Types.Node (NodeId, CorpusId, AnnuaireId)
import Gargantext.API.Count ( CountAPI, count, Query)
import Gargantext.API.Search ( SearchAPI, search, SearchQuery)
import Gargantext.Database.Facet
import Gargantext.Database.Utils (HasConnection)
import Gargantext.Prelude
import Gargantext.Viz.Graph.API
--import Gargantext.API.Orchestrator
......@@ -144,7 +145,7 @@ fireWall req fw = do
then pure True
else pure False
{-
-- makeMockApp :: Env -> IO (Warp.Settings, Application)
makeMockApp :: MockEnv -> IO Application
makeMockApp env = do
......@@ -177,7 +178,7 @@ makeMockApp env = do
--pure (warpS, logWare $ checkOriginAndHost $ corsMiddleware $ serverApp)
pure $ logStdoutDev $ checkOriginAndHost $ corsMiddleware $ serverApp
-}
makeDevMiddleware :: IO Middleware
......@@ -248,6 +249,10 @@ type GargAPI' =
:<|> "annuaire":> Summary "Annuaire endpoint"
:> Capture "id" AnnuaireId :> NodeAPI HyperdataAnnuaire
-- Document endpoint
:<|> "document":> Summary "Document endpoint"
:> Capture "id" DocId :> "ngrams" :> TableNgramsApi
-- Corpus endpoint
:<|> "nodes" :> Summary "Nodes endpoint"
:> ReqBody '[JSON] [NodeId] :> NodesAPI
......@@ -274,6 +279,8 @@ type GargAPI' =
:<|> "tree" :> Summary "Tree endpoint"
:> Capture "id" NodeId :> TreeAPI
:<|> "upload" :> ApiUpload
-- :<|> "scraper" :> WithCallbacks ScraperAPI
......@@ -310,11 +317,13 @@ serverGargAPI -- orchestrator
:<|> nodeAPI (Proxy :: Proxy HyperdataAny) fakeUserId
:<|> nodeAPI (Proxy :: Proxy HyperdataCorpus) fakeUserId
:<|> nodeAPI (Proxy :: Proxy HyperdataAnnuaire) fakeUserId
:<|> apiNgramsTableDoc
:<|> nodesAPI
:<|> count -- TODO: undefined
:<|> search
:<|> graphAPI -- TODO: mock
:<|> treeAPI
:<|> upload
-- :<|> orchestrator
where
fakeUserId = 1 -- TODO
......@@ -331,16 +340,16 @@ swaggerFront :: Server SwaggerFrontAPI
swaggerFront = schemaUiServer swaggerDoc
:<|> frontEndServer
gargMock :: Server GargAPI
gargMock = mock apiGarg Proxy
--gargMock :: Server GargAPI
--gargMock = mock apiGarg Proxy
---------------------------------------------------------------------
makeApp :: (HasConnection env, HasRepo env, HasSettings env)
=> env -> IO Application
makeApp = fmap (serve api) . server
appMock :: Application
appMock = serve api (swaggerFront :<|> gargMock :<|> serverStatic)
--appMock :: Application
--appMock = serve api (swaggerFront :<|> gargMock :<|> serverStatic)
---------------------------------------------------------------------
api :: Proxy API
......@@ -405,9 +414,10 @@ startGargantext port file = do
mid <- makeDevMiddleware
run port (mid app) `finally` stopGargantext env
{-
startGargantextMock :: PortNumber -> IO ()
startGargantextMock port = do
portRouteInfo port
application <- makeMockApp . MockEnv $ FireWall False
run port application
-}
......@@ -35,6 +35,9 @@ import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Utils
import Gargantext.Core.Types (CorpusId)
import Gargantext.Prelude
import Gargantext.API.Ngrams
import Gargantext.API.Ngrams.NTree
import Gargantext.Database.Flow
import Gargantext.Viz.Chart
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
......@@ -66,7 +69,6 @@ instance Arbitrary Metric
deriveJSON (unPrefix "metrics_") ''Metrics
deriveJSON (unPrefix "m_") ''Metric
-------------------------------------------------------------
data ChartMetrics a = ChartMetrics { chartMetrics_data :: a }
......@@ -89,24 +91,30 @@ instance Arbitrary Histo
deriveJSON (unPrefix "histo_") ''Histo
instance ToSchema MyTree
instance Arbitrary MyTree
where
arbitrary = MyTree <$> arbitrary <*> arbitrary <*> arbitrary
-- TODO add start / end
getChart :: CorpusId -> Maybe UTCTime -> Maybe UTCTime -> Cmd err (ChartMetrics Histo)
getChart cId _start _end = do
h <- histoData cId
pure (ChartMetrics h)
getPie :: FlowCmdM env err m => CorpusId -> Maybe UTCTime -> Maybe UTCTime -> TabType -> m (ChartMetrics Histo)
getPie cId _start _end tt = do
p <- pieData cId (ngramsTypeFromTabType tt) GraphTerm
pure (ChartMetrics p)
getTree :: FlowCmdM env err m => CorpusId -> Maybe UTCTime -> Maybe UTCTime -> TabType -> ListType -> m (ChartMetrics [MyTree])
getTree cId _start _end tt lt = do
p <- treeData cId (ngramsTypeFromTabType tt) lt
pure (ChartMetrics p)
{-
data FacetChart = FacetChart { facetChart_time :: UTCTime'
, facetChart_count :: Double
}
deriving (Show, Generic)
$(deriveJSON (unPrefix "facetChart_") ''FacetChart)
instance ToSchema FacetChart
instance Arbitrary FacetChart where
arbitrary = FacetChart <$> arbitrary <*> arbitrary
-}
This diff is collapsed.
{-|
Module : Gargantext.API.Ngrams.NTree
Description : Tree of Ngrams
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Ngrams.NTree
where
import Data.Text (Text)
import Gargantext.Prelude
import GHC.Generics (Generic)
import Data.Aeson.TH (deriveJSON)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Core.Types (ListType(..), NodeId)
import Gargantext.API.Ngrams
import Data.Tree
import Data.Maybe (catMaybes)
import Data.Map (Map)
import Data.Set (Set)
import qualified Data.Set as Set
import qualified Data.Map as Map
import qualified Data.List as List
type Children = Text
type Root = Text
data MyTree = MyTree { mt_label :: Text
, mt_value :: Double
, mt_children :: [MyTree]
} deriving (Generic, Show)
toMyTree :: Tree (Text,Double) -> MyTree
toMyTree (Node (l,v) xs) = MyTree l v (map toMyTree xs)
deriveJSON (unPrefix "mt_") ''MyTree
toTree :: ListType -> Map Text (Set NodeId) -> Map Text NgramsRepoElement -> [MyTree]
toTree lt vs m = map toMyTree $ unfoldForest buildNode roots
where
buildNode r = maybe ((r, value r),[]) (\x -> ((r, value r), mSetToList $ _nre_children x)) (Map.lookup r m)
value l = maybe 0 (fromIntegral . Set.size) $ Map.lookup l vs
rootsCandidates = catMaybes
$ List.nub
$ map (\(c,c') -> case _nre_root c' of
Nothing -> Just c
_ -> _nre_root c' ) (Map.toList m)
roots = map fst
$ filter (\(_,l) -> l == lt)
$ catMaybes
$ map (\c -> (,) <$> Just c <*> (_nre_list <$> Map.lookup c m)) rootsCandidates
......@@ -58,7 +58,6 @@ mapTermListRoot nodeIds ngramsType = do
| (t, nre) <- Map.toList ngrams
]
filterListWithRoot :: ListType -> Map Text (ListType, Maybe Text)
-> Map Text (Maybe RootTerm)
filterListWithRoot lt m = Map.fromList
......@@ -71,7 +70,6 @@ filterListWithRoot lt m = Map.fromList
Nothing -> panic $ "Garg.API.Ngrams.Tools: filterWithRoot, unknown key: " <> r
Just (l',_) -> l' == lt
groupNodesByNgrams :: Map Text (Maybe RootTerm)
-> Map Text (Set NodeId)
-> Map Text (Set NodeId)
......@@ -97,4 +95,3 @@ getCoocByNgrams (Diagonal diag) m =
False -> listToCombi identity (Map.keys m)
]
......@@ -45,11 +45,11 @@ import Data.Text (Text())
import Data.Time (UTCTime)
import GHC.Generics (Generic)
import Gargantext.API.Metrics
import Gargantext.API.Ngrams (TabType(..), TableNgramsApi, TableNgramsApiGet, tableNgramsPatch, getTableNgrams, QueryParamR)
import Gargantext.API.Ngrams (TabType(..), TableNgramsApi, apiNgramsTableCorpus, QueryParamR)
import Gargantext.API.Search ( SearchAPI, searchIn, SearchInQuery)
import Gargantext.API.Types
import Gargantext.Core.Types (Offset, Limit)
import Gargantext.Core.Types.Main (Tree, NodeTree)
import Gargantext.Core.Types.Main (Tree, NodeTree, ListType)
import Gargantext.Database.Facet (FacetDoc , runViewDocuments, OrderBy(..),runViewAuthorsDoc)
import Gargantext.Database.Node.Children (getChildren)
import Gargantext.Database.Schema.Node ( getNodesWithParentId, getNode, deleteNode, deleteNodes, mkNodeWithParent, JSONB, NodeError(..), HasNodeError(..))
......@@ -61,6 +61,7 @@ import Gargantext.Prelude
import Gargantext.Text.Metrics (Scored(..))
import Gargantext.Viz.Phylo.API (PhyloAPI, phyloAPI)
import Gargantext.Viz.Chart
import Gargantext.API.Ngrams.NTree (MyTree)
import Servant
import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
......@@ -121,11 +122,9 @@ type NodeAPI a = Get '[JSON] (Node a)
-- TODO gather it
:<|> "table" :> TableApi
:<|> "list" :> TableNgramsApi
:<|> "listGet" :> TableNgramsApiGet
:<|> "ngrams" :> TableNgramsApi
:<|> "pairing" :> PairingApi
:<|> "favorites" :> FavApi
:<|> "documents" :> DocsApi
:<|> "search":> Summary "Node Search"
......@@ -138,6 +137,8 @@ type NodeAPI a = Get '[JSON] (Node a)
-- VIZ
:<|> "metrics" :> MetricsAPI
:<|> "chart" :> ChartApi
:<|> "pie" :> PieApi
:<|> "tree" :> TreeApi
:<|> "phylo" :> PhyloAPI
-- TODO-ACCESS: check userId CanRenameNode nodeId
......@@ -168,9 +169,9 @@ nodeAPI p uId id
-- TODO gather it
:<|> getTable id
:<|> tableNgramsPatch id
:<|> getTableNgrams id
:<|> apiNgramsTableCorpus id
:<|> getPairing id
-- :<|> getTableNgramsDoc id
:<|> favApi id
:<|> delDocs id
......@@ -178,9 +179,11 @@ nodeAPI p uId id
:<|> getMetrics id
:<|> getChart id
:<|> getPie id
:<|> getTree id
:<|> phyloAPI id
-- Annuaire
-- :<|> upload
-- :<|> query
------------------------------------------------------------------------
......@@ -264,6 +267,21 @@ type ChartApi = Summary " Chart API"
:> QueryParam "to" UTCTime
:> Get '[JSON] (ChartMetrics Histo)
type PieApi = Summary " Chart API"
:> QueryParam "from" UTCTime
:> QueryParam "to" UTCTime
:> QueryParamR "ngramsType" TabType
:> Get '[JSON] (ChartMetrics Histo)
type TreeApi = Summary " Tree API"
:> QueryParam "from" UTCTime
:> QueryParam "to" UTCTime
:> QueryParamR "ngramsType" TabType
:> QueryParamR "listType" ListType
:> Get '[JSON] (ChartMetrics [MyTree])
-- Depending on the Type of the Node, we could post
-- New documents for a corpus
-- New map list terms
......@@ -276,7 +294,7 @@ type ChartApi = Summary " Chart API"
instance HasNodeError ServantErr where
_NodeError = prism' mk (const Nothing) -- $ panic "HasNodeError ServantErr: not a prism")
_NodeError = prism' mk (const Nothing) -- panic "HasNodeError ServantErr: not a prism")
where
e = "Gargantext NodeError: "
mk NoListFound = err404 { errBody = e <> "No list found" }
......@@ -294,7 +312,7 @@ instance HasNodeError ServantErr where
-- TODO(orphan): There should be a proper APIError data type with a case TreeError.
instance HasTreeError ServantErr where
_TreeError = prism' mk (const Nothing) -- $ panic "HasTreeError ServantErr: not a prism")
_TreeError = prism' mk (const Nothing) -- panic "HasTreeError ServantErr: not a prism")
where
e = "TreeError: "
mk NoRoot = err404 { errBody = e <> "Root node not found" }
......@@ -340,25 +358,7 @@ query :: Monad m => Text -> m Text
query s = pure s
-- | Upload files
-- TODO Is it possible to adapt the function according to iValue input ?
--upload :: MultipartData -> Handler Text
--upload multipartData = do
-- liftIO $ do
-- putStrLn "Inputs:"
-- forM_ (inputs multipartData) $ \input ->
-- putStrLn $ " " <> show (iName input)
-- <> " -> " <> show (iValue input)
--
-- forM_ (files multipartData) $ \file -> do
-- content <- readFile (fdFilePath file)
-- putStrLn $ "Content of " <> show (fdFileName file)
-- <> " at " <> fdFilePath file
-- putStrLn content
-- pure (pack "Data loaded")
-------------------------------------------------------------------------------
-------------------------------------------------------------
type MetricsAPI = Summary "SepGen IncExc metrics"
:> QueryParam "list" ListId
:> QueryParamR "ngramsType" TabType
......@@ -370,12 +370,11 @@ getMetrics cId maybeListId tabType maybeLimit = do
(ngs', scores) <- Metrics.getMetrics' cId maybeListId tabType maybeLimit
let
metrics = map (\(Scored t s1 s2) -> Metric t s1 s2 (listType t ngs')) scores
metrics = map (\(Scored t s1 s2) -> Metric t (log' 5 s1) (log' 2 s2) (listType t ngs')) scores
log' n x = 1 + (if x <= 0 then 0 else (log $ (10^(n::Int)) * x))
listType t m = maybe (panic errorMsg) fst $ Map.lookup t m
errorMsg = "API.Node.metrics: key absent"
pure $ Metrics metrics
......@@ -71,13 +71,13 @@ data SendEmailType = SendEmailViaAws
data Settings = Settings
{ _allowedOrigin :: ByteString -- ^ allowed origin for CORS
, _allowedHost :: ByteString -- ^ allowed host for CORS
{ _allowedOrigin :: ByteString -- allowed origin for CORS
, _allowedHost :: ByteString -- allowed host for CORS
, _appPort :: PortNumber
, _logLevelLimit :: LogLevel -- ^ log level from the monad-logger package
, _logLevelLimit :: LogLevel -- log level from the monad-logger package
-- , _dbServer :: Text
-- ^ this is not used yet
, _jwtSecret :: Jose.Jwk -- ^ key from the jose-jwt package
, _jwtSecret :: Jose.Jwk -- key from the jose-jwt package
, _sendLoginEmails :: SendEmailType
, _scrapydUrl :: BaseUrl
, _fileFolder :: FilePath
......@@ -195,22 +195,22 @@ mkRepoSaver repo_var = mkDebounce settings
settings = defaultDebounceSettings
{ debounceFreq = 1000000 -- 1 second
, debounceAction = withMVar repo_var repoSaverAction
-- ^ Here this not only `readMVar` but `takeMVar`.
-- Here this not only `readMVar` but `takeMVar`.
-- Namely while repoSaverAction is saving no other change
-- can be made to the MVar.
-- This might be not efficent and thus reconsidered later.
-- However this enables to safely perform a *final* save.
-- See `cleanEnv`.
-- Future work:
-- * Add a new MVar just for saving.
-- Add a new MVar just for saving.
}
readRepoEnv :: IO RepoEnv
readRepoEnv = do
-- | Does file exist ? :: Bool
-- Does file exist ? :: Bool
repoFile <- doesFileExist repoSnapshot
-- | Is file not empty ? :: Bool
-- Is file not empty ? :: Bool
repoExists <- if repoFile
then (>0) <$> getFileSize repoSnapshot
else pure False
......
{-|
Module : Gargantext.API.Upload
Description : Server API
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
module Gargantext.API.Upload
where
import qualified Data.Text as Text
import GHC.Generics (Generic)
import Gargantext.Prelude
import Data.Text (Text)
import Data.Aeson
import Servant
import Servant.Multipart
--import Servant.Mock (HasMock(mock))
import Servant.Swagger (HasSwagger(toSwagger))
-- import qualified Data.ByteString.Lazy as LBS
import Control.Monad
import Control.Monad.IO.Class
import Gargantext.API.Types
--import Servant.CSV.Cassava (CSV'(..))
--import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
--import Data.Swagger
--import Gargantext.API.Ngrams (TODO)
-- | Upload files
-- TODO Is it possible to adapt the function according to iValue input ?
--type API = MultipartForm Mem (MultipartData Mem) :> Post '[JSON] Integer
-- instance Generic Mem
--instance ToSchema Mem
--instance Arbitrary Mem
--instance ToSchema (MultipartData Mem)
--instance Arbitrary ( MultipartData Mem)
instance HasSwagger (MultipartForm tag a :> sub) where
-- TODO
toSwagger _ = undefined -- toSwagger (Proxy :: Proxy (TODO :> Post '[JSON] ()))
--declareNamedSchema _ = declareNamedSchema (Proxy :: Proxy TODO)
--instance Arbitrary (MultipartForm Mem (MultipartData Mem))
{-
instance (FromMultipart tag a, MultipartBackend tag, Servant.Multipart.LookupContext context (MultipartOptions tag))
=> HasMock (MultipartForm tag a :> sub) context where
mock _ _ = undefined
instance HasMock (MultipartForm Mem (MultipartData Mem) :> sub) context where
mock _ _ = undefined
-}
data Upload = Upload { up :: [Text] }
deriving (Generic)
instance ToJSON Upload
type ApiUpload = MultipartForm Mem (MultipartData Mem) :> Post '[JSON] Text
-- MultipartData consists in textual inputs,
-- accessible through its "inputs" field, as well
-- as files, accessible through its "files" field.
upload :: GargServer ApiUpload
upload multipartData = do
--{-
is <- liftIO $ do
putStrLn ("Inputs:" :: Text)
forM (inputs multipartData) $ \input -> do
putStrLn $ ("iName " :: Text) <> (iName input)
<> ("iValue " :: Text) <> (iValue input)
pure $ iName input
--{-
_ <- forM (files multipartData) $ \file -> do
let content = fdPayload file
putStrLn $ ("XXX " :: Text) <> (fdFileName file)
putStrLn $ ("YYY " :: Text) <> cs content
--pure $ cs content
-- is <- inputs multipartData
--}
pure $ Text.concat $ map cs is
-------------------------------------------------------------------------------
......@@ -148,38 +148,3 @@ put = U.update
-- type Name = Text
--mkCorpus :: ToJSON a => Name -> (a -> Text) -> [a] -> Cmd err NewNode
--mkCorpus name title ns = do
-- pid <- home
--
-- let pid' = case lastMay pid of
-- Nothing -> printDebug "No home for" name
-- Just p -> p
--
-- let uid = 1
-- postNode uid (Just pid') ( Node' NodeCorpus name emptyObject
-- (map (\n -> Node' Document (title n) (toJSON n) []) ns)
-- )
--
---- |
---- import IMTClient as C
---- postAnnuaire "Annuaire IMT" (\n -> (maybe "" identity (C.prenom n)) <> " " <> (maybe "" identity (C.nom n))) (take 30 annuaire)
--mkAnnuaire :: ToJSON a => Name -> (a -> Text) -> [a] -> Cmd err NewNode
--mkAnnuaire name title ns = do
-- pid <- lastMay <$> home
-- let pid' = case lastMay pid of
-- Nothing -> printDebug "No home for" name
-- Just p -> p
-- let uid = 1
-- postNode uid (Just pid') ( Node' Annuaire name emptyObject
-- (map (\n -> Node' UserPage (title n) (toJSON n) []) ns)
-- )
--------------------------------------------------------------
-- |
-- myCorpus <- Prelude.map doc2hyperdataDocument <$> toDocs <$> snd <$> readCsv "doc/corpus_imt/Gargantext_Corpus_small.csv"
-- There is an error in the CSV parsing...
-- let myCorpus' = Prelude.filter (\n -> T.length (maybe "" identity (hyperdataDocument_title n)) > 30) myCorpus
-- corporaOf :: Username -> IO [Corpus]
{-|
Module : Gargantext.Database.Cooc
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
--{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Database.Cooc where
import Database.PostgreSQL.Simple.SqlQQ
import Gargantext.Prelude
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.API.Settings (runCmdDevNoErr, DevEnv)
type CorpusId = Int
type MainListId = Int
type GroupListId = Int
coocTest :: DevEnv -> IO [(Int, Int, Int)]
coocTest env = runCmdDevNoErr env $ dBcooc 421968 446602 446599
dBcooc :: CorpusId -> MainListId -> GroupListId -> Cmd err [(Int, Int, Int)]
dBcooc corpus mainList groupList = runPGSQuery [sql|
set work_mem='1GB';
--EXPLAIN ANALYZE
WITH COOC as (
SELECT
COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
COUNT(*) AS score
FROM
nodes AS n
-- / -- X Y
-- SQL graph for getting the cooccurrences
-- STEP 1: X axis of the matrix
INNER JOIN nodes_ngrams
AS ngA ON ngA.node_id = n.id
-- \--> get the occurrences node/ngram of the corpus
INNER JOIN nodes_ngrams
AS wlA ON ngA.ngram_id = wlA.ngram_id
AND wlA.node_id = ?
-- \--> filter with white/main list (typename 7)
LEFT JOIN nodes_ngrams_ngrams
AS grA ON wlA.ngram_id = grA.ngram1_id
AND grA.node_id = ?
-- \--> adding (joining) ngrams that are grouped (typename 6)
LEFT JOIN nodes_ngrams
AS wlAA ON grA.ngram2_id = wlAA.ngram_id
AND wlAA.node_id = wlA.node_id
-- \--> adding (joining) ngrams that are not grouped
--LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
-- STEP 2: Y axi of the matrix
INNER JOIN nodes_ngrams
AS ngB ON ngB.node_id = n.id
-- \--> get the occurrences node/ngram of the corpus
INNER JOIN nodes_ngrams
AS wlB ON ngB.ngram_id = wlB.ngram_id
AND wlB.node_id = ?
-- \--> filter with white/main list
LEFT JOIN nodes_ngrams_ngrams
AS grB ON wlB.ngram_id = grB.ngram1_id
AND grB.node_id = ?
-- \--> adding (joining) ngrams that are grouped
LEFT JOIN nodes_ngrams
AS wlBB ON grB.ngram2_id = wlBB.ngram_id
AND wlBB.node_id = wlB.node_id
-- \--> adding (joining) ngrams that are not grouped
-- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
-- \--> for joining all synonyms even if they are not in the main list (white list)
WHERE
n.typename = 4
AND n.parent_id = ?
GROUP BY 1,2
-- ==
-- GROUP BY ngA, ngB
)
SELECT ngA, ngB, score
FROM COOC --> from the query above
WHERE score >= 3
AND
ngA <= ngB
|] (mainList, groupList, mainList, groupList, corpus)
......@@ -23,6 +23,7 @@ Portability : POSIX
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
------------------------------------------------------------------------
module Gargantext.Database.Facet
where
......@@ -171,18 +172,20 @@ type Trash = Bool
data OrderBy = DateAsc | DateDesc
| TitleAsc | TitleDesc
| ScoreDesc | ScoreAsc
| SourceAsc | SourceDesc
deriving (Generic, Enum, Bounded, Read, Show)
-- | NgramCoun
instance FromHttpApiData OrderBy
where
parseUrlPiece "DateAsc" = pure DateAsc
parseUrlPiece "DateDesc" = pure DateDesc
parseUrlPiece "TitleAsc" = pure TitleAsc
parseUrlPiece "TitleDesc" = pure TitleDesc
parseUrlPiece "DateAsc" = pure DateAsc
parseUrlPiece "DateDesc" = pure DateDesc
parseUrlPiece "TitleAsc" = pure TitleAsc
parseUrlPiece "TitleDesc" = pure TitleDesc
parseUrlPiece "ScoreAsc" = pure ScoreAsc
parseUrlPiece "ScoreDesc" = pure ScoreDesc
parseUrlPiece _ = Left "Unexpected value of OrderBy"
parseUrlPiece "SourceAsc" = pure SourceAsc
parseUrlPiece "SourceDesc" = pure SourceDesc
parseUrlPiece _ = Left "Unexpected value of OrderBy"
instance ToParamSchema OrderBy
instance FromJSON OrderBy
......@@ -251,7 +254,7 @@ viewDocuments cId t ntId = proc () -> do
------------------------------------------------------------------------
filterWith :: (PGOrd date, PGOrd title, PGOrd score) =>
filterWith :: (PGOrd date, PGOrd title, PGOrd score, hyperdata ~ Column SqlJsonb) =>
Maybe Gargantext.Core.Types.Offset
-> Maybe Gargantext.Core.Types.Limit
-> Maybe OrderBy
......@@ -260,14 +263,24 @@ filterWith :: (PGOrd date, PGOrd title, PGOrd score) =>
filterWith o l order q = limit' l $ offset' o $ orderBy (orderWith order) q
orderWith :: (PGOrd b1, PGOrd b2, PGOrd b3) => Maybe OrderBy -> Order (Facet id (Column b1) (Column b2) hyperdata (Column b3) score)
orderWith order = case order of
(Just DateAsc) -> asc facetDoc_created
(Just TitleAsc) -> asc facetDoc_title
(Just TitleDesc) -> desc facetDoc_title
(Just ScoreAsc) -> asc facetDoc_favorite
(Just ScoreDesc) -> desc facetDoc_favorite
_ -> desc facetDoc_created
orderWith :: (PGOrd b1, PGOrd b2, PGOrd b3)
=> Maybe OrderBy
-> Order (Facet id (Column b1) (Column b2) (Column SqlJsonb) (Column b3) score)
orderWith (Just DateAsc) = asc facetDoc_created
orderWith (Just DateDesc) = desc facetDoc_created
orderWith (Just TitleAsc) = asc facetDoc_title
orderWith (Just TitleDesc) = desc facetDoc_title
orderWith (Just ScoreAsc) = asc facetDoc_favorite
orderWith (Just ScoreDesc) = desc facetDoc_favorite
orderWith (Just SourceAsc) = asc facetDoc_source
orderWith (Just SourceDesc) = desc facetDoc_source
orderWith _ = asc facetDoc_created
facetDoc_source :: PGIsJson a
=> Facet id created title (Column a) favorite ngramCount
-> Column (Nullable PGText)
facetDoc_source x = toNullable (facetDoc_hyperdata x) .->> pgString "source"
......@@ -12,6 +12,10 @@ Portability : POSIX
-- check userId CanFillUserCorpus userCorpusId
-- check masterUserId CanFillMasterCorpus masterCorpusId
-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
-- TODO-EVENTS: InsertedNodes
-}
{-# LANGUAGE ConstraintKinds #-}
......@@ -24,14 +28,6 @@ Portability : POSIX
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where
--import Gargantext.Database.Metrics.Count (getNgramsElementsWithParentNodeId)
--import Gargantext.Database.Metrics.TFICF (getTficf)
--import Gargantext.Database.Node.Contact (HyperdataContact(..))
--import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
--import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import Control.Lens ((^.), view, Lens', _Just)
import Control.Monad (mapM_)
......@@ -43,35 +39,37 @@ import Data.Monoid
import Data.Text (Text, splitOn, intercalate)
import GHC.Show (Show)
import Gargantext.API.Ngrams (HasRepoVar)
import Gargantext.API.Ngrams (NgramsElement, putListNgrams, RepoCmdM)
import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (NodePoly(..), Terms(..))
import Gargantext.Core.Types.Individu (Username)
import Gargantext.Core.Types.Main
import Gargantext.Database.TextSearch (searchInDatabase)
import Gargantext.Database.Config (userMaster, corpusMasterName)
import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Flow.Utils (insertDocNgrams)
import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRoot)
import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
import Gargantext.Database.Schema.User (getUser, UserLight(..))
import Gargantext.Database.TextSearch (searchInDatabase)
import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude
import Gargantext.Text.List (buildNgramsLists,StopSize(..))
import Gargantext.Text.Parsers (parseDocs, FileFormat)
import Gargantext.Text.Parsers (parseFile, FileFormat)
import Gargantext.Text.Terms (TermType(..), tt_lang)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
import Servant (ServantErr)
import System.FilePath (FilePath)
import qualified Data.Map as DM
--import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
type FlowCmdM env err m =
( CmdM env err m
......@@ -110,13 +108,13 @@ flowCorpusDebat u n l fp = do
flowCorpusFile :: FlowCmdM env ServantErr m
=> Username -> CorpusName
-> Limit -- ^ Limit the number of docs (for dev purpose)
-> Limit -- Limit the number of docs (for dev purpose)
-> TermType Lang -> FileFormat -> FilePath
-> m CorpusId
flowCorpusFile u n l la ff fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> parseDocs ff fp
<$> parseFile ff fp
)
flowCorpus u n la (map (map toHyperdataDocument) docs)
......@@ -130,10 +128,6 @@ flowCorpusSearchInDatabase u la q = do
------------------------------------------------------------------------
-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
-- TODO-EVENTS: InsertedNodes
flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
=> Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flow c u cn la docs = do
......@@ -182,15 +176,14 @@ insertMasterDocs c lang hs = do
-- TODO Type NodeDocumentUnicised
let hs' = map addUniqId hs
ids <- insertDb masterUserId masterCorpusId hs'
let documentsWithId = mergeData (toInserted ids) (DM.fromList $ map viewUniqId' hs')
let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams
terms2id <- insertNgrams $ DM.keys maps
let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
_ <- insertToNodeNgrams indexedNgrams
maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
terms2id <- insertNgrams $ Map.keys maps
let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
lId <- getOrMkList masterCorpusId masterUserId
_ <- insertDocNgrams lId indexedNgrams
pure $ map reId ids
......@@ -255,7 +248,7 @@ viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
toInserted :: [ReturnId] -> Map HashId ReturnId
toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
. filter (\r -> reInserted r == True)
data DocumentWithId a = DocumentWithId
......@@ -266,7 +259,7 @@ data DocumentWithId a = DocumentWithId
mergeData :: Map HashId ReturnId
-> Map HashId a
-> [DocumentWithId a]
mergeData rs = catMaybes . map toDocumentWithId . DM.toList
mergeData rs = catMaybes . map toDocumentWithId . Map.toList
where
toDocumentWithId (hash,hpd) =
DocumentWithId <$> fmap reId (lookup hash rs)
......@@ -296,7 +289,7 @@ instance ExtractNgramsT HyperdataContact
$ maybe ["Nothing"] (\a -> [a])
$ view (hc_who . _Just . cw_lastName) hc'
pure $ DM.fromList $ [(a', DM.singleton Authors 1) | a' <- authors ]
pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
......@@ -333,15 +326,15 @@ extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
<$> concat
<$> liftIO (extractTerms lang' leText)
pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
<> [(i', DM.singleton Institutes 1) | i' <- institutes ]
<> [(a', DM.singleton Authors 1) | a' <- authors ]
<> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ]
<> [(a', Map.singleton Authors 1) | a' <- authors ]
<> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
......@@ -365,11 +358,11 @@ documentIdWithNgrams f = mapM toDocumentIdWithNgrams
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams a]
-> Map Ngrams (Map NgramsType (Map NodeId Int))
mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
where
f :: DocumentIdWithNgrams a
-> Map Ngrams (Map NgramsType (Map NodeId Int))
f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
where
nId = documentId $ documentWithId d
......@@ -388,5 +381,6 @@ flowList uId cId ngs = do
lId <- getOrMkList cId uId
printDebug "listId flowList" lId
listInsert lId ngs
--trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
pure lId
......@@ -23,6 +23,8 @@ import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Types.Node (NodeId, Node, NodePoly(..), Hyperdata)
import Gargantext.Database.Utils (Cmd)
import Gargantext.Database.Schema.NodeNgram
import Gargantext.Database.Schema.NodeNodeNgrams
import Gargantext.Database.Types.Node
import Gargantext.Core.Types.Main (ListType(..), listTypeId)
toMaps :: Hyperdata a => (a -> Map (NgramsT Ngrams) Int) -> [Node a] -> Map (NgramsT Ngrams) (Map NodeId Int)
......@@ -56,6 +58,7 @@ data DocumentIdWithNgrams a =
-- | TODO for now, list Type is CandidateTerm because Graph Terms
-- have to be detected in next step in the flow
-- TODO remvoe this
insertToNodeNgrams :: Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err Int
insertToNodeNgrams m = insertNodeNgrams [ NodeNgram n (_ngramsId ng) Nothing (ngramsTypeId t) (listTypeId CandidateTerm) (fromIntegral i)
| (ng, t2n2i) <- DM.toList m
......@@ -63,3 +66,23 @@ insertToNodeNgrams m = insertNodeNgrams [ NodeNgram n (_ngramsId ng) Nothing (ng
, (n, i) <- DM.toList n2i
]
docNgrams2nodeNodeNgrams :: CorpusId -> DocNgrams -> NodeNodeNgrams
docNgrams2nodeNodeNgrams cId (DocNgrams d n nt w) = NodeNodeNgrams Nothing cId d n nt w
data DocNgrams = DocNgrams { dn_doc_id :: DocId
, dn_ngrams_id :: Int
, dn_ngrams_type :: NgramsTypeId
, dn_weight :: Double
}
insertDocNgramsOn :: CorpusId -> [DocNgrams] -> Cmd err Int
insertDocNgramsOn cId dn = insertNodeNodeNgrams $ (map (docNgrams2nodeNodeNgrams cId) dn)
insertDocNgrams :: CorpusId -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err Int
insertDocNgrams cId m = insertDocNgramsOn cId [ DocNgrams n (_ngramsId ng) (ngramsTypeId t) (fromIntegral i)
| (ng, t2n2i) <- DM.toList m
, (t, n2i) <- DM.toList t2n2i
, (n, i) <- DM.toList n2i
]
......@@ -22,9 +22,10 @@ import Data.Map (Map)
import Data.Text (Text)
import Gargantext.API.Ngrams (TabType(..), ngramsTypeFromTabType)
import Gargantext.API.Ngrams.Tools (filterListWithRoot, groupNodesByNgrams, Diagonal(..), getCoocByNgrams, mapTermListRoot, RootTerm)
import Gargantext.Core.Types (ListType(..), Limit)
import Gargantext.Core.Types (ListType(..), Limit, NodeType(..))
import Gargantext.Database.Flow (FlowCmdM)
import Gargantext.Database.Metrics.NgramsByNode (getNodesByNgramsOnlyUser, getTficfWith)
import Gargantext.Database.Node.Select
import Gargantext.Database.Schema.Node (defaultList)
import Gargantext.Database.Types.Node (ListId, CorpusId, HyperdataCorpus)
import Gargantext.Database.Flow (getOrMkRootWithCorpus)
......@@ -51,7 +52,10 @@ getMetrics cId maybeListId tabType maybeLimit = do
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
metrics' <- getTficfWith cId masterCorpusId (ngramsTypeFromTabType tabType) ngs'
lId <- defaultList cId
lIds <- selectNodesWithUsername NodeList userMaster
metrics' <- getTficfWith cId masterCorpusId (lIds <> [lId]) (ngramsTypeFromTabType tabType) ngs'
pure (ngs , toScored [metrics, Map.fromList $ map (\(a,b) -> (a, Vec.fromList [fst b])) $ Map.toList metrics'])
......@@ -79,10 +83,13 @@ getNgramsCooc cId maybeListId tabType maybeLimit = do
let
take' Nothing xs = xs
take' (Just n) xs = take n xs
lId <- defaultList cId
lIds <- selectNodesWithUsername NodeList userMaster
myCooc <- Map.filter (>1) <$> getCoocByNgrams (Diagonal True)
<$> groupNodesByNgrams ngs
<$> getNodesByNgramsOnlyUser cId (ngramsTypeFromTabType tabType)
<$> getNodesByNgramsOnlyUser cId (lIds <> [lId]) (ngramsTypeFromTabType tabType)
(take' maybeLimit $ Map.keys ngs)
pure $ (ngs', ngs, myCooc)
......
{-|
Module : Gargantext.Database.Metrics.Count
Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Count Ngrams by Context
-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Database.Metrics.Count where
import Control.Arrow (returnA)
import Control.Lens (view)
import Data.Map.Strict (Map, fromListWith, elems)
import Data.Monoid (mempty)
import Data.Text (Text)
import Database.PostgreSQL.Simple.SqlQQ (sql)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement)
import Gargantext.Core.Types.Main (listTypeId, ListType(..))
import Gargantext.Database.Access
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Queries.Join (leftJoin4, leftJoin5, leftJoin3)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsType(..), ngramsTypeId, Ngrams(..), NgramsIndexed(..), ngrams, ngramsTerms, fromNgramsTypeId)
import Gargantext.Database.Schema.Node
import Gargantext.Database.Schema.Node (HasNodeError(..))
import Gargantext.Database.Schema.NodeNgram
import Gargantext.Database.Schema.NodeNode
import Gargantext.Database.Schema.NodeNodeNgrams
import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
import Gargantext.Database.Utils
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Prelude hiding (sum)
import Gargantext.Text.Metrics.Count (Coocs, coocOn)
import Opaleye
import Safe (headMay)
import qualified Database.PostgreSQL.Simple as PGS
getCoocByDocDev :: HasNodeError err => CorpusId -> ListId -> Cmd err (Map ([Text], [Text]) Int)
getCoocByDocDev cId lId = coocOn (\n-> [ view ( ngrams . ngramsTerms) n]) <$> getNgramsByDoc cId lId
getCoocByDoc :: CorpusId -> ListId -> Cmd err (Map (NgramsIndexed, NgramsIndexed) Coocs)
getCoocByDoc cId lId = coocOn identity <$> getNgramsByDoc cId lId
getNgramsByDoc :: CorpusId -> ListId -> Cmd err [[NgramsIndexed]]
getNgramsByDoc cId lId =
elems
<$> fromListWith (<>)
<$> map (\(nId, ngId, nt, n) -> (nId, [NgramsIndexed (Ngrams nt n) ngId]))
<$> getNgramsByDocDb cId lId
getNgramsByDocDb :: CorpusId -> ListId -> Cmd err [(NodeId, NgramsId, Text, Int)]
getNgramsByDocDb cId lId = runPGSQuery query params
where
params = (cId, lId, listTypeId GraphTerm, ngramsTypeId NgramsTerms)
query = [sql|
-- TODO add CTE
SELECT n.id, ng.id, ng.terms, ng.n -- , list.parent_id
FROM nodes n
JOIN nodes_nodes nn ON nn.node2_id = n.id
JOIN nodes_ngrams nng ON nng.node_id = nn.node2_id
JOIN nodes_ngrams list ON list.ngrams_id = nng.ngrams_id
JOIN ngrams ng ON ng.id = nng.ngrams_id
WHERE nn.node1_id = ? -- CorpusId
AND list.node_id = ? -- ListId
AND list.list_type = ? -- GraphListId
AND list.ngrams_type = ? -- NgramsTypeId
|]
getNgramsByNode :: NodeId -> NgramsType -> Cmd err [[Text]]
getNgramsByNode nId nt = elems
<$> fromListWith (<>)
<$> map (\(i,t) -> (i,[t]))
<$> getNgramsByNodeNodeIndexed nId nt
-- | TODO add join with nodeNodeNgram (if it exists)
getNgramsByNodeNodeIndexed :: NodeId -> NgramsType -> Cmd err [(NodeId, Text)]
getNgramsByNodeNodeIndexed nId nt = runOpaQuery (select' nId)
where
select' nId' = proc () -> do
(ng,(nng,(nn,n))) <- getNgramsByNodeNodeIndexedJoin -< ()
restrict -< _node_id n .== toNullable (pgNodeId nId')
restrict -< nng_ngramsType nng .== toNullable (pgNgramsTypeId $ ngramsTypeId nt)
restrict -< nn_delete nn ./= (toNullable . pgBool) True
returnA -< (nng_node_id nng, ngrams_terms ng)
{-
getNgramsByNodeIndexed' :: NodeId -> NgramsType -> Cmd err [(NodeId, Maybe Text)]
getNgramsByNodeIndexed' nId nt = runOpaQuery (select' nId)
where
select' nId' = proc () -> do
(nnng,(ng,(nng,(_,n)))) <- getNgramsByNodeIndexedJoin5 -< ()
restrict -< _node_id n .== toNullable (pgNodeId nId')
restrict -< nng_ngramsType nng .== toNullable (pgNgramsTypeId $ ngramsTypeId nt)
let node_id' = ifThenElse (isNull $ toNullable $ nnng_node1_id nnng)
(nng_node_id nng)
(nnng_node2_id nng)
let t1 = ifThenElse (isNull $ toNullable $ nnng_node1_id nnng)
(ngrams_terms ng)
(nnng_terms nng)
returnA -< (n1, t1)
--}
getNgramsByNodeNodeIndexedJoin :: Query ( NgramsRead
, (NodeNgramReadNull
, (NodeNodeReadNull
, NodeReadNull
)
)
)
getNgramsByNodeNodeIndexedJoin = leftJoin4 queryNodeTable
queryNodeNodeTable
queryNodeNgramTable
queryNgramsTable
c1 c2 c3
where
c1 :: (NodeNodeRead, NodeRead) -> Column PGBool
c1 (nn,n) = nn_node1_id nn .== _node_id n
c2 :: ( NodeNgramRead
, (NodeNodeRead
, NodeReadNull
)
) -> Column PGBool
c2 (nng,(nn',_)) = (nng_node_id nng) .== nn_node2_id nn'
c3 :: ( NgramsRead
, ( NodeNgramRead
, ( NodeNodeReadNull
, NodeReadNull
)
)
) -> Column PGBool
c3 (ng,(nng',(_,_))) = (ngrams_id ng) .== nng_ngrams_id nng'
getNgramsByNodeNodeIndexedJoin5 :: Query ( NodeNodeNgramsRead
, (NgramsReadNull
, (NodeNgramReadNull
, (NodeNodeReadNull
, NodeReadNull
)
)
)
)
getNgramsByNodeNodeIndexedJoin5 = leftJoin5 queryNodeTable
queryNodeNodeTable
queryNodeNgramTable
queryNgramsTable
queryNodeNodeNgramsTable
c1 c2 c3 c4
where
c1 :: (NodeNodeRead, NodeRead) -> Column PGBool
c1 (nn,n) = nn_node1_id nn .== _node_id n
c2 :: ( NodeNgramRead
, (NodeNodeRead
, NodeReadNull
)
) -> Column PGBool
c2 (nng,(nn',_)) = (nng_node_id nng) .== nn_node2_id nn'
c3 :: ( NgramsRead
, ( NodeNgramRead
, ( NodeNodeReadNull
, NodeReadNull
)
)
) -> Column PGBool
c3 (ng,(nng',(_,_))) = (ngrams_id ng) .== nng_ngrams_id nng'
c4 :: ( NodeNodeNgramsRead
, (NgramsRead
, ( NodeNgramReadNull
, ( NodeNodeReadNull
, NodeReadNull
)
)
)
) -> Column PGBool
c4 (nnng,(_,(_,(nn,_)))) = (toNullable $ nnng_node1_id nnng) .== (nn_node1_id nn)
.&& (toNullable $ nnng_node2_id nnng) .== (nn_node2_id nn)
--}
--{-
getNgramsElementsWithParentNodeId :: NodeId -> Cmd err (Map NgramsType [NgramsElement])
getNgramsElementsWithParentNodeId nId = do
ns <- getNgramsWithParentNodeId nId
pure $ fromListWith (<>)
[ (maybe (panic "error") identity $ fromNgramsTypeId nt,
[mkNgramsElement ng CandidateTerm Nothing mempty])
| (_,(nt,ng)) <- ns
]
-------------------------------------------------------------------------
getNgramsWithParentNodeId :: NodeId -> Cmd err [(NodeId, (NgramsTypeId, Text))]
getNgramsWithParentNodeId nId = runOpaQuery (select nId)
where
select nId' = proc () -> do
(ng,(nng,n)) <- getNgramsWithParentNodeIdJoin -< ()
restrict -< _node_parentId n .== (toNullable $ pgNodeId nId')
restrict -< _node_typename n .== (toNullable $ pgInt4 $ nodeTypeId NodeDocument)
returnA -< (nng_node_id nng, (nng_ngramsType nng, ngrams_terms ng))
--}
getNgramsWithParentNodeIdJoin :: Query ( NgramsRead
, ( NodeNgramReadNull
, NodeReadNull
)
)
getNgramsWithParentNodeIdJoin = leftJoin3 queryNodeTable queryNodeNgramTable queryNgramsTable on1 on2
where
on1 :: (NodeNgramRead, NodeRead) -> Column PGBool
on1 (nng,n) = nng_node_id nng .== _node_id n
on2 :: (NgramsRead, (NodeNgramRead, NodeReadNull))-> Column PGBool
on2 (ng, (nng,_)) = ngrams_id ng .== nng_ngrams_id nng
countCorpusDocuments :: Roles -> Int -> Cmd err Int
countCorpusDocuments r cId = maybe 0 identity
<$> headMay
<$> map (\(PGS.Only n) -> n)
<$> runQuery' r cId
where
runQuery' RoleUser cId' = runPGSQuery
"SELECT count(*) from nodes_nodes nn WHERE nn.node1_id = ? AND nn.delete = False"
(PGS.Only cId')
runQuery' RoleMaster cId' = runPGSQuery
"SELECT count(*) from nodes n WHERE n.parent_id = ? AND n.typename = ?"
(cId', nodeTypeId NodeDocument)
......@@ -69,11 +69,11 @@ getTficf' u m nt f = do
(countNodesByNgramsWith f m')
--{-
getTficfWith :: UserCorpusId -> MasterCorpusId
getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
-> NgramsType -> Map Text (Maybe Text)
-> Cmd err (Map Text (Double, Set Text))
getTficfWith u m nt mtxt = do
u' <- getNodesByNgramsOnlyUser u nt (Map.keys mtxt)
getTficfWith u m ls nt mtxt = do
u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
m' <- getNodesByNgramsMaster u m
let f x = case Map.lookup x mtxt of
......@@ -126,34 +126,35 @@ getNodesByNgramsUser :: CorpusId -> NgramsType
getNodesByNgramsUser cId nt =
fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
<$> selectNgramsByNodeUser cId nt
where
selectNgramsByNodeUser :: CorpusId -> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId nt =
runPGSQuery queryNgramsByNodeUser
( cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
, 1000 :: Int -- limit
, 0 :: Int -- offset
)
queryNgramsByNodeUser :: DPS.Query
queryNgramsByNodeUser = [sql|
SELECT nng.node_id, ng.terms FROM nodes_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node_id, ng.terms
ORDER BY (nng.node_id, ng.terms) DESC
LIMIT ?
OFFSET ?
|]
selectNgramsByNodeUser :: CorpusId -> NgramsType
-> Cmd err [(NodeId, Text)]
selectNgramsByNodeUser cId' nt' =
runPGSQuery queryNgramsByNodeUser
( cId'
, nodeTypeId NodeDocument
, ngramsTypeId nt'
-- , 100 :: Int -- limit
-- , 0 :: Int -- offset
)
queryNgramsByNodeUser :: DPS.Query
queryNgramsByNodeUser = [sql|
SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node2_id, ng.terms
ORDER BY (nng.node2_id, ng.terms) DESC
-- LIMIT ?
-- OFFSET ?
|]
------------------------------------------------------------------------
-- TODO add groups
getOccByNgramsOnlyFast :: CorpusId -> NgramsType -> [Text]
......@@ -162,17 +163,21 @@ getOccByNgramsOnlyFast cId nt ngs =
fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
-- just slower than getOccByNgramsOnlyFast
getOccByNgramsOnlySlow :: CorpusId -> NgramsType -> [Text]
getOccByNgramsOnlySlow :: NodeType -> CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text Int)
getOccByNgramsOnlySlow cId nt ngs =
Map.map Set.size <$> getNodesByNgramsOnlyUser cId nt ngs
getOccByNgramsOnlySlow t cId ls nt ngs =
Map.map Set.size <$> getScore' t cId ls nt ngs
where
getScore' NodeCorpus = getNodesByNgramsOnlyUser
getScore' NodeDocument = getNgramsByDocOnlyUser
getScore' _ = getNodesByNgramsOnlyUser
getOccByNgramsOnlySafe :: CorpusId -> NgramsType -> [Text]
getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text Int)
getOccByNgramsOnlySafe cId nt ngs = do
getOccByNgramsOnlySafe cId ls nt ngs = do
printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
fast <- getOccByNgramsOnlyFast cId nt ngs
slow <- getOccByNgramsOnlySlow cId nt ngs
slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
when (fast /= slow) $
printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
pure slow
......@@ -197,29 +202,30 @@ queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
queryNgramsOccurrencesOnlyByNodeUser = [sql|
WITH input_rows(terms) AS (?)
SELECT ng.terms, COUNT(nng.node_id) FROM nodes_ngrams nng
SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node_id, ng.terms
GROUP BY nng.node2_id, ng.terms
|]
getNodesByNgramsOnlyUser :: CorpusId -> NgramsType -> [Text]
getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId))
getNodesByNgramsOnlyUser cId nt ngs = Map.unionsWith (<>)
getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>)
. map (fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByNodeUser cId nt) (splitEvery 1000 ngs)
<$> mapM (selectNgramsOnlyByNodeUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByNodeUser cId nt tms =
selectNgramsOnlyByNodeUser cId ls nt tms =
runPGSQuery queryNgramsOnlyByNodeUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, cId
, nodeTypeId NodeDocument
, ngramsTypeId nt
......@@ -230,19 +236,58 @@ selectNgramsOnlyByNodeUser cId nt tms =
queryNgramsOnlyByNodeUser :: DPS.Query
queryNgramsOnlyByNodeUser = [sql|
WITH input_rows(terms) AS (?)
SELECT ng.terms, nng.node_id FROM nodes_ngrams nng
WITH input_rows(terms) AS (?),
input_list(id) AS (?)
SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN nodes_nodes nn ON nn.node2_id = nng.node_id
JOIN input_list il ON il.id = nng.node1_id
JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
JOIN nodes n ON nn.node2_id = n.id
WHERE nn.node1_id = ? -- CorpusId
AND n.typename = ? -- NodeTypeId
AND nng.ngrams_type = ? -- NgramsTypeId
AND nn.delete = False
GROUP BY nng.node_id, ng.terms
GROUP BY ng.terms, nng.node2_id
|]
getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
-> Cmd err (Map Text (Set NodeId))
getNgramsByDocOnlyUser cId ls nt ngs = Map.unionsWith (<>)
. map (fromListWith (<>) . map (second Set.singleton))
<$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
-> Cmd err [(Text, NodeId)]
selectNgramsOnlyByDocUser dId ls nt tms =
runPGSQuery queryNgramsOnlyByDocUser
( Values fields (DPS.Only <$> tms)
, Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
, dId
, ngramsTypeId nt
)
where
fields = [QualifiedIdentifier Nothing "text"]
queryNgramsOnlyByDocUser :: DPS.Query
queryNgramsOnlyByDocUser = [sql|
WITH input_rows(terms) AS (?),
input_list(id) AS (?)
SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
JOIN ngrams ng ON nng.ngrams_id = ng.id
JOIN input_rows ir ON ir.terms = ng.terms
JOIN input_list il ON il.id = nng.node1_id
WHERE nng.node2_id = ? -- DocId
AND nng.ngrams_type = ? -- NgramsTypeId
GROUP BY ng.terms, nng.node2_id
|]
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
......@@ -272,6 +317,7 @@ selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
, ngramsTypeId NgramsTerms
)
-- | TODO fix node_node_ngrams relation
queryNgramsByNodeMaster' :: DPS.Query
queryNgramsByNodeMaster' = [sql|
......@@ -279,7 +325,7 @@ WITH nodesByNgramsUser AS (
SELECT n.id, ng.terms FROM nodes n
JOIN nodes_nodes nn ON n.id = nn.node2_id
JOIN nodes_ngrams nng ON nn.node2_id = n.id
JOIN node_node_ngrams nng ON nng.node2_id = n.id
JOIN ngrams ng ON nng.ngrams_id = ng.id
WHERE nn.node1_id = ? -- UserCorpusId
-- AND n.typename = ? -- NodeTypeId
......@@ -294,7 +340,7 @@ SELECT n.id, ng.terms FROM nodes n
nodesByNgramsMaster AS (
SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
JOIN nodes_ngrams nng ON n.id = nng.node_id
JOIN node_node_ngrams nng ON n.id = nng.node2_id
JOIN ngrams ng ON ng.id = nng.ngrams_id
WHERE n.parent_id = ? -- Master Corpus NodeTypeId
......
{-|
Module : Gargantext.Database.Ngrams
Description : Deal with in Gargantext Database.
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Database.Ngrams
where
import Data.Text (Text)
import Gargantext.Core.Types
import Gargantext.Database.Utils (runOpaQuery, Cmd)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.NodeNodeNgrams
import Gargantext.Database.Schema.Node
import Gargantext.Prelude
import Opaleye
import Control.Arrow (returnA)
selectNgramsByDoc :: [CorpusId] -> DocumentId -> NgramsType -> Cmd err [Text]
selectNgramsByDoc cIds dId nt = runOpaQuery (query cIds dId nt)
where
join :: Query (NgramsRead, NodeNodeNgramsReadNull)
join = leftJoin queryNgramsTable queryNodeNodeNgramsTable on1
where
on1 (ng,nnng) = ngrams_id ng .== nnng_ngrams_id nnng
query cIds' dId' nt' = proc () -> do
(ng,nnng) <- join -< ()
restrict -< foldl (\b cId -> ((toNullable $ pgNodeId cId) .== nnng_node1_id nnng) .|| b) (pgBool True) cIds'
restrict -< (toNullable $ pgNodeId dId') .== nnng_node2_id nnng
restrict -< (toNullable $ pgNgramsType nt') .== nnng_ngramsType nnng
returnA -< ngrams_terms ng
postNgrams :: CorpusId -> DocumentId -> [Text] -> Cmd err Int
postNgrams = undefined
......@@ -63,6 +63,7 @@ import Control.Lens.Prism
import Control.Lens.Cons
import Data.Aeson (toJSON)
import Data.Maybe (maybe)
import Data.Time.Segment (jour)
import Data.Text (Text)
import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
import Database.PostgreSQL.Simple.FromRow (fromRow, field)
......@@ -120,6 +121,7 @@ instance InsertDb HyperdataDocument
, toField u
, toField p
, toField $ maybe "No Title" (DT.take 255) (_hyperdataDocument_title h)
, toField $ _hyperdataDocument_publication_date h -- TODO USE UTCTime
, (toField . toJSON) h
]
......@@ -129,6 +131,7 @@ instance InsertDb HyperdataContact
, toField u
, toField p
, toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
, toField $ jour 2010 1 1 -- TODO put default date
, (toField . toJSON) h
]
......@@ -147,14 +150,14 @@ insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fie
-- | Input Tables: types of the tables
inputSqlTypes :: [Text]
inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
inputSqlTypes = map DT.pack ["int4","int4","int4","text","date","jsonb"]
-- | SQL query to insert documents inside the database
queryInsert :: Query
queryInsert = [sql|
WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
WITH input_rows(typename,user_id,parent_id,name,date,hyperdata) AS (?)
, ins AS (
INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
INSERT INTO nodes (typename,user_id,parent_id,name,date,hyperdata)
SELECT * FROM input_rows
ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
-- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
......@@ -180,10 +183,10 @@ queryInsert = [sql|
-- | When documents are inserted
-- ReturnType after insertion
data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
, reId :: NodeId -- ^ always return the id of the document (even new or not new)
data ReturnId = ReturnId { reInserted :: Bool -- if the document is inserted (True: is new, False: is not new)
, reId :: NodeId -- always return the id of the document (even new or not new)
-- this is the uniq id in the database
, reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
, reUniqId :: Text -- Hash Id with concatenation of hash parameters
} deriving (Show, Generic)
instance FromRow ReturnId where
......
{-|
Module : Gargantext.Database.Node.Select
Description : Main requests of Node to the database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}
module Gargantext.Database.Node.Select where
import Opaleye
import Gargantext.Core.Types
import Gargantext.Database.Schema.Node
import Gargantext.Database.Utils
import Gargantext.Database.Config
import Gargantext.Database.Schema.User
import Gargantext.Core.Types.Individu (Username)
import Control.Arrow (returnA)
selectNodesWithUsername :: NodeType -> Username -> Cmd err [NodeId]
selectNodesWithUsername nt u = runOpaQuery (q u)
where
q u' = proc () -> do
(n,usrs) <- join -< ()
restrict -< user_username usrs .== (toNullable $ pgStrictText u')
restrict -< _node_typename n .== (pgInt4 $ nodeTypeId nt)
returnA -< _node_id n
join :: Query (NodeRead, UserReadNull)
join = leftJoin queryNodeTable queryUserTable on1
where
on1 (n,us) = _node_userId n .== user_id us
This diff is collapsed.
......@@ -276,6 +276,8 @@ selectNode id = proc () -> do
restrict -< _node_id row .== id
returnA -< row
runGetNodes :: Query NodeRead -> Cmd err [NodeAny]
runGetNodes = runOpaQuery
......@@ -306,7 +308,6 @@ selectNodesWith' parentId maybeNodeType = proc () -> do
returnA -< row ) -< ()
returnA -< node
deleteNode :: NodeId -> Cmd err Int
deleteNode n = mkCmd $ \conn ->
fromIntegral <$> runDelete conn nodeTable
......@@ -593,7 +594,6 @@ defaultList cId =
mkList :: HasNodeError err => ParentId -> UserId -> Cmd err [NodeId]
mkList p u = insertNodesR [nodeListW Nothing Nothing p u]
mkGraph :: ParentId -> UserId -> Cmd err [GraphId]
mkGraph p u = insertNodesR [nodeGraphW Nothing Nothing p u]
......
......@@ -116,6 +116,7 @@ nodeNgramTable = Table "nodes_ngrams"
queryNodeNgramTable :: Query NodeNgramRead
queryNodeNgramTable = queryTable nodeNgramTable
--{-
insertNodeNgrams :: [NodeNgram] -> Cmd err Int
insertNodeNgrams = insertNodeNgramW
. map (\(NodeNgram n g p ngt lt w) ->
......@@ -126,7 +127,6 @@ insertNodeNgrams = insertNodeNgramW
(pgInt4 lt)
(pgDouble w)
)
insertNodeNgramW :: [NodeNgramWrite] -> Cmd err Int
insertNodeNgramW nns =
mkCmd $ \c -> fromIntegral <$> runInsert_ c insertNothing
......@@ -136,7 +136,7 @@ insertNodeNgramW nns =
, iReturning = rCount
, iOnConflict = (Just DoNothing)
})
--}
type NgramsText = Text
updateNodeNgrams' :: ListId -> [(NgramsTypeId, NgramsText, ListTypeId)] -> Cmd err ()
......@@ -166,138 +166,3 @@ ON CONFLICT (node_id, ngrams_id, ngrams_type) DO
UPDATE SET list_type = excluded.list_type
;
|]
data Action = Del | Add
type NgramsParent = Text
type NgramsChild = Text
{-
ngramsGroup :: Action -> ListId -> [(NgramsTypeId, NgramsParent, NgramsChild)] -> Cmd err ()
ngramsGroup _ _ [] = pure ()
ngramsGroup a lid input = void $ trace (show input) $ execPGSQuery (ngramsGroupQuery a) (DPS.Only $ Values fields input')
where
fields = map (\t-> QualifiedIdentifier Nothing t) ["int4","int4","text","int4","text","text"]
input' = map (\(ntpid,p,c) -> (lid, nodeTypeId NodeList, userMaster, ntpid, p,c)) input
-}
ngramsGroupQuery :: Action -> DPS.Query
ngramsGroupQuery a = case a of
Add -> [sql|
WITH input(lid,listTypeId,masterUsername,ntype,parent_terms,child_terms)
AS (?),
-- (VALUES (15::"int4",5::"int4",'gargantua'::"text",4::"int4",'input'::"text",'designer'::"text"))),
list_master AS (
SELECT n.id from nodes n
JOIN auth_user u ON n.user_id = u.id
JOIN input ON n.typename = input.listTypeId
WHERE u.username = input.masterUsername
LIMIT 1
),
list_user AS(
-- FIRST import parent from master to user list
INSERT INTO nodes_ngrams (node_id,ngrams_id,parent_id,ngrams_type,list_type,weight)
SELECT input.lid,nn.ngrams_id,nn.parent_id,nn.ngrams_type,nn.list_type, nn.weight
FROM INPUT
JOIN ngrams ng ON ng.terms = input.parent_terms
JOIN nodes_ngrams nn ON nn.ngrams_id = ng.id
JOIN list_master ON nn.node_id = list_master.id
WHERE
nn.ngrams_id = ng.id
ON CONFLICT (node_id,ngrams_id,ngrams_type) DO NOTHING
)
INSERT INTO nodes_ngrams (node_id,ngrams_id,parent_id,ngrams_type,list_type,weight)
SELECT input.lid, nc.id, nnpu.id, input.ntype, nnmaster.list_type, nnmaster.weight
FROM input
JOIN ngrams np ON np.terms = input.parent_terms
JOIN ngrams nc ON nc.terms = input.child_terms
JOIN nodes_ngrams nnpu ON nnpu.ngrams_id = np.id
JOIN nodes_ngrams nnmaster ON nnmaster.ngrams_id = nc.id
JOIN list_master ON nnmaster.node_id = list_master.id
WHERE
nnpu.node_id = input.lid
AND nnpu.ngrams_type = input.ntype
AND nnmaster.ngrams_id = nc.id
AND nnmaster.ngrams_type = ntype
ON CONFLICT (node_id,ngrams_id,ngrams_type) DO
UPDATE SET parent_id = excluded.parent_id
|]
Del -> [sql|
WITH input(lid,listTypeId,masterUsername,ntype,parent_terms,child_terms)
AS (?),
-- (VALUES (15::"int4",5::"int4",'gargantua'::"text",4::"int4",'input'::"text",'designer'::"text"))),
list_master AS (
SELECT n.id from nodes n
JOIN auth_user u ON n.user_id = u.id
JOIN input ON n.typename = input.listTypeId
WHERE u.username = input.masterUsername
LIMIT 1
),
list_user AS(
-- FIRST import parent from master to user list
INSERT INTO nodes_ngrams (node_id,ngrams_id,parent_id,ngrams_type,list_type,weight)
SELECT input.lid,nn.ngrams_id,nn.parent_id,nn.ngrams_type,nn.list_type, nn.weight
FROM INPUT
JOIN ngrams ng ON ng.terms = input.parent_terms
JOIN nodes_ngrams nn ON nn.ngrams_id = ng.id
JOIN list_master ON nn.node_id = list_master.id
WHERE
nn.ngrams_id = ng.id
ON CONFLICT (node_id,ngrams_id,ngrams_type) DO NOTHING
)
INSERT INTO nodes_ngrams (node_id,ngrams_id,parent_id,ngrams_type,list_type,weight)
SELECT input.lid, nc.id, NULL, input.ntype, nnmaster.list_type, nnmaster.weight
FROM input
JOIN ngrams np ON np.terms = input.parent_terms
JOIN ngrams nc ON nc.terms = input.child_terms
JOIN nodes_ngrams nnmaster ON nnmaster.ngrams_id = nc.id
JOIN list_master ON nnmaster.node_id = list_master.id
WHERE
nnmaster.ngrams_id = nc.id
AND nnmaster.ngrams_type = ntype
ON CONFLICT (node_id,ngrams_id,ngrams_type) DO
UPDATE SET parent_id = NULL
|]
data NodeNgramsUpdate = NodeNgramsUpdate
{ _nnu_user_list_id :: ListId
, _nnu_lists_update :: [(NgramsTypeId, NgramsText, ListTypeId)]
, _nnu_add_children :: [(NgramsTypeId, NgramsParent, NgramsChild)]
, _nnu_rem_children :: [(NgramsTypeId, NgramsParent, NgramsChild)]
}
-- TODO wrap these updates in a transaction.
-- TODO-ACCESS:
-- * check userId CanUpdateNgrams userListId
{-
updateNodeNgrams :: NodeNgramsUpdate -> Cmd err ()
updateNodeNgrams nnu = do
updateNodeNgrams' userListId $ _nnu_lists_update nnu
ngramsGroup Del userListId $ _nnu_rem_children nnu
ngramsGroup Add userListId $ _nnu_add_children nnu
-- TODO remove duplicate line (fix SQL query)
ngramsGroup Add userListId $ _nnu_add_children nnu
where
userListId = _nnu_user_list_id nnu
-}
......@@ -9,14 +9,15 @@ Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE Arrows #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Schema.NodeNodeNgrams
where
......@@ -24,61 +25,92 @@ module Gargantext.Database.Schema.NodeNodeNgrams
import Prelude
import Data.Maybe (Maybe)
import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Control.Lens.TH (makeLensesWith, abbreviatedFields)
import Gargantext.Database.Utils (Cmd, runOpaQuery)
--import Control.Lens.TH (makeLensesWith, abbreviatedFields)
import Gargantext.Database.Utils (Cmd, mkCmd)
import Gargantext.Database.Schema.Ngrams (NgramsTypeId, pgNgramsTypeId, NgramsId)
import Gargantext.Database.Schema.Node (pgNodeId)
import Gargantext.Database.Types.Node
import Opaleye
data NodeNodeNgramsPoly node1_id node2_id ngram_id score
= NodeNodeNgrams { nnng_node1_id :: node1_id
, nnng_node2_id :: node2_id
, nnng_ngrams_id :: ngram_id
, nnng_score :: score
} deriving (Show)
type NodeNodeNgramsWrite = NodeNodeNgramsPoly (Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Maybe (Column PGFloat8))
type NodeNodeNgramsRead = NodeNodeNgramsPoly (Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGFloat8)
type NodeNodeNgramsReadNull = NodeNodeNgramsPoly (Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGFloat8))
type NodeNodeNgrams = NodeNodeNgramsPoly Int
Int
Int
(Maybe Double)
data NodeNodeNgramsPoly id' n1 n2 ngrams_id ngt w
= NodeNodeNgrams { nnng_id :: id'
, nnng_node1_id :: n1
, nnng_node2_id :: n2
, nnng_ngrams_id :: ngrams_id
, nnng_ngramsType :: ngt
, nnng_weight :: w
} deriving (Show)
type NodeNodeNgramsWrite =
NodeNodeNgramsPoly (Maybe (Column PGInt4 ))
(Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGFloat8)
type NodeNodeNgramsRead =
NodeNodeNgramsPoly (Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGInt4 )
(Column PGFloat8)
type NodeNodeNgramsReadNull =
NodeNodeNgramsPoly (Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGInt4 ))
(Column (Nullable PGFloat8))
type NodeNodeNgrams =
NodeNodeNgramsPoly (Maybe Int) CorpusId DocId NgramsId NgramsTypeId Double
--{-
$(makeAdaptorAndInstance "pNodeNodeNgrams" ''NodeNodeNgramsPoly)
$(makeLensesWith abbreviatedFields ''NodeNodeNgramsPoly)
-- $(makeLensesWith abbreviatedFields ''NodeNodeNgramsPoly)
nodeNodeNgramsTable :: Table NodeNodeNgramsWrite NodeNodeNgramsRead
nodeNodeNgramsTable = Table "nodes_nodes_ngrams"
nodeNodeNgramsTable = Table "node_node_ngrams"
( pNodeNodeNgrams NodeNodeNgrams
{ nnng_node1_id = required "node1_id"
, nnng_node2_id = required "node2_id"
, nnng_ngrams_id = required "ngram_id"
, nnng_score = optional "score"
{ nnng_id = optional "id"
, nnng_node1_id = required "node1_id"
, nnng_node2_id = required "node2_id"
, nnng_ngrams_id = required "ngrams_id"
, nnng_ngramsType = required "ngrams_type"
, nnng_weight = required "weight"
}
)
queryNodeNodeNgramsTable :: Query NodeNodeNgramsRead
queryNodeNodeNgramsTable = queryTable nodeNodeNgramsTable
-- | not optimized (get all ngrams without filters)
nodeNodeNgrams :: Cmd err [NodeNodeNgrams]
nodeNodeNgrams = runOpaQuery queryNodeNodeNgramsTable
instance QueryRunnerColumnDefault PGFloat8 (Maybe Double) where
queryRunnerColumnDefault = fieldQueryRunnerColumn
-- | Insert utils
insertNodeNodeNgrams :: [NodeNodeNgrams] -> Cmd err Int
insertNodeNodeNgrams = insertNodeNodeNgramsW
. map (\(NodeNodeNgrams id'' n1 n2 ng nt w) ->
NodeNodeNgrams (pgInt4 <$> id'')
(pgNodeId n1)
(pgNodeId n2)
(pgInt4 ng)
(pgNgramsTypeId nt)
(pgDouble w)
)
insertNodeNodeNgramsW :: [NodeNodeNgramsWrite] -> Cmd err Int
insertNodeNodeNgramsW nnnw =
mkCmd $ \c -> fromIntegral <$> runInsert_ c insertNothing
where
insertNothing = (Insert { iTable = nodeNodeNgramsTable
, iRows = nnnw
, iReturning = rCount
, iOnConflict = (Just DoNothing)
})
......@@ -43,7 +43,6 @@ import Opaleye
------------------------------------------------------------------------
type UserId = Int
data UserLight = UserLight { userLight_id :: Int
, userLight_username :: Text
, userLight_email :: Text
......@@ -83,6 +82,16 @@ type UserRead = UserPoly (Column PGInt4) (Column PGText)
(Column PGBool) (Column PGBool)
(Column PGTimestamptz)
type UserReadNull = UserPoly (Column (Nullable PGInt4)) (Column (Nullable PGText))
(Column (Nullable PGTimestamptz)) (Column (Nullable PGBool))
(Column (Nullable PGText)) (Column (Nullable PGText))
(Column (Nullable PGText)) (Column (Nullable PGText))
(Column (Nullable PGBool)) (Column (Nullable PGBool))
(Column (Nullable PGTimestamptz))
type User = UserPoly Int Text (Maybe UTCTime) Bool Text Text Text Text Bool Bool UTCTime
$(makeAdaptorAndInstance "pUser" ''UserPoly)
......
......@@ -49,36 +49,26 @@ CREATE TABLE public.ngrams (
);
ALTER TABLE public.ngrams OWNER TO gargantua;
--------------------------------------------------------------
--------------------------------------------------------------
-- TODO: delete delete this table
CREATE TABLE public.nodes_ngrams (
id SERIAL,
node_id integer NOT NULL,
ngrams_id integer NOT NULL,
parent_id integer REFERENCES public.nodes_ngrams(id) ON DELETE SET NULL,
ngrams_type integer,
list_type integer,
weight double precision,
FOREIGN KEY (node_id) REFERENCES public.nodes(id) ON DELETE CASCADE,
FOREIGN KEY (ngrams_id) REFERENCES public.ngrams(id) ON DELETE CASCADE,
PRIMARY KEY (id)
-- PRIMARY KEY (node_id,ngrams_id)
);
ALTER TABLE public.nodes_ngrams OWNER TO gargantua;
--CREATE TABLE public.nodes_ngrams (
-- id SERIAL,
-- node_id integer NOT NULL,
-- ngrams_id integer NOT NULL,
-- parent_id integer REFERENCES public.nodes_ngrams(id) ON DELETE SET NULL,
-- ngrams_type integer,
-- list_type integer,
-- weight double precision,
-- FOREIGN KEY (node_id) REFERENCES public.nodes(id) ON DELETE CASCADE,
-- FOREIGN KEY (ngrams_id) REFERENCES public.ngrams(id) ON DELETE CASCADE,
-- PRIMARY KEY (id)
--);
--ALTER TABLE public.nodes_ngrams OWNER TO gargantua;
--------------------------------------------------------------
CREATE TABLE public.nodes_ngrams_repo (
version integer NOT NULL,
patches jsonb DEFAULT '{}'::jsonb NOT NULL,
PRIMARY KEY (version)
);
ALTER TABLE public.nodes_ngrams_repo OWNER TO gargantua;
--------------------------------------------------------------
--
--
-- TODO: delete delete this table
--CREATE TABLE public.nodes_ngrams_ngrams (
-- node_id integer NOT NULL REFERENCES public.nodes(id) ON DELETE CASCADE,
-- ngram1_id integer NOT NULL REFERENCES public.ngrams(id) ON DELETE CASCADE,
......@@ -89,16 +79,38 @@ ALTER TABLE public.nodes_ngrams_repo OWNER TO gargantua;
--
--ALTER TABLE public.nodes_ngrams_ngrams OWNER TO gargantua;
---------------------------------------------------------
---------------------------------------------------------------
CREATE TABLE public.nodes_nodes (
node1_id integer NOT NULL REFERENCES public.nodes(id) ON DELETE CASCADE,
node2_id integer NOT NULL REFERENCES public.nodes(id) ON DELETE CASCADE,
score real,
favorite boolean,
delete boolean,
PRIMARY KEY (node1_id, node2_id)
PRIMARY KEY (node1_id,node2_id)
);
ALTER TABLE public.nodes_nodes OWNER TO gargantua;
---------------------------------------------------------------
-- TODO should reference "id" of nodes_nodes (instead of node1_id, node2_id)
CREATE TABLE public.node_node_ngrams (
id SERIAL,
node1_id INTEGER NOT NULL REFERENCES public.nodes (id) ON DELETE CASCADE,
node2_id INTEGER NOT NULL REFERENCES public.nodes (id) ON DELETE CASCADE,
ngrams_id INTEGER NOT NULL REFERENCES public.ngrams (id) ON DELETE CASCADE,
ngrams_type INTEGER,
weight double precision,
PRIMARY KEY (id)
);
ALTER TABLE public.node_node_ngrams OWNER TO gargantua;
--------------------------------------------------------------
--CREATE TABLE public.nodes_ngrams_repo (
-- version integer NOT NULL,
-- patches jsonb DEFAULT '{}'::jsonb NOT NULL,
-- PRIMARY KEY (version)
--);
--ALTER TABLE public.nodes_ngrams_repo OWNER TO gargantua;
---------------------------------------------------------
-- If needed for rights management at row level
......@@ -113,7 +125,6 @@ CREATE TABLE public.rights (
ALTER TABLE public.rights OWNER TO gargantua;
------------------------------------------------------------
-- INDEXES
......@@ -130,14 +141,10 @@ CREATE UNIQUE INDEX ON public.nodes USING btree (typename, parent_id, ((hyperdat
CREATE UNIQUE INDEX ON public.ngrams (terms); -- TEST GIN
CREATE INDEX ON public.nodes_ngrams USING btree (ngrams_id);
CREATE UNIQUE INDEX ON public.nodes_ngrams USING btree (node_id,ngrams_id);
CREATE UNIQUE INDEX ON public.nodes_ngrams USING btree (node_id,ngrams_id,ngrams_type);
CREATE INDEX ON public.nodes_nodes USING btree (node1_id, node2_id, delete);
CREATE UNIQUE INDEX ON public.nodes_nodes USING btree (node1_id, node2_id);
--CREATE INDEX ON public.nodes_nodes_ngrams USING btree (node1_id,nod2_id);
CREATE UNIQUE INDEX ON public.node_node_ngrams USING btree (node1_id, node2_id, ngrams_id, ngrams_type);
-- TRIGGERS
-- TODO user haskell-postgresql-simple to create this function
......
......@@ -129,23 +129,6 @@ joinInCorpusWithContacts = leftJoin6 queryNodeTable queryNodeNgramTable queryNgr
cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nn_node2_id nn
{-
queryGraphCorpusAuthors' :: O.Query (NodeSearchRead, (NodeNodeReadNull, (NodeNgramReadNull, NgramsReadNull)))
queryGraphCorpusAuthors' = leftJoin4 queryNgramsTable queryNodeNgramTable queryNodeNodeTable queryNodeSearchTable cond12 cond23 cond34
where
cond23 :: (NgramsRead, (NodeNgramRead, NodeReadNull)) -> Column PGBool
cond23 (ng2, (nng2, _)) = nodeNgram_ngrams_id nng2 .== ngrams_id ng2
cond34 :: (NodeNgramRead, (NgramsRead, (NodeNgramReadNull, NodeReadNull))) -> Column PGBool
cond34 (nng, (ng, (_,_))) = ngrams_id ng .== nodeNgram_ngrams_id nng
cond45 :: (NodeNodeRead, (NodeNgramRead, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))) -> Column PGBool
cond45 (nn, (nng, (_,(_,_)))) = nodeNgram_node_id nng .== nodeNode_node2_id nn
cond56 :: (NodeSearchRead, (NodeNodeRead, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull))))) -> Column PGBool
cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nodeNode_node2_id nn
-}
newtype TSQuery = UnsafeTSQuery [Text]
......
......@@ -65,6 +65,7 @@ newtype NodeId = NodeId Int
instance ToField NodeId where
toField (NodeId n) = toField n
instance FromField NodeId where
fromField field mdata = do
n <- fromField field mdata
......@@ -424,17 +425,20 @@ type NodeGraph = Node HyperdataGraph
type NodePhylo = Node HyperdataPhylo
type NodeNotebook = Node HyperdataNotebook
------------------------------------------------------------------------
data NodeType = NodeUser
data NodeType = NodeUser
| NodeFolder
| NodeCorpus | NodeCorpusV3 | NodeDocument
| NodeAnnuaire | NodeContact
-- | NodeOccurrences
| NodeGraph
| NodeDashboard | NodeChart
-- | Classification
| NodeList | NodeListModel
| NodeList | NodeListModel deriving (Show, Read, Eq, Generic, Bounded, Enum)
{-
-- | Metrics
deriving (Show, Read, Eq, Generic, Bounded, Enum)
-- | NodeOccurrences
-- | Classification
-}
allNodeTypes :: [NodeType]
allNodeTypes = [minBound ..]
......
......@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
mapIdSchool = M.fromList $ Gargantext.Prelude.map (\(School n _ i) -> (i,n)) schools
hal_data :: IO (DV.Vector CsvHal)
hal_data = snd <$> CSV.readHal "doc/corpus_imt/Gargantext_Corpus.csv"
hal_data = snd <$> CSV.readCsvHal "doc/corpus_imt/Gargantext_Corpus.csv"
names :: S.Set Text
names = S.fromList $ Gargantext.Prelude.map (\s -> school_id s) schools
......
......@@ -13,24 +13,86 @@ Text gathers terms in unit of contexts.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Gargantext.Text
where
import Data.Text (Text, split)
import Gargantext.Prelude hiding (filter)
import NLP.FullStop (segment)
import qualified Data.Text as DT
import NLP.FullStop (segment)
-----------------------------------------------------------------
import Gargantext.Core
import Gargantext.Prelude hiding (filter)
-- | Why not use data ?
data Niveau = NiveauTexte Texte
| NiveauParagraphe Paragraphe
| NiveauPhrase Phrase
| NiveauMultiTerme MultiTerme
| NiveauMot Mot
| NiveauLettre Lettre
deriving (Show)
-- | Why use newtype ?
newtype Texte = Texte Text
newtype Paragraphe = Paragraphe Text
newtype Phrase = Phrase Text
newtype MultiTerme = MultiTerme Text
newtype Mot = Mot Text
newtype Lettre = Lettre Text
-- | Type syn seems obvious
type Titre = Phrase
-----------------------------------------------------------------
type Config = Lang -> Context
type Context = Text -> [Text]
data Viz = Graph | Phylo | Chart
instance Show Texte where
show (Texte t) = show t
instance Show Paragraphe where
show (Paragraphe p) = show p
instance Show Phrase where
show (Phrase p) = show p
instance Show MultiTerme where
show (MultiTerme mt) = show mt
instance Show Mot where
show (Mot t) = show t
instance Show Lettre where
show (Lettre l) = show l
-----------------------------------------------------------------
class Collage sup inf where
dec :: sup -> [inf]
inc :: [inf] -> sup
instance Collage Texte Paragraphe where
dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t
inc = Texte . DT.intercalate "\n" . map (\(Paragraphe t) -> t)
instance Collage Paragraphe Phrase where
dec (Paragraphe t) = map Phrase $ sentences t
inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p)
instance Collage Phrase MultiTerme where
dec (Phrase t) = map MultiTerme $ DT.words t
inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p)
instance Collage MultiTerme Mot where
dec (MultiTerme mt) = map Mot $ DT.words mt
inc = MultiTerme . DT.intercalate " " . map (\(Mot m) -> m)
-- | We could use Type Classes but we lose the Sum Type classification
toMultiTerme :: Niveau -> [MultiTerme]
toMultiTerme (NiveauTexte (Texte _t)) = undefined
toMultiTerme (NiveauPhrase p) = dec p
toMultiTerme (NiveauMultiTerme mt) = [mt]
toMultiTerme (NiveauMot _m) = undefined
toMultiTerme _ = undefined
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
......
{-|
Module : Gargantext.Text.Convert
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Format Converter.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Convert (risPress2csvWrite)
where
import System.FilePath (FilePath()) -- , takeExtension)
import Gargantext.Prelude
import Gargantext.Text.Parsers.CSV (writeDocs2Csv)
import Gargantext.Text.Parsers (parseFile, FileFormat(..))
risPress2csvWrite :: FilePath -> IO ()
risPress2csvWrite f = parseFile RisPresse (f <> ".ris")
>>= \hs -> writeDocs2Csv (f <> ".csv") hs
......@@ -55,28 +55,26 @@ buildNgramsLists l n m s uCid mCid = do
pure $ Map.unions $ othersTerms <> [ngTerms]
buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsOthersList uCid groupIt nt = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
pure $ Map.fromList [(nt, [ mkNgramsElement t CandidateTerm Nothing (mSetFromList [])
| (t,_ns) <- Map.toList ngs
let
all' = Map.toList ngs
pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
where
toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
| (t,_ns) <- x
]
)
]
-- TODO remove hard coded parameters
buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
--printDebug "candidate" (length candidates)
let termList = toTermList ((isStopTerm s) . fst) candidates
--let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)]
......@@ -114,7 +112,7 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
zs = drop b $ drop a ns
a = 3
b = 5000
b = 500
isStopTerm :: StopSize -> Text -> Bool
isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
......
......@@ -22,40 +22,36 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
module Gargantext.Text.Parsers (FileFormat(..), clean, parseFile)
where
import System.FilePath (FilePath(), takeExtension)
--import Data.ByteString (ByteString)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Concurrent.Async as CCA (mapConcurrently)
import Control.Monad (join)
import qualified Data.Time as DT
import qualified Data.ByteString.Char8 as DBC
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.Time (UTCTime(..))
import Data.List (concat)
import qualified Data.Map as DM
import qualified Data.ByteString as DB
import Data.List (lookup)
import Data.Ord()
import Data.String (String())
import Data.String()
import Data.Either(Either(..))
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text)
import qualified Data.Text as DT
-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently)
import Data.Text.Encoding (decodeUtf8)
import Data.String (String())
import Data.List (lookup)
------------------------------------------------------------------------
import Data.Tuple.Extra (both, first, second)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.Map as DM
import qualified Data.Text as DT
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.Date (parseDate)
import qualified Gargantext.Text.Parsers.WOS as WOS
import qualified Gargantext.Text.Parsers.RIS as RIS
import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
import qualified Gargantext.Text.Parsers.Date as Date
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -71,7 +67,8 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS | CsvHalFormat-- | CsvGargV3
data FileFormat = WOS | RIS | RisPresse
| CsvGargV3 | CsvHalFormat
deriving (Show)
-- Implemented (ISI Format)
......@@ -79,42 +76,33 @@ data FileFormat = WOS | CsvHalFormat-- | CsvGargV3
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs WOS path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
parseDocs CsvHalFormat p = parseHal p
type Year = Int
type Month = Int
type Day = Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
parseDate' l (Just txt) = do
utcTime <- parseDate l txt
let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
-- TODO: to debug maybe add the filepath in error message
parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseFile CsvHalFormat p = parseHal p
parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc WOS d = do
-- TODO use language for RIS
toDoc ff d = do
let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
(utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
pure $ HyperdataDocument (Just $ DT.pack $ show ff)
(lookup "doi" d)
(lookup "URL" d)
Nothing
......@@ -133,26 +121,35 @@ toDoc WOS d = do
Nothing
Nothing
(Just $ (DT.pack . show) lang)
toDoc _ _ = undefined
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith _ = enrichWith' identity
enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith' f = second (map both' . map f . concat)
where
both' = map (both decodeUtf8)
readFileWith :: FileFormat -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do
files <- case takeExtension path of
".zip" -> openZip path
_ -> pure <$> DB.readFile path
(as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
pure (as, map toText $ concat bs)
where
-- TODO : decode with bayesian inference on encodings
toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
_ -> pure <$> clean <$> DB.readFile path
partitionEithers <$> mapConcurrently (runParser format) files
-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
withParser WOS = WOS.parser
withParser RIS = RIS.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet"
......@@ -167,9 +164,9 @@ openZip fp = do
bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
pure bs
clean :: Text -> Text
clean txt = DT.map clean' txt
clean :: DB.ByteString -> DB.ByteString
clean txt = DBC.map clean' txt
where
clean' '’' = '\''
clean' '\r' = ' '
clean' c = c
......@@ -17,25 +17,23 @@ CSV parser for Gargantext corpus files.
module Gargantext.Text.Parsers.CSV where
import GHC.Real (round)
import GHC.IO (FilePath)
import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import GHC.IO (FilePath)
import GHC.Real (round)
import GHC.Word (Word8)
import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
import Gargantext.Prelude hiding (length)
import Gargantext.Text
import Gargantext.Text.Context
import Gargantext.Prelude hiding (length)
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString as BS
import qualified Data.Vector as V
---------------------------------------------------------------
headerCsvGargV3 :: Header
......@@ -48,7 +46,7 @@ headerCsvGargV3 = header [ "title"
, "authors"
]
---------------------------------------------------------------
data Doc = Doc
data CsvGargV3 = CsvGargV3
{ d_docId :: !Int
, d_title :: !Text
, d_source :: !Text
......@@ -61,9 +59,8 @@ data Doc = Doc
deriving (Show)
---------------------------------------------------------------
-- | Doc 2 HyperdataDocument
doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
toDoc :: CsvGargV3 -> HyperdataDocument
toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV")
(Just . pack . show $ did)
Nothing
......@@ -83,22 +80,23 @@ doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
Nothing
Nothing
Nothing
---------------------------------------------------------------
-- | Types Conversions
toDocs :: Vector CsvDoc -> [Doc]
toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
-> Doc nId t s py pm pd abst auth )
-> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
---------------------------------------------------------------
fromDocs :: Vector Doc -> Vector CsvDoc
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
......@@ -174,44 +172,88 @@ instance ToNamedRecord CsvDoc where
, "authors" .= aut
]
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
(m $ _hyperdataDocument_source h)
(mI $ _hyperdataDocument_publication_year h)
(mI $ _hyperdataDocument_publication_month h)
(mI $ _hyperdataDocument_publication_day h)
(m $ _hyperdataDocument_abstract h)
(m $ _hyperdataDocument_authors h)
where
m = maybe "" identity
mI = maybe 0 identity
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord '\t'}
)
csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord '\t'}
)
csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
delimiter :: Word8
delimiter = fromIntegral $ ord '\t'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readCsv fp
readCsvOn fields fp = V.toList
<$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
<$> snd
<$> readFile fp
------------------------------------------------------------------------
readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> csvDocs
readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
------------------------------------------------------------------------
readCsv :: FilePath -> IO (Header, Vector CsvDoc)
readCsv fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
-- | TODO use readFileLazy
readFile :: FilePath -> IO (Header, Vector CsvDoc)
readFile = fmap readCsvLazyBS . BL.readFile
-- | TODO use readByteStringLazy
readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
Right csvDocs -> csvDocs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
readCsvHal = fmap readCsvHalLazyBS . BL.readFile
readHal :: FilePath -> IO (Header, Vector CsvHal)
readHal fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
-- | TODO use readByteStringLazy
readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
Right csvDocs -> csvDocs
readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
------------------------------------------------------------------------
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
------------------------------------------------------------------------
-- Hal Format
......@@ -321,7 +363,6 @@ csvHal2doc (CsvHal title source
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
------------------------------------------------------------------------
......@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw) where
module Gargantext.Text.Parsers.Date (parse, parseRaw, split) where
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack)
import Data.Time (parseTimeOrError, defaultTimeLocale)
import Data.Time.Clock (UTCTime, getCurrentTime)
import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.LocalTime (utc)
import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
import Duckling.Api (analyze)
......@@ -37,40 +37,45 @@ import qualified Data.Aeson as Json
import qualified Data.HashSet as HashSet
import qualified Duckling.Core as DC
-- | Unused import (to parse Date Format, keeping it for maybe next steps)
-- import Control.Monad ((=<<))
-- import Data.Either (Either)
-- import Data.Fixed (Fixed (MkFixed))
-- import Data.Foldable (length)
-- import Data.String (String)
-- import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
-- import Data.Time.Calendar (Day, fromGregorian)
-- import Duckling.Debug as DB
-- import Duckling.Engine (parseAndResolve)
-- import Duckling.Rules (rulesFor)
-- import Prelude (toInteger, div, otherwise, (++))
-- import Text.Parsec.Error (ParseError)
-- import Text.Parsec.Prim (Stream, ParsecT)
-- import Text.Parsec.String (Parser)
-- import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
-- import Text.XML.HXT.DOM.Util (decimalStringToInt)
-- import qualified Text.ParserCombinators.Parsec (parse)
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
split :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
split _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
split l (Just txt) = do
utcTime <- parse l txt
let (y, m, d) = split' utcTime
pure (Just utcTime, (Just y, Just m,Just d))
split' :: UTCTime -> (Year, Month, Day)
split' utcTime = (fromIntegral y, m, d)
where
(UTCTime day _) = utcTime
(y,m,d) = toGregorian day
type Year = Int
type Month = Int
type Day = Int
------------------------------------------------------------------------
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parseDate :: Lang -> Text -> IO UTCTime
parseDate lang s = do
dateStr' <- parseDateRaw lang s
let format = "%Y-%m-%dT%T"
let dateStr = unpack $ maybe "0-0-0T0:0:0" identity
$ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale format dateStr
parse :: Lang -> Text -> IO UTCTime
parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type DateFormat = Text
type DateDefault = Text
parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseRaw lang s
let dateStr = unpack $ maybe def identity
$ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
-- TODO add Paris at Duckling.Locale Region datatype
......@@ -85,19 +90,19 @@ parserLang EN = DC.EN
-- IO can be avoided here:
-- currentContext :: Lang -> IO Context
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseDateRaw :: Lang -> Text -> IO (Text)
parseDateRaw lang text = do
parseRaw :: Lang -> Text -> IO (Text)
parseRaw lang text = do
maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
case headMay maybeJson of
Just (Json.Object object) -> case HM.lookup "value" object of
Just (Json.String date) -> pure date
Just _ -> panic "ParseDateRaw ERROR: should be a json String"
Nothing -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
Just _ -> panic "ParseRaw ERROR: should be a json String"
Nothing -> panic $ "ParseRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
_ -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
_ -> panic $ "ParseRaw ERROR: type error" <> (pack . show) lang <> " " <> text
-- | Current Time in DucklingTime format
......@@ -116,64 +121,3 @@ parseDateWithDuckling lang input = do
--pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
pure $ analyze input contxt $ HashSet.fromList [(This Time)]
-- | Permit to transform a String to an Int in a monadic context
--wrapDST :: Monad m => String -> m Int
--wrapDST = (return . decimalStringToInt)
-- | Generic parser which take at least one element not given in argument
--many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
--many1NoneOf = (many1 . noneOf)
--getMultiplicator :: Int -> Int
--getMultiplicator a
-- | 0 >= a = 1
-- | otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
--parseGregorian :: Parser Day
--parseGregorian = do
-- y <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-'
-- m <- wrapDST =<< many1NoneOf ['-']
-- _ <- char '-'
-- d <- wrapDST =<< many1NoneOf ['T']
-- _ <- char 'T'
-- return $ fromGregorian (toInteger y) m d
--
---- | Parser for time format h:m:s
--parseTimeOfDay :: Parser TimeOfDay
--parseTimeOfDay = do
-- h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- m <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- r <- many1NoneOf ['.']
-- _ <- char '.'
-- dec <- many1NoneOf ['+', '-']
-- let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
-- seconds = nb * 10^(12-l)
-- return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
--
--
-- | Parser for timezone format +hh:mm
--parseTimeZone :: Parser TimeZone
--parseTimeZone = do
-- sign <- oneOf ['+', '-']
-- h <- wrapDST =<< many1NoneOf [':']
-- _ <- char ':'
-- m <- wrapDST =<< (many1 $ anyChar)
-- let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
-- in return $ TimeZone timeInMinute False "CET"
--
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
--parseZonedTime :: Parser ZonedTime
--parseZonedTime= do
-- d <- parseGregorian
-- tod <- parseTimeOfDay
-- tz <- parseTimeZone
-- return $ ZonedTime (LocalTime d (tod)) tz
--
---- | Opposite of toRFC3339
--fromRFC3339 :: Text -> Either ParseError ZonedTime
--fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
-- where input = unpack t
{-|
Module : Gargantext.Text.Parsers.Date.Attoparsec
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Attoparsec
where
import Control.Applicative ((<*))
import Data.Attoparsec.ByteString (Parser, take)
import Data.ByteString (ByteString)
import Data.Monoid ((<>))
import Data.Tuple.Extra (first)
import Gargantext.Prelude hiding (takeWhile, take)
-------------------------------------------------------------
parserWith :: Parser ByteString -> Parser [(ByteString, ByteString)]
parserWith sep = do
day <- take 2 <* sep
mon <- take 2 <* sep
yea <- take 4
pure $ map (first (\x -> "publication_" <> x))
[ ("day",day)
, ("month", mon)
, ("year", yea)
, ("date", yea <> "-" <> mon <> "-" <> day <> "T0:0:0")
]
{-|
Module : Gargantext.Text.Parsers.Date
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.Date.Parsec
where
import Control.Monad ((=<<))
import Data.Either (Either)
import Data.Fixed (Fixed (MkFixed))
import Data.Foldable (length)
import Data.String (String)
import Data.Text (Text, unpack)
import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
import Data.Time.Calendar (Day, fromGregorian)
import Gargantext.Prelude
import Prelude (toInteger, div, otherwise, (++))
import Text.Parsec.Error (ParseError)
import Text.Parsec.Prim (Stream, ParsecT)
import Text.Parsec.String (Parser)
import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
import Text.XML.HXT.DOM.Util (decimalStringToInt)
import qualified Text.ParserCombinators.Parsec (parse)
-- | Permit to transform a String to an Int in a monadic context
wrapDST :: Monad m => String -> m Int
wrapDST = return . decimalStringToInt
-- | Generic parser which take at least one element not given in argument
many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
many1NoneOf = (many1 . noneOf)
getMultiplicator :: Int -> Int
getMultiplicator a
| 0 >= a = 1
| otherwise = 10 * (getMultiplicator $ div a 10)
-- | Parser for date format y-m-d
parseGregorian :: Parser Day
parseGregorian = do
y <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
m <- wrapDST =<< many1NoneOf ['-']
_ <- char '-'
d <- wrapDST =<< many1NoneOf ['T']
_ <- char 'T'
return $ fromGregorian (toInteger y) m d
---- | Parser for time format h:m:s
parseTimeOfDay :: Parser TimeOfDay
parseTimeOfDay = do
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
r <- many1NoneOf ['.']
_ <- char '.'
dec <- many1NoneOf ['+', '-']
let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
seconds = nb * 10^(12-l)
return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
-- | Parser for timezone format +hh:mm
parseTimeZone :: Parser TimeZone
parseTimeZone = do
sign <- oneOf ['+', '-']
h <- wrapDST =<< many1NoneOf [':']
_ <- char ':'
m <- wrapDST =<< (many1 $ anyChar)
let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
in return $ TimeZone timeInMinute False "CET"
---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
parseZonedTime :: Parser ZonedTime
parseZonedTime= do
d <- parseGregorian
tod <- parseTimeOfDay
tz <- parseTimeZone
return $ ZonedTime (LocalTime d (tod)) tz
---- | Opposite of toRFC3339
fromRFC3339 :: Text -> Either ParseError ZonedTime
fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
where input = unpack t
......@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Prelude
import System.IO (FilePath)
import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeCsv, headerCsvGargV3)
import Gargantext.Text.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
import Data.Vector (fromList)
data Patent = Patent { _patent_title :: Text
......@@ -48,7 +48,7 @@ type FilePathOut = FilePath
json2csv :: FilePathIn -> FilePathOut -> IO ()
json2csv fin fout = do
patents <- maybe (panic "json2csv error") identity <$> readPatents fin
writeCsv fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent title abstract year _) =
......
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This version of Parsers fixes the Date of publication in Gargantext
(V3) parser of PubMed. Indeed, we can not rely neither on Journal
Publication Date neither on Article publication date, which are
incomplete structurally but for its interpretation too. Then, to
simplify and uniformize data, date of publication of database insertion
is used.
TODO:
- Add authors
- factorize
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.PubMed where
import Data.Conduit.List as CL hiding (catMaybes, head)
import Control.Monad (join)
import GHC.IO (FilePath)
import Prelude (read)
import Gargantext.Prelude
import Control.Monad.Catch (MonadThrow)
import Data.Maybe (Maybe)
import Data.Monoid (mconcat)
import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedData
} deriving Show
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text
, pubmed_abstract :: Maybe [Text]
}
deriving (Show)
data PubMedData =
PubMedData { pubmedData_date :: UTCTime
, pubmedData_year :: Integer
, pubmedData_month :: Int
, pubmedData_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet = do
as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
return as
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle = do
articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
return articles
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many $ ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many $ ignoreAnyTreeContent
return dates'
_ <- many $ ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed (article) (PubMedData (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many $ ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $ manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many $ ignoreAnyTreeContent
return j
title <- do
t <- manyTagsUntil "ArticleTitle" content
return t
abstracts <- do
as <- manyTagsUntil "Abstract" $ many $ do
txt <- tagIgnoreAttrs "AbstractText" $ do
c <- content
_ <- many $ ignoreAnyTreeContent
return c
_ <- many $ ignoreAnyTreeContent
return txt
return as
-- TODO add authos
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>\n"
, "<PubmedArticle>\n"
, " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>\n"
, " <Year>2018</Year>\n"
, " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>\n"
, " </Journal>\n"
, " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, " <Abstract>\n"
, " <AbstractText>Abstract Text.</AbstractText>\n"
, " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
]
{-|
Module : Gargantext.Text.Parsers.RIS
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
RIS is a standardized tag format developed by Research Information
Systems, Incorporated (the format name refers to the company) to enable
citation programs to exchange data.
[More](https://en.wikipedia.org/wiki/RIS_(file_format))
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS (parser, onField, fieldWith, lines) where
import Data.List (lookup)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, try, takeTill, take, many1)
import Data.Attoparsec.ByteString.Char8 (isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.Monoid ((<>))
import Gargantext.Prelude hiding (takeWhile, take)
import qualified Data.List as DL
-------------------------------------------------------------
parser :: Parser [[(ByteString, ByteString)]]
parser = do
n <- notice "TY -"
ns <- many1 (notice "\nTY -")
pure $ [n] <> ns
notice :: Parser ByteString -> Parser [(ByteString, ByteString)]
notice s = start *> many (fieldWith field) <* end
where
field :: Parser ByteString
field = "\n" *> take 2 <* " - "
start :: Parser ByteString
start = s *> takeTill isEndOfLine
end :: Parser ByteString
end = "\nER -" *> takeTill isEndOfLine
fieldWith :: Parser ByteString -> Parser (ByteString, ByteString)
fieldWith n = do
name <- n
txt <- takeTill isEndOfLine
txts <- try lines
let txts' = case DL.length txts > 0 of
True -> txts
False -> []
pure (name, concat ([txt] <> txts'))
lines :: Parser [ByteString]
lines = many line
where
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
-------------------------------------------------------------
-- Field for First elem of a Tuple, Key for corresponding Map
onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
onField k f m = m <> ( maybe [] f (lookup k m) )
{-|
Module : Gargantext.Text.Parsers.RIS.Presse
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Presse RIS format parser for Europresse Database.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.RIS.Presse (presseEnrich) where
import Data.List (lookup)
import Data.Either (either)
import Data.Tuple.Extra (first, both, uncurry)
import Data.Attoparsec.ByteString (parseOnly)
import Data.ByteString (ByteString, length)
import Gargantext.Prelude hiding (takeWhile, take, length)
import Gargantext.Text.Parsers.RIS (onField)
import Gargantext.Core (Lang(..))
import qualified Gargantext.Text.Parsers.Date.Attoparsec as Date
presseEnrich :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
presseEnrich = (onField "DA" parseDate)
. (onField "LA" parseLang)
. fixFields
parseDate :: ByteString -> [(ByteString, ByteString)]
parseDate str = either (const []) identity $ parseOnly (Date.parserWith "/") str
parseLang :: ByteString -> [(ByteString, ByteString)]
parseLang "Français" = [(langField, cs $ show FR)]
parseLang "English" = [(langField, cs $ show EN)]
parseLang x = [(langField, x)]
langField :: ByteString
langField = "language"
fixFields :: [(ByteString, ByteString)] -> [(ByteString, ByteString)]
fixFields ns = map (first fixFields'') ns
where
-- | Title is sometimes longer than abstract
fixFields'' = case uncurry (>) <$> look'' of
Just True -> fixFields' "abstract" "title"
_ -> fixFields' "title" "abstract"
look'' :: Maybe (Int, Int)
look'' = both length <$> look
look :: Maybe (ByteString,ByteString)
look = (,) <$> lookup "TI" ns <*> lookup "N2" ns
fixFields' :: ByteString -> ByteString
-> ByteString -> ByteString
fixFields' title abstract champs
| champs == "AU" = "authors"
| champs == "TI" = title
| champs == "JF" = "source"
| champs == "DI" = "doi"
| champs == "UR" = "url"
| champs == "N2" = abstract
| otherwise = champs
......@@ -14,27 +14,21 @@ commentary with @some markup@.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Parsers.WOS (wosParser) where
module Gargantext.Text.Parsers.WOS (parser, keys) where
-- TOFIX : Should import Gargantext.Prelude here
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
import qualified Data.List as DL
import Data.Monoid ((<>))
import Data.Attoparsec.ByteString (Parser, try, string
, takeTill, take
, manyTill, many1)
import Control.Applicative
import Data.Attoparsec.ByteString (Parser, string, takeTill, take, manyTill, many1)
import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
import Data.ByteString (ByteString, concat)
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Control.Applicative
import Gargantext.Text.Parsers.RIS (fieldWith)
import Prelude hiding (takeWhile, take, concat, readFile, lines, concat)
-------------------------------------------------------------
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser :: Parser [[(ByteString, ByteString)]]
wosParser = do
parser :: Parser [[(ByteString, ByteString)]]
parser = do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_ <- manyTill anyChar (string $ pack "\nVR 1.0")
......@@ -42,8 +36,11 @@ wosParser = do
pure ns
notice :: Parser [(ByteString, ByteString)]
notice = start *> fields <* end
notice = start *> many (fieldWith field) <* end
where
field :: Parser ByteString
field = "\n" *> take 2 <* " "
start :: Parser ByteString
start = "\nPT " *> takeTill isEndOfLine
......@@ -51,28 +48,8 @@ notice = start *> fields <* end
end = manyTill anyChar (string $ pack "\nER\n")
fields :: Parser [(ByteString, ByteString)]
fields = many field
where
field :: Parser (ByteString, ByteString)
field = do
name <- "\n" *> take 2 <* " "
txt <- takeTill isEndOfLine
txts <- try lines
let txts' = case DL.length txts > 0 of
True -> txts
False -> []
pure (translate name, concat ([txt] <> txts'))
lines :: Parser [ByteString]
lines = many line
where
line :: Parser ByteString
line = "\n " *> takeTill isEndOfLine
translate :: ByteString -> ByteString
translate champs
keys :: ByteString -> ByteString
keys champs
| champs == "AF" = "authors"
| champs == "TI" = "title"
| champs == "SO" = "source"
......@@ -80,5 +57,3 @@ translate champs
| champs == "PD" = "publication_date"
| champs == "AB" = "abstract"
| otherwise = champs
-------------------------------------------------------------
......@@ -35,7 +35,7 @@ import Gargantext.Text.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
Doc
CsvGargV3
DocId
DocField
NoFeatures
......@@ -48,7 +48,7 @@ initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
......@@ -57,7 +57,7 @@ docSearchConfig =
documentFeatureValue = const noFeatures
}
where
extractTerms :: Doc -> DocField -> [Text]
extractTerms :: CsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
......
......@@ -27,13 +27,13 @@ group :: [TokenTag] -> [TokenTag]
group [] = []
group ntags = group2 NP NP
$ group2 NP VB
-- $ group2 NP IN
-- - $ group2 IN DT
-- group2 NP IN
-- group2 IN DT
$ group2 VB NP
$ group2 JJ NP
$ group2 NP JJ
$ group2 JJ JJ
-- - $ group2 JJ CC
-- group2 JJ CC
$ ntags
------------------------------------------------------------------------
......
......@@ -13,6 +13,7 @@ Portability : POSIX
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-}
module Gargantext.Viz.Chart
where
......@@ -23,10 +24,27 @@ import Data.List (unzip, sortOn)
import Data.Map (toList)
import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Database.Config
import Gargantext.Database.Schema.NodeNode (selectDocsDates)
import Gargantext.Database.Utils
import Gargantext.Database.Types.Node (CorpusId)
import Gargantext.Database.Node.Select
import Gargantext.Text.Metrics.Count (occurrencesWith)
import Gargantext.Core.Types.Main
-- Pie Chart
import Data.Maybe (catMaybes)
import qualified Data.Map as Map
import qualified Data.List as List
import Gargantext.API.Ngrams.Tools
import Gargantext.API.Ngrams.NTree
import Gargantext.Database.Metrics.NgramsByNode
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Schema.Node
import Gargantext.Core.Types
import Gargantext.Database.Flow
import Servant
data Chart = ChartHisto | ChartScatter | ChartPie
......@@ -47,3 +65,62 @@ histoData cId = do
$ occurrencesWith identity dates
pure (Histo ls css)
pieData :: FlowCmdM env err m
=> CorpusId -> NgramsType -> ListType
-> m Histo
pieData cId nt lt = do
ls' <- selectNodesWithUsername NodeList userMaster
ls <- map (_node_id) <$> getListsWithParentId cId
ts <- mapTermListRoot ls nt
let
dico = filterListWithRoot lt ts
terms = catMaybes $ List.concat $ map (\(a,b) -> [Just a, b]) $ Map.toList dico
group dico' x = case Map.lookup x dico' of
Nothing -> x
Just x' -> maybe x identity x'
(_total,mapTerms) <- countNodesByNgramsWith (group dico)
<$> getNodesByNgramsOnlyUser cId (ls' <> ls) nt terms
let (dates, count) = unzip $ map (\(t,(d,_)) -> (t, d)) $ Map.toList mapTerms
pure (Histo dates (map round count))
treeData :: FlowCmdM env err m
=> CorpusId -> NgramsType -> ListType
-> m [MyTree]
treeData cId nt lt = do
ls' <- selectNodesWithUsername NodeList userMaster
ls <- map (_node_id) <$> getListsWithParentId cId
ts <- mapTermListRoot ls nt
let
dico = filterListWithRoot lt ts
terms = catMaybes $ List.concat $ map (\(a,b) -> [Just a, b]) $ Map.toList dico
cs' <- getNodesByNgramsOnlyUser cId (ls' <> ls) nt terms
m <- getListNgrams ls nt
pure $ toTree lt cs' m
treeData' :: FlowCmdM env ServantErr m
=> CorpusId -> NgramsType -> ListType
-> m [MyTree]
treeData' cId nt lt = do
ls' <- selectNodesWithUsername NodeList userMaster
ls <- map (_node_id) <$> getListsWithParentId cId
ts <- mapTermListRoot ls nt
let
dico = filterListWithRoot lt ts
terms = catMaybes $ List.concat $ map (\(a,b) -> [Just a, b]) $ Map.toList dico
cs' <- getNodesByNgramsOnlyUser cId (ls' <> ls) nt terms
m <- getListNgrams ls nt
pure $ toTree lt cs' m
......@@ -28,8 +28,10 @@ import Control.Monad.IO.Class (liftIO)
import Gargantext.API.Ngrams.Tools
import Gargantext.API.Types
import Gargantext.Core.Types.Main
import Gargantext.Database.Config
import Gargantext.Database.Metrics.NgramsByNode (getNodesByNgramsOnlyUser)
import Gargantext.Database.Schema.Ngrams
import Gargantext.Database.Node.Select
import Gargantext.Database.Schema.Node (getNode)
import Gargantext.Database.Schema.Node (defaultList)
import Gargantext.Database.Types.Node hiding (node_id) -- (GraphId, ListId, CorpusId, NodeId)
......@@ -65,13 +67,14 @@ getGraph nId = do
]
-- (map (\n -> LegendField n "#FFFFFF" (pack $ show n)) [1..10])
let cId = maybe (panic "no parentId") identity $ _node_parentId nodeGraph
lIds <- selectNodesWithUsername NodeList userMaster
lId <- defaultList cId
ngs <- filterListWithRoot GraphTerm <$> mapTermListRoot [lId] NgramsTerms
myCooc <- Map.filter (>1) <$> getCoocByNgrams (Diagonal False)
<$> groupNodesByNgrams ngs
<$> getNodesByNgramsOnlyUser cId NgramsTerms (Map.keys ngs)
<$> getNodesByNgramsOnlyUser cId (lIds <> [lId]) NgramsTerms (Map.keys ngs)
graph <- liftIO $ cooc2graph myCooc
pure $ set graph_metadata (Just metadata)
......
......@@ -82,6 +82,7 @@ getPhylo _phyloId _lId l f b l' ms x y z ts s o e d b' = do
q = initPhyloQueryView l f b l' ms fs' ts so e d b'
-- | TODO remove phylo for real data here
pure (toPhyloView q phylo)
-- TODO remove phylo for real data here
------------------------------------------------------------------------
{-
......
resolver: lts-12.10
resolver: lts-12.26
flags: {}
extra-package-dbs: []
packages:
- .
- 'deps/servant-job'
- 'deps/clustering-louvain'
- 'deps/patches-map'
- 'deps/patches-class'
- 'deps/haskell-opaleye'
- 'deps/hsparql'
allow-newer: true
extra-deps:
- time-units-1.0.0
- git: https://github.com/delanoe/data-time-segment.git
commit: 4e3d57d80e9dfe6624c8eeaa8595fc8fe64d8723
commit: 10a416b9f6c443866b36479c3441ebb3bcdeb7ef
- git: https://gitlab.iscpif.fr/gargantext/hlcm.git
commit: 6f0595d2421005837d59151a8b26eee83ebb67b5
- git: https://github.com/delanoe/servant-static-th.git
......@@ -23,7 +16,22 @@ extra-deps:
commit: 90eef7604bb230644c2246eccd094d7bfefcb135
- git: https://github.com/paulrzcz/HSvm.git
commit: 3fe28b683aba5ddf05e3b5f8eced0bd05c5a29f9
#- opaleye-0.6.7002.0
- git: https://github.com/robstewart57/rdf4h.git
commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed
commit: dcaa0f5dd53f20648f4f5a615d29163582a4219c
- git: https://gitlab.iscpif.fr/gargantext/patches-class
commit: 746b4ce0af8f9e600d555ad7e5b2973a940cdad9
- git: https://github.com/np/servant-job.git
commit: ac4227441bbca30c44235582b5ec31340c569021
- git: https://gitlab.iscpif.fr/gargantext/clustering-louvain.git
commit: 1c636112b151110408e7c5a28cec39e46657358e
- git: https://github.com/np/patches-map
commit: 8c6f38c4844ead53e664cf9c82ba461715dbe445
- git: https://github.com/delanoe/haskell-opaleye.git #- opaleye-0.6.7002.0
commit: 53385de076be09f728a1b58c035a18e9ff9bcfd6
- git: https://github.com/delanoe/hsparql.git
commit: 308c74b71a1abb0a91546fa57d353131248e3a7f
- KMP-0.1.0.2
- accelerate-1.2.0.0
- aeson-lens-0.5.0.0
......@@ -35,10 +43,10 @@ extra-deps:
- multiset-0.3.4.1 # stack test
- probable-0.1.3
- rake-0.0.1
- rdf4h-3.1.1
- json-stream-0.4.2.4 # Text.Parsers (JSON)
- serialise-0.2.0.0
- servant-flatten-0.2
- servant-multipart-0.11.2
- stemmer-0.5.2
- time-units-1.0.0
- validity-0.9.0.0 # patches-{map,class}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment