Commit f6a473f2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DBFLOW] Ngrams / NodeId / count

parent 871b48ee
...@@ -25,12 +25,17 @@ module Gargantext.Database.Flow ...@@ -25,12 +25,17 @@ module Gargantext.Database.Flow
where where
import System.FilePath (FilePath) import System.FilePath (FilePath)
import Data.Maybe (Maybe(..)) import Data.Maybe (Maybe(..))
import Data.Text (Text, unpack)
import Data.Map (Map)
import qualified Data.Map as DM
import Gargantext.Core.Types (NodePoly(..)) import Gargantext.Core.Types (NodePoly(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Bashql (runCmd', del) import Gargantext.Database.Bashql (runCmd', del)
import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus) import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus)
import Gargantext.Database.User (getUser, UserLight(..), Username) import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(reId)) import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
import Gargantext.Database.Node.Document.Add (add) import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS)) import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
...@@ -41,23 +46,24 @@ type CorpusId = Int ...@@ -41,23 +46,24 @@ type CorpusId = Int
subFlow :: Username -> IO (UserId, RootId, CorpusId) subFlow :: Username -> IO (UserId, RootId, CorpusId)
subFlow username = do subFlow username = do
maybeUserId <- runCmd' (getUser username) maybeUserId <- runCmd' (getUser username)
let userId = case maybeUserId of let userId = case maybeUserId of
Nothing -> panic "Error: User does not exist (yet)" -- mk NodeUser gargantua_id "Node Gargantua" Nothing -> panic "Error: User does not exist (yet)"
-- mk NodeUser gargantua_id "Node Gargantua"
Just user -> userLight_id user Just user -> userLight_id user
rootId' <- map _node_id <$> runCmd' (getRoot userId) rootId' <- map _node_id <$> runCmd' (getRoot userId)
rootId'' <- case rootId' of rootId'' <- case rootId' of
[] -> runCmd' (mkRoot userId) [] -> runCmd' (mkRoot userId)
un -> case length un >= 2 of un -> case length un >= 2 of
True -> panic "Error: more than 1 userNode / user" True -> panic "Error: more than 1 userNode / user"
False -> pure rootId' False -> pure rootId'
let rootId = maybe (panic "error rootId") identity (head rootId'') let rootId = maybe (panic "error rootId") identity (head rootId'')
corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
let corpusId = maybe (panic "error corpusId") identity (head corpusId') let corpusId = maybe (panic "error corpusId") identity (head corpusId')
printDebug "(username, userId, rootId, corpusId" printDebug "(username, userId, rootId, corpusId"
(username, userId, rootId, corpusId) (username, userId, rootId, corpusId)
pure (userId, rootId, corpusId) pure (userId, rootId, corpusId)
...@@ -68,17 +74,81 @@ flow fp = do ...@@ -68,17 +74,81 @@ flow fp = do
(masterUserId, _, corpusId) <- subFlow "gargantua" (masterUserId, _, corpusId) <- subFlow "gargantua"
docs <- parseDocs WOS fp docs <- map addUniqIds <$> parseDocs WOS fp
ids <- runCmd' $ insertDocuments masterUserId corpusId docs ids <- runCmd' $ insertDocuments masterUserId corpusId docs
printDebug "Docs IDs : " ids printDebug "Docs IDs : " ids
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
printDebug "Docs IDs : " idsRepeat printDebug "Docs IDs : " idsRepeat
(_, _, corpusId2) <- subFlow "alexandre" (_, _, corpusId2) <- subFlow "alexandre"
inserted <- runCmd' $ add corpusId2 (map reId ids) inserted <- runCmd' $ add corpusId2 (map reId ids)
printDebug "Inserted : " inserted printDebug "Inserted : " inserted
runCmd' (del [corpusId2, corpusId]) runCmd' $ del [corpusId2, corpusId]
----------------------------------------------------------------
type HashId = Text
type ToInsert = Map HashId HyperdataDocument
type Inserted = Map HashId ReturnId
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
where
hash = maybe "Error" identity
toInserted :: [ReturnId] -> Map HashId ReturnId
toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
$ filter (\r -> reInserted r == True) rs
data DocumentWithId = DocumentWithId { documentId :: NodeId
, documentData :: HyperdataDocument
}
type NodeId = Int
mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument
-> [DocumentWithId]
mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
where
lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
, document_ngrams :: Map Ngram Int
}
data NgramsType = Sources | Authors | Terms
-- | Typed Ngrams
data Ngrams = Ngrams { ngramsType :: NgramsType
, ngramsText :: Text
, ngramsSize :: Int
}
type Ngram = Text
type NgramId = Int
documentIdWithNgrams :: (HyperdataDocument -> Map Ngram Int) -> [DocumentWithId] -> [DocumentIdWithNgrams]
documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngram [(NodeId,Int)]
mapNodeIdNgrams ds = DM.fromListWith (<>) xs
where
xs = [(ngId, [(nId, i)]) | (nId, n2i') <- n2i ds, (ngId, i) <- DM.toList n2i']
n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
--indexNgram :: [Ngram] ->
---- insert to NodeNgram
---- using insertNgrams from
--indexNgram :: Map Ngram (Map NodeId Int) -> Map NgramId (Map NodeId Int)
--indexNgram = undefined
-- grouping here
{-| {-|
Module : Gargantext.Databse.Ngram Module : Gargantext.Database.Ngram
Description : Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
Here is a longer description of this module, containing some Ngrams connection to the Database.
commentary with @some markup@.
-} -}
{-# LANGUAGE Arrows #-} {-# LANGUAGE Arrows #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Ngram where module Gargantext.Database.Ngram where
import Prelude import Control.Lens.TH (makeLensesWith, abbreviatedFields)
import Data.Text (Text) import Data.List (find)
import Data.Maybe (Maybe) import Data.Maybe (Maybe)
import Data.Profunctor.Product.TH (makeAdaptorAndInstance) import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Control.Lens.TH (makeLensesWith, abbreviatedFields) import GHC.Generics (Generic)
import qualified Database.PostgreSQL.Simple as PGS import Data.ByteString.Internal (ByteString)
import Data.Text (Text)
import Opaleye import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ
import Gargantext.Database.Node (mkCmd, Cmd(..))
-- import Opaleye
import Prelude
-- Functions only import qualified Database.PostgreSQL.Simple as DPS
import Data.List (find) import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
data NgramPoly id terms n = Ngram { ngram_id :: id --data NgramPoly id terms n = NgramDb { ngram_id :: id
, ngram_terms :: terms -- , ngram_terms :: terms
, ngram_n :: n -- , ngram_n :: n
} deriving (Show) -- } deriving (Show)
--
--type NgramWrite = NgramPoly (Maybe (Column PGInt4))
-- (Column PGText)
-- (Column PGInt4)
--
--type NgramRead = NgramPoly (Column PGInt4)
-- (Column PGText)
-- (Column PGInt4)
--
----type Ngram = NgramPoly Int Text Int
--
-- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
-- $(makeLensesWith abbreviatedFields ''NgramPoly)
--
--ngramTable :: Table NgramWrite NgramRead
--ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
-- , ngram_terms = required "terms"
-- , ngram_n = required "n"
-- }
-- )
--
--queryNgramTable :: Query NgramRead
--queryNgramTable = queryTable ngramTable
--
--dbGetNgrams :: PGS.Connection -> IO [NgramDb]
--dbGetNgrams conn = runQuery conn queryNgramTable
type NgramWrite = NgramPoly (Maybe (Column PGInt4)) type Ngram = Text
(Column PGText) type NgramId = Int
(Column PGInt4) type SizeN = Int
type NgramRead = NgramPoly (Column PGInt4) data NgramIds = NgramIds { ngramId :: Int
(Column PGText) , ngramTerms :: Text
(Column PGInt4) } deriving (Show, Generic)
type Ngram = NgramPoly Int Text Int instance DPS.FromRow NgramIds where
fromRow = NgramIds <$> field <*> field
$(makeAdaptorAndInstance "pNgram" ''NgramPoly) ----------------------
$(makeLensesWith abbreviatedFields ''NgramPoly) insertNgrams :: [(Ngram, SizeN)] -> Cmd [DPS.Only Int]
insertNgrams ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
ngramTable :: Table NgramWrite NgramRead
ngramTable = Table "ngrams" (pNgram Ngram { ngram_id = optional "id"
, ngram_terms = required "terms"
, ngram_n = required "n"
}
)
queryNgramTable :: Query NgramRead insertNgrams_Debug :: [(Ngram, SizeN)] -> Cmd ByteString
queryNgramTable = queryTable ngramTable insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
--selectUsers :: Query UserRead
--selectUsers = proc () -> do
-- --user@(i, p, ll, is, un, fn, ln, m, iff, ive, dj) <- queryUserTable -< ()
-- row@(User i p ll is un fn ln m iff ive dj) <- queryUserTable -< ()
-- O.restrict -< i .== 1
-- --returnA -< User i p ll is un fn ln m iff ive dj
-- returnA -< row
--
findWith :: (Eq a1, Foldable t) => (a -> a1) -> a1 -> t a -> Maybe a ----------------------
findWith f t = find (\x -> f x == t) queryInsertNgrams :: DPS.Query
queryInsertNgrams = [sql|
WITH input_rows(terms,n) AS (?)
, ins AS (
INSERT INTO ngrams (terms,n)
SELECT * FROM input_rows
ON CONFLICT (terms) DO NOTHING -- unique index created here
RETURNING id,terms
)
--userWithUsername :: Text -> [User] -> Maybe User SELECT id, terms
--userWithUsername t xs = userWith userUsername t xs FROM ins
-- UNION ALL
--userWithId :: Integer -> [User] -> Maybe User SELECT c.id, terms
--userWithId t xs = userWith userUserId t xs FROM input_rows
JOIN ngrams c USING (terms); -- columns of unique index
|]
-- | not optimized (get all ngrams without filters)
dbGetNgrams :: PGS.Connection -> IO [Ngram]
dbGetNgrams conn = runQuery conn queryNgramTable
...@@ -150,8 +150,9 @@ queryInsert = [sql| ...@@ -150,8 +150,9 @@ queryInsert = [sql|
|] |]
prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData] prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document" identity $ _hyperdataDocument_title h) prepare uId pId = map (\h -> InputData tId uId pId
(toJSON $ addUniqId h) (maybe "No Title of Document" identity $ _hyperdataDocument_title h)
(toJSON h)
) )
where where
tId = nodeTypeId NodeDocument tId = nodeTypeId NodeDocument
...@@ -166,7 +167,7 @@ prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document" ...@@ -166,7 +167,7 @@ prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document"
data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new) data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
, reId :: Int -- ^ always return the id of the document (even new or not new) , reId :: Int -- ^ always return the id of the document (even new or not new)
-- this is the uniq id in the database -- this is the uniq id in the database
, reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
} deriving (Show, Generic) } deriving (Show, Generic)
instance FromRow ReturnId where instance FromRow ReturnId where
...@@ -195,8 +196,8 @@ instance ToRow InputData where ...@@ -195,8 +196,8 @@ instance ToRow InputData where
--------------------------------------------------------------------------- ---------------------------------------------------------------------------
-- * Uniqueness of document definition -- * Uniqueness of document definition
addUniqId :: HyperdataDocument -> HyperdataDocument addUniqIds :: HyperdataDocument -> HyperdataDocument
addUniqId doc = set hyperdataDocument_uniqIdBdd (Just hashBdd) addUniqIds doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
$ set hyperdataDocument_uniqId (Just hash) doc $ set hyperdataDocument_uniqId (Just hash) doc
where where
hash = uniqId $ DT.concat $ map ($ doc) hashParameters hash = uniqId $ DT.concat $ map ($ doc) hashParameters
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment