Commit f6a473f2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DBFLOW] Ngrams / NodeId / count

parent 871b48ee
......@@ -25,12 +25,17 @@ module Gargantext.Database.Flow
where
import System.FilePath (FilePath)
import Data.Maybe (Maybe(..))
import Data.Text (Text, unpack)
import Data.Map (Map)
import qualified Data.Map as DM
import Gargantext.Core.Types (NodePoly(..))
import Gargantext.Prelude
import Gargantext.Database.Bashql (runCmd', del)
import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus)
import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(reId))
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
......@@ -43,7 +48,8 @@ subFlow username = do
maybeUserId <- runCmd' (getUser username)
let userId = case maybeUserId of
Nothing -> panic "Error: User does not exist (yet)" -- mk NodeUser gargantua_id "Node Gargantua"
Nothing -> panic "Error: User does not exist (yet)"
-- mk NodeUser gargantua_id "Node Gargantua"
Just user -> userLight_id user
rootId' <- map _node_id <$> runCmd' (getRoot userId)
......@@ -68,7 +74,7 @@ flow fp = do
(masterUserId, _, corpusId) <- subFlow "gargantua"
docs <- parseDocs WOS fp
docs <- map addUniqIds <$> parseDocs WOS fp
ids <- runCmd' $ insertDocuments masterUserId corpusId docs
printDebug "Docs IDs : " ids
......@@ -80,5 +86,69 @@ flow fp = do
inserted <- runCmd' $ add corpusId2 (map reId ids)
printDebug "Inserted : " inserted
runCmd' (del [corpusId2, corpusId])
runCmd' $ del [corpusId2, corpusId]
----------------------------------------------------------------
type HashId = Text
type ToInsert = Map HashId HyperdataDocument
type Inserted = Map HashId ReturnId
toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
where
hash = maybe "Error" identity
toInserted :: [ReturnId] -> Map HashId ReturnId
toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
$ filter (\r -> reInserted r == True) rs
data DocumentWithId = DocumentWithId { documentId :: NodeId
, documentData :: HyperdataDocument
}
type NodeId = Int
mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument
-> [DocumentWithId]
mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
where
lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
, document_ngrams :: Map Ngram Int
}
data NgramsType = Sources | Authors | Terms
-- | Typed Ngrams
data Ngrams = Ngrams { ngramsType :: NgramsType
, ngramsText :: Text
, ngramsSize :: Int
}
type Ngram = Text
type NgramId = Int
documentIdWithNgrams :: (HyperdataDocument -> Map Ngram Int) -> [DocumentWithId] -> [DocumentIdWithNgrams]
documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
-- | TODO check optimization
mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngram [(NodeId,Int)]
mapNodeIdNgrams ds = DM.fromListWith (<>) xs
where
xs = [(ngId, [(nId, i)]) | (nId, n2i') <- n2i ds, (ngId, i) <- DM.toList n2i']
n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
--indexNgram :: [Ngram] ->
---- insert to NodeNgram
---- using insertNgrams from
--indexNgram :: Map Ngram (Map NodeId Int) -> Map NgramId (Map NodeId Int)
--indexNgram = undefined
-- grouping here
{-|
Module : Gargantext.Databse.Ngram
Description :
Module : Gargantext.Database.Ngram
Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
Ngrams connection to the Database.
-}
{-# LANGUAGE Arrows #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Ngram where
import Prelude
import Data.Text (Text)
import Control.Lens.TH (makeLensesWith, abbreviatedFields)
import Data.List (find)
import Data.Maybe (Maybe)
import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Control.Lens.TH (makeLensesWith, abbreviatedFields)
import qualified Database.PostgreSQL.Simple as PGS
import GHC.Generics (Generic)
import Data.ByteString.Internal (ByteString)
import Data.Text (Text)
import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ
import Gargantext.Database.Node (mkCmd, Cmd(..))
-- import Opaleye
import Prelude
import Opaleye
import qualified Database.PostgreSQL.Simple as DPS
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
-- Functions only
import Data.List (find)
--data NgramPoly id terms n = NgramDb { ngram_id :: id
-- , ngram_terms :: terms
-- , ngram_n :: n
-- } deriving (Show)
--
--type NgramWrite = NgramPoly (Maybe (Column PGInt4))
-- (Column PGText)
-- (Column PGInt4)
--
--type NgramRead = NgramPoly (Column PGInt4)
-- (Column PGText)
-- (Column PGInt4)
--
----type Ngram = NgramPoly Int Text Int
--
-- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
-- $(makeLensesWith abbreviatedFields ''NgramPoly)
--
--ngramTable :: Table NgramWrite NgramRead
--ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
-- , ngram_terms = required "terms"
-- , ngram_n = required "n"
-- }
-- )
--
--queryNgramTable :: Query NgramRead
--queryNgramTable = queryTable ngramTable
--
--dbGetNgrams :: PGS.Connection -> IO [NgramDb]
--dbGetNgrams conn = runQuery conn queryNgramTable
data NgramPoly id terms n = Ngram { ngram_id :: id
, ngram_terms :: terms
, ngram_n :: n
} deriving (Show)
type NgramWrite = NgramPoly (Maybe (Column PGInt4))
(Column PGText)
(Column PGInt4)
type Ngram = Text
type NgramId = Int
type SizeN = Int
type NgramRead = NgramPoly (Column PGInt4)
(Column PGText)
(Column PGInt4)
data NgramIds = NgramIds { ngramId :: Int
, ngramTerms :: Text
} deriving (Show, Generic)
type Ngram = NgramPoly Int Text Int
instance DPS.FromRow NgramIds where
fromRow = NgramIds <$> field <*> field
$(makeAdaptorAndInstance "pNgram" ''NgramPoly)
$(makeLensesWith abbreviatedFields ''NgramPoly)
----------------------
insertNgrams :: [(Ngram, SizeN)] -> Cmd [DPS.Only Int]
insertNgrams ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
ngramTable :: Table NgramWrite NgramRead
ngramTable = Table "ngrams" (pNgram Ngram { ngram_id = optional "id"
, ngram_terms = required "terms"
, ngram_n = required "n"
}
)
queryNgramTable :: Query NgramRead
queryNgramTable = queryTable ngramTable
insertNgrams_Debug :: [(Ngram, SizeN)] -> Cmd ByteString
insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
where
fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
--selectUsers :: Query UserRead
--selectUsers = proc () -> do
-- --user@(i, p, ll, is, un, fn, ln, m, iff, ive, dj) <- queryUserTable -< ()
-- row@(User i p ll is un fn ln m iff ive dj) <- queryUserTable -< ()
-- O.restrict -< i .== 1
-- --returnA -< User i p ll is un fn ln m iff ive dj
-- returnA -< row
--
findWith :: (Eq a1, Foldable t) => (a -> a1) -> a1 -> t a -> Maybe a
findWith f t = find (\x -> f x == t)
----------------------
queryInsertNgrams :: DPS.Query
queryInsertNgrams = [sql|
WITH input_rows(terms,n) AS (?)
, ins AS (
INSERT INTO ngrams (terms,n)
SELECT * FROM input_rows
ON CONFLICT (terms) DO NOTHING -- unique index created here
RETURNING id,terms
)
--userWithUsername :: Text -> [User] -> Maybe User
--userWithUsername t xs = userWith userUsername t xs
--
--userWithId :: Integer -> [User] -> Maybe User
--userWithId t xs = userWith userUserId t xs
SELECT id, terms
FROM ins
UNION ALL
SELECT c.id, terms
FROM input_rows
JOIN ngrams c USING (terms); -- columns of unique index
|]
-- | not optimized (get all ngrams without filters)
dbGetNgrams :: PGS.Connection -> IO [Ngram]
dbGetNgrams conn = runQuery conn queryNgramTable
......@@ -150,8 +150,9 @@ queryInsert = [sql|
|]
prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document" identity $ _hyperdataDocument_title h)
(toJSON $ addUniqId h)
prepare uId pId = map (\h -> InputData tId uId pId
(maybe "No Title of Document" identity $ _hyperdataDocument_title h)
(toJSON h)
)
where
tId = nodeTypeId NodeDocument
......@@ -166,7 +167,7 @@ prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document"
data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
, reId :: Int -- ^ always return the id of the document (even new or not new)
-- this is the uniq id in the database
, reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
, reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
} deriving (Show, Generic)
instance FromRow ReturnId where
......@@ -195,8 +196,8 @@ instance ToRow InputData where
---------------------------------------------------------------------------
-- * Uniqueness of document definition
addUniqId :: HyperdataDocument -> HyperdataDocument
addUniqId doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
addUniqIds :: HyperdataDocument -> HyperdataDocument
addUniqIds doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
$ set hyperdataDocument_uniqId (Just hash) doc
where
hash = uniqId $ DT.concat $ map ($ doc) hashParameters
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment