Commit 96920cfd authored by Alexandre Delanoë's avatar Alexandre Delanoë

[DB][FLOW] fix duplicate ngrams insertion.

parent 714462cc
......@@ -28,13 +28,13 @@ import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
import Gargantext.Database.Bashql (runCmd') -- , del)
import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)--, mkGraph, mkDashboard)--, mkAnnuaire)
import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds, Hyper(HyperDocument))
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds, ToDbData(..))
import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.Node.Contact (HyperdataContact(..))
-- import Gargantext.Database.Node.Contact (HyperdataContact(..))
import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Prelude
......@@ -47,19 +47,20 @@ type CorpusId = Int
flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
flowDatabase ff fp cName = do
-- Corus Flow
-- Corpus Flow
(masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
-- Documents Flow
hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
let hyperdataDocuments' = map (\h -> HyperDocument h) hyperdataDocuments
printDebug "hyperdataDocuments" hyperdataDocuments
let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
printDebug "hyperdataDocuments" (length hyperdataDocuments)
ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
--printDebug "Docs IDs : " (ids)
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
printDebug "Repeated Docs IDs : " (length idsRepeat)
-- printDebug "Docs IDs : " (ids)
-- idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
-- printDebug "Repeated Docs IDs : " (length idsRepeat)
let idsNotRepeated = filter (\r -> reInserted r == True) ids
-- {-
-- Ngrams Flow
-- todo: flow for new documents only
let tids = toInserted ids
......@@ -68,37 +69,38 @@ flowDatabase ff fp cName = do
let tihs = toInsert hyperdataDocuments
printDebug "toInsert hyperdataDocuments" (length tihs)
let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
printDebug "documentsWithId" documentsWithId
let documentsWithId = mergeData (toInserted idsNotRepeated) (toInsert hyperdataDocuments)
-- printDebug "documentsWithId" documentsWithId
-- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
printDebug "docsWithNgrams" docsWithNgrams
-- printDebug "docsWithNgrams" docsWithNgrams
let maps = mapNodeIdNgrams docsWithNgrams
printDebug "maps" (maps)
-- printDebug "maps" (maps)
indexedNgrams <- runCmd' $ indexNgrams maps
printDebug "inserted ngrams" indexedNgrams
-- printDebug "inserted ngrams" indexedNgrams
_ <- runCmd' $ insertToNodeNgrams indexedNgrams
-- List Flow
listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
printDebug "list id : " listId2
(userId, rootUserId, corpusId2) <- subFlowCorpus userArbitrary cName
--(userId, rootUserId, corpusId2) <- subFlowCorpus userArbitrary cName
--}
(userId, _, corpusId2) <- subFlowCorpus userArbitrary cName
userListId <- runCmd' $ listFlowUser userId corpusId2
printDebug "UserList : " userListId
inserted <- runCmd' $ add corpusId2 (map reId ids)
printDebug "Inserted : " (length inserted)
_ <- runCmd' $ mkDashboard corpusId2 userId
_ <- runCmd' $ mkGraph corpusId2 userId
--_ <- runCmd' $ mkDashboard corpusId2 userId
--_ <- runCmd' $ mkGraph corpusId2 userId
-- Annuaire Flow
annuaireId <- runCmd' $ mkAnnuaire rootUserId userId
-- _ <- runCmd' $ mkAnnuaire rootUserId userId
pure corpusId2
-- runCmd' $ del [corpusId2, corpusId]
......@@ -208,16 +210,16 @@ insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId
------------------------------------------------------------------------
listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
listFlow uId cId ngs = do
printDebug "ngs:" ngs
-- printDebug "ngs:" ngs
lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
printDebug "ngs" (DM.keys ngs)
--printDebug "ngs" (DM.keys ngs)
-- TODO add stemming equivalence of 2 ngrams
let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
_ <- insertGroups lId groupEd
-- compute Candidate / Map
let lists = ngrams2list ngs
printDebug "lists:" lists
-- printDebug "lists:" lists
is <- insertLists lId lists
printDebug "listNgrams inserted :" is
......
......@@ -24,6 +24,7 @@ Ngrams connection to the Database.
module Gargantext.Database.Ngrams where
-- import Opaleye
import Debug.Trace (trace)
import Prelude (Enum, Bounded, minBound, maxBound)
import Control.Lens (makeLenses, view)
import Data.ByteString.Internal (ByteString)
......@@ -223,9 +224,11 @@ getNgramsTableData :: DPS.Connection
-> NodeType -> NgramsType
-> NgramsTableParamUser -> NgramsTableParamMaster
-> IO [NgramsTableData]
getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
getNgramsTableData conn nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) = do
_ <- trace $ show (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w)
<$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
-- <$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
<$> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc)
where
nodeTId = nodeTypeId nodeT
ngrmTId = ngramsTypeId ngrmT
......@@ -235,23 +238,28 @@ querySelectTableNgrams :: DPS.Query
querySelectTableNgrams = [sql|
WITH tableUser AS (
SELECT ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
JOIN nodes n ON n.id = nn2.node_id
WHERE nn1.node_id = ? -- User listId
AND n.parent_id = ? -- User CorpusId or AnnuaireId
AND n.typename = ? -- both type of childs (Documents or Contacts)
AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
), tableMaster AS (
SELECT ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
JOIN nodes n ON n.id = nn2.node_id
WHERE nn1.node_id = ? -- Master listId
AND n.parent_id = ? -- Master CorpusId or AnnuaireId
AND n.typename = ? -- both type of childs (Documents or Contacts)
AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
JOIN nodes_ngrams list ON list.ngram_id = ngs.id
JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
JOIN nodes n ON n.id = corp.node_id
WHERE list.node_id = ? -- User listId
AND n.parent_id = ? -- User CorpusId or AnnuaireId
AND n.typename = ? -- both type of childs (Documents or Contacts)
AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms or...)
)
, tableMaster AS (
SELECT ngs.terms, ngs.n, list.ngrams_type, corp.weight FROM ngrams ngs
JOIN nodes_ngrams list ON list.ngram_id = ngs.id
JOIN nodes_ngrams corp ON corp.ngram_id = ngs.id
JOIN nodes n ON n.id = corp.node_id
JOIN nodes_nodes nn ON nn.node2_id = n.id
WHERE list.node_id = ? -- Master listId
AND n.parent_id = ? -- Master CorpusId or AnnuaireId
AND n.typename = ? -- Master childs (Documents or Contacts)
AND corp.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
AND nn.node1_id = ? -- User CorpusId or AnnuaireId
)
SELECT COALESCE(tu.terms,tm.terms) AS terms
......
......@@ -30,12 +30,7 @@ import GHC.Int (Int64)
import Control.Lens (set)
import Data.Maybe
import Data.Time (UTCTime)
import Database.PostgreSQL.Simple.FromField ( Conversion
, ResultError(ConversionFailed)
, FromField
, fromField
, returnError
)
import Database.PostgreSQL.Simple.FromField (FromField, fromField)
import Prelude hiding (null, id, map, sum)
import Gargantext.Core (Lang(..))
......@@ -46,7 +41,6 @@ import Gargantext.Database.Queries
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Prelude hiding (sum)
import Database.PostgreSQL.Simple.Internal (Field)
import Control.Applicative (Applicative)
import Control.Arrow (returnA)
import Control.Lens.TH (makeLensesWith, abbreviatedFields)
......@@ -56,7 +50,6 @@ import Data.Aeson
import Data.Maybe (Maybe, fromMaybe)
import Data.Text (Text)
import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
import Data.Typeable (Typeable)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Lazy as DBL
......
......@@ -17,30 +17,21 @@ Portability : POSIX
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Node.Contact (NodeContact,HyperdataContact, ContactWho, ContactWhere, ContactTouch)
module Gargantext.Database.Node.Contact
where
import GHC.Generics (Generic)
import Data.Aeson.TH (deriveJSON)
import Data.Text (Text)
import Data.Time (UTCTime)
import qualified Data.Text as DT
import Control.Lens (makeLenses)
import Database.PostgreSQL.Simple
import Opaleye (QueryRunnerColumnDefault
, queryRunnerColumnDefault, PGJsonb, fieldQueryRunnerColumn)
-- import Control.Lens (makeLenses)
import Opaleye (QueryRunnerColumnDefault, queryRunnerColumnDefault, PGJsonb, fieldQueryRunnerColumn)
import Gargantext.Database.Utils (fromField')
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Node (NodeWrite', AnnuaireId, UserId, Name, node)
import Gargantext.Prelude
import Gargantext.Database.Types.Node (Node,Hyperdata,NodeType(..))
import Data.Aeson (Result(Error,Success), fromJSON, FromJSON, ToJSON)
import Database.PostgreSQL.Simple.FromField ( Conversion
, ResultError(ConversionFailed)
, FromField
, fromField
, returnError
)
import Database.PostgreSQL.Simple.FromField (FromField, fromField)
------------------------------------------------------------------------
......
......@@ -60,27 +60,26 @@ the concatenation of the parameters defined by @hashParameters@.
module Gargantext.Database.Node.Document.Insert where
import Control.Lens (set)
import Data.Aeson (toJSON, Value, ToJSON)
import Data.ByteString.Internal (ByteString)
import Data.Aeson (toJSON, Value)
import Data.Maybe (maybe)
import Data.Text (Text)
import Data.Typeable (Typeable)
import Database.PostgreSQL.Simple (FromRow, Query, formatQuery, query, Only(..))
import Database.PostgreSQL.Simple (FromRow, Query, query, Only(..))
import Database.PostgreSQL.Simple.FromRow (fromRow, field)
import Database.PostgreSQL.Simple.SqlQQ
import Database.PostgreSQL.Simple.ToField (toField)
import Database.PostgreSQL.Simple.ToRow (ToRow(..))
import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
import Data.Text (Text)
import qualified Data.Text as DT (pack, unpack, concat, take)
import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
import qualified Data.ByteString.Lazy.Char8 as DC (pack)
import GHC.Generics (Generic)
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Node (mkCmd, Cmd(..))
import Gargantext.Database.Node.Contact (HyperdataContact(..))
import Gargantext.Database.Types.Node
import Gargantext.Prelude
import qualified Data.ByteString.Lazy.Char8 as DC (pack)
import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
import qualified Data.Text as DT (pack, unpack, concat, take)
-- TODO : the import of Document constructor below does not work
-- import Gargantext.Database.Types.Node (Document)
--import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
......@@ -93,9 +92,11 @@ import Gargantext.Database.Types.Node
-- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
-- , NodeTypeId
-- )
import Gargantext.Prelude
{-| To Print result query
import Data.ByteString.Internal (ByteString)
import Database.PostgreSQL.Simple (formatQuery)
-}
import GHC.Generics (Generic)
---------------------------------------------------------------------------
-- * Main Insert functions
......@@ -108,16 +109,12 @@ import GHC.Generics (Generic)
-- ParentId : folder ID which is parent of the inserted documents
data Hyper = HyperDocument HyperdataDocument | HyperContact HyperdataContact
data ToDbData = ToDbDocument HyperdataDocument | ToDbContact HyperdataContact
insertDocuments :: UserId -> ParentId -> [Hyper] -> Cmd [ReturnId]
insertDocuments uId pId hs = mkCmd $ \c -> query c queryInsert (Only $ Values fields inputData)
insertDocuments :: UserId -> ParentId -> [ToDbData] -> Cmd [ReturnId]
insertDocuments uId pId hs = mkCmd $ \c -> query c queryInsert (Only $ Values fields $ prepare uId pId hs)
where
fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
inputData = case hs of
[HyperDocument _] -> prepare _hyperdataDocument_title uId pId $ map (\(HyperDocument h) -> h) hs
[HyperContact _] -> prepare (\_ -> Just "name") uId pId $ map (\(HyperContact h) -> h) hs
_ -> panic "error"
-- | Debug SQL function
--
......@@ -159,13 +156,18 @@ queryInsert = [sql|
JOIN nodes c USING (hyperdata); -- columns of unique index
|]
prepare :: (Hyperdata a, ToJSON a) => (a -> Maybe Text) -> UserId -> ParentId -> [a] -> [InputData]
prepare f uId pId = map (\h -> InputData tId uId pId
(DT.take 255 <$> maybe "No Title" identity $ f h)
(toJSON h)
)
prepare :: UserId -> ParentId -> [ToDbData] -> [InputData]
prepare uId pId = map (\h -> InputData tId uId pId (name h) (toJSON' h))
where
tId = nodeTypeId NodeDocument
tId = nodeTypeId NodeDocument
toJSON' (ToDbDocument hd) = toJSON hd
toJSON' (ToDbContact hc) = toJSON hc
name h = DT.take 255 <$> maybe "No Title" identity $ f h
where
f (ToDbDocument hd) = _hyperdataDocument_title hd
f (ToDbContact _ ) = Just "Contact" -- TODO view FirstName . LastName
------------------------------------------------------------------------
-- * Main Types used
......
......@@ -27,7 +27,6 @@ import Database.PostgreSQL.Simple.Internal (Field)
import qualified Data.ByteString as DB
import Database.PostgreSQL.Simple.FromField ( Conversion
, ResultError(ConversionFailed)
, FromField
, fromField
, returnError
)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment