[DBFLOW] Ngrams / NodeId / count

f6a473f2 · Alexandre Delanoë · 871b48ee · f6a473f2 · f6a473f2 · f6a473f2
Commit f6a473f2 authored Oct 24, 2018 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 169 additions and 66 deletions

Flow.hs src/Gargantext/Database/Flow.hs +80 -10

Ngram.hs src/Gargantext/Database/Ngram.hs +83 -51

Insert.hs src/Gargantext/Database/Node/Document/Insert.hs +6 -5

No files found.
--- a/src/Gargantext/Database/Flow.hs
+++ b/src/Gargantext/Database/Flow.hs
@@ -25,12 +25,17 @@ module Gargantext.Database.Flow
    where
 import System.FilePath (FilePath)
 import Data.Maybe (Maybe(..))
+import Data.Text (Text, unpack)
+import Data.Map (Map)
+import qualified Data.Map as DM
 import Gargantext.Core.Types (NodePoly(..))
 import Gargantext.Prelude
 import Gargantext.Database.Bashql (runCmd', del)
+import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus)
 import Gargantext.Database.User (getUser, UserLight(..), Username)
-import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(reId))
+import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
 import Gargantext.Database.Node.Document.Add    (add)
 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
@@ -41,23 +46,24 @@ type CorpusId = Int
 subFlow :: Username -> IO (UserId, RootId, CorpusId)
 subFlow username = do
  maybeUserId <- runCmd' (getUser username)
  let userId = case maybeUserId of
-        Nothing   -> panic "Error: User does not exist (yet)" -- mk NodeUser gargantua_id "Node Gargantua"
+        Nothing   -> panic "Error: User does not exist (yet)" 
+        -- mk NodeUser gargantua_id "Node Gargantua"
        Just user -> userLight_id user
  rootId' <- map _node_id <$> runCmd' (getRoot userId)
  rootId'' <- case rootId' of
        []  -> runCmd' (mkRoot userId)
        un  -> case length un >= 2 of
                 True  -> panic "Error: more than 1 userNode / user"
                 False -> pure rootId'
  let rootId = maybe (panic "error rootId") identity (head rootId'')
  corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
  let corpusId = maybe (panic "error corpusId") identity (head corpusId')
  printDebug "(username, userId, rootId, corpusId"
              (username, userId, rootId, corpusId)
  pure (userId, rootId, corpusId)
@@ -68,17 +74,81 @@ flow fp = do
  (masterUserId, _, corpusId) <- subFlow "gargantua"
-  docs <- parseDocs WOS fp
+  docs <- map addUniqIds <$> parseDocs WOS fp
  ids  <- runCmd' $ insertDocuments masterUserId corpusId docs
  printDebug "Docs IDs : " ids
  idsRepeat  <- runCmd' $ insertDocuments masterUserId corpusId docs
  printDebug "Docs IDs : " idsRepeat
  (_, _, corpusId2) <- subFlow "alexandre"
  inserted <- runCmd' $ add corpusId2 (map reId ids)
  printDebug "Inserted : " inserted
-  runCmd' (del [corpusId2, corpusId])
+  runCmd' $ del [corpusId2, corpusId]
+----------------------------------------------------------------
+type HashId = Text
+type ToInsert = Map HashId HyperdataDocument
+type Inserted = Map HashId ReturnId
+toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
+toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
+  where
+    hash = maybe "Error" identity
+toInserted :: [ReturnId] -> Map HashId ReturnId
+toInserted rs = DM.fromList $ map    (\r ->  (reUniqId r, r)    )
+                            $ filter (\r -> reInserted r == True) rs
+data DocumentWithId = DocumentWithId { documentId   :: NodeId
+                             , documentData :: HyperdataDocument
+                             }
+type NodeId  = Int
+mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument
+          -> [DocumentWithId]
+mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
+  where
+    lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
+data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
+                                                 , document_ngrams :: Map Ngram Int
+                                                 }
+data NgramsType = Sources | Authors | Terms
+-- | Typed Ngrams
+data Ngrams   = Ngrams { ngramsType :: NgramsType
+                       , ngramsText :: Text
+                       , ngramsSize :: Int
+                       }
+type Ngram = Text
+type NgramId = Int
+documentIdWithNgrams :: (HyperdataDocument -> Map Ngram Int) -> [DocumentWithId] -> [DocumentIdWithNgrams]
+documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
+-- | TODO check optimization
+mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngram [(NodeId,Int)]
+mapNodeIdNgrams ds = DM.fromListWith (<>) xs
+  where
+    xs  = [(ngId, [(nId, i)]) | (nId, n2i') <- n2i ds, (ngId, i) <- DM.toList n2i']
+    n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
+--indexNgram :: [Ngram] -> 
+---- insert to NodeNgram
+---- using insertNgrams from 
+--indexNgram :: Map Ngram (Map NodeId Int) -> Map NgramId (Map NodeId Int)
+--indexNgram = undefined
+-- grouping here
--- a/src/Gargantext/Database/Ngram.hs
+++ b/src/Gargantext/Database/Ngram.hs
 {-|
-Module      : Gargantext.Databse.Ngram
+Module      : Gargantext.Database.Ngram
-Description : 
+Description : Ngram connection to the Database
 Copyright   : (c) CNRS, 2017-Present
 License     : AGPL + CECILL v3
 Maintainer  : team@gargantext.org
 Stability   : experimental
 Portability : POSIX
-Here is a longer description of this module, containing some
+Ngrams connection to the Database.
-commentary with @some markup@.
 -}
 {-# LANGUAGE Arrows                 #-}
+{-# LANGUAGE DeriveGeneric          #-}
 {-# LANGUAGE FlexibleInstances      #-}
 {-# LANGUAGE FunctionalDependencies #-}
 {-# LANGUAGE MultiParamTypeClasses  #-}
 {-# LANGUAGE NoImplicitPrelude      #-}
+{-# LANGUAGE OverloadedStrings      #-}
+{-# LANGUAGE QuasiQuotes            #-}
 {-# LANGUAGE TemplateHaskell        #-}
 module Gargantext.Database.Ngram where
-import Prelude
+import Control.Lens.TH (makeLensesWith, abbreviatedFields)
-import Data.Text (Text)
+import Data.List (find)
 import Data.Maybe (Maybe)
 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
-import Control.Lens.TH (makeLensesWith, abbreviatedFields)
+import GHC.Generics (Generic)
-import qualified Database.PostgreSQL.Simple as PGS
+import Data.ByteString.Internal (ByteString)
+import Data.Text (Text)
-import Opaleye
+import Database.PostgreSQL.Simple.FromRow (fromRow, field)
+import Database.PostgreSQL.Simple.SqlQQ
+import Gargantext.Database.Node (mkCmd, Cmd(..))
+-- import Opaleye
+import Prelude
-- Functions only
+import qualified Database.PostgreSQL.Simple as DPS
-import Data.List (find)
+import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
-data NgramPoly id terms n = Ngram { ngram_id    :: id
+--data NgramPoly id terms n = NgramDb { ngram_id    :: id
-                                  , ngram_terms :: terms
+--                                    , ngram_terms :: terms
-                                  , ngram_n     :: n
+--                                    , ngram_n     :: n
-                                  } deriving (Show)
+--                                    } deriving (Show)
+--
+--type NgramWrite = NgramPoly (Maybe (Column PGInt4))
+--                                   (Column PGText)
+--                                   (Column PGInt4)
+--
+--type NgramRead  = NgramPoly        (Column PGInt4)
+--                                   (Column PGText)
+--                                   (Column PGInt4)
+--
+----type Ngram = NgramPoly Int Text Int
+--
+-- $(makeAdaptorAndInstance "pNgram"    ''NgramPoly)
+-- $(makeLensesWith abbreviatedFields   ''NgramPoly)
+--
+--ngramTable :: Table NgramWrite NgramRead
+--ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id    = optional "id"
+--                                            , ngram_terms = required "terms"
+--                                            , ngram_n     = required "n"
+--                                            }
+--                                )
+--
+--queryNgramTable :: Query NgramRead
+--queryNgramTable = queryTable ngramTable
+--
+--dbGetNgrams :: PGS.Connection -> IO [NgramDb]
+--dbGetNgrams conn = runQuery conn queryNgramTable
-type NgramWrite = NgramPoly (Maybe (Column PGInt4))
+type Ngram = Text
-                                   (Column PGText)
+type NgramId = Int
-                                   (Column PGInt4)
+type SizeN = Int
-type NgramRead  = NgramPoly        (Column PGInt4)
+data NgramIds = NgramIds { ngramId :: Int
-                                   (Column PGText)
+                         , ngramTerms :: Text
-                                   (Column PGInt4)
+     } deriving (Show, Generic)
-type Ngram = NgramPoly Int Text Int
+instance DPS.FromRow NgramIds where
+  fromRow = NgramIds <$> field <*> field
-$(makeAdaptorAndInstance "pNgram"    ''NgramPoly)
+----------------------
-$(makeLensesWith abbreviatedFields   ''NgramPoly)
+insertNgrams :: [(Ngram, SizeN)] -> Cmd [DPS.Only Int]
+insertNgrams ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
+  where
+    fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
-ngramTable :: Table NgramWrite NgramRead
-ngramTable = Table "ngrams" (pNgram Ngram { ngram_id    = optional "id"
-                                            , ngram_terms = required "terms"
-                                            , ngram_n     = required "n"
-                                            }
-                                )
-queryNgramTable :: Query NgramRead
+insertNgrams_Debug :: [(Ngram, SizeN)] -> Cmd ByteString
-queryNgramTable = queryTable ngramTable
+insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
+  where
+    fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
--selectUsers :: Query UserRead
--selectUsers = proc () -> do
--      --user@(i, p, ll, is, un, fn, ln, m, iff, ive, dj) <- queryUserTable -< ()
--      row@(User i p ll is un fn ln m iff ive dj) <- queryUserTable -< ()
--      O.restrict -< i .== 1
--      --returnA -< User i p ll is un fn ln m iff ive dj
--      returnA -< row
--
-findWith :: (Eq a1, Foldable t) => (a -> a1) -> a1 -> t a -> Maybe a
+----------------------
-findWith f t = find (\x -> f x == t)
+queryInsertNgrams :: DPS.Query
+queryInsertNgrams = [sql|
+    WITH input_rows(terms,n) AS (?)
+    , ins AS (
+       INSERT INTO ngrams (terms,n)
+       SELECT * FROM input_rows
+       ON CONFLICT (terms) DO NOTHING -- unique index created here
+       RETURNING id,terms
+       )
--userWithUsername :: Text -> [User] -> Maybe User
+    SELECT id, terms
--userWithUsername t xs = userWith userUsername t xs
+    FROM   ins
--
+    UNION  ALL
--userWithId :: Integer -> [User] -> Maybe User
+    SELECT c.id, terms
--userWithId t xs = userWith userUserId t xs
+    FROM   input_rows
+    JOIN   ngrams c USING (terms);     -- columns of unique index
+           |]
-- | not optimized (get all ngrams without filters)
-dbGetNgrams :: PGS.Connection -> IO [Ngram]
-dbGetNgrams conn = runQuery conn queryNgramTable
--- a/src/Gargantext/Database/Node/Document/Insert.hs
+++ b/src/Gargantext/Database/Node/Document/Insert.hs
@@ -150,8 +150,9 @@ queryInsert = [sql|
           |]
 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
-prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document" identity $ _hyperdataDocument_title h) 
+prepare uId pId = map (\h -> InputData tId uId pId 
-                                                   (toJSON $ addUniqId h)
+                            (maybe "No Title of Document" identity $ _hyperdataDocument_title h)
+                                                   (toJSON h)
                      )
  where
    tId  = nodeTypeId NodeDocument
@@ -166,7 +167,7 @@ prepare uId pId = map (\h -> InputData tId uId pId (maybe "No Title of Document"
 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
                         , reId       :: Int  -- ^ always return the id of the document (even new or not new)
                                         --   this is the uniq id in the database
-                         , reUniqId   :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
+                         , reUniqId   :: Text -- ^ Hash Id with concatenation of hash parameters
                         } deriving (Show, Generic)
 instance FromRow ReturnId where
@@ -195,8 +196,8 @@ instance ToRow InputData where
 ---------------------------------------------------------------------------
 -- * Uniqueness of document definition
-addUniqId :: HyperdataDocument -> HyperdataDocument
+addUniqIds :: HyperdataDocument -> HyperdataDocument
-addUniqId doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
+addUniqIds doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
              $ set hyperdataDocument_uniqId    (Just hash) doc
  where
    hash    = uniqId $ DT.concat $ map ($ doc) hashParameters