TextSearch.hs 9.45 KB
Newer Older
1 2
{-|
Module      : Gargantext.Database.TextSearch
Alexandre Delanoë's avatar
Alexandre Delanoë committed
3
Description : Postgres text search experimentation
4 5 6 7 8 9 10
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX
-}

11
{-# LANGUAGE Arrows            #-}
12 13
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
14
{-# LANGUAGE RankNTypes        #-}
15 16 17 18

module Gargantext.Database.TextSearch where

import Data.Aeson
19
import Data.Map.Strict hiding (map, drop, take)
20
import Data.Maybe
21
import Data.List (intersperse, take, drop)
22
import Data.String (IsString(..))
23
import Data.Text (Text, words, unpack, intercalate)
24
import Data.Time (UTCTime)
25
import Database.PostgreSQL.Simple (Query)
26
import Database.PostgreSQL.Simple.ToField
27 28
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Types.Node (NodeType(..))
29
import Gargantext.Prelude
30
--import Gargantext.Database.Node.Contact
31
import Gargantext.Database.Facet
32
import Gargantext.Database.Schema.Node
33
import Gargantext.Database.Schema.Ngrams
34
import Gargantext.Database.Schema.NodeNode
35
import Gargantext.Database.Schema.NodeNgram
36
import Gargantext.Database.Queries.Join (leftJoin6)
37
import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery)
38
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
39 40 41 42
import Gargantext.Core.Types
import Control.Arrow (returnA)
import qualified Opaleye as O hiding (Order)
import Opaleye hiding (Query, Order)
43

44

45
------------------------------------------------------------------------
46 47
searchInDatabase :: ParentId -> Text -> Cmd err [(NodeId, HyperdataDocument)]
searchInDatabase p t = runOpaQuery (queryInDatabase p t)
48

49
-- | Global search query where ParentId is Master Node Corpus Id 
50 51
queryInDatabase :: ParentId -> Text -> O.Query (Column PGInt4, Column PGJsonb)
queryInDatabase _ q = proc () -> do
52 53 54 55
    row <- queryNodeSearchTable -< ()
    restrict -< (_ns_search row)    @@ (pgTSQuery (unpack q))
    restrict -< (_ns_typename row) .== (pgInt4 $ nodeTypeId NodeDocument)
    returnA  -< (_ns_id row, _ns_hyperdata row)
56 57 58

------------------------------------------------------------------------
-- | todo add limit and offset and order
59 60
searchInCorpus :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [FacetDoc]
searchInCorpus cId q o l order = runOpaQuery (filterWith o l order $ queryInCorpus cId q')
61
  where
62
    q' = intercalate " | " $ map stemIt q
63

64
queryInCorpus :: CorpusId -> Text -> O.Query FacetDocRead
65 66
queryInCorpus cId q = proc () -> do
  (n, nn) <- joinInCorpus -< ()
67
  restrict -< ( nodeNode_node1_id nn) .== (toNullable $ pgNodeId cId)
68 69
  restrict -< (_ns_search n)           @@ (pgTSQuery (unpack q))
  restrict -< (_ns_typename n)        .== (pgInt4 $ nodeTypeId NodeDocument)
70
  returnA  -< FacetDoc (_ns_id n) (_ns_date n) (_ns_name n) (_ns_hyperdata n) (pgBool True) (pgInt4 1)
71

72 73
joinInCorpus :: O.Query (NodeSearchRead, NodeNodeReadNull)
joinInCorpus = leftJoin queryNodeSearchTable queryNodeNodeTable cond
74
  where
75
    cond :: (NodeSearchRead, NodeNodeRead) -> Column PGBool
76
    cond (n, nn) = nodeNode_node2_id nn .== _ns_id n
77

78
------------------------------------------------------------------------
79
type AuthorName = Text
80

81
-- | TODO Optim: Offset and Limit in the Query
82 83
searchInCorpusWithContacts :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [FacetPaired Int UTCTime HyperdataDocument Int [Pair Int Text]]
searchInCorpusWithContacts cId q o l order = take (maybe 5 identity l) <$> drop (maybe 0 identity o)
84 85 86
  <$> map (\((i,u,h,s), ps) -> FacetPaired i u h s (catMaybes ps))
  <$> toList <$> fromListWith (<>)
  <$> map (\(FacetPaired i u h s p) -> ((i,u,h,s), [maybePair p]))
87
  <$> searchInCorpusWithContacts' cId q o l order
88 89 90 91
  where
    maybePair (Pair Nothing Nothing) = Nothing
    maybePair (Pair _ Nothing) = Nothing
    maybePair (Pair Nothing _) = Nothing
92
    maybePair (Pair (Just p_id) (Just p_label)) = Just $ Pair p_id p_label
93

94 95
searchInCorpusWithContacts' :: CorpusId -> [Text] -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> Cmd err [(FacetPaired Int UTCTime HyperdataDocument Int (Pair (Maybe Int) (Maybe Text)))]
searchInCorpusWithContacts' cId q o l order = runOpaQuery $ queryInCorpusWithContacts cId q' o l order
96
  where
97
    q' = intercalate " | " $ map stemIt q
98 99 100 101 102



queryInCorpusWithContacts :: CorpusId -> Text -> Maybe Offset -> Maybe Limit -> Maybe OrderBy -> O.Query FacetPairedRead
queryInCorpusWithContacts cId q _ _ _ = proc () -> do
103
  (docs, (corpusDoc, (docNgrams, (ngrams', (_, contacts))))) <- joinInCorpusWithContacts -< ()
104 105
  restrict -< (_ns_search docs)              @@ (pgTSQuery  $ unpack q  )
  restrict -< (_ns_typename docs)           .== (pgInt4 $ nodeTypeId NodeDocument)
106
  restrict -< (nodeNode_node1_id corpusDoc) .== (toNullable $ pgNodeId cId)
107
  restrict -< (_nn_listType docNgrams)      .== (toNullable $ pgNgramsType Authors)
108
  restrict -< (_node_typename contacts)     .== (toNullable $ pgInt4 $ nodeTypeId NodeContact)
109
  -- let contact_id    = ifThenElse (isNull $ _node_id contacts) (toNullable $ pgInt4 0) (_node_id contacts)
110
  returnA  -< FacetPaired (_ns_id docs) (_ns_date docs) (_ns_hyperdata docs) (pgInt4 0) (Pair (_node_id contacts) (ngrams_terms ngrams'))
111

112 113
joinInCorpusWithContacts :: O.Query (NodeSearchRead, (NodeNodeReadNull, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))))
joinInCorpusWithContacts = leftJoin6 queryNodeTable queryNodeNgramTable queryNgramsTable queryNodeNgramTable queryNodeNodeTable queryNodeSearchTable cond12 cond23 cond34 cond45 cond56
114 115
    where
         cond12 :: (NodeNgramRead, NodeRead) -> Column PGBool
116
         cond12 (ng3, n2) = _node_id n2 .== _nn_node_id ng3
117 118
---------
         cond23 :: (NgramsRead, (NodeNgramRead, NodeReadNull)) -> Column PGBool
119
         cond23 (ng2, (nng2, _)) = _nn_ngrams_id nng2 .== ngrams_id ng2
120 121
         
         cond34 :: (NodeNgramRead, (NgramsRead, (NodeNgramReadNull, NodeReadNull))) -> Column PGBool
122
         cond34 (nng, (ng, (_,_))) = ngrams_id ng .== _nn_ngrams_id nng
123 124
         
         cond45 :: (NodeNodeRead, (NodeNgramRead, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))) -> Column PGBool
125
         cond45 (nn, (nng, (_,(_,_)))) = _nn_node_id nng .== nodeNode_node2_id nn
126 127 128
         
         cond56 :: (NodeSearchRead, (NodeNodeRead, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull))))) -> Column PGBool
         cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nodeNode_node2_id nn
129 130


131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
{-
queryGraphCorpusAuthors' :: O.Query (NodeSearchRead, (NodeNodeReadNull, (NodeNgramReadNull, NgramsReadNull)))
queryGraphCorpusAuthors' = leftJoin4 queryNgramsTable queryNodeNgramTable queryNodeNodeTable queryNodeSearchTable cond12 cond23 cond34
    where
         cond23 :: (NgramsRead, (NodeNgramRead, NodeReadNull)) -> Column PGBool
         cond23 (ng2, (nng2, _)) = nodeNgram_ngrams_id nng2 .== ngrams_id ng2
         
         cond34 :: (NodeNgramRead, (NgramsRead, (NodeNgramReadNull, NodeReadNull))) -> Column PGBool
         cond34 (nng, (ng, (_,_))) = ngrams_id ng .== nodeNgram_ngrams_id nng
         
         cond45 :: (NodeNodeRead, (NodeNgramRead, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull)))) -> Column PGBool
         cond45 (nn, (nng, (_,(_,_)))) = nodeNgram_node_id nng .== nodeNode_node2_id nn
         
         cond56 :: (NodeSearchRead, (NodeNodeRead, (NodeNgramReadNull, (NgramsReadNull, (NodeNgramReadNull, NodeReadNull))))) -> Column PGBool
         cond56 (n, (nn, (_,(_,(_,_))))) = _ns_id n .== nodeNode_node2_id nn
-}
147 148 149



150
newtype TSQuery = UnsafeTSQuery [Text]
151

152
-- | TODO [""] -> panic "error"
153
toTSQuery :: [Text] -> TSQuery
154
toTSQuery txt = UnsafeTSQuery $ map stemIt txt
155

156

157 158 159 160 161 162 163 164 165 166 167 168 169 170
instance IsString TSQuery
  where
    fromString = UnsafeTSQuery . words . cs


instance ToField TSQuery
  where
    toField (UnsafeTSQuery xs)
      = Many  $ intersperse (Plain " && ")
              $ map (\q -> Many [ Plain "plainto_tsquery("
                                , Escape (cs q)
                                , Plain ")"
                                ]
                    ) xs
171

172
data Order    = Asc | Desc
173

174 175 176 177
instance ToField Order
  where
    toField Asc  = Plain "ASC"
    toField Desc = Plain "DESC"
178 179 180 181 182

-- TODO
-- FIX fav
-- ADD ngrams count
-- TESTS
183
textSearchQuery :: Query
184
textSearchQuery = "SELECT n.id, n.hyperdata->'publication_year'     \
185 186
\                   , n.hyperdata->'title'                          \
\                   , n.hyperdata->'source'                         \
187 188 189 190 191
\                   , n.hyperdata->'authors'                        \
\                   , COALESCE(nn.score,null)                       \
\                      FROM nodes n                                 \
\            LEFT JOIN nodes_nodes nn  ON nn.node2_id = n.id        \
\              WHERE                                                \
192
\                n.search @@ (?::tsquery)                           \
193 194
\                AND (n.parent_id = ? OR nn.node1_id = ?)           \
\                AND n.typename  = ?                                \
195
\                ORDER BY n.hyperdata -> 'publication_date' ?       \
196
\            offset ? limit ?;"
197

198 199 200
-- | Text Search Function for Master Corpus
-- TODO : text search for user corpus
-- Example:
201 202 203
-- textSearchTest :: ParentId -> TSQuery -> Cmd err [(Int, Value, Value, Value, Value, Maybe Int)]
-- textSearchTest pId q = textSearch q pId 5 0 Asc
textSearch :: TSQuery -> ParentId
204
           -> Limit -> Offset -> Order
205 206
           -> Cmd err [(Int,Value,Value,Value, Value, Maybe Int)]
textSearch q p l o ord = runPGSQuery textSearchQuery (q,p,p,typeId,ord,o,l)
207 208
  where
    typeId = nodeTypeId NodeDocument
209

210