Search.hs 2.66 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
{-|
Module      : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

This search Engine is first made to clean CSV file according to a query.

Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}

{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}

module Gargantext.Text.Search where

import Data.SearchEngine

import Data.Ix

-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)

import Gargantext.Prelude
31
import Gargantext.Text.Terms.Mono (monoTexts)
32
import Gargantext.Text.Terms.Mono.Stem as ST
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
import Gargantext.Text.Parsers.CSV

type DocId = Int

type DocSearchEngine = SearchEngine
                         Doc
                         DocId
                         DocField
                         NoFeatures

data DocField = TitleField
              | AbstractField
  deriving (Eq, Ord, Enum, Bounded, Ix, Show)

initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
    initSearchEngine docSearchConfig defaultSearchRankParameters

docSearchConfig :: SearchConfig Doc DocId DocField NoFeatures
docSearchConfig =
    SearchConfig {
      documentKey           = d_docId,
55
      extractDocumentTerms  = extractTerms,
56 57 58 59
      transformQueryTerm    = normaliseQueryToken,
      documentFeatureValue  = const noFeatures
  }
  where
60
    extractTerms :: Doc -> DocField -> [Text]
61 62
    extractTerms doc TitleField       = monoTexts (d_title doc)
    extractTerms doc AbstractField    = monoTexts (d_abstract doc)
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

    normaliseQueryToken :: Text -> DocField -> Text
    normaliseQueryToken tok =
      let tokStem = ST.stem ST.EN
       in \field -> case field of
                      TitleField    -> tokStem tok
                      AbstractField -> tokStem tok

defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
    SearchRankParameters {
      paramK1,
      paramB,
      paramFieldWeights,
      paramFeatureWeights     = noFeatures,
      paramFeatureFunctions   = noFeatures,
      paramResultsetSoftLimit = 2000,
      paramResultsetHardLimit = 4000,
      paramAutosuggestPrefilterLimit  = 500,
      paramAutosuggestPostfilterLimit = 500
    }
  where
    paramK1 :: Float
    paramK1 = 1.5

    paramB :: DocField -> Float
    paramB TitleField      = 0.9
    paramB AbstractField   = 0.5

    paramFieldWeights :: DocField -> Float
    paramFieldWeights TitleField    = 20
    paramFieldWeights AbstractField = 5