Search.hs 2.62 KB
Newer Older
1
{-|
2
Module      : Gargantext.Core.Text.Search
3 4 5 6 7 8 9 10 11 12 13 14 15
Description : All parsers of Gargantext in one file.
Copyright   : (c) CNRS, 2017 - present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

This search Engine is first made to clean CSV file according to a query.

Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}

16
module Gargantext.Core.Text.Search where
17 18 19 20 21 22 23 24 25 26 27

import Data.SearchEngine

import Data.Ix

-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)

import Gargantext.Prelude
28 29 30
import Gargantext.Core.Text.Terms.Mono (monoTexts)
import Gargantext.Core.Text.Terms.Mono.Stem as ST
import Gargantext.Core.Text.Corpus.Parsers.CSV
31 32 33 34

type DocId = Int

type DocSearchEngine = SearchEngine
35
                         CsvGargV3
36 37 38 39 40 41 42 43 44 45 46 47
                         DocId
                         DocField
                         NoFeatures

data DocField = TitleField
              | AbstractField
  deriving (Eq, Ord, Enum, Bounded, Ix, Show)

initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
    initSearchEngine docSearchConfig defaultSearchRankParameters

48
docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
49 50 51
docSearchConfig =
    SearchConfig {
      documentKey           = d_docId,
52
      extractDocumentTerms  = extractTerms,
53 54 55 56
      transformQueryTerm    = normaliseQueryToken,
      documentFeatureValue  = const noFeatures
  }
  where
57
    extractTerms :: CsvGargV3 -> DocField -> [Text]
58 59
    extractTerms doc TitleField       = monoTexts (d_title doc)
    extractTerms doc AbstractField    = monoTexts (d_abstract doc)
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

    normaliseQueryToken :: Text -> DocField -> Text
    normaliseQueryToken tok =
      let tokStem = ST.stem ST.EN
       in \field -> case field of
                      TitleField    -> tokStem tok
                      AbstractField -> tokStem tok

defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
    SearchRankParameters {
      paramK1,
      paramB,
      paramFieldWeights,
      paramFeatureWeights     = noFeatures,
      paramFeatureFunctions   = noFeatures,
      paramResultsetSoftLimit = 2000,
      paramResultsetHardLimit = 4000,
      paramAutosuggestPrefilterLimit  = 500,
      paramAutosuggestPostfilterLimit = 500
    }
  where
    paramK1 :: Float
    paramK1 = 1.5

    paramB :: DocField -> Float
    paramB TitleField      = 0.9
    paramB AbstractField   = 0.5

    paramFieldWeights :: DocField -> Float
    paramFieldWeights TitleField    = 20
    paramFieldWeights AbstractField = 5