1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
{-|
Module : Gargantext.Text.Search
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This search Engine is first made to clean CSV file according to a query.
Starting from this model, a specific Gargantext engine will be made
(using more metrics scores/features).
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
module Gargantext.Text.Search where
import Data.SearchEngine
import Data.Ix
-- Usefull to use stopwords
-- import Data.Set (Set)
-- import qualified Data.Set as Set
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Text.Terms.Mono (monoTexts)
import Gargantext.Text.Terms.Mono.Stem as ST
import Gargantext.Text.Corpus.Parsers.CSV
type DocId = Int
type DocSearchEngine = SearchEngine
CsvGargV3
DocId
DocField
NoFeatures
data DocField = TitleField
| AbstractField
deriving (Eq, Ord, Enum, Bounded, Ix, Show)
initialDocSearchEngine :: DocSearchEngine
initialDocSearchEngine =
initSearchEngine docSearchConfig defaultSearchRankParameters
docSearchConfig :: SearchConfig CsvGargV3 DocId DocField NoFeatures
docSearchConfig =
SearchConfig {
documentKey = d_docId,
extractDocumentTerms = extractTerms,
transformQueryTerm = normaliseQueryToken,
documentFeatureValue = const noFeatures
}
where
extractTerms :: CsvGargV3 -> DocField -> [Text]
extractTerms doc TitleField = monoTexts (d_title doc)
extractTerms doc AbstractField = monoTexts (d_abstract doc)
normaliseQueryToken :: Text -> DocField -> Text
normaliseQueryToken tok =
let tokStem = ST.stem ST.EN
in \field -> case field of
TitleField -> tokStem tok
AbstractField -> tokStem tok
defaultSearchRankParameters :: SearchRankParameters DocField NoFeatures
defaultSearchRankParameters =
SearchRankParameters {
paramK1,
paramB,
paramFieldWeights,
paramFeatureWeights = noFeatures,
paramFeatureFunctions = noFeatures,
paramResultsetSoftLimit = 2000,
paramResultsetHardLimit = 4000,
paramAutosuggestPrefilterLimit = 500,
paramAutosuggestPostfilterLimit = 500
}
where
paramK1 :: Float
paramK1 = 1.5
paramB :: DocField -> Float
paramB TitleField = 0.9
paramB AbstractField = 0.5
paramFieldWeights :: DocField -> Float
paramFieldWeights TitleField = 20
paramFieldWeights AbstractField = 5