Commit 93e711b1 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX][DOC] Indexing issue

parent 439d3771
Pipeline #3420 passed with stage
in 92 minutes and 38 seconds
## Version 0.0.6.8.5.1
* [BACK][FIX] Indexing issue: taking all terms instead of longest of terms in case of ngrams included in others
* [FRONT][FIX][Disconnection of instance causes a blank page (#464)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/464)
* [BACK][FIX] ArXiv search in Abstracts by default
## Version 0.0.6.8.5
* [BACK][FIX][Ngrams Table, page sort / limit (#149)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/149)
* [FRONT][FIX][Security Issue with Teams (#452)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/452)
......
cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.34.7.
-- This file has been generated from package.yaml by hpack version 0.35.0.
--
-- see: https://github.com/sol/hpack
name: gargantext
version: 0.0.6.8.5
version: 0.0.6.8.5.1
synopsis: Search, map, share
description: Please see README.md
category: Data
......
......@@ -6,7 +6,7 @@ name: gargantext
# | | | +----- Layers * : New versions with API additions
# | | | | +--- Layers * : New versions without API breaking changes
# | | | | |
version: '0.0.6.8.5'
version: '0.0.6.8.5.1'
synopsis: Search, map, share
description: Please see README.md
category: Data
......
......@@ -130,26 +130,23 @@ reIndexWith :: ( HasNodeStory env err m
-> Set ListType
-> m ()
reIndexWith cId lId nt lts = do
printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- Getting [NgramsTerm]
ts <- List.concat
<$> map (\(k,vs) -> k:vs)
<$> HashMap.toList
<$> getTermsWith identity [lId] nt lts
-- printDebug "ts" ts
-- Taking the ngrams with 0 occurrences only (orphans)
-- occs <- getOccByNgramsOnlyFast' cId lId nt ts
-- printDebug "occs" occs
let orphans = ts {- List.concat
$ map (\t -> case HashMap.lookup t occs of
Nothing -> [t]
Just n -> if n <= 1 then [t] else [ ]
) ts
-}
-- printDebug "orphans" orphans
-}
printDebug "orphans" orphans
-- Get all documents of the corpus
docs <- selectDocNodes cId
......@@ -171,12 +168,12 @@ reIndexWith cId lId nt lts = do
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
) docs
-- printDebug "ngramsByDoc" ngramsByDoc
printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc
pure () -- ngramsByDoc
pure ()
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n
......
......@@ -37,8 +37,11 @@ data Pattern = Pattern
type Patterns = [Pattern]
------------------------------------------------------------------------
replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0
data ReplaceTerms = KeepAll | LongestOnly
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0
where
terms_len = length terms
......@@ -49,15 +52,17 @@ replaceTerms pats terms = go 0
Just (len, term) ->
term : go (ix + len)
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
m = toMap
[ (ix, (len, term))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ]
toMap = case rplaceTerms of
KeepAll -> IntMap.fromList
LongestOnly -> IntMap.fromListWith merge
where
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
......@@ -82,14 +87,14 @@ termsInText pats txt = groupWithCounts
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
-- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence
--------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment