Commit 93e711b1 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX][DOC] Indexing issue

parent 439d3771
## Version 0.0.6.8.5.1
* [BACK][FIX] Indexing issue: taking all terms instead of longest of terms in case of ngrams included in others
* [FRONT][FIX][Disconnection of instance causes a blank page (#464)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/464)
* [BACK][FIX] ArXiv search in Abstracts by default
## Version 0.0.6.8.5 ## Version 0.0.6.8.5
* [BACK][FIX][Ngrams Table, page sort / limit (#149)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/149) * [BACK][FIX][Ngrams Table, page sort / limit (#149)](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/149)
* [FRONT][FIX][Security Issue with Teams (#452)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/452) * [FRONT][FIX][Security Issue with Teams (#452)](https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/452)
......
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.34.7. -- This file has been generated from package.yaml by hpack version 0.35.0.
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
name: gargantext name: gargantext
version: 0.0.6.8.5 version: 0.0.6.8.5.1
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
......
...@@ -6,7 +6,7 @@ name: gargantext ...@@ -6,7 +6,7 @@ name: gargantext
# | | | +----- Layers * : New versions with API additions # | | | +----- Layers * : New versions with API additions
# | | | | +--- Layers * : New versions without API breaking changes # | | | | +--- Layers * : New versions without API breaking changes
# | | | | | # | | | | |
version: '0.0.6.8.5' version: '0.0.6.8.5.1'
synopsis: Search, map, share synopsis: Search, map, share
description: Please see README.md description: Please see README.md
category: Data category: Data
......
...@@ -130,26 +130,23 @@ reIndexWith :: ( HasNodeStory env err m ...@@ -130,26 +130,23 @@ reIndexWith :: ( HasNodeStory env err m
-> Set ListType -> Set ListType
-> m () -> m ()
reIndexWith cId lId nt lts = do reIndexWith cId lId nt lts = do
printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- Getting [NgramsTerm] -- Getting [NgramsTerm]
ts <- List.concat ts <- List.concat
<$> map (\(k,vs) -> k:vs) <$> map (\(k,vs) -> k:vs)
<$> HashMap.toList <$> HashMap.toList
<$> getTermsWith identity [lId] nt lts <$> getTermsWith identity [lId] nt lts
-- printDebug "ts" ts
-- Taking the ngrams with 0 occurrences only (orphans)
-- occs <- getOccByNgramsOnlyFast' cId lId nt ts
-- printDebug "occs" occs
let orphans = ts {- List.concat let orphans = ts {- List.concat
$ map (\t -> case HashMap.lookup t occs of $ map (\t -> case HashMap.lookup t occs of
Nothing -> [t] Nothing -> [t]
Just n -> if n <= 1 then [t] else [ ] Just n -> if n <= 1 then [t] else [ ]
) ts ) ts
-} -}
-- printDebug "orphans" orphans
printDebug "orphans" orphans
-- Get all documents of the corpus -- Get all documents of the corpus
docs <- selectDocNodes cId docs <- selectDocNodes cId
...@@ -171,12 +168,12 @@ reIndexWith cId lId nt lts = do ...@@ -171,12 +168,12 @@ reIndexWith cId lId nt lts = do
(List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]]) (List.cycle [Map.fromList $ [(nt, Map.singleton (doc ^. context_id) 1 )]])
) docs ) docs
-- printDebug "ngramsByDoc" ngramsByDoc printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database -- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
pure () -- ngramsByDoc pure ()
toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams) toIndexedNgrams :: HashMap Text NgramsId -> Text -> Maybe (Indexed Int Ngrams)
toIndexedNgrams m t = Indexed <$> i <*> n toIndexedNgrams m t = Indexed <$> i <*> n
......
...@@ -37,8 +37,11 @@ data Pattern = Pattern ...@@ -37,8 +37,11 @@ data Pattern = Pattern
type Patterns = [Pattern] type Patterns = [Pattern]
------------------------------------------------------------------------ ------------------------------------------------------------------------
replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0 data ReplaceTerms = KeepAll | LongestOnly
replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
replaceTerms rplaceTerms pats terms = go 0
where where
terms_len = length terms terms_len = length terms
...@@ -49,15 +52,17 @@ replaceTerms pats terms = go 0 ...@@ -49,15 +52,17 @@ replaceTerms pats terms = go 0
Just (len, term) -> Just (len, term) ->
term : go (ix + len) term : go (ix + len)
m = toMap
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
m =
IntMap.fromListWith merge
[ (ix, (len, term)) [ (ix, (len, term))
| Pattern pat len term <- pats, ix <- KMP.match pat terms ] | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
toMap = case rplaceTerms of
KeepAll -> IntMap.fromList
LongestOnly -> IntMap.fromListWith merge
where
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatterns :: TermList -> Patterns buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where where
...@@ -82,14 +87,14 @@ termsInText pats txt = groupWithCounts ...@@ -82,14 +87,14 @@ termsInText pats txt = groupWithCounts
-------------------------------------------------------------------------- --------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text] extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
-- | Extract terms -- | Extract terms
-- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
-- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"] -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
-- ["chat blanc"] -- ["chat blanc"]
extractTermsWithList' :: Patterns -> Text -> [Text] extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms pats) extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence . monoTextsBySentence
-------------------------------------------------------------------------- --------------------------------------------------------------------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment