Commit 777cf4cc authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] clean groups + handling duplicates (TODO inside)

parent e55c6044
Pipeline #1360 failed with stage
......@@ -59,7 +59,7 @@ CREATE TABLE public.ngrams_postag (
lemm_id INTEGER NOT NULL,
score INTEGER DEFAULT 1 ::integer NOT NULL,
FOREIGN KEY (ngrams_id) REFERENCES public.ngrams(id) ON DELETE CASCADE,
FOREIGN KEY (lemm_id) REFERENCES public.ngrams(id) ON DELETE CASCADE
FOREIGN KEY (lemm_id) REFERENCES public.ngrams(id) ON DELETE CASCADE
);
ALTER TABLE public.ngrams_postag OWNER TO gargantua;
......
......@@ -127,6 +127,7 @@ instance (ToJSONKey a, ToSchema a) => ToSchema (MSet a) where
newtype NgramsTerm = NgramsTerm { unNgramsTerm :: Text }
deriving (Ord, Eq, Show, Generic, ToJSONKey, ToJSON, FromJSON, Semigroup, Arbitrary, Serialise, ToSchema, Hashable)
instance IsHashable NgramsTerm where
hash (NgramsTerm t) = hash t
......
......@@ -88,8 +88,10 @@ groupWith (GroupParams l _m _n _) t =
-- | This lemmatization group done with CoreNLP algo (or others)
groupWith (GroupWithPosTag _ _ m) t =
case HashMap.lookup (unNgramsTerm t) m of
Nothing -> t
Just t' -> NgramsTerm t'
Nothing -> clean t
Just t' -> clean $ NgramsTerm t'
where
clean (NgramsTerm t) = NgramsTerm $ Text.replace "-" " " t
--------------------------------------------------------------------
stemPatches :: GroupParams
......
......@@ -140,11 +140,12 @@ queryInsertNgramsPostag = [sql|
FROM input_rows ir
JOIN ins_form_ret form ON form.terms = ir.form
JOIN ins_lem_ret lem ON lem.terms = ir.lem
-- GROUP BY ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id
-- GROUP BY ir.lang_id, ir.algo_id, ir.postag, form.id, lem.id
-- ORDER BY s DESC
-- LIMIT 1
ON CONFLICT (lang_id,algo_id,postag,ngrams_id,lemm_id)
DO UPDATE SET score = ngrams_postag.score + 1
DO NOTHING -- acceptable for now since we are using NP mainly
-- DO UPDATE SET score = ngrams_postag.score + 1
)
SELECT terms,id FROM ins_form_ret
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment