From 5149e7ced7cfb4284e5ea1c13840b559256c12bb Mon Sep 17 00:00:00 2001 From: rloth <romain.loth@iscpif.fr> Date: Tue, 8 Dec 2015 10:00:22 +0100 Subject: [PATCH] <p> dans la stoplist via regex --- ngram/stop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ngram/stop.py b/ngram/stop.py index 26be40d0..cb4e28b3 100644 --- a/ngram/stop.py +++ b/ngram/stop.py @@ -56,6 +56,7 @@ def isStopWord(ngram, stop_words=None): , "(.*)\d(.*)" , "(.*)(\.)(.*)" , "(.*)(\,)(.*)" + , "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes , "(.*)(study)(.*)" , "(.*)(xx|xi|xv)(.*)" , "(.*)(result)(.*)" -- 2.21.0