Commit 7fc403c9 authored by Yoelis Acourt's avatar Yoelis Acourt

fix(cleanTextForNLP): removes transformation for hyphaneted words

parent 5bb981be
Pipeline #6585 failed with stages
......@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1.
-}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where
import Data.Attoparsec.Text as DAT ( digit, space, notChar, string )
import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
......@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
cleanTextForNLP = unifySpaces . removeUrls
where
remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment