Commit e53d4b86 authored by Yoelis Acourt's avatar Yoelis Acourt

fix(cleanTextForNLP): removes transformation for hyphaneted words

parent 4e21f839
Pipeline #6580 failed with stages
in 13 minutes and 24 seconds
......@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1.
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
import Data.Attoparsec.Text as DAT ( digit, space, notChar, string )
import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
......@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens
-- TODO: make tests here
cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
cleanTextForNLP = unifySpaces . removeUrls
remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment