Commit e53d4b86 authored by Yoelis Acourt's avatar Yoelis Acourt

fix(cleanTextForNLP): removes transformation for hyphaneted words

parent 4e21f839
...@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1. ...@@ -11,10 +11,10 @@ Multi-terms are ngrams where n > 1.
-} -}
module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP) module Gargantext.Core.Text.Terms.Multi (multiterms, Terms(..), tokenTag2terms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
where where
import Data.Attoparsec.Text as DAT ( digit, space, notChar, string ) import Data.Attoparsec.Text as DAT (space, notChar, string )
import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..)) import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En import Gargantext.Core.Text.Terms.Multi.Lang.En qualified as En
import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr import Gargantext.Core.Text.Terms.Multi.Lang.Fr qualified as Fr
...@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens ...@@ -82,12 +82,10 @@ groupTokens _ = Fr.groupTokens
-- TODO: make tests here -- TODO: make tests here
cleanTextForNLP :: Text -> Text cleanTextForNLP :: Text -> Text
cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls cleanTextForNLP = unifySpaces . removeUrls
where where
remove x = RAT.streamEdit x (const "") remove x = RAT.streamEdit x (const "")
unifySpaces = RAT.streamEdit (many DAT.space) (const " ") unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
removeUrls = removeUrlsWith "http" . removeUrlsWith "www" removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space) removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment