......@@ -76,6 +76,7 @@ library
......@@ -100,6 +100,7 @@ library:
- Gargantext.Core.Text.Metrics.TFICF
- Gargantext.Core.Text.Metrics.CharByChar
- Gargantext.Core.Text.Metrics.Count
- Gargantext.Core.Text.Prepare
- Gargantext.Core.Text.Search
- Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Mono
......@@ -20,22 +20,63 @@ that could be the incarnation of the mythic Gargantua.
module Gargantext.Core.Text.Clean
import Gargantext.Prelude
import Data.Text (Text)
import qualified Data.Text as Text
import Gargantext.Core.Text (sentences)
import Gargantext.Prelude
import qualified Data.List as List
import qualified Data.Text as Text
groupLines :: [Text] -> [Text]
groupLines (a:x:xs) = undefined
cleanText :: Text -> [Text]
cleanText txt = List.filter (/= "")
$ toParagraphs
$ Text.lines
$ Text.replace "--" "" -- removing bullets like of dialogs
$ Text.replace "\xd" "" txt
prepareText :: Paragraph -> Text -> [Text]
prepareText p txt = groupText p
$ List.filter (/= "")
$ toParagraphs
$ Text.lines
$ Text.replace "_" " " -- some texts seem to be underlined
$ Text.replace "--" "" -- removing bullets like of dialogs
$ Text.replace "\xd" "" txt
groupText :: Paragraph -> [Text] -> [Text]
groupText (Uniform g s) = groupUniform g s
groupText AuthorLike = groupLines
data Paragraph = Uniform Grain Step | AuthorLike
-- Uniform does not preserve the paragraphs of the author but length of paragraphs is uniform
-- Author Like preserve the paragraphs of the Author but length of paragraphs is not uniform
-- Grain: number of Sentences by block of Text
-- Step : overlap of sentence between connex block of Text
groupUniform :: Grain -> Step -> [Text] -> [Text]
groupUniform g s ts = map (Text.intercalate " ")
$ chunkAlong g s
$ sentences
$ Text.concat ts
groupLines :: [Text] -> [Text]
groupLines xxx@(a:b:xs) =
if Text.length a > moyenne
then [a] <> (groupLines (b:xs))
else let ab = a <> " " <> b in
if Text.length ab > moyenne
then [ab] <> (groupLines xs)
else groupLines ([ab] <> xs)
moyenne = round
$ mean
$ (map (fromIntegral . Text.length) xxx :: [Double])
groupLines [a] = [a]
groupLines [] = []
groupLines_test :: [Text]
groupLines_test = groupLines theData
theData = ["abxxxx", "bc", "cxxx", "d"]
toParagraphs :: [Text] -> [Text]
toParagraphs (a:x:xs) =
if a == ""
......@@ -46,4 +87,15 @@ toParagraphs (a:x:xs) =
toParagraphs [a] = [a]
toParagraphs [] = []
-- Tests
-- TODO for internships: Property tests
toParagraphs_test :: Bool
toParagraphs_test =
toParagraphs ["a","b","","c","d","d","","e","f","","g","h",""]
== [ "a b", "", "c d d", "", "e f", "", "g h", ""]
