Commit 58ad4a3e authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Prepare text from Gutemberg

parent 916be24b
Pipeline #3132 failed with stage
in 43 minutes and 35 seconds
......@@ -76,6 +76,7 @@ library
Gargantext.Core.Text.Metrics.TFICF
Gargantext.Core.Text.Metrics.CharByChar
Gargantext.Core.Text.Metrics.Count
Gargantext.Core.Text.Prepare
Gargantext.Core.Text.Search
Gargantext.Core.Text.Terms
Gargantext.Core.Text.Terms.Mono
......
......@@ -100,6 +100,7 @@ library:
- Gargantext.Core.Text.Metrics.TFICF
- Gargantext.Core.Text.Metrics.CharByChar
- Gargantext.Core.Text.Metrics.Count
- Gargantext.Core.Text.Prepare
- Gargantext.Core.Text.Search
- Gargantext.Core.Text.Terms
- Gargantext.Core.Text.Terms.Mono
......
......@@ -20,22 +20,63 @@ that could be the incarnation of the mythic Gargantua.
module Gargantext.Core.Text.Clean
where
import Gargantext.Prelude
import Data.Text (Text)
import qualified Data.Text as Text
import Gargantext.Core.Text (sentences)
import Gargantext.Prelude
import qualified Data.List as List
import qualified Data.Text as Text
groupLines :: [Text] -> [Text]
groupLines (a:x:xs) = undefined
cleanText :: Text -> [Text]
cleanText txt = List.filter (/= "")
$ toParagraphs
$ Text.lines
$ Text.replace "--" "" -- removing bullets like of dialogs
$ Text.replace "\xd" "" txt
---------------------------------------------------------------------
prepareText :: Paragraph -> Text -> [Text]
prepareText p txt = groupText p
$ List.filter (/= "")
$ toParagraphs
$ Text.lines
$ Text.replace "_" " " -- some texts seem to be underlined
$ Text.replace "--" "" -- removing bullets like of dialogs
$ Text.replace "\xd" "" txt
---------------------------------------------------------------------
groupText :: Paragraph -> [Text] -> [Text]
groupText (Uniform g s) = groupUniform g s
groupText AuthorLike = groupLines
---------------------------------------------------------------------
data Paragraph = Uniform Grain Step | AuthorLike
-- Uniform does not preserve the paragraphs of the author but length of paragraphs is uniform
-- Author Like preserve the paragraphs of the Author but length of paragraphs is not uniform
-- Grain: number of Sentences by block of Text
-- Step : overlap of sentence between connex block of Text
groupUniform :: Grain -> Step -> [Text] -> [Text]
groupUniform g s ts = map (Text.intercalate " ")
$ chunkAlong g s
$ sentences
$ Text.concat ts
groupLines :: [Text] -> [Text]
groupLines xxx@(a:b:xs) =
if Text.length a > moyenne
then [a] <> (groupLines (b:xs))
else let ab = a <> " " <> b in
if Text.length ab > moyenne
then [ab] <> (groupLines xs)
else groupLines ([ab] <> xs)
where
moyenne = round
$ mean
$ (map (fromIntegral . Text.length) xxx :: [Double])
groupLines [a] = [a]
groupLines [] = []
groupLines_test :: [Text]
groupLines_test = groupLines theData
where
theData = ["abxxxx", "bc", "cxxx", "d"]
---------------------------------------------------------------------
toParagraphs :: [Text] -> [Text]
toParagraphs (a:x:xs) =
if a == ""
......@@ -46,4 +87,15 @@ toParagraphs (a:x:xs) =
toParagraphs [a] = [a]
toParagraphs [] = []
-- Tests
-- TODO for internships: Property tests
toParagraphs_test :: Bool
toParagraphs_test =
toParagraphs ["a","b","","c","d","d","","e","f","","g","h",""]
== [ "a b", "", "c d d", "", "e f", "", "g h", ""]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment