formatting so no lines do more than 72 clomuns #4

54aa9ba5 · Mael NICOLAS · 04133e20 · 54aa9ba5
Commit 54aa9ba5 authored Oct 04, 2018 by Mael NICOLAS
Show whitespace changes
Inline Side-by-side

Showing with 50 additions and 15 deletions

Wikimedia.hs src/Gargantext/Text/Parsers/Wikimedia.hs +50 -15

No files found.
--- a/src/Gargantext/Text/Parsers/Wikimedia.hs
+++ b/src/Gargantext/Text/Parsers/Wikimedia.hs
+{-|
+Module      : Gargantext.Text.Parsers.WOS
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+@Gargantext.Text.Parsers.Wikimedia@:
+This module provide a parser for wikipedia dump.
+This include an xml parser for wikipedia's xml
+and an wikimedia to plaintext converter for the wikipedia text field
+-}
+
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE NoImplicitPrelude #-}

@@ -9,16 +24,20 @@ import Data.Conduit
 import Data.XML.Types (Event, Name)
 import Text.Pandoc
 import Data.Text as T
+import Data.Either

-- | This module provide a parser for wikipedia dump.
-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
 -- | Use case
 -- >>> :{
 --  wikimediaFile <- BL.readFile "text.xml"
--  _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM mediawikiPageToPlain
+--  _ <- runConduit $ parseLBS def wikimediaFile
+--        .| force "mediawiki required" parseMediawiki
+--        .| CL.mapM mediawikiPageToPlain
+--        .| CL.mapM_ print
 -- :}

-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant.
+-- | A simple "Page" type.
+-- For the moment it take only text and title
+--  (since there is no abstract) will see if other datas are relevant.
 data Page = Page
  {
    _title :: T.Text
@@ -27,16 +46,23 @@ data Page = Page
  deriving (Show)

 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
-parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
-  text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
-  many_ $ ignoreAnyTreeContent
+parseRevision =
+  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
+  text <-
+    force "text is missing" $ ignoreExcept
+    "{http://www.mediawiki.org/xml/export-0.10/}text" content
+  many_
+    $ ignoreAnyTreeContent
  return text

 tagUntil :: Name -> NameMatcher Name
 tagUntil name = matching (/= name)

-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data.
-ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
+-- | Utility function that parse nothing but the tag given,
+-- usefull because we have to consume every data.
+ignoreExcept :: MonadThrow m => Name
+  -> ConduitT Event o m b
+  -> ConduitT Event o m (Maybe b)
 ignoreExcept name f = do
  _ <- consumeExcept name
  tagIgnoreAttrs (matching (==name)) f
@@ -45,15 +71,22 @@ consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
 consumeExcept = many_ . ignoreTreeContent . tagUntil

 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
-parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
-  title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
-  _ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
-  revision <- force "revision is missing" $ parseRevision
+parsePage =
+  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
+  title <-
+    force "title is missing" $
+    tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
+  _ <-
+    consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
+  revision <-
+    force "revision is missing" $ parseRevision
  many_ $ ignoreAnyTreeContent
  return $ Page title revision

 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
-parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage
+parseMediawiki =
+  tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
+  $ manyYield' parsePage

 -- | Need to wrap the result in IO to parse and to combine it.
 mediawikiPageToPlain :: Page -> IO Page
@@ -65,4 +98,6 @@ mediawikiPageToPlain page = do
          res <- runIO $ do
            doc <- readMediaWiki def media
            writePlain def doc
-          handleError res
+          case res of
+            (Left _) -> return media
+            (Right r) -> return r